all: REVERSE MERGE dev.simd (7d65463) into master

This commit is a REVERSE MERGE. It merges dev.simd back into its parent branch, master. The development of simd will continue on (only) dev.simd, and it will be merged to the master branch when necessary. Merge List: + 2025-11-24 7d65463a54 [dev.simd] all: merge master (e704b09) into dev.simd + 2025-11-24 afd1721fc5 [dev.simd] all: merge master (02d1f3a) into dev.simd + 2025-11-24 a9914886da [dev.simd] internal/buildcfg: don't enable SIMD experiment by default + 2025-11-24 61a5a6b016 [dev.simd] simd: add goexperiment tag to generate.go + 2025-11-24 f045ed4110 [dev.simd] go/doc/comment: don't include experimental packages in std list + 2025-11-24 220d73cc44 [dev.simd] all: merge master (8dd5b13) into dev.simd + 2025-11-24 0c69e77343 Revert "[dev.simd] internal/runtime/gc: add simd package based greentea kernels" + 2025-11-21 da92168ec8 [dev.simd] internal/runtime/gc: add simd package based greentea kernels + 2025-11-21 3fdd183aef [dev.simd] cmd/compile, simd: update conversion API names + 2025-11-21 d3a0321dba [dev.simd] cmd/compile: fix incorrect mapping of SHA256MSG2128 + 2025-11-20 74ebdd28d1 [dev.simd] simd, cmd/compile: add more element types for Select128FromPair + 2025-11-20 4d26d66a49 [dev.simd] simd: fix signatures for PermuteConstant* methods + 2025-11-20 e3d4645693 [dev.simd] all: merge master (ca37d24) into dev.simd + 2025-11-20 95b4ad525f [dev.simd] simd: reorganize internal tests so that simd does not import testing + 2025-11-18 3fe246ae0f [dev.simd] simd: make 'go generate' generate everything + 2025-11-18 cf45adf140 [dev.simd] simd: move template code generator into _gen + 2025-11-18 19b4a30899 [dev.simd] simd/_gen/simdgen: remove outdated asm.yaml.toy + 2025-11-18 9461db5c59 [dev.simd] simd: fix comment in file generator + 2025-11-18 4004ff3523 [dev.simd] simd: remove FlattenedTranspose from exports + 2025-11-18 896f293a25 [dev.simd] cmd/compile, simd: change DotProductQuadruple and add peepholes + 2025-11-18 be9c50c6a0 [dev.simd] cmd/compile, simd: change SHA ops names and types + 2025-11-17 0978935a99 [dev.simd] cmd/compile, simd: change AES op names and add missing size + 2025-11-17 95871e4a00 [dev.simd] cmd/compile, simd: add VPALIGNR + 2025-11-17 934dbcea1a [dev.simd] simd: update CPU feature APIs + 2025-11-17 e4d9484220 [dev.simd] cmd/compile: fix unstable output + 2025-11-13 d7a0c45642 [dev.simd] all: merge master (57362e9) into dev.simd + 2025-11-11 86b4fe31d9 [dev.simd] cmd/compile: add masked merging ops and optimizations + 2025-11-10 771a1dc216 [dev.simd] cmd/compile: add peepholes for all masked ops and bug fixes + 2025-11-10 972732b245 [dev.simd] simd, cmd/compile: remove move from API + 2025-11-10 bf77323efa [dev.simd] simd: put unexported methods to another file + 2025-11-04 fe040658b2 [dev.simd] simd/_gen: fix sorting ops slices + 2025-10-29 e452f4ac7d [dev.simd] cmd/compile: enhance inlining for closure-of-SIMD + 2025-10-27 ca1264ac50 [dev.simd] test: add some trickier cases to ternary-boolean simd test + 2025-10-24 f6b4711095 [dev.simd] cmd/compile, simd: add rewrite to convert logical expression trees into TERNLOG instructions + 2025-10-24 cf7c1a4cbb [dev.simd] cmd/compile, simd: add SHA features + 2025-10-24 2b8eded4f4 [dev.simd] simd/_gen: parse SHA features from XED + 2025-10-24 c75965b666 [dev.simd] simd: added String() method to SIMD vectors. + 2025-10-22 d03634f807 [dev.simd] cmd/compile, simd: add definitions for VPTERNLOG[DQ] + 2025-10-20 20b3339542 [dev.simd] simd: add AES feature check + 2025-10-14 fc3bc49337 [dev.simd] simd: clean up mask load comments + 2025-10-14 416332dba2 [dev.simd] cmd/compile, simd: update DotProd to DotProduct + 2025-10-14 647c790143 [dev.simd] cmd/compile: peephole simd mask load/stores from bits + 2025-10-14 2e71cf1a2a [dev.simd] cmd/compile, simd: remove mask load and stores + 2025-10-13 c4fbf3b4cf [dev.simd] simd/_gen: add mem peephole with feat mismatches + 2025-10-13 ba72ee0f30 [dev.simd] cmd/compile: more support for cpufeatures + 2025-10-09 be57d94c4c [dev.simd] simd: add emulated Not method + 2025-10-07 d2270bccbd [dev.simd] cmd/compile: track which CPU features are in scope + 2025-10-03 48756abd3a [dev.simd] cmd/compile: inliner tweaks to favor simd-handling functions + 2025-10-03 fb1749a3fe [dev.simd] all: merge master (adce7f1) into dev.simd + 2025-09-30 703a5fbaad [dev.simd] cmd/compile, simd: add AES instructions + 2025-09-29 1c961c2fb2 [dev.simd] simd: use new data movement instructions to do "fast" transposes + 2025-09-26 fe4af1c067 [dev.simd] simd: repair broken comments in generated ops_amd64.go + 2025-09-26 ea3b2ecd28 [dev.simd] cmd/compile, simd: add 64-bit select-from-pair methods + 2025-09-26 25c36b95d1 [dev.simd] simd, cmd/compile: add 128 bit select-from-pair + 2025-09-26 f0e281e693 [dev.simd] cmd/compile: don't require single use for SIMD load/store folding + 2025-09-26 b4d1e018a8 [dev.simd] cmd/compile: remove unnecessary code from early simd prototype + 2025-09-26 578777bf7c [dev.simd] cmd/compile: make condtion of CanSSA smarter for SIMD fields + 2025-09-26 c28b2a0ca1 [dev.simd] simd: generalize select-float32-from-pair + 2025-09-25 a693ae1e9a [dev.simd] all: merge master (d70ad4e) into dev.simd + 2025-09-25 5a78e1a4a1 [dev.simd] simd, cmd/compile: mark simd vectors uncomparable + 2025-09-23 bf00f5dfd6 [dev.simd] simd, cmd/compile: added simd methods for VSHUFP[DS] + 2025-09-23 8e60feeb41 [dev.simd] cmd/compile: improve slicemask removal + 2025-09-23 2b50ffe172 [dev.simd] cmd/compile: remove stores to unread parameters + 2025-09-23 2d8cb80d7c [dev.simd] all: merge master (9b2d39b) into dev.simd + 2025-09-22 63a09d6d3d [dev.simd] cmd/compile: fix SIMD const rematerialization condition + 2025-09-20 2ca96d218d [dev.simd] cmd/compile: enhance prove to infer bounds in slice len/cap calculations + 2025-09-19 c0f031fcc3 [dev.simd] cmd/compile: spill the correct SIMD register for morestack + 2025-09-19 58fa1d023e [dev.simd] cmd/compile: enhance the chunked indexing case to include reslicing + 2025-09-18 7ae0eb2e80 [dev.simd] cmd/compile: remove Add32x4 generic op + 2025-09-18 31b664d40b [dev.simd] cmd/compile: widen index for simd intrinsics jumptable + 2025-09-18 e34ad6de42 [dev.simd] cmd/compile: optimize VPTEST for 2-operand cases + 2025-09-18 f1e3651c33 [dev.simd] cmd/compile, simd: add VPTEST + 2025-09-18 d9751166a6 [dev.simd] cmd/compile: handle rematerialized op for incompatible reg constraint + 2025-09-18 4eb5c6e07b [dev.simd] cmd/compile, simd/_gen: add rewrite for const load ops + 2025-09-18 443b7aeddb [dev.simd] cmd/compile, simd/_gen: make rewrite rules consistent on CPU Features + 2025-09-16 bdd30e25ca [dev.simd] all: merge master (ca0e035) into dev.simd + 2025-09-16 0e590a505d [dev.simd] cmd/compile: use the right type for spill slot + 2025-09-15 dabe2bb4fb [dev.simd] cmd/compile: fix holes in mask peepholes + 2025-09-12 3ec0b25ab7 [dev.simd] cmd/compile, simd/_gen/simdgen: add const load mops + 2025-09-12 1e5631d4e0 [dev.simd] cmd/compile: peephole simd load + 2025-09-11 48f366d826 [dev.simd] cmd/compile: add memop peephole rules + 2025-09-11 9a349f8e72 [dev.simd] all: merge master (cf5e993) into dev.simd + 2025-09-11 5a0446d449 [dev.simd] simd/_gen/simdgen, cmd/compile: add memory op machine ops + 2025-09-08 c39b2fdd1e [dev.simd] cmd/compile, simd: add VPLZCNT[DQ] + 2025-09-07 832c1f76dc [dev.simd] cmd/compile: enhance prove to deal with double-offset IsInBounds checks + 2025-09-06 0b323350a5 [dev.simd] simd/_gen/simdgen: merge memory ops + 2025-09-06 f42c9261d3 [dev.simd] simd/_gen/simdgen: parse memory operands + 2025-09-05 356c48d8e9 [dev.simd] cmd/compile, simd: add ClearAVXUpperBits + 2025-09-03 7c8b9115bc [dev.simd] all: merge master (4c4cefc) into dev.simd + 2025-09-02 9125351583 [dev.simd] internal/cpu: report AVX1 and 2 as supported on macOS 15 Rosetta 2 + 2025-09-02 b509516b2e [dev.simd] simd, cmd/compile: add Interleave{Hi,Lo} (VPUNPCK*) + 2025-09-02 6890aa2e20 [dev.simd] cmd/compile: add instructions and rewrites for scalar-> vector moves + 2025-08-24 5ebe2d05d5 [dev.simd] simd: correct SumAbsDiff documentation + 2025-08-22 a5137ec92a [dev.simd] cmd/compile: sample peephole optimization for SIMD broadcast + 2025-08-22 83714616aa [dev.simd] cmd/compile: remove VPADDD4 + 2025-08-22 4a3ea146ae [dev.simd] cmd/compile: correct register mask of some AVX512 ops + 2025-08-22 8d874834f1 [dev.simd] cmd/compile: use X15 for zero value in AVX context + 2025-08-22 4c311aa38f [dev.simd] cmd/compile: ensure the whole X15 register is zeroed + 2025-08-22 baea0c700b [dev.simd] cmd/compile, simd: complete AVX2? u?int shuffles + 2025-08-22 fa1e78c9ad [dev.simd] cmd/compile, simd: make Permute 128-bit use AVX VPSHUFB + 2025-08-22 bc217d4170 [dev.simd] cmd/compile, simd: add packed saturated u?int conversions + 2025-08-22 4fa23b0d29 [dev.simd] cmd/compile, simd: add saturated u?int conversions + 2025-08-21 3f6bab5791 [dev.simd] simd: move tests to a subdirectory to declutter "simd" + 2025-08-21 aea0a5e8d7 [dev.simd] simd/_gen/unify: improve envSet doc comment + 2025-08-21 7fdb1da6b0 [dev.simd] cmd/compile, simd: complete truncating u?int conversions. + 2025-08-21 f4c41d9922 [dev.simd] cmd/compile, simd: complete u?int widening conversions + 2025-08-21 6af8881adb [dev.simd] simd: reorganize cvt rules + 2025-08-21 58cfc2a5f6 [dev.simd] cmd/compile, simd: add VPSADBW + 2025-08-21 f7c6fa709e [dev.simd] simd/_gen/unify: fix some missing environments + 2025-08-20 7c84e984e6 [dev.simd] cmd/compile: rewrite to elide Slicemask from len==c>0 slicing + 2025-08-20 cf31b15635 [dev.simd] simd, cmd/compile: added .Masked() peephole opt for many operations. + 2025-08-20 1334285862 [dev.simd] simd: template field name cleanup in genfiles + 2025-08-20 af6475df73 [dev.simd] simd: add testing hooks for size-changing conversions + 2025-08-20 ede64cf0d8 [dev.simd] simd, cmd/compile: sample peephole optimization for .Masked() + 2025-08-20 103b6e39ca [dev.simd] all: merge master (9de69f6) into dev.simd + 2025-08-20 728ac3e050 [dev.simd] simd: tweaks to improve test disassembly + 2025-08-20 4fce49b86c [dev.simd] simd, cmd/compile: add widening unsigned converts 8->16->32 + 2025-08-19 0f660d675f [dev.simd] simd: make OpMasked machine ops only + 2025-08-19 a034826e26 [dev.simd] simd, cmd/compile: implement ToMask, unexport asMask. + 2025-08-18 8ccd6c2034 [dev.simd] simd, cmd/compile: mark BLEND instructions as not-zero-mask + 2025-08-18 9a934d5080 [dev.simd] cmd/compile, simd: added methods for "float" GetElem + 2025-08-15 7380213a4e [dev.simd] cmd/compile: make move/load/store dependent only on reg and width + 2025-08-15 908e3e8166 [dev.simd] cmd/compile: make (most) move/load/store lowering use reg and width only + 2025-08-14 9783f86bc8 [dev.simd] cmd/compile: accounts rematerialize ops's output reginfo + 2025-08-14 a4ad41708d [dev.simd] all: merge master (924fe98) into dev.simd + 2025-08-13 8b90d48d8c [dev.simd] simd/_gen/simdgen: rewrite etetest.sh + 2025-08-13 b7c8698549 [dev.simd] simd/_gen: migrate simdgen from x/arch + 2025-08-13 257c1356ec [dev.simd] go/types: exclude simd/_gen module from TestStdlib + 2025-08-13 858a8d2276 [dev.simd] simd: reorganize/rename generated emulation files + 2025-08-13 2080415aa2 [dev.simd] simd: add emulations for missing AVX2 comparisons + 2025-08-13 ddb689c7bb [dev.simd] simd, cmd/compile: generated code for Broadcast + 2025-08-13 e001300cf2 [dev.simd] cmd/compile: fix LoadReg so it is aware of register target + 2025-08-13 d5dea86993 [dev.simd] cmd/compile: fix isIntrinsic for methods; fix fp <-> gp moves + 2025-08-13 08ab8e24a3 [dev.simd] cmd/compile: generated code from 'fix generated rules for shifts' + 2025-08-11 702ee2d51e [dev.simd] cmd/compile, simd: update generated files + 2025-08-11 e33eb1a7a5 [dev.simd] cmd/compile, simd: update generated files + 2025-08-11 667add4f1c [dev.simd] cmd/compile, simd: update generated files + 2025-08-11 1755c2909d [dev.simd] cmd/compile, simd: update generated files + 2025-08-11 2fd49d8f30 [dev.simd] simd: imm doc improve + 2025-08-11 ce0e803ab9 [dev.simd] cmd/compile: keep track of multiple rule file names in ssa/_gen + 2025-08-11 38b76bf2a3 [dev.simd] cmd/compile, simd: jump table for imm ops + 2025-08-08 94d72355f6 [dev.simd] simd: add emulations for bitwise ops and for mask/merge methods + 2025-08-07 8eb5f6020e [dev.simd] cmd/compile, simd: API interface fixes + 2025-08-07 b226bcc4a9 [dev.simd] cmd/compile, simd: add value conversion ToBits for mask + 2025-08-06 5b0ef7fcdc [dev.simd] cmd/compile, simd: add Expand + 2025-08-06 d3cf582f8a [dev.simd] cmd/compile, simd: (Set|Get)(Lo|Hi) + 2025-08-05 7ca34599ec [dev.simd] simd, cmd/compile: generated files to add 'blend' and 'blendMasked' + 2025-08-05 82d056ddd7 [dev.simd] cmd/compile: add ShiftAll immediate variant + 2025-08-04 775fb52745 [dev.simd] all: merge master (7a1679d) into dev.simd + 2025-08-04 6b9b59e144 [dev.simd] simd, cmd/compile: rename some methods + 2025-08-04 d375b95357 [dev.simd] simd: move lots of slice functions and methods to generated code + 2025-08-04 3f92aa1eca [dev.simd] cmd/compile, simd: make bitwise logic ops available to all u?int vectors + 2025-08-04 c2d775d401 [dev.simd] cmd/compile, simd: change PairDotProdAccumulate to AddDotProd + 2025-08-04 2c25f3e846 [dev.simd] cmd/compile, simd: change Shift*AndFillUpperFrom to Shift*Concat + 2025-08-01 c25e5c86b2 [dev.simd] cmd/compile: generated code for K-mask-register slice load/stores + 2025-08-01 1ac5f3533f [dev.simd] cmd/compile: opcodes and rules and code generation to enable AVX512 masked loads/stores + 2025-08-01 f39711a03d [dev.simd] cmd/compile: test for int-to-mask conversion + 2025-08-01 08bec02907 [dev.simd] cmd/compile: add register-to-mask moves, other simd glue + 2025-08-01 09ff25e350 [dev.simd] simd: add tests for simd conversions to Int32/Uint32. + 2025-08-01 a24ffe3379 [dev.simd] simd: modify test generation to make it more flexible + 2025-08-01 ec5c20ba5a [dev.simd] cmd/compile: generated simd code to add some conversions + 2025-08-01 e62e377ed6 [dev.simd] cmd/compile, simd: generated code from repaired simdgen sort + 2025-08-01 761894d4a5 [dev.simd] simd: add partial slice load/store for 32/64-bits on AVX2 + 2025-08-01 acc1492b7d [dev.simd] cmd/compile: Generated code for AVX2 SIMD masked load/store + 2025-08-01 a0b87a7478 [dev.simd] cmd/compile: changes for AVX2 SIMD masked load/store + 2025-08-01 88568519b4 [dev.simd] simd: move test generation into Go repo + 2025-07-31 6f7a1164e7 [dev.simd] cmd/compile, simd: support store to bits for mask + 2025-07-21 41054cdb1c [dev.simd] simd, internal/cpu: support more AVX CPU Feature checks + 2025-07-21 957f06c410 [dev.simd] cmd/compile, simd: support load from bits for mask + 2025-07-21 f0e9dc0975 [dev.simd] cmd/compile: fix opLen(2|3)Imm8_2I intrinsic function + 2025-07-17 03a3887f31 [dev.simd] simd: clean up masked op doc + 2025-07-17 c61743e4f0 [dev.simd] cmd/compile, simd: reorder PairDotProdAccumulate + 2025-07-15 ef5f6cc921 [dev.simd] cmd/compile: adjust param order for AndNot + 2025-07-15 6d10680141 [dev.simd] cmd/compile, simd: add Compress + 2025-07-15 17baae72db [dev.simd] simd: default mask param's name to mask + 2025-07-15 01f7f57025 [dev.simd] cmd/compile, simd: add variable Permute + 2025-07-14 f5f42753ab [dev.simd] cmd/compile, simd: add VDPPS + 2025-07-14 08ffd66ab2 [dev.simd] simd: updates CPU Feature in doc + 2025-07-14 3f789721d6 [dev.simd] cmd/compile: mark SIMD types non-fat + 2025-07-11 b69622b83e [dev.simd] cmd/compile, simd: adjust Shift.* operations + 2025-07-11 4993a91ae1 [dev.simd] simd: change imm param name to constant + 2025-07-11 bbb6dccd84 [dev.simd] simd: fix documentations + 2025-07-11 1440ff7036 [dev.simd] cmd/compile: exclude simd vars from merge local + 2025-07-11 ccb43dcec7 [dev.simd] cmd/compile: add VZEROUPPER and VZEROALL inst + 2025-07-11 21596f2f75 [dev.simd] all: merge master (88cf0c5) into dev.simd + 2025-07-10 ab7f839280 [dev.simd] cmd/compile: fix maskreg/simdreg chaos + 2025-07-09 47b07a87a6 [dev.simd] cmd/compile, simd: fix Int64x2 Greater output type to mask + 2025-07-09 08cd62e9f5 [dev.simd] cmd/compile: remove X15 from register mask + 2025-07-09 9ea33ed538 [dev.simd] cmd/compile: output of simd generator, more ... rewrite rules + 2025-07-09 aab8b173a9 [dev.simd] cmd/compile, simd: Int64x2 Greater and Uint* Equal + 2025-07-09 8db7f41674 [dev.simd] cmd/compile: use upper registers for AVX512 simd ops + 2025-07-09 574854fd86 [dev.simd] runtime: save Z16-Z31 registers in async preempt + 2025-07-09 5429328b0c [dev.simd] cmd/compile: change register mask names for simd ops + 2025-07-09 029d7ec3e9 [dev.simd] cmd/compile, simd: rename Masked$OP to $(OP)Masked. + 2025-07-09 983e81ce57 [dev.simd] simd: rename stubs_amd64.go to ops_amd64.go + 2025-07-08 56ca67682b [dev.simd] cmd/compile, simd: remove FP bitwise logic operations. + 2025-07-08 0870ed04a3 [dev.simd] cmd/compile: make compares between NaNs all false. + 2025-07-08 24f2b8ae2e [dev.simd] simd: {Int,Uint}{8x{16,32},16x{8,16}} subvector loads/stores from slices. + 2025-07-08 2bb45cb8a5 [dev.simd] cmd/compile: minor tweak for race detector + 2025-07-07 43a61aef56 [dev.simd] cmd/compile: add EXTRACT[IF]128 instructions + 2025-07-07 292db9b676 [dev.simd] cmd/compile: add INSERT[IF]128 instructions + 2025-07-07 d8fa853b37 [dev.simd] cmd/compile: make regalloc simd aware on copy + 2025-07-07 dfd75f82d4 [dev.simd] cmd/compile: output of simdgen with invariant type order + 2025-07-04 72c39ef834 [dev.simd] cmd/compile: fix the "always panic" code to actually panic + 2025-07-01 1ee72a15a3 [dev.simd] internal/cpu: add GFNI feature check + 2025-06-30 0710cce6eb [dev.simd] runtime: remove write barrier in xRegRestore + 2025-06-30 59846af331 [dev.simd] cmd/compile, simd: cleanup operations and documentations + 2025-06-30 f849225b3b [dev.simd] all: merge master (740857f) into dev.simd + 2025-06-30 9eeb1e7a9a [dev.simd] runtime: save AVX2 and AVX-512 state on asynchronous preemption + 2025-06-30 426cf36b4d [dev.simd] runtime: save scalar registers off stack in amd64 async preemption + 2025-06-30 ead249a2e2 [dev.simd] cmd/compile: reorder operands for some simd operations + 2025-06-30 55665e1e37 [dev.simd] cmd/compile: undoes reorder transform in prior commit, changes names + 2025-06-26 10c9621936 [dev.simd] cmd/compile, simd: add galois field operations + 2025-06-26 e61ebfce56 [dev.simd] cmd/compile, simd: add shift operations + 2025-06-26 35b8cf7fed [dev.simd] cmd/compile: tweak sort order in generator + 2025-06-26 7fadfa9638 [dev.simd] cmd/compile: add simd VPEXTRA* + 2025-06-26 0d8cb89f5c [dev.simd] cmd/compile: support simd(imm,fp) returns gp + 2025-06-25 f4a7c124cc [dev.simd] all: merge master (f8ccda2) into dev.simd + 2025-06-25 4fda27c0cc [dev.simd] cmd/compile: glue codes for Shift and Rotate + 2025-06-24 61c1183342 [dev.simd] simd: add test wrappers + 2025-06-23 e32488003d [dev.simd] cmd/compile: make simd regmask naming more like existing conventions + 2025-06-23 1fa4bcfcda [dev.simd] simd, cmd/compile: generated code for VPINSR[BWDQ], and test + 2025-06-23 dd63b7aa0e [dev.simd] simd: add AVX512 aggregated check + 2025-06-23 0cdb2697d1 [dev.simd] simd: add tests for intrinsic used as a func value and via reflection + 2025-06-23 88c013d6ff [dev.simd] cmd/compile: generate function body for bodyless intrinsics + 2025-06-20 a8669c78f5 [dev.simd] sync: correct the type of runtime_StoreReluintptr + 2025-06-20 7c6ac35275 [dev.simd] cmd/compile: add simdFp1gp1fp1Imm8 helper to amd64 code generation + 2025-06-20 4150372a5d [dev.simd] cmd/compile: don't treat devel compiler as a released compiler + 2025-06-18 1b87d52549 [dev.simd] cmd/compile: add fp1gp1fp1 register mask for AMD64 + 2025-06-18 1313521f75 [dev.simd] cmd/compile: remove fused mul/add/sub shapes. + 2025-06-17 1be5eb2686 [dev.simd] cmd/compile: fix signature error of PairDotProdAccumulate. + 2025-06-17 3a4d10bfca [dev.simd] cmd/compile: removed a map iteration from generator; tweaked type order + 2025-06-17 21d6573154 [dev.simd] cmd/compile: alphabetize SIMD intrinsics + 2025-06-16 ee1d9f3f85 [dev.simd] cmd/compile: reorder stubs + 2025-06-13 6c50c8b892 [dev.simd] cmd/compile: move simd helpers into compiler, out of generated code + 2025-06-13 7392dfd43e [dev.simd] cmd/compile: generated simd*ops files weren't up to date + 2025-06-13 00a8dacbe4 [dev.simd] cmd/compile: remove unused simd intrinsics "helpers" + 2025-06-13 b9a548775f cmd/compile: add up-to-date test for generated files + 2025-06-13 ca01eab9c7 [dev.simd] cmd/compile: add fused mul add sub ops + 2025-06-13 ded6e0ac71 [dev.simd] cmd/compile: add more dot products + 2025-06-13 3df41c856e [dev.simd] simd: update documentations + 2025-06-13 9ba7db36b5 [dev.simd] cmd/compile: add dot product ops + 2025-06-13 34a9cdef87 [dev.simd] cmd/compile: add round simd ops + 2025-06-13 5289e0f24e [dev.simd] cmd/compile: updates simd ordering and docs + 2025-06-13 c81cb05e3e [dev.simd] cmd/compile: add simdGen prog writer + 2025-06-13 9b9af3d638 [dev.simd] internal/cpu: add AVX-512-CD and DQ, and derived "basic AVX-512" + 2025-06-13 dfa6c74263 [dev.simd] runtime: eliminate global state in mkpreempt.go + 2025-06-10 b2e8ddba3c [dev.simd] all: merge master (773701a) into dev.simd + 2025-06-09 884f646966 [dev.simd] cmd/compile: add fp3m1fp1 shape to regalloc + 2025-06-09 6bc3505773 [dev.simd] cmd/compile: add fp3fp1 regsiter shape + 2025-06-05 2eaa5a0703 [dev.simd] simd: add functions+methods to load-from/store-to slices + 2025-06-05 8ecbd59ebb [dev.simd] cmd/compile: generated codes for amd64 SIMD + 2025-06-02 baa72c25f1 [dev.simd] all: merge master (711ff94) into dev.simd + 2025-05-30 0ff18a9cca [dev.simd] cmd/compile: disable intrinsics test for new simd stuff + 2025-05-30 7800f3813c [dev.simd] cmd/compile: flip sense of intrinsics test for SIMD + 2025-05-29 eba2430c16 [dev.simd] simd, cmd/compile, go build, go/doc: test tweaks + 2025-05-29 71c0e550cd [dev.simd] cmd/dist: disable API check on dev branch + 2025-05-29 62e1fccfb9 [dev.simd] internal: delete unused internal/simd directory + 2025-05-29 1161228bf1 [dev.simd] cmd/compile: add a fp1m1fp1 register shape to amd64 + 2025-05-28 fdb067d946 [dev.simd] simd: initialize directory to make it suitable for testing SIMD + 2025-05-28 11d2b28bff [dev.simd] cmd/compile: add and fix k register supports + 2025-05-28 04b1030ae4 [dev.simd] cmd/compile: adapters for simd + 2025-05-27 2ef7106881 [dev.simd] internal/buildcfg: enable SIMD GOEXPERIMENT for amd64 + 2025-05-22 4d2c71ebf9 [dev.simd] internal/goexperiment: add SIMD goexperiment + 2025-05-22 3ac5f2f962 [dev.simd] codereview.cfg: set up dev.simd branch Change-Id: I60f2cd2ea055384a3788097738c6989630207871
author: Cherry Mui <cherryyz@google.com> 2025-11-24 16:02:01 -0500
committer: Cherry Mui <cherryyz@google.com> 2025-11-24 16:02:08 -0500
commit: d4f5650cc5cdc5fa559491991208f8563bd6f3b8 (patch)
tree: 723d49351a1fc8ee7f295cd69bfe8ef7501cfe60 /src/simd/internal
parent: e704b0993b69b564d7a0f515cf206f2cc9f1a342 (diff)
parent: 7d65463a5431063490a229a80f1d4c6cc19a2169 (diff)
download: go-d4f5650cc5cdc5fa559491991208f8563bd6f3b8.tar.xz
16 files changed, 7585 insertions, 0 deletions
diff --git a/src/simd/internal/simd_test/binary_helpers_test.go b/src/simd/internal/simd_test/binary_helpers_test.go
new file mode 100644
index 0000000000..82cf784bca
--- /dev/null
+++ b/src/simd/internal/simd_test/binary_helpers_test.go
@@ -0,0 +1,464 @@
+// Code generated by 'go run genfiles.go'; DO NOT EDIT.
+
+//go:build goexperiment.simd
+
+// This file contains functions testing binary simd methods.
+// Each function in this file is specialized for a
+// particular simd type <BaseType><Width>x<Count>.
+
+package simd_test
+
+import (
+	"simd"
+	"testing"
+)
+
+// testInt8x16Binary tests the simd binary method f against the expected behavior generated by want
+func testInt8x16Binary(t *testing.T, f func(_, _ simd.Int8x16) simd.Int8x16, want func(_, _ []int8) []int8) {
+	n := 16
+	t.Helper()
+	forSlicePair(t, int8s, n, func(x, y []int8) bool {
+		t.Helper()
+		a := simd.LoadInt8x16Slice(x)
+		b := simd.LoadInt8x16Slice(y)
+		g := make([]int8, n)
+		f(a, b).StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testInt16x8Binary tests the simd binary method f against the expected behavior generated by want
+func testInt16x8Binary(t *testing.T, f func(_, _ simd.Int16x8) simd.Int16x8, want func(_, _ []int16) []int16) {
+	n := 8
+	t.Helper()
+	forSlicePair(t, int16s, n, func(x, y []int16) bool {
+		t.Helper()
+		a := simd.LoadInt16x8Slice(x)
+		b := simd.LoadInt16x8Slice(y)
+		g := make([]int16, n)
+		f(a, b).StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testInt32x4Binary tests the simd binary method f against the expected behavior generated by want
+func testInt32x4Binary(t *testing.T, f func(_, _ simd.Int32x4) simd.Int32x4, want func(_, _ []int32) []int32) {
+	n := 4
+	t.Helper()
+	forSlicePair(t, int32s, n, func(x, y []int32) bool {
+		t.Helper()
+		a := simd.LoadInt32x4Slice(x)
+		b := simd.LoadInt32x4Slice(y)
+		g := make([]int32, n)
+		f(a, b).StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testInt64x2Binary tests the simd binary method f against the expected behavior generated by want
+func testInt64x2Binary(t *testing.T, f func(_, _ simd.Int64x2) simd.Int64x2, want func(_, _ []int64) []int64) {
+	n := 2
+	t.Helper()
+	forSlicePair(t, int64s, n, func(x, y []int64) bool {
+		t.Helper()
+		a := simd.LoadInt64x2Slice(x)
+		b := simd.LoadInt64x2Slice(y)
+		g := make([]int64, n)
+		f(a, b).StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testUint8x16Binary tests the simd binary method f against the expected behavior generated by want
+func testUint8x16Binary(t *testing.T, f func(_, _ simd.Uint8x16) simd.Uint8x16, want func(_, _ []uint8) []uint8) {
+	n := 16
+	t.Helper()
+	forSlicePair(t, uint8s, n, func(x, y []uint8) bool {
+		t.Helper()
+		a := simd.LoadUint8x16Slice(x)
+		b := simd.LoadUint8x16Slice(y)
+		g := make([]uint8, n)
+		f(a, b).StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testUint16x8Binary tests the simd binary method f against the expected behavior generated by want
+func testUint16x8Binary(t *testing.T, f func(_, _ simd.Uint16x8) simd.Uint16x8, want func(_, _ []uint16) []uint16) {
+	n := 8
+	t.Helper()
+	forSlicePair(t, uint16s, n, func(x, y []uint16) bool {
+		t.Helper()
+		a := simd.LoadUint16x8Slice(x)
+		b := simd.LoadUint16x8Slice(y)
+		g := make([]uint16, n)
+		f(a, b).StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testUint32x4Binary tests the simd binary method f against the expected behavior generated by want
+func testUint32x4Binary(t *testing.T, f func(_, _ simd.Uint32x4) simd.Uint32x4, want func(_, _ []uint32) []uint32) {
+	n := 4
+	t.Helper()
+	forSlicePair(t, uint32s, n, func(x, y []uint32) bool {
+		t.Helper()
+		a := simd.LoadUint32x4Slice(x)
+		b := simd.LoadUint32x4Slice(y)
+		g := make([]uint32, n)
+		f(a, b).StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testUint64x2Binary tests the simd binary method f against the expected behavior generated by want
+func testUint64x2Binary(t *testing.T, f func(_, _ simd.Uint64x2) simd.Uint64x2, want func(_, _ []uint64) []uint64) {
+	n := 2
+	t.Helper()
+	forSlicePair(t, uint64s, n, func(x, y []uint64) bool {
+		t.Helper()
+		a := simd.LoadUint64x2Slice(x)
+		b := simd.LoadUint64x2Slice(y)
+		g := make([]uint64, n)
+		f(a, b).StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testFloat32x4Binary tests the simd binary method f against the expected behavior generated by want
+func testFloat32x4Binary(t *testing.T, f func(_, _ simd.Float32x4) simd.Float32x4, want func(_, _ []float32) []float32) {
+	n := 4
+	t.Helper()
+	forSlicePair(t, float32s, n, func(x, y []float32) bool {
+		t.Helper()
+		a := simd.LoadFloat32x4Slice(x)
+		b := simd.LoadFloat32x4Slice(y)
+		g := make([]float32, n)
+		f(a, b).StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testFloat64x2Binary tests the simd binary method f against the expected behavior generated by want
+func testFloat64x2Binary(t *testing.T, f func(_, _ simd.Float64x2) simd.Float64x2, want func(_, _ []float64) []float64) {
+	n := 2
+	t.Helper()
+	forSlicePair(t, float64s, n, func(x, y []float64) bool {
+		t.Helper()
+		a := simd.LoadFloat64x2Slice(x)
+		b := simd.LoadFloat64x2Slice(y)
+		g := make([]float64, n)
+		f(a, b).StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testInt8x32Binary tests the simd binary method f against the expected behavior generated by want
+func testInt8x32Binary(t *testing.T, f func(_, _ simd.Int8x32) simd.Int8x32, want func(_, _ []int8) []int8) {
+	n := 32
+	t.Helper()
+	forSlicePair(t, int8s, n, func(x, y []int8) bool {
+		t.Helper()
+		a := simd.LoadInt8x32Slice(x)
+		b := simd.LoadInt8x32Slice(y)
+		g := make([]int8, n)
+		f(a, b).StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testInt16x16Binary tests the simd binary method f against the expected behavior generated by want
+func testInt16x16Binary(t *testing.T, f func(_, _ simd.Int16x16) simd.Int16x16, want func(_, _ []int16) []int16) {
+	n := 16
+	t.Helper()
+	forSlicePair(t, int16s, n, func(x, y []int16) bool {
+		t.Helper()
+		a := simd.LoadInt16x16Slice(x)
+		b := simd.LoadInt16x16Slice(y)
+		g := make([]int16, n)
+		f(a, b).StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testInt32x8Binary tests the simd binary method f against the expected behavior generated by want
+func testInt32x8Binary(t *testing.T, f func(_, _ simd.Int32x8) simd.Int32x8, want func(_, _ []int32) []int32) {
+	n := 8
+	t.Helper()
+	forSlicePair(t, int32s, n, func(x, y []int32) bool {
+		t.Helper()
+		a := simd.LoadInt32x8Slice(x)
+		b := simd.LoadInt32x8Slice(y)
+		g := make([]int32, n)
+		f(a, b).StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testInt64x4Binary tests the simd binary method f against the expected behavior generated by want
+func testInt64x4Binary(t *testing.T, f func(_, _ simd.Int64x4) simd.Int64x4, want func(_, _ []int64) []int64) {
+	n := 4
+	t.Helper()
+	forSlicePair(t, int64s, n, func(x, y []int64) bool {
+		t.Helper()
+		a := simd.LoadInt64x4Slice(x)
+		b := simd.LoadInt64x4Slice(y)
+		g := make([]int64, n)
+		f(a, b).StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testUint8x32Binary tests the simd binary method f against the expected behavior generated by want
+func testUint8x32Binary(t *testing.T, f func(_, _ simd.Uint8x32) simd.Uint8x32, want func(_, _ []uint8) []uint8) {
+	n := 32
+	t.Helper()
+	forSlicePair(t, uint8s, n, func(x, y []uint8) bool {
+		t.Helper()
+		a := simd.LoadUint8x32Slice(x)
+		b := simd.LoadUint8x32Slice(y)
+		g := make([]uint8, n)
+		f(a, b).StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testUint16x16Binary tests the simd binary method f against the expected behavior generated by want
+func testUint16x16Binary(t *testing.T, f func(_, _ simd.Uint16x16) simd.Uint16x16, want func(_, _ []uint16) []uint16) {
+	n := 16
+	t.Helper()
+	forSlicePair(t, uint16s, n, func(x, y []uint16) bool {
+		t.Helper()
+		a := simd.LoadUint16x16Slice(x)
+		b := simd.LoadUint16x16Slice(y)
+		g := make([]uint16, n)
+		f(a, b).StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testUint32x8Binary tests the simd binary method f against the expected behavior generated by want
+func testUint32x8Binary(t *testing.T, f func(_, _ simd.Uint32x8) simd.Uint32x8, want func(_, _ []uint32) []uint32) {
+	n := 8
+	t.Helper()
+	forSlicePair(t, uint32s, n, func(x, y []uint32) bool {
+		t.Helper()
+		a := simd.LoadUint32x8Slice(x)
+		b := simd.LoadUint32x8Slice(y)
+		g := make([]uint32, n)
+		f(a, b).StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testUint64x4Binary tests the simd binary method f against the expected behavior generated by want
+func testUint64x4Binary(t *testing.T, f func(_, _ simd.Uint64x4) simd.Uint64x4, want func(_, _ []uint64) []uint64) {
+	n := 4
+	t.Helper()
+	forSlicePair(t, uint64s, n, func(x, y []uint64) bool {
+		t.Helper()
+		a := simd.LoadUint64x4Slice(x)
+		b := simd.LoadUint64x4Slice(y)
+		g := make([]uint64, n)
+		f(a, b).StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testFloat32x8Binary tests the simd binary method f against the expected behavior generated by want
+func testFloat32x8Binary(t *testing.T, f func(_, _ simd.Float32x8) simd.Float32x8, want func(_, _ []float32) []float32) {
+	n := 8
+	t.Helper()
+	forSlicePair(t, float32s, n, func(x, y []float32) bool {
+		t.Helper()
+		a := simd.LoadFloat32x8Slice(x)
+		b := simd.LoadFloat32x8Slice(y)
+		g := make([]float32, n)
+		f(a, b).StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testFloat64x4Binary tests the simd binary method f against the expected behavior generated by want
+func testFloat64x4Binary(t *testing.T, f func(_, _ simd.Float64x4) simd.Float64x4, want func(_, _ []float64) []float64) {
+	n := 4
+	t.Helper()
+	forSlicePair(t, float64s, n, func(x, y []float64) bool {
+		t.Helper()
+		a := simd.LoadFloat64x4Slice(x)
+		b := simd.LoadFloat64x4Slice(y)
+		g := make([]float64, n)
+		f(a, b).StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testInt8x64Binary tests the simd binary method f against the expected behavior generated by want
+func testInt8x64Binary(t *testing.T, f func(_, _ simd.Int8x64) simd.Int8x64, want func(_, _ []int8) []int8) {
+	n := 64
+	t.Helper()
+	forSlicePair(t, int8s, n, func(x, y []int8) bool {
+		t.Helper()
+		a := simd.LoadInt8x64Slice(x)
+		b := simd.LoadInt8x64Slice(y)
+		g := make([]int8, n)
+		f(a, b).StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testInt16x32Binary tests the simd binary method f against the expected behavior generated by want
+func testInt16x32Binary(t *testing.T, f func(_, _ simd.Int16x32) simd.Int16x32, want func(_, _ []int16) []int16) {
+	n := 32
+	t.Helper()
+	forSlicePair(t, int16s, n, func(x, y []int16) bool {
+		t.Helper()
+		a := simd.LoadInt16x32Slice(x)
+		b := simd.LoadInt16x32Slice(y)
+		g := make([]int16, n)
+		f(a, b).StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testInt32x16Binary tests the simd binary method f against the expected behavior generated by want
+func testInt32x16Binary(t *testing.T, f func(_, _ simd.Int32x16) simd.Int32x16, want func(_, _ []int32) []int32) {
+	n := 16
+	t.Helper()
+	forSlicePair(t, int32s, n, func(x, y []int32) bool {
+		t.Helper()
+		a := simd.LoadInt32x16Slice(x)
+		b := simd.LoadInt32x16Slice(y)
+		g := make([]int32, n)
+		f(a, b).StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testInt64x8Binary tests the simd binary method f against the expected behavior generated by want
+func testInt64x8Binary(t *testing.T, f func(_, _ simd.Int64x8) simd.Int64x8, want func(_, _ []int64) []int64) {
+	n := 8
+	t.Helper()
+	forSlicePair(t, int64s, n, func(x, y []int64) bool {
+		t.Helper()
+		a := simd.LoadInt64x8Slice(x)
+		b := simd.LoadInt64x8Slice(y)
+		g := make([]int64, n)
+		f(a, b).StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testUint8x64Binary tests the simd binary method f against the expected behavior generated by want
+func testUint8x64Binary(t *testing.T, f func(_, _ simd.Uint8x64) simd.Uint8x64, want func(_, _ []uint8) []uint8) {
+	n := 64
+	t.Helper()
+	forSlicePair(t, uint8s, n, func(x, y []uint8) bool {
+		t.Helper()
+		a := simd.LoadUint8x64Slice(x)
+		b := simd.LoadUint8x64Slice(y)
+		g := make([]uint8, n)
+		f(a, b).StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testUint16x32Binary tests the simd binary method f against the expected behavior generated by want
+func testUint16x32Binary(t *testing.T, f func(_, _ simd.Uint16x32) simd.Uint16x32, want func(_, _ []uint16) []uint16) {
+	n := 32
+	t.Helper()
+	forSlicePair(t, uint16s, n, func(x, y []uint16) bool {
+		t.Helper()
+		a := simd.LoadUint16x32Slice(x)
+		b := simd.LoadUint16x32Slice(y)
+		g := make([]uint16, n)
+		f(a, b).StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testUint32x16Binary tests the simd binary method f against the expected behavior generated by want
+func testUint32x16Binary(t *testing.T, f func(_, _ simd.Uint32x16) simd.Uint32x16, want func(_, _ []uint32) []uint32) {
+	n := 16
+	t.Helper()
+	forSlicePair(t, uint32s, n, func(x, y []uint32) bool {
+		t.Helper()
+		a := simd.LoadUint32x16Slice(x)
+		b := simd.LoadUint32x16Slice(y)
+		g := make([]uint32, n)
+		f(a, b).StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testUint64x8Binary tests the simd binary method f against the expected behavior generated by want
+func testUint64x8Binary(t *testing.T, f func(_, _ simd.Uint64x8) simd.Uint64x8, want func(_, _ []uint64) []uint64) {
+	n := 8
+	t.Helper()
+	forSlicePair(t, uint64s, n, func(x, y []uint64) bool {
+		t.Helper()
+		a := simd.LoadUint64x8Slice(x)
+		b := simd.LoadUint64x8Slice(y)
+		g := make([]uint64, n)
+		f(a, b).StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testFloat32x16Binary tests the simd binary method f against the expected behavior generated by want
+func testFloat32x16Binary(t *testing.T, f func(_, _ simd.Float32x16) simd.Float32x16, want func(_, _ []float32) []float32) {
+	n := 16
+	t.Helper()
+	forSlicePair(t, float32s, n, func(x, y []float32) bool {
+		t.Helper()
+		a := simd.LoadFloat32x16Slice(x)
+		b := simd.LoadFloat32x16Slice(y)
+		g := make([]float32, n)
+		f(a, b).StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testFloat64x8Binary tests the simd binary method f against the expected behavior generated by want
+func testFloat64x8Binary(t *testing.T, f func(_, _ simd.Float64x8) simd.Float64x8, want func(_, _ []float64) []float64) {
+	n := 8
+	t.Helper()
+	forSlicePair(t, float64s, n, func(x, y []float64) bool {
+		t.Helper()
+		a := simd.LoadFloat64x8Slice(x)
+		b := simd.LoadFloat64x8Slice(y)
+		g := make([]float64, n)
+		f(a, b).StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
diff --git a/src/simd/internal/simd_test/binary_test.go b/src/simd/internal/simd_test/binary_test.go
new file mode 100644
index 0000000000..04dca3e2e2
--- /dev/null
+++ b/src/simd/internal/simd_test/binary_test.go
@@ -0,0 +1,361 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd && amd64
+
+package simd_test
+
+import (
+	"simd"
+	"testing"
+)
+
+func TestAdd(t *testing.T) {
+	testFloat32x4Binary(t, simd.Float32x4.Add, addSlice[float32])
+	testFloat32x8Binary(t, simd.Float32x8.Add, addSlice[float32])
+	testFloat64x2Binary(t, simd.Float64x2.Add, addSlice[float64])
+	testFloat64x4Binary(t, simd.Float64x4.Add, addSlice[float64])
+
+	testInt16x16Binary(t, simd.Int16x16.Add, addSlice[int16])
+	testInt16x8Binary(t, simd.Int16x8.Add, addSlice[int16])
+	testInt32x4Binary(t, simd.Int32x4.Add, addSlice[int32])
+	testInt32x8Binary(t, simd.Int32x8.Add, addSlice[int32])
+	testInt64x2Binary(t, simd.Int64x2.Add, addSlice[int64])
+	testInt64x4Binary(t, simd.Int64x4.Add, addSlice[int64])
+	testInt8x16Binary(t, simd.Int8x16.Add, addSlice[int8])
+	testInt8x32Binary(t, simd.Int8x32.Add, addSlice[int8])
+
+	testUint16x16Binary(t, simd.Uint16x16.Add, addSlice[uint16])
+	testUint16x8Binary(t, simd.Uint16x8.Add, addSlice[uint16])
+	testUint32x4Binary(t, simd.Uint32x4.Add, addSlice[uint32])
+	testUint32x8Binary(t, simd.Uint32x8.Add, addSlice[uint32])
+	testUint64x2Binary(t, simd.Uint64x2.Add, addSlice[uint64])
+	testUint64x4Binary(t, simd.Uint64x4.Add, addSlice[uint64])
+	testUint8x16Binary(t, simd.Uint8x16.Add, addSlice[uint8])
+	testUint8x32Binary(t, simd.Uint8x32.Add, addSlice[uint8])
+
+	if simd.X86.AVX512() {
+		testFloat32x16Binary(t, simd.Float32x16.Add, addSlice[float32])
+		testFloat64x8Binary(t, simd.Float64x8.Add, addSlice[float64])
+		testInt8x64Binary(t, simd.Int8x64.Add, addSlice[int8])
+		testInt16x32Binary(t, simd.Int16x32.Add, addSlice[int16])
+		testInt32x16Binary(t, simd.Int32x16.Add, addSlice[int32])
+		testInt64x8Binary(t, simd.Int64x8.Add, addSlice[int64])
+		testUint8x64Binary(t, simd.Uint8x64.Add, addSlice[uint8])
+		testUint16x32Binary(t, simd.Uint16x32.Add, addSlice[uint16])
+		testUint32x16Binary(t, simd.Uint32x16.Add, addSlice[uint32])
+		testUint64x8Binary(t, simd.Uint64x8.Add, addSlice[uint64])
+	}
+}
+
+func TestSub(t *testing.T) {
+	testFloat32x4Binary(t, simd.Float32x4.Sub, subSlice[float32])
+	testFloat32x8Binary(t, simd.Float32x8.Sub, subSlice[float32])
+	testFloat64x2Binary(t, simd.Float64x2.Sub, subSlice[float64])
+	testFloat64x4Binary(t, simd.Float64x4.Sub, subSlice[float64])
+
+	testInt16x16Binary(t, simd.Int16x16.Sub, subSlice[int16])
+	testInt16x8Binary(t, simd.Int16x8.Sub, subSlice[int16])
+	testInt32x4Binary(t, simd.Int32x4.Sub, subSlice[int32])
+	testInt32x8Binary(t, simd.Int32x8.Sub, subSlice[int32])
+	testInt64x2Binary(t, simd.Int64x2.Sub, subSlice[int64])
+	testInt64x4Binary(t, simd.Int64x4.Sub, subSlice[int64])
+	testInt8x16Binary(t, simd.Int8x16.Sub, subSlice[int8])
+	testInt8x32Binary(t, simd.Int8x32.Sub, subSlice[int8])
+
+	testUint16x16Binary(t, simd.Uint16x16.Sub, subSlice[uint16])
+	testUint16x8Binary(t, simd.Uint16x8.Sub, subSlice[uint16])
+	testUint32x4Binary(t, simd.Uint32x4.Sub, subSlice[uint32])
+	testUint32x8Binary(t, simd.Uint32x8.Sub, subSlice[uint32])
+	testUint64x2Binary(t, simd.Uint64x2.Sub, subSlice[uint64])
+	testUint64x4Binary(t, simd.Uint64x4.Sub, subSlice[uint64])
+	testUint8x16Binary(t, simd.Uint8x16.Sub, subSlice[uint8])
+	testUint8x32Binary(t, simd.Uint8x32.Sub, subSlice[uint8])
+
+	if simd.X86.AVX512() {
+		testFloat32x16Binary(t, simd.Float32x16.Sub, subSlice[float32])
+		testFloat64x8Binary(t, simd.Float64x8.Sub, subSlice[float64])
+		testInt8x64Binary(t, simd.Int8x64.Sub, subSlice[int8])
+		testInt16x32Binary(t, simd.Int16x32.Sub, subSlice[int16])
+		testInt32x16Binary(t, simd.Int32x16.Sub, subSlice[int32])
+		testInt64x8Binary(t, simd.Int64x8.Sub, subSlice[int64])
+		testUint8x64Binary(t, simd.Uint8x64.Sub, subSlice[uint8])
+		testUint16x32Binary(t, simd.Uint16x32.Sub, subSlice[uint16])
+		testUint32x16Binary(t, simd.Uint32x16.Sub, subSlice[uint32])
+		testUint64x8Binary(t, simd.Uint64x8.Sub, subSlice[uint64])
+	}
+}
+
+func TestMax(t *testing.T) {
+	// testFloat32x4Binary(t, simd.Float32x4.Max, maxSlice[float32]) // nan is wrong
+	// testFloat32x8Binary(t, simd.Float32x8.Max, maxSlice[float32]) // nan is wrong
+	// testFloat64x2Binary(t, simd.Float64x2.Max, maxSlice[float64]) // nan is wrong
+	// testFloat64x4Binary(t, simd.Float64x4.Max, maxSlice[float64]) // nan is wrong
+
+	testInt16x16Binary(t, simd.Int16x16.Max, maxSlice[int16])
+	testInt16x8Binary(t, simd.Int16x8.Max, maxSlice[int16])
+	testInt32x4Binary(t, simd.Int32x4.Max, maxSlice[int32])
+	testInt32x8Binary(t, simd.Int32x8.Max, maxSlice[int32])
+
+	if simd.X86.AVX512() {
+		testInt64x2Binary(t, simd.Int64x2.Max, maxSlice[int64])
+		testInt64x4Binary(t, simd.Int64x4.Max, maxSlice[int64])
+	}
+
+	testInt8x16Binary(t, simd.Int8x16.Max, maxSlice[int8])
+	testInt8x32Binary(t, simd.Int8x32.Max, maxSlice[int8])
+
+	testUint16x16Binary(t, simd.Uint16x16.Max, maxSlice[uint16])
+	testUint16x8Binary(t, simd.Uint16x8.Max, maxSlice[uint16])
+	testUint32x4Binary(t, simd.Uint32x4.Max, maxSlice[uint32])
+	testUint32x8Binary(t, simd.Uint32x8.Max, maxSlice[uint32])
+
+	if simd.X86.AVX512() {
+		testUint64x2Binary(t, simd.Uint64x2.Max, maxSlice[uint64])
+		testUint64x4Binary(t, simd.Uint64x4.Max, maxSlice[uint64])
+	}
+
+	testUint8x16Binary(t, simd.Uint8x16.Max, maxSlice[uint8])
+	testUint8x32Binary(t, simd.Uint8x32.Max, maxSlice[uint8])
+
+	if simd.X86.AVX512() {
+		// testFloat32x16Binary(t, simd.Float32x16.Max, maxSlice[float32]) // nan is wrong
+		// testFloat64x8Binary(t, simd.Float64x8.Max, maxSlice[float64]) // nan is wrong
+		testInt8x64Binary(t, simd.Int8x64.Max, maxSlice[int8])
+		testInt16x32Binary(t, simd.Int16x32.Max, maxSlice[int16])
+		testInt32x16Binary(t, simd.Int32x16.Max, maxSlice[int32])
+		testInt64x8Binary(t, simd.Int64x8.Max, maxSlice[int64])
+		testUint8x64Binary(t, simd.Uint8x64.Max, maxSlice[uint8])
+		testUint16x32Binary(t, simd.Uint16x32.Max, maxSlice[uint16])
+		testUint32x16Binary(t, simd.Uint32x16.Max, maxSlice[uint32])
+		testUint64x8Binary(t, simd.Uint64x8.Max, maxSlice[uint64])
+	}
+}
+
+func TestMin(t *testing.T) {
+	// testFloat32x4Binary(t, simd.Float32x4.Min, minSlice[float32]) // nan is wrong
+	// testFloat32x8Binary(t, simd.Float32x8.Min, minSlice[float32]) // nan is wrong
+	// testFloat64x2Binary(t, simd.Float64x2.Min, minSlice[float64]) // nan is wrong
+	// testFloat64x4Binary(t, simd.Float64x4.Min, minSlice[float64]) // nan is wrong
+
+	testInt16x16Binary(t, simd.Int16x16.Min, minSlice[int16])
+	testInt16x8Binary(t, simd.Int16x8.Min, minSlice[int16])
+	testInt32x4Binary(t, simd.Int32x4.Min, minSlice[int32])
+	testInt32x8Binary(t, simd.Int32x8.Min, minSlice[int32])
+
+	if simd.X86.AVX512() {
+		testInt64x2Binary(t, simd.Int64x2.Min, minSlice[int64])
+		testInt64x4Binary(t, simd.Int64x4.Min, minSlice[int64])
+	}
+
+	testInt8x16Binary(t, simd.Int8x16.Min, minSlice[int8])
+	testInt8x32Binary(t, simd.Int8x32.Min, minSlice[int8])
+
+	testUint16x16Binary(t, simd.Uint16x16.Min, minSlice[uint16])
+	testUint16x8Binary(t, simd.Uint16x8.Min, minSlice[uint16])
+	testUint32x4Binary(t, simd.Uint32x4.Min, minSlice[uint32])
+	testUint32x8Binary(t, simd.Uint32x8.Min, minSlice[uint32])
+
+	if simd.X86.AVX512() {
+		testUint64x2Binary(t, simd.Uint64x2.Min, minSlice[uint64])
+		testUint64x4Binary(t, simd.Uint64x4.Min, minSlice[uint64])
+	}
+
+	testUint8x16Binary(t, simd.Uint8x16.Min, minSlice[uint8])
+	testUint8x32Binary(t, simd.Uint8x32.Min, minSlice[uint8])
+
+	if simd.X86.AVX512() {
+		// testFloat32x16Binary(t, simd.Float32x16.Min, minSlice[float32]) // nan is wrong
+		// testFloat64x8Binary(t, simd.Float64x8.Min, minSlice[float64]) // nan is wrong
+		testInt8x64Binary(t, simd.Int8x64.Min, minSlice[int8])
+		testInt16x32Binary(t, simd.Int16x32.Min, minSlice[int16])
+		testInt32x16Binary(t, simd.Int32x16.Min, minSlice[int32])
+		testInt64x8Binary(t, simd.Int64x8.Min, minSlice[int64])
+		testUint8x64Binary(t, simd.Uint8x64.Min, minSlice[uint8])
+		testUint16x32Binary(t, simd.Uint16x32.Min, minSlice[uint16])
+		testUint32x16Binary(t, simd.Uint32x16.Min, minSlice[uint32])
+		testUint64x8Binary(t, simd.Uint64x8.Min, minSlice[uint64])
+	}
+}
+
+func TestAnd(t *testing.T) {
+	testInt16x16Binary(t, simd.Int16x16.And, andSlice[int16])
+	testInt16x8Binary(t, simd.Int16x8.And, andSlice[int16])
+	testInt32x4Binary(t, simd.Int32x4.And, andSlice[int32])
+	testInt32x8Binary(t, simd.Int32x8.And, andSlice[int32])
+	testInt64x2Binary(t, simd.Int64x2.And, andSlice[int64])
+	testInt64x4Binary(t, simd.Int64x4.And, andSlice[int64])
+	testInt8x16Binary(t, simd.Int8x16.And, andSlice[int8])
+	testInt8x32Binary(t, simd.Int8x32.And, andSlice[int8])
+
+	testUint16x16Binary(t, simd.Uint16x16.And, andSlice[uint16])
+	testUint16x8Binary(t, simd.Uint16x8.And, andSlice[uint16])
+	testUint32x4Binary(t, simd.Uint32x4.And, andSlice[uint32])
+	testUint32x8Binary(t, simd.Uint32x8.And, andSlice[uint32])
+	testUint64x2Binary(t, simd.Uint64x2.And, andSlice[uint64])
+	testUint64x4Binary(t, simd.Uint64x4.And, andSlice[uint64])
+	testUint8x16Binary(t, simd.Uint8x16.And, andSlice[uint8])
+	testUint8x32Binary(t, simd.Uint8x32.And, andSlice[uint8])
+
+	if simd.X86.AVX512() {
+		//	testInt8x64Binary(t, simd.Int8x64.And, andISlice[int8]) // missing
+		//	testInt16x32Binary(t, simd.Int16x32.And, andISlice[int16]) // missing
+		testInt32x16Binary(t, simd.Int32x16.And, andSlice[int32])
+		testInt64x8Binary(t, simd.Int64x8.And, andSlice[int64])
+		//	testUint8x64Binary(t, simd.Uint8x64.And, andISlice[uint8]) // missing
+		//	testUint16x32Binary(t, simd.Uint16x32.And, andISlice[uint16]) // missing
+		testUint32x16Binary(t, simd.Uint32x16.And, andSlice[uint32])
+		testUint64x8Binary(t, simd.Uint64x8.And, andSlice[uint64])
+	}
+}
+
+func TestAndNot(t *testing.T) {
+	testInt16x16Binary(t, simd.Int16x16.AndNot, andNotSlice[int16])
+	testInt16x8Binary(t, simd.Int16x8.AndNot, andNotSlice[int16])
+	testInt32x4Binary(t, simd.Int32x4.AndNot, andNotSlice[int32])
+	testInt32x8Binary(t, simd.Int32x8.AndNot, andNotSlice[int32])
+	testInt64x2Binary(t, simd.Int64x2.AndNot, andNotSlice[int64])
+	testInt64x4Binary(t, simd.Int64x4.AndNot, andNotSlice[int64])
+	testInt8x16Binary(t, simd.Int8x16.AndNot, andNotSlice[int8])
+	testInt8x32Binary(t, simd.Int8x32.AndNot, andNotSlice[int8])
+
+	testUint16x16Binary(t, simd.Uint16x16.AndNot, andNotSlice[uint16])
+	testUint16x8Binary(t, simd.Uint16x8.AndNot, andNotSlice[uint16])
+	testUint32x4Binary(t, simd.Uint32x4.AndNot, andNotSlice[uint32])
+	testUint32x8Binary(t, simd.Uint32x8.AndNot, andNotSlice[uint32])
+	testUint64x2Binary(t, simd.Uint64x2.AndNot, andNotSlice[uint64])
+	testUint64x4Binary(t, simd.Uint64x4.AndNot, andNotSlice[uint64])
+	testUint8x16Binary(t, simd.Uint8x16.AndNot, andNotSlice[uint8])
+	testUint8x32Binary(t, simd.Uint8x32.AndNot, andNotSlice[uint8])
+
+	if simd.X86.AVX512() {
+		testInt8x64Binary(t, simd.Int8x64.AndNot, andNotSlice[int8])
+		testInt16x32Binary(t, simd.Int16x32.AndNot, andNotSlice[int16])
+		testInt32x16Binary(t, simd.Int32x16.AndNot, andNotSlice[int32])
+		testInt64x8Binary(t, simd.Int64x8.AndNot, andNotSlice[int64])
+		testUint8x64Binary(t, simd.Uint8x64.AndNot, andNotSlice[uint8])
+		testUint16x32Binary(t, simd.Uint16x32.AndNot, andNotSlice[uint16])
+		testUint32x16Binary(t, simd.Uint32x16.AndNot, andNotSlice[uint32])
+		testUint64x8Binary(t, simd.Uint64x8.AndNot, andNotSlice[uint64])
+	}
+}
+
+func TestXor(t *testing.T) {
+	testInt16x16Binary(t, simd.Int16x16.Xor, xorSlice[int16])
+	testInt16x8Binary(t, simd.Int16x8.Xor, xorSlice[int16])
+	testInt32x4Binary(t, simd.Int32x4.Xor, xorSlice[int32])
+	testInt32x8Binary(t, simd.Int32x8.Xor, xorSlice[int32])
+	testInt64x2Binary(t, simd.Int64x2.Xor, xorSlice[int64])
+	testInt64x4Binary(t, simd.Int64x4.Xor, xorSlice[int64])
+	testInt8x16Binary(t, simd.Int8x16.Xor, xorSlice[int8])
+	testInt8x32Binary(t, simd.Int8x32.Xor, xorSlice[int8])
+
+	testUint16x16Binary(t, simd.Uint16x16.Xor, xorSlice[uint16])
+	testUint16x8Binary(t, simd.Uint16x8.Xor, xorSlice[uint16])
+	testUint32x4Binary(t, simd.Uint32x4.Xor, xorSlice[uint32])
+	testUint32x8Binary(t, simd.Uint32x8.Xor, xorSlice[uint32])
+	testUint64x2Binary(t, simd.Uint64x2.Xor, xorSlice[uint64])
+	testUint64x4Binary(t, simd.Uint64x4.Xor, xorSlice[uint64])
+	testUint8x16Binary(t, simd.Uint8x16.Xor, xorSlice[uint8])
+	testUint8x32Binary(t, simd.Uint8x32.Xor, xorSlice[uint8])
+
+	if simd.X86.AVX512() {
+		//	testInt8x64Binary(t, simd.Int8x64.Xor, andISlice[int8]) // missing
+		//	testInt16x32Binary(t, simd.Int16x32.Xor, andISlice[int16]) // missing
+		testInt32x16Binary(t, simd.Int32x16.Xor, xorSlice[int32])
+		testInt64x8Binary(t, simd.Int64x8.Xor, xorSlice[int64])
+		//	testUint8x64Binary(t, simd.Uint8x64.Xor, andISlice[uint8]) // missing
+		//	testUint16x32Binary(t, simd.Uint16x32.Xor, andISlice[uint16]) // missing
+		testUint32x16Binary(t, simd.Uint32x16.Xor, xorSlice[uint32])
+		testUint64x8Binary(t, simd.Uint64x8.Xor, xorSlice[uint64])
+	}
+}
+
+func TestOr(t *testing.T) {
+	testInt16x16Binary(t, simd.Int16x16.Or, orSlice[int16])
+	testInt16x8Binary(t, simd.Int16x8.Or, orSlice[int16])
+	testInt32x4Binary(t, simd.Int32x4.Or, orSlice[int32])
+	testInt32x8Binary(t, simd.Int32x8.Or, orSlice[int32])
+	testInt64x2Binary(t, simd.Int64x2.Or, orSlice[int64])
+	testInt64x4Binary(t, simd.Int64x4.Or, orSlice[int64])
+	testInt8x16Binary(t, simd.Int8x16.Or, orSlice[int8])
+	testInt8x32Binary(t, simd.Int8x32.Or, orSlice[int8])
+
+	testUint16x16Binary(t, simd.Uint16x16.Or, orSlice[uint16])
+	testUint16x8Binary(t, simd.Uint16x8.Or, orSlice[uint16])
+	testUint32x4Binary(t, simd.Uint32x4.Or, orSlice[uint32])
+	testUint32x8Binary(t, simd.Uint32x8.Or, orSlice[uint32])
+	testUint64x2Binary(t, simd.Uint64x2.Or, orSlice[uint64])
+	testUint64x4Binary(t, simd.Uint64x4.Or, orSlice[uint64])
+	testUint8x16Binary(t, simd.Uint8x16.Or, orSlice[uint8])
+	testUint8x32Binary(t, simd.Uint8x32.Or, orSlice[uint8])
+
+	if simd.X86.AVX512() {
+		//	testInt8x64Binary(t, simd.Int8x64.Or, andISlice[int8]) // missing
+		//	testInt16x32Binary(t, simd.Int16x32.Or, andISlice[int16]) // missing
+		testInt32x16Binary(t, simd.Int32x16.Or, orSlice[int32])
+		testInt64x8Binary(t, simd.Int64x8.Or, orSlice[int64])
+		//	testUint8x64Binary(t, simd.Uint8x64.Or, andISlice[uint8]) // missing
+		//	testUint16x32Binary(t, simd.Uint16x32.Or, andISlice[uint16]) // missing
+		testUint32x16Binary(t, simd.Uint32x16.Or, orSlice[uint32])
+		testUint64x8Binary(t, simd.Uint64x8.Or, orSlice[uint64])
+	}
+}
+
+func TestMul(t *testing.T) {
+	testFloat32x4Binary(t, simd.Float32x4.Mul, mulSlice[float32])
+	testFloat32x8Binary(t, simd.Float32x8.Mul, mulSlice[float32])
+	testFloat64x2Binary(t, simd.Float64x2.Mul, mulSlice[float64])
+	testFloat64x4Binary(t, simd.Float64x4.Mul, mulSlice[float64])
+
+	testInt16x16Binary(t, simd.Int16x16.Mul, mulSlice[int16])
+	testInt16x8Binary(t, simd.Int16x8.Mul, mulSlice[int16])
+	testInt32x4Binary(t, simd.Int32x4.Mul, mulSlice[int32])
+	testInt32x8Binary(t, simd.Int32x8.Mul, mulSlice[int32])
+
+	// testInt8x16Binary(t, simd.Int8x16.Mul, mulSlice[int8]) // nope
+	// testInt8x32Binary(t, simd.Int8x32.Mul, mulSlice[int8])
+
+	// TODO we should be able to do these, there's no difference between signed/unsigned Mul
+	// testUint16x16Binary(t, simd.Uint16x16.Mul, mulSlice[uint16])
+	// testUint16x8Binary(t, simd.Uint16x8.Mul, mulSlice[uint16])
+	// testUint32x4Binary(t, simd.Uint32x4.Mul, mulSlice[uint32])
+	// testUint32x8Binary(t, simd.Uint32x8.Mul, mulSlice[uint32])
+	// testUint64x2Binary(t, simd.Uint64x2.Mul, mulSlice[uint64])
+	// testUint64x4Binary(t, simd.Uint64x4.Mul, mulSlice[uint64])
+
+	// testUint8x16Binary(t, simd.Uint8x16.Mul, mulSlice[uint8]) // nope
+	// testUint8x32Binary(t, simd.Uint8x32.Mul, mulSlice[uint8])
+
+	if simd.X86.AVX512() {
+		testInt64x2Binary(t, simd.Int64x2.Mul, mulSlice[int64]) // avx512 only
+		testInt64x4Binary(t, simd.Int64x4.Mul, mulSlice[int64])
+
+		testFloat32x16Binary(t, simd.Float32x16.Mul, mulSlice[float32])
+		testFloat64x8Binary(t, simd.Float64x8.Mul, mulSlice[float64])
+
+		// testInt8x64Binary(t, simd.Int8x64.Mul, mulSlice[int8]) // nope
+		testInt16x32Binary(t, simd.Int16x32.Mul, mulSlice[int16])
+		testInt32x16Binary(t, simd.Int32x16.Mul, mulSlice[int32])
+		testInt64x8Binary(t, simd.Int64x8.Mul, mulSlice[int64])
+		// testUint8x64Binary(t, simd.Uint8x64.Mul, mulSlice[uint8]) // nope
+
+		// TODO signed should do the job
+		// testUint16x32Binary(t, simd.Uint16x32.Mul, mulSlice[uint16])
+		// testUint32x16Binary(t, simd.Uint32x16.Mul, mulSlice[uint32])
+		// testUint64x8Binary(t, simd.Uint64x8.Mul, mulSlice[uint64])
+	}
+}
+
+func TestDiv(t *testing.T) {
+	testFloat32x4Binary(t, simd.Float32x4.Div, divSlice[float32])
+	testFloat32x8Binary(t, simd.Float32x8.Div, divSlice[float32])
+	testFloat64x2Binary(t, simd.Float64x2.Div, divSlice[float64])
+	testFloat64x4Binary(t, simd.Float64x4.Div, divSlice[float64])
+
+	if simd.X86.AVX512() {
+		testFloat32x16Binary(t, simd.Float32x16.Div, divSlice[float32])
+		testFloat64x8Binary(t, simd.Float64x8.Div, divSlice[float64])
+	}
+}
diff --git a/src/simd/internal/simd_test/compare_helpers_test.go b/src/simd/internal/simd_test/compare_helpers_test.go
new file mode 100644
index 0000000000..aef703c66a
--- /dev/null
+++ b/src/simd/internal/simd_test/compare_helpers_test.go
@@ -0,0 +1,464 @@
+// Code generated by 'go run genfiles.go'; DO NOT EDIT.
+
+//go:build goexperiment.simd
+
+// This file contains functions testing simd methods that compare two operands.
+// Each function in this file is specialized for a
+// particular simd type <BaseType><Width>x<Count>.
+
+package simd_test
+
+import (
+	"simd"
+	"testing"
+)
+
+// testInt8x16Compare tests the simd comparison method f against the expected behavior generated by want
+func testInt8x16Compare(t *testing.T, f func(_, _ simd.Int8x16) simd.Mask8x16, want func(_, _ []int8) []int64) {
+	n := 16
+	t.Helper()
+	forSlicePair(t, int8s, n, func(x, y []int8) bool {
+		t.Helper()
+		a := simd.LoadInt8x16Slice(x)
+		b := simd.LoadInt8x16Slice(y)
+		g := make([]int8, n)
+		f(a, b).AsInt8x16().StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testInt16x8Compare tests the simd comparison method f against the expected behavior generated by want
+func testInt16x8Compare(t *testing.T, f func(_, _ simd.Int16x8) simd.Mask16x8, want func(_, _ []int16) []int64) {
+	n := 8
+	t.Helper()
+	forSlicePair(t, int16s, n, func(x, y []int16) bool {
+		t.Helper()
+		a := simd.LoadInt16x8Slice(x)
+		b := simd.LoadInt16x8Slice(y)
+		g := make([]int16, n)
+		f(a, b).AsInt16x8().StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testInt32x4Compare tests the simd comparison method f against the expected behavior generated by want
+func testInt32x4Compare(t *testing.T, f func(_, _ simd.Int32x4) simd.Mask32x4, want func(_, _ []int32) []int64) {
+	n := 4
+	t.Helper()
+	forSlicePair(t, int32s, n, func(x, y []int32) bool {
+		t.Helper()
+		a := simd.LoadInt32x4Slice(x)
+		b := simd.LoadInt32x4Slice(y)
+		g := make([]int32, n)
+		f(a, b).AsInt32x4().StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testInt64x2Compare tests the simd comparison method f against the expected behavior generated by want
+func testInt64x2Compare(t *testing.T, f func(_, _ simd.Int64x2) simd.Mask64x2, want func(_, _ []int64) []int64) {
+	n := 2
+	t.Helper()
+	forSlicePair(t, int64s, n, func(x, y []int64) bool {
+		t.Helper()
+		a := simd.LoadInt64x2Slice(x)
+		b := simd.LoadInt64x2Slice(y)
+		g := make([]int64, n)
+		f(a, b).AsInt64x2().StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testUint8x16Compare tests the simd comparison method f against the expected behavior generated by want
+func testUint8x16Compare(t *testing.T, f func(_, _ simd.Uint8x16) simd.Mask8x16, want func(_, _ []uint8) []int64) {
+	n := 16
+	t.Helper()
+	forSlicePair(t, uint8s, n, func(x, y []uint8) bool {
+		t.Helper()
+		a := simd.LoadUint8x16Slice(x)
+		b := simd.LoadUint8x16Slice(y)
+		g := make([]int8, n)
+		f(a, b).AsInt8x16().StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testUint16x8Compare tests the simd comparison method f against the expected behavior generated by want
+func testUint16x8Compare(t *testing.T, f func(_, _ simd.Uint16x8) simd.Mask16x8, want func(_, _ []uint16) []int64) {
+	n := 8
+	t.Helper()
+	forSlicePair(t, uint16s, n, func(x, y []uint16) bool {
+		t.Helper()
+		a := simd.LoadUint16x8Slice(x)
+		b := simd.LoadUint16x8Slice(y)
+		g := make([]int16, n)
+		f(a, b).AsInt16x8().StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testUint32x4Compare tests the simd comparison method f against the expected behavior generated by want
+func testUint32x4Compare(t *testing.T, f func(_, _ simd.Uint32x4) simd.Mask32x4, want func(_, _ []uint32) []int64) {
+	n := 4
+	t.Helper()
+	forSlicePair(t, uint32s, n, func(x, y []uint32) bool {
+		t.Helper()
+		a := simd.LoadUint32x4Slice(x)
+		b := simd.LoadUint32x4Slice(y)
+		g := make([]int32, n)
+		f(a, b).AsInt32x4().StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testUint64x2Compare tests the simd comparison method f against the expected behavior generated by want
+func testUint64x2Compare(t *testing.T, f func(_, _ simd.Uint64x2) simd.Mask64x2, want func(_, _ []uint64) []int64) {
+	n := 2
+	t.Helper()
+	forSlicePair(t, uint64s, n, func(x, y []uint64) bool {
+		t.Helper()
+		a := simd.LoadUint64x2Slice(x)
+		b := simd.LoadUint64x2Slice(y)
+		g := make([]int64, n)
+		f(a, b).AsInt64x2().StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testFloat32x4Compare tests the simd comparison method f against the expected behavior generated by want
+func testFloat32x4Compare(t *testing.T, f func(_, _ simd.Float32x4) simd.Mask32x4, want func(_, _ []float32) []int64) {
+	n := 4
+	t.Helper()
+	forSlicePair(t, float32s, n, func(x, y []float32) bool {
+		t.Helper()
+		a := simd.LoadFloat32x4Slice(x)
+		b := simd.LoadFloat32x4Slice(y)
+		g := make([]int32, n)
+		f(a, b).AsInt32x4().StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testFloat64x2Compare tests the simd comparison method f against the expected behavior generated by want
+func testFloat64x2Compare(t *testing.T, f func(_, _ simd.Float64x2) simd.Mask64x2, want func(_, _ []float64) []int64) {
+	n := 2
+	t.Helper()
+	forSlicePair(t, float64s, n, func(x, y []float64) bool {
+		t.Helper()
+		a := simd.LoadFloat64x2Slice(x)
+		b := simd.LoadFloat64x2Slice(y)
+		g := make([]int64, n)
+		f(a, b).AsInt64x2().StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testInt8x32Compare tests the simd comparison method f against the expected behavior generated by want
+func testInt8x32Compare(t *testing.T, f func(_, _ simd.Int8x32) simd.Mask8x32, want func(_, _ []int8) []int64) {
+	n := 32
+	t.Helper()
+	forSlicePair(t, int8s, n, func(x, y []int8) bool {
+		t.Helper()
+		a := simd.LoadInt8x32Slice(x)
+		b := simd.LoadInt8x32Slice(y)
+		g := make([]int8, n)
+		f(a, b).AsInt8x32().StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testInt16x16Compare tests the simd comparison method f against the expected behavior generated by want
+func testInt16x16Compare(t *testing.T, f func(_, _ simd.Int16x16) simd.Mask16x16, want func(_, _ []int16) []int64) {
+	n := 16
+	t.Helper()
+	forSlicePair(t, int16s, n, func(x, y []int16) bool {
+		t.Helper()
+		a := simd.LoadInt16x16Slice(x)
+		b := simd.LoadInt16x16Slice(y)
+		g := make([]int16, n)
+		f(a, b).AsInt16x16().StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testInt32x8Compare tests the simd comparison method f against the expected behavior generated by want
+func testInt32x8Compare(t *testing.T, f func(_, _ simd.Int32x8) simd.Mask32x8, want func(_, _ []int32) []int64) {
+	n := 8
+	t.Helper()
+	forSlicePair(t, int32s, n, func(x, y []int32) bool {
+		t.Helper()
+		a := simd.LoadInt32x8Slice(x)
+		b := simd.LoadInt32x8Slice(y)
+		g := make([]int32, n)
+		f(a, b).AsInt32x8().StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testInt64x4Compare tests the simd comparison method f against the expected behavior generated by want
+func testInt64x4Compare(t *testing.T, f func(_, _ simd.Int64x4) simd.Mask64x4, want func(_, _ []int64) []int64) {
+	n := 4
+	t.Helper()
+	forSlicePair(t, int64s, n, func(x, y []int64) bool {
+		t.Helper()
+		a := simd.LoadInt64x4Slice(x)
+		b := simd.LoadInt64x4Slice(y)
+		g := make([]int64, n)
+		f(a, b).AsInt64x4().StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testUint8x32Compare tests the simd comparison method f against the expected behavior generated by want
+func testUint8x32Compare(t *testing.T, f func(_, _ simd.Uint8x32) simd.Mask8x32, want func(_, _ []uint8) []int64) {
+	n := 32
+	t.Helper()
+	forSlicePair(t, uint8s, n, func(x, y []uint8) bool {
+		t.Helper()
+		a := simd.LoadUint8x32Slice(x)
+		b := simd.LoadUint8x32Slice(y)
+		g := make([]int8, n)
+		f(a, b).AsInt8x32().StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testUint16x16Compare tests the simd comparison method f against the expected behavior generated by want
+func testUint16x16Compare(t *testing.T, f func(_, _ simd.Uint16x16) simd.Mask16x16, want func(_, _ []uint16) []int64) {
+	n := 16
+	t.Helper()
+	forSlicePair(t, uint16s, n, func(x, y []uint16) bool {
+		t.Helper()
+		a := simd.LoadUint16x16Slice(x)
+		b := simd.LoadUint16x16Slice(y)
+		g := make([]int16, n)
+		f(a, b).AsInt16x16().StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testUint32x8Compare tests the simd comparison method f against the expected behavior generated by want
+func testUint32x8Compare(t *testing.T, f func(_, _ simd.Uint32x8) simd.Mask32x8, want func(_, _ []uint32) []int64) {
+	n := 8
+	t.Helper()
+	forSlicePair(t, uint32s, n, func(x, y []uint32) bool {
+		t.Helper()
+		a := simd.LoadUint32x8Slice(x)
+		b := simd.LoadUint32x8Slice(y)
+		g := make([]int32, n)
+		f(a, b).AsInt32x8().StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testUint64x4Compare tests the simd comparison method f against the expected behavior generated by want
+func testUint64x4Compare(t *testing.T, f func(_, _ simd.Uint64x4) simd.Mask64x4, want func(_, _ []uint64) []int64) {
+	n := 4
+	t.Helper()
+	forSlicePair(t, uint64s, n, func(x, y []uint64) bool {
+		t.Helper()
+		a := simd.LoadUint64x4Slice(x)
+		b := simd.LoadUint64x4Slice(y)
+		g := make([]int64, n)
+		f(a, b).AsInt64x4().StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testFloat32x8Compare tests the simd comparison method f against the expected behavior generated by want
+func testFloat32x8Compare(t *testing.T, f func(_, _ simd.Float32x8) simd.Mask32x8, want func(_, _ []float32) []int64) {
+	n := 8
+	t.Helper()
+	forSlicePair(t, float32s, n, func(x, y []float32) bool {
+		t.Helper()
+		a := simd.LoadFloat32x8Slice(x)
+		b := simd.LoadFloat32x8Slice(y)
+		g := make([]int32, n)
+		f(a, b).AsInt32x8().StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testFloat64x4Compare tests the simd comparison method f against the expected behavior generated by want
+func testFloat64x4Compare(t *testing.T, f func(_, _ simd.Float64x4) simd.Mask64x4, want func(_, _ []float64) []int64) {
+	n := 4
+	t.Helper()
+	forSlicePair(t, float64s, n, func(x, y []float64) bool {
+		t.Helper()
+		a := simd.LoadFloat64x4Slice(x)
+		b := simd.LoadFloat64x4Slice(y)
+		g := make([]int64, n)
+		f(a, b).AsInt64x4().StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testInt8x64Compare tests the simd comparison method f against the expected behavior generated by want
+func testInt8x64Compare(t *testing.T, f func(_, _ simd.Int8x64) simd.Mask8x64, want func(_, _ []int8) []int64) {
+	n := 64
+	t.Helper()
+	forSlicePair(t, int8s, n, func(x, y []int8) bool {
+		t.Helper()
+		a := simd.LoadInt8x64Slice(x)
+		b := simd.LoadInt8x64Slice(y)
+		g := make([]int8, n)
+		f(a, b).AsInt8x64().StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testInt16x32Compare tests the simd comparison method f against the expected behavior generated by want
+func testInt16x32Compare(t *testing.T, f func(_, _ simd.Int16x32) simd.Mask16x32, want func(_, _ []int16) []int64) {
+	n := 32
+	t.Helper()
+	forSlicePair(t, int16s, n, func(x, y []int16) bool {
+		t.Helper()
+		a := simd.LoadInt16x32Slice(x)
+		b := simd.LoadInt16x32Slice(y)
+		g := make([]int16, n)
+		f(a, b).AsInt16x32().StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testInt32x16Compare tests the simd comparison method f against the expected behavior generated by want
+func testInt32x16Compare(t *testing.T, f func(_, _ simd.Int32x16) simd.Mask32x16, want func(_, _ []int32) []int64) {
+	n := 16
+	t.Helper()
+	forSlicePair(t, int32s, n, func(x, y []int32) bool {
+		t.Helper()
+		a := simd.LoadInt32x16Slice(x)
+		b := simd.LoadInt32x16Slice(y)
+		g := make([]int32, n)
+		f(a, b).AsInt32x16().StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testInt64x8Compare tests the simd comparison method f against the expected behavior generated by want
+func testInt64x8Compare(t *testing.T, f func(_, _ simd.Int64x8) simd.Mask64x8, want func(_, _ []int64) []int64) {
+	n := 8
+	t.Helper()
+	forSlicePair(t, int64s, n, func(x, y []int64) bool {
+		t.Helper()
+		a := simd.LoadInt64x8Slice(x)
+		b := simd.LoadInt64x8Slice(y)
+		g := make([]int64, n)
+		f(a, b).AsInt64x8().StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testUint8x64Compare tests the simd comparison method f against the expected behavior generated by want
+func testUint8x64Compare(t *testing.T, f func(_, _ simd.Uint8x64) simd.Mask8x64, want func(_, _ []uint8) []int64) {
+	n := 64
+	t.Helper()
+	forSlicePair(t, uint8s, n, func(x, y []uint8) bool {
+		t.Helper()
+		a := simd.LoadUint8x64Slice(x)
+		b := simd.LoadUint8x64Slice(y)
+		g := make([]int8, n)
+		f(a, b).AsInt8x64().StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testUint16x32Compare tests the simd comparison method f against the expected behavior generated by want
+func testUint16x32Compare(t *testing.T, f func(_, _ simd.Uint16x32) simd.Mask16x32, want func(_, _ []uint16) []int64) {
+	n := 32
+	t.Helper()
+	forSlicePair(t, uint16s, n, func(x, y []uint16) bool {
+		t.Helper()
+		a := simd.LoadUint16x32Slice(x)
+		b := simd.LoadUint16x32Slice(y)
+		g := make([]int16, n)
+		f(a, b).AsInt16x32().StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testUint32x16Compare tests the simd comparison method f against the expected behavior generated by want
+func testUint32x16Compare(t *testing.T, f func(_, _ simd.Uint32x16) simd.Mask32x16, want func(_, _ []uint32) []int64) {
+	n := 16
+	t.Helper()
+	forSlicePair(t, uint32s, n, func(x, y []uint32) bool {
+		t.Helper()
+		a := simd.LoadUint32x16Slice(x)
+		b := simd.LoadUint32x16Slice(y)
+		g := make([]int32, n)
+		f(a, b).AsInt32x16().StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testUint64x8Compare tests the simd comparison method f against the expected behavior generated by want
+func testUint64x8Compare(t *testing.T, f func(_, _ simd.Uint64x8) simd.Mask64x8, want func(_, _ []uint64) []int64) {
+	n := 8
+	t.Helper()
+	forSlicePair(t, uint64s, n, func(x, y []uint64) bool {
+		t.Helper()
+		a := simd.LoadUint64x8Slice(x)
+		b := simd.LoadUint64x8Slice(y)
+		g := make([]int64, n)
+		f(a, b).AsInt64x8().StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testFloat32x16Compare tests the simd comparison method f against the expected behavior generated by want
+func testFloat32x16Compare(t *testing.T, f func(_, _ simd.Float32x16) simd.Mask32x16, want func(_, _ []float32) []int64) {
+	n := 16
+	t.Helper()
+	forSlicePair(t, float32s, n, func(x, y []float32) bool {
+		t.Helper()
+		a := simd.LoadFloat32x16Slice(x)
+		b := simd.LoadFloat32x16Slice(y)
+		g := make([]int32, n)
+		f(a, b).AsInt32x16().StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
+
+// testFloat64x8Compare tests the simd comparison method f against the expected behavior generated by want
+func testFloat64x8Compare(t *testing.T, f func(_, _ simd.Float64x8) simd.Mask64x8, want func(_, _ []float64) []int64) {
+	n := 8
+	t.Helper()
+	forSlicePair(t, float64s, n, func(x, y []float64) bool {
+		t.Helper()
+		a := simd.LoadFloat64x8Slice(x)
+		b := simd.LoadFloat64x8Slice(y)
+		g := make([]int64, n)
+		f(a, b).AsInt64x8().StoreSlice(g)
+		w := want(x, y)
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+	})
+}
diff --git a/src/simd/internal/simd_test/compare_test.go b/src/simd/internal/simd_test/compare_test.go
new file mode 100644
index 0000000000..09b3bfc0d9
--- /dev/null
+++ b/src/simd/internal/simd_test/compare_test.go
@@ -0,0 +1,265 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd && amd64
+
+package simd_test
+
+import (
+	"simd"
+	"testing"
+)
+
+// AVX 2 lacks most comparisons, but they can be synthesized
+// from > and =
+var comparisonFixed bool = simd.X86.AVX512()
+
+func TestLess(t *testing.T) {
+	testFloat32x4Compare(t, simd.Float32x4.Less, lessSlice[float32])
+	testFloat32x8Compare(t, simd.Float32x8.Less, lessSlice[float32])
+	testFloat64x2Compare(t, simd.Float64x2.Less, lessSlice[float64])
+	testFloat64x4Compare(t, simd.Float64x4.Less, lessSlice[float64])
+
+	testInt16x16Compare(t, simd.Int16x16.Less, lessSlice[int16])
+	testInt16x8Compare(t, simd.Int16x8.Less, lessSlice[int16])
+	testInt32x4Compare(t, simd.Int32x4.Less, lessSlice[int32])
+	testInt32x8Compare(t, simd.Int32x8.Less, lessSlice[int32])
+	testInt64x2Compare(t, simd.Int64x2.Less, lessSlice[int64])
+	testInt64x4Compare(t, simd.Int64x4.Less, lessSlice[int64])
+	testInt8x16Compare(t, simd.Int8x16.Less, lessSlice[int8])
+	testInt8x32Compare(t, simd.Int8x32.Less, lessSlice[int8])
+
+	testInt16x16Compare(t, simd.Int16x16.Less, lessSlice[int16])
+	testInt16x8Compare(t, simd.Int16x8.Less, lessSlice[int16])
+	testInt32x4Compare(t, simd.Int32x4.Less, lessSlice[int32])
+	testInt32x8Compare(t, simd.Int32x8.Less, lessSlice[int32])
+	testInt64x2Compare(t, simd.Int64x2.Less, lessSlice[int64])
+	testInt64x4Compare(t, simd.Int64x4.Less, lessSlice[int64])
+	testInt8x16Compare(t, simd.Int8x16.Less, lessSlice[int8])
+	testInt8x32Compare(t, simd.Int8x32.Less, lessSlice[int8])
+
+	testUint16x16Compare(t, simd.Uint16x16.Less, lessSlice[uint16])
+	testUint16x8Compare(t, simd.Uint16x8.Less, lessSlice[uint16])
+	testUint32x4Compare(t, simd.Uint32x4.Less, lessSlice[uint32])
+	testUint32x8Compare(t, simd.Uint32x8.Less, lessSlice[uint32])
+	testUint64x2Compare(t, simd.Uint64x2.Less, lessSlice[uint64])
+	testUint64x4Compare(t, simd.Uint64x4.Less, lessSlice[uint64])
+	testUint8x16Compare(t, simd.Uint8x16.Less, lessSlice[uint8])
+	testUint8x32Compare(t, simd.Uint8x32.Less, lessSlice[uint8])
+
+	if simd.X86.AVX512() {
+		testUint16x16Compare(t, simd.Uint16x16.Less, lessSlice[uint16])
+		testUint16x8Compare(t, simd.Uint16x8.Less, lessSlice[uint16])
+		testUint32x4Compare(t, simd.Uint32x4.Less, lessSlice[uint32])
+		testUint32x8Compare(t, simd.Uint32x8.Less, lessSlice[uint32])
+		testUint64x2Compare(t, simd.Uint64x2.Less, lessSlice[uint64])
+		testUint64x4Compare(t, simd.Uint64x4.Less, lessSlice[uint64])
+		testUint8x16Compare(t, simd.Uint8x16.Less, lessSlice[uint8])
+		testUint8x32Compare(t, simd.Uint8x32.Less, lessSlice[uint8])
+
+		testFloat32x16Compare(t, simd.Float32x16.Less, lessSlice[float32])
+		testFloat64x8Compare(t, simd.Float64x8.Less, lessSlice[float64])
+		testInt8x64Compare(t, simd.Int8x64.Less, lessSlice[int8])
+		testInt16x32Compare(t, simd.Int16x32.Less, lessSlice[int16])
+		testInt32x16Compare(t, simd.Int32x16.Less, lessSlice[int32])
+		testInt64x8Compare(t, simd.Int64x8.Less, lessSlice[int64])
+		testUint8x64Compare(t, simd.Uint8x64.Less, lessSlice[uint8])
+		testUint16x32Compare(t, simd.Uint16x32.Less, lessSlice[uint16])
+		testUint32x16Compare(t, simd.Uint32x16.Less, lessSlice[uint32])
+		testUint64x8Compare(t, simd.Uint64x8.Less, lessSlice[uint64])
+	}
+}
+
+func TestLessEqual(t *testing.T) {
+	testFloat32x4Compare(t, simd.Float32x4.LessEqual, lessEqualSlice[float32])
+	testFloat32x8Compare(t, simd.Float32x8.LessEqual, lessEqualSlice[float32])
+	testFloat64x2Compare(t, simd.Float64x2.LessEqual, lessEqualSlice[float64])
+	testFloat64x4Compare(t, simd.Float64x4.LessEqual, lessEqualSlice[float64])
+
+	testInt16x16Compare(t, simd.Int16x16.LessEqual, lessEqualSlice[int16])
+	testInt16x8Compare(t, simd.Int16x8.LessEqual, lessEqualSlice[int16])
+	testInt32x4Compare(t, simd.Int32x4.LessEqual, lessEqualSlice[int32])
+	testInt32x8Compare(t, simd.Int32x8.LessEqual, lessEqualSlice[int32])
+	testInt64x2Compare(t, simd.Int64x2.LessEqual, lessEqualSlice[int64])
+	testInt64x4Compare(t, simd.Int64x4.LessEqual, lessEqualSlice[int64])
+	testInt8x16Compare(t, simd.Int8x16.LessEqual, lessEqualSlice[int8])
+	testInt8x32Compare(t, simd.Int8x32.LessEqual, lessEqualSlice[int8])
+
+	testUint16x16Compare(t, simd.Uint16x16.LessEqual, lessEqualSlice[uint16])
+	testUint16x8Compare(t, simd.Uint16x8.LessEqual, lessEqualSlice[uint16])
+	testUint32x4Compare(t, simd.Uint32x4.LessEqual, lessEqualSlice[uint32])
+	testUint32x8Compare(t, simd.Uint32x8.LessEqual, lessEqualSlice[uint32])
+	testUint64x2Compare(t, simd.Uint64x2.LessEqual, lessEqualSlice[uint64])
+	testUint64x4Compare(t, simd.Uint64x4.LessEqual, lessEqualSlice[uint64])
+	testUint8x16Compare(t, simd.Uint8x16.LessEqual, lessEqualSlice[uint8])
+	testUint8x32Compare(t, simd.Uint8x32.LessEqual, lessEqualSlice[uint8])
+
+	if simd.X86.AVX512() {
+		testFloat32x16Compare(t, simd.Float32x16.LessEqual, lessEqualSlice[float32])
+		testFloat64x8Compare(t, simd.Float64x8.LessEqual, lessEqualSlice[float64])
+		testInt8x64Compare(t, simd.Int8x64.LessEqual, lessEqualSlice[int8])
+		testInt16x32Compare(t, simd.Int16x32.LessEqual, lessEqualSlice[int16])
+		testInt32x16Compare(t, simd.Int32x16.LessEqual, lessEqualSlice[int32])
+		testInt64x8Compare(t, simd.Int64x8.LessEqual, lessEqualSlice[int64])
+		testUint8x64Compare(t, simd.Uint8x64.LessEqual, lessEqualSlice[uint8])
+		testUint16x32Compare(t, simd.Uint16x32.LessEqual, lessEqualSlice[uint16])
+		testUint32x16Compare(t, simd.Uint32x16.LessEqual, lessEqualSlice[uint32])
+		testUint64x8Compare(t, simd.Uint64x8.LessEqual, lessEqualSlice[uint64])
+	}
+}
+
+func TestGreater(t *testing.T) {
+	testFloat32x4Compare(t, simd.Float32x4.Greater, greaterSlice[float32])
+	testFloat32x8Compare(t, simd.Float32x8.Greater, greaterSlice[float32])
+	testFloat64x2Compare(t, simd.Float64x2.Greater, greaterSlice[float64])
+	testFloat64x4Compare(t, simd.Float64x4.Greater, greaterSlice[float64])
+
+	testInt16x16Compare(t, simd.Int16x16.Greater, greaterSlice[int16])
+	testInt16x8Compare(t, simd.Int16x8.Greater, greaterSlice[int16])
+	testInt32x4Compare(t, simd.Int32x4.Greater, greaterSlice[int32])
+	testInt32x8Compare(t, simd.Int32x8.Greater, greaterSlice[int32])
+
+	testInt64x2Compare(t, simd.Int64x2.Greater, greaterSlice[int64])
+	testInt64x4Compare(t, simd.Int64x4.Greater, greaterSlice[int64])
+	testInt8x16Compare(t, simd.Int8x16.Greater, greaterSlice[int8])
+	testInt8x32Compare(t, simd.Int8x32.Greater, greaterSlice[int8])
+
+	testUint16x16Compare(t, simd.Uint16x16.Greater, greaterSlice[uint16])
+	testUint16x8Compare(t, simd.Uint16x8.Greater, greaterSlice[uint16])
+	testUint32x4Compare(t, simd.Uint32x4.Greater, greaterSlice[uint32])
+	testUint32x8Compare(t, simd.Uint32x8.Greater, greaterSlice[uint32])
+
+	testUint64x2Compare(t, simd.Uint64x2.Greater, greaterSlice[uint64])
+	testUint64x4Compare(t, simd.Uint64x4.Greater, greaterSlice[uint64])
+	testUint8x16Compare(t, simd.Uint8x16.Greater, greaterSlice[uint8])
+	testUint8x32Compare(t, simd.Uint8x32.Greater, greaterSlice[uint8])
+
+	if simd.X86.AVX512() {
+
+		testFloat32x16Compare(t, simd.Float32x16.Greater, greaterSlice[float32])
+		testFloat64x8Compare(t, simd.Float64x8.Greater, greaterSlice[float64])
+		testInt8x64Compare(t, simd.Int8x64.Greater, greaterSlice[int8])
+		testInt16x32Compare(t, simd.Int16x32.Greater, greaterSlice[int16])
+		testInt32x16Compare(t, simd.Int32x16.Greater, greaterSlice[int32])
+		testInt64x8Compare(t, simd.Int64x8.Greater, greaterSlice[int64])
+		testUint8x64Compare(t, simd.Uint8x64.Greater, greaterSlice[uint8])
+		testUint16x32Compare(t, simd.Uint16x32.Greater, greaterSlice[uint16])
+		testUint32x16Compare(t, simd.Uint32x16.Greater, greaterSlice[uint32])
+		testUint64x8Compare(t, simd.Uint64x8.Greater, greaterSlice[uint64])
+	}
+}
+
+func TestGreaterEqual(t *testing.T) {
+	testFloat32x4Compare(t, simd.Float32x4.GreaterEqual, greaterEqualSlice[float32])
+	testFloat32x8Compare(t, simd.Float32x8.GreaterEqual, greaterEqualSlice[float32])
+	testFloat64x2Compare(t, simd.Float64x2.GreaterEqual, greaterEqualSlice[float64])
+	testFloat64x4Compare(t, simd.Float64x4.GreaterEqual, greaterEqualSlice[float64])
+
+	testInt16x16Compare(t, simd.Int16x16.GreaterEqual, greaterEqualSlice[int16])
+	testInt16x8Compare(t, simd.Int16x8.GreaterEqual, greaterEqualSlice[int16])
+	testInt32x4Compare(t, simd.Int32x4.GreaterEqual, greaterEqualSlice[int32])
+	testInt32x8Compare(t, simd.Int32x8.GreaterEqual, greaterEqualSlice[int32])
+	testInt64x2Compare(t, simd.Int64x2.GreaterEqual, greaterEqualSlice[int64])
+	testInt64x4Compare(t, simd.Int64x4.GreaterEqual, greaterEqualSlice[int64])
+	testInt8x16Compare(t, simd.Int8x16.GreaterEqual, greaterEqualSlice[int8])
+	testInt8x32Compare(t, simd.Int8x32.GreaterEqual, greaterEqualSlice[int8])
+
+	testUint16x16Compare(t, simd.Uint16x16.GreaterEqual, greaterEqualSlice[uint16])
+	testUint16x8Compare(t, simd.Uint16x8.GreaterEqual, greaterEqualSlice[uint16])
+	testUint32x4Compare(t, simd.Uint32x4.GreaterEqual, greaterEqualSlice[uint32])
+	testUint32x8Compare(t, simd.Uint32x8.GreaterEqual, greaterEqualSlice[uint32])
+	testUint64x2Compare(t, simd.Uint64x2.GreaterEqual, greaterEqualSlice[uint64])
+	testUint64x4Compare(t, simd.Uint64x4.GreaterEqual, greaterEqualSlice[uint64])
+	testUint8x16Compare(t, simd.Uint8x16.GreaterEqual, greaterEqualSlice[uint8])
+	testUint8x32Compare(t, simd.Uint8x32.GreaterEqual, greaterEqualSlice[uint8])
+
+	if simd.X86.AVX512() {
+		testFloat32x16Compare(t, simd.Float32x16.GreaterEqual, greaterEqualSlice[float32])
+		testFloat64x8Compare(t, simd.Float64x8.GreaterEqual, greaterEqualSlice[float64])
+		testInt8x64Compare(t, simd.Int8x64.GreaterEqual, greaterEqualSlice[int8])
+		testInt16x32Compare(t, simd.Int16x32.GreaterEqual, greaterEqualSlice[int16])
+		testInt32x16Compare(t, simd.Int32x16.GreaterEqual, greaterEqualSlice[int32])
+		testInt64x8Compare(t, simd.Int64x8.GreaterEqual, greaterEqualSlice[int64])
+		testUint8x64Compare(t, simd.Uint8x64.GreaterEqual, greaterEqualSlice[uint8])
+		testUint16x32Compare(t, simd.Uint16x32.GreaterEqual, greaterEqualSlice[uint16])
+		testUint32x16Compare(t, simd.Uint32x16.GreaterEqual, greaterEqualSlice[uint32])
+		testUint64x8Compare(t, simd.Uint64x8.GreaterEqual, greaterEqualSlice[uint64])
+	}
+}
+
+func TestEqual(t *testing.T) {
+	testFloat32x4Compare(t, simd.Float32x4.Equal, equalSlice[float32])
+	testFloat32x8Compare(t, simd.Float32x8.Equal, equalSlice[float32])
+	testFloat64x2Compare(t, simd.Float64x2.Equal, equalSlice[float64])
+	testFloat64x4Compare(t, simd.Float64x4.Equal, equalSlice[float64])
+
+	testInt16x16Compare(t, simd.Int16x16.Equal, equalSlice[int16])
+	testInt16x8Compare(t, simd.Int16x8.Equal, equalSlice[int16])
+	testInt32x4Compare(t, simd.Int32x4.Equal, equalSlice[int32])
+	testInt32x8Compare(t, simd.Int32x8.Equal, equalSlice[int32])
+	testInt64x2Compare(t, simd.Int64x2.Equal, equalSlice[int64])
+	testInt64x4Compare(t, simd.Int64x4.Equal, equalSlice[int64])
+	testInt8x16Compare(t, simd.Int8x16.Equal, equalSlice[int8])
+	testInt8x32Compare(t, simd.Int8x32.Equal, equalSlice[int8])
+
+	testUint16x16Compare(t, simd.Uint16x16.Equal, equalSlice[uint16])
+	testUint16x8Compare(t, simd.Uint16x8.Equal, equalSlice[uint16])
+	testUint32x4Compare(t, simd.Uint32x4.Equal, equalSlice[uint32])
+	testUint32x8Compare(t, simd.Uint32x8.Equal, equalSlice[uint32])
+	testUint64x2Compare(t, simd.Uint64x2.Equal, equalSlice[uint64])
+	testUint64x4Compare(t, simd.Uint64x4.Equal, equalSlice[uint64])
+	testUint8x16Compare(t, simd.Uint8x16.Equal, equalSlice[uint8])
+	testUint8x32Compare(t, simd.Uint8x32.Equal, equalSlice[uint8])
+
+	if simd.X86.AVX512() {
+		testFloat32x16Compare(t, simd.Float32x16.Equal, equalSlice[float32])
+		testFloat64x8Compare(t, simd.Float64x8.Equal, equalSlice[float64])
+		testInt8x64Compare(t, simd.Int8x64.Equal, equalSlice[int8])
+		testInt16x32Compare(t, simd.Int16x32.Equal, equalSlice[int16])
+		testInt32x16Compare(t, simd.Int32x16.Equal, equalSlice[int32])
+		testInt64x8Compare(t, simd.Int64x8.Equal, equalSlice[int64])
+		testUint8x64Compare(t, simd.Uint8x64.Equal, equalSlice[uint8])
+		testUint16x32Compare(t, simd.Uint16x32.Equal, equalSlice[uint16])
+		testUint32x16Compare(t, simd.Uint32x16.Equal, equalSlice[uint32])
+		testUint64x8Compare(t, simd.Uint64x8.Equal, equalSlice[uint64])
+	}
+}
+
+func TestNotEqual(t *testing.T) {
+	testFloat32x4Compare(t, simd.Float32x4.NotEqual, notEqualSlice[float32])
+	testFloat32x8Compare(t, simd.Float32x8.NotEqual, notEqualSlice[float32])
+	testFloat64x2Compare(t, simd.Float64x2.NotEqual, notEqualSlice[float64])
+	testFloat64x4Compare(t, simd.Float64x4.NotEqual, notEqualSlice[float64])
+
+	testInt16x16Compare(t, simd.Int16x16.NotEqual, notEqualSlice[int16])
+	testInt16x8Compare(t, simd.Int16x8.NotEqual, notEqualSlice[int16])
+	testInt32x4Compare(t, simd.Int32x4.NotEqual, notEqualSlice[int32])
+	testInt32x8Compare(t, simd.Int32x8.NotEqual, notEqualSlice[int32])
+	testInt64x2Compare(t, simd.Int64x2.NotEqual, notEqualSlice[int64])
+	testInt64x4Compare(t, simd.Int64x4.NotEqual, notEqualSlice[int64])
+	testInt8x16Compare(t, simd.Int8x16.NotEqual, notEqualSlice[int8])
+	testInt8x32Compare(t, simd.Int8x32.NotEqual, notEqualSlice[int8])
+
+	testUint16x16Compare(t, simd.Uint16x16.NotEqual, notEqualSlice[uint16])
+	testUint16x8Compare(t, simd.Uint16x8.NotEqual, notEqualSlice[uint16])
+	testUint32x4Compare(t, simd.Uint32x4.NotEqual, notEqualSlice[uint32])
+	testUint32x8Compare(t, simd.Uint32x8.NotEqual, notEqualSlice[uint32])
+	testUint64x2Compare(t, simd.Uint64x2.NotEqual, notEqualSlice[uint64])
+	testUint64x4Compare(t, simd.Uint64x4.NotEqual, notEqualSlice[uint64])
+	testUint8x16Compare(t, simd.Uint8x16.NotEqual, notEqualSlice[uint8])
+	testUint8x32Compare(t, simd.Uint8x32.NotEqual, notEqualSlice[uint8])
+
+	if simd.X86.AVX512() {
+		testFloat32x16Compare(t, simd.Float32x16.NotEqual, notEqualSlice[float32])
+		testFloat64x8Compare(t, simd.Float64x8.NotEqual, notEqualSlice[float64])
+		testInt8x64Compare(t, simd.Int8x64.NotEqual, notEqualSlice[int8])
+		testInt16x32Compare(t, simd.Int16x32.NotEqual, notEqualSlice[int16])
+		testInt32x16Compare(t, simd.Int32x16.NotEqual, notEqualSlice[int32])
+		testInt64x8Compare(t, simd.Int64x8.NotEqual, notEqualSlice[int64])
+		testUint8x64Compare(t, simd.Uint8x64.NotEqual, notEqualSlice[uint8])
+		testUint16x32Compare(t, simd.Uint16x32.NotEqual, notEqualSlice[uint16])
+		testUint32x16Compare(t, simd.Uint32x16.NotEqual, notEqualSlice[uint32])
+		testUint64x8Compare(t, simd.Uint64x8.NotEqual, notEqualSlice[uint64])
+	}
+}
diff --git a/src/simd/internal/simd_test/comparemasked_helpers_test.go b/src/simd/internal/simd_test/comparemasked_helpers_test.go
new file mode 100644
index 0000000000..4c05d10bb3
--- /dev/null
+++ b/src/simd/internal/simd_test/comparemasked_helpers_test.go
@@ -0,0 +1,734 @@
+// Code generated by 'go run genfiles.go'; DO NOT EDIT.
+
+//go:build goexperiment.simd
+
+// This file contains functions testing simd methods that compare two operands under a mask.
+// Each function in this file is specialized for a
+// particular simd type <BaseType><Width>x<Count>.
+
+package simd_test
+
+import (
+	"simd"
+	"testing"
+)
+
+// testInt8x16CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testInt8x16CompareMasked(t *testing.T,
+	f func(_, _ simd.Int8x16, m simd.Mask8x16) simd.Mask8x16,
+	want func(_, _ []int8) []int64) {
+	n := 16
+	t.Helper()
+	forSlicePairMasked(t, int8s, n, func(x, y []int8, m []bool) bool {
+		t.Helper()
+		a := simd.LoadInt8x16Slice(x)
+		b := simd.LoadInt8x16Slice(y)
+		k := simd.LoadInt8x16Slice(toVect[int8](m)).ToMask()
+		g := make([]int8, n)
+		f(a, b, k).AsInt8x16().StoreSlice(g)
+		w := want(x, y)
+		for i := range m {
+			if !m[i] {
+				w[i] = 0
+			}
+		}
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+	})
+}
+
+// testInt16x8CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testInt16x8CompareMasked(t *testing.T,
+	f func(_, _ simd.Int16x8, m simd.Mask16x8) simd.Mask16x8,
+	want func(_, _ []int16) []int64) {
+	n := 8
+	t.Helper()
+	forSlicePairMasked(t, int16s, n, func(x, y []int16, m []bool) bool {
+		t.Helper()
+		a := simd.LoadInt16x8Slice(x)
+		b := simd.LoadInt16x8Slice(y)
+		k := simd.LoadInt16x8Slice(toVect[int16](m)).ToMask()
+		g := make([]int16, n)
+		f(a, b, k).AsInt16x8().StoreSlice(g)
+		w := want(x, y)
+		for i := range m {
+			if !m[i] {
+				w[i] = 0
+			}
+		}
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+	})
+}
+
+// testInt32x4CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testInt32x4CompareMasked(t *testing.T,
+	f func(_, _ simd.Int32x4, m simd.Mask32x4) simd.Mask32x4,
+	want func(_, _ []int32) []int64) {
+	n := 4
+	t.Helper()
+	forSlicePairMasked(t, int32s, n, func(x, y []int32, m []bool) bool {
+		t.Helper()
+		a := simd.LoadInt32x4Slice(x)
+		b := simd.LoadInt32x4Slice(y)
+		k := simd.LoadInt32x4Slice(toVect[int32](m)).ToMask()
+		g := make([]int32, n)
+		f(a, b, k).AsInt32x4().StoreSlice(g)
+		w := want(x, y)
+		for i := range m {
+			if !m[i] {
+				w[i] = 0
+			}
+		}
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+	})
+}
+
+// testInt64x2CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testInt64x2CompareMasked(t *testing.T,
+	f func(_, _ simd.Int64x2, m simd.Mask64x2) simd.Mask64x2,
+	want func(_, _ []int64) []int64) {
+	n := 2
+	t.Helper()
+	forSlicePairMasked(t, int64s, n, func(x, y []int64, m []bool) bool {
+		t.Helper()
+		a := simd.LoadInt64x2Slice(x)
+		b := simd.LoadInt64x2Slice(y)
+		k := simd.LoadInt64x2Slice(toVect[int64](m)).ToMask()
+		g := make([]int64, n)
+		f(a, b, k).AsInt64x2().StoreSlice(g)
+		w := want(x, y)
+		for i := range m {
+			if !m[i] {
+				w[i] = 0
+			}
+		}
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+	})
+}
+
+// testUint8x16CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testUint8x16CompareMasked(t *testing.T,
+	f func(_, _ simd.Uint8x16, m simd.Mask8x16) simd.Mask8x16,
+	want func(_, _ []uint8) []int64) {
+	n := 16
+	t.Helper()
+	forSlicePairMasked(t, uint8s, n, func(x, y []uint8, m []bool) bool {
+		t.Helper()
+		a := simd.LoadUint8x16Slice(x)
+		b := simd.LoadUint8x16Slice(y)
+		k := simd.LoadInt8x16Slice(toVect[int8](m)).ToMask()
+		g := make([]int8, n)
+		f(a, b, k).AsInt8x16().StoreSlice(g)
+		w := want(x, y)
+		for i := range m {
+			if !m[i] {
+				w[i] = 0
+			}
+		}
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+	})
+}
+
+// testUint16x8CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testUint16x8CompareMasked(t *testing.T,
+	f func(_, _ simd.Uint16x8, m simd.Mask16x8) simd.Mask16x8,
+	want func(_, _ []uint16) []int64) {
+	n := 8
+	t.Helper()
+	forSlicePairMasked(t, uint16s, n, func(x, y []uint16, m []bool) bool {
+		t.Helper()
+		a := simd.LoadUint16x8Slice(x)
+		b := simd.LoadUint16x8Slice(y)
+		k := simd.LoadInt16x8Slice(toVect[int16](m)).ToMask()
+		g := make([]int16, n)
+		f(a, b, k).AsInt16x8().StoreSlice(g)
+		w := want(x, y)
+		for i := range m {
+			if !m[i] {
+				w[i] = 0
+			}
+		}
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+	})
+}
+
+// testUint32x4CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testUint32x4CompareMasked(t *testing.T,
+	f func(_, _ simd.Uint32x4, m simd.Mask32x4) simd.Mask32x4,
+	want func(_, _ []uint32) []int64) {
+	n := 4
+	t.Helper()
+	forSlicePairMasked(t, uint32s, n, func(x, y []uint32, m []bool) bool {
+		t.Helper()
+		a := simd.LoadUint32x4Slice(x)
+		b := simd.LoadUint32x4Slice(y)
+		k := simd.LoadInt32x4Slice(toVect[int32](m)).ToMask()
+		g := make([]int32, n)
+		f(a, b, k).AsInt32x4().StoreSlice(g)
+		w := want(x, y)
+		for i := range m {
+			if !m[i] {
+				w[i] = 0
+			}
+		}
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+	})
+}
+
+// testUint64x2CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testUint64x2CompareMasked(t *testing.T,
+	f func(_, _ simd.Uint64x2, m simd.Mask64x2) simd.Mask64x2,
+	want func(_, _ []uint64) []int64) {
+	n := 2
+	t.Helper()
+	forSlicePairMasked(t, uint64s, n, func(x, y []uint64, m []bool) bool {
+		t.Helper()
+		a := simd.LoadUint64x2Slice(x)
+		b := simd.LoadUint64x2Slice(y)
+		k := simd.LoadInt64x2Slice(toVect[int64](m)).ToMask()
+		g := make([]int64, n)
+		f(a, b, k).AsInt64x2().StoreSlice(g)
+		w := want(x, y)
+		for i := range m {
+			if !m[i] {
+				w[i] = 0
+			}
+		}
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+	})
+}
+
+// testFloat32x4CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testFloat32x4CompareMasked(t *testing.T,
+	f func(_, _ simd.Float32x4, m simd.Mask32x4) simd.Mask32x4,
+	want func(_, _ []float32) []int64) {
+	n := 4
+	t.Helper()
+	forSlicePairMasked(t, float32s, n, func(x, y []float32, m []bool) bool {
+		t.Helper()
+		a := simd.LoadFloat32x4Slice(x)
+		b := simd.LoadFloat32x4Slice(y)
+		k := simd.LoadInt32x4Slice(toVect[int32](m)).ToMask()
+		g := make([]int32, n)
+		f(a, b, k).AsInt32x4().StoreSlice(g)
+		w := want(x, y)
+		for i := range m {
+			if !m[i] {
+				w[i] = 0
+			}
+		}
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+	})
+}
+
+// testFloat64x2CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testFloat64x2CompareMasked(t *testing.T,
+	f func(_, _ simd.Float64x2, m simd.Mask64x2) simd.Mask64x2,
+	want func(_, _ []float64) []int64) {
+	n := 2
+	t.Helper()
+	forSlicePairMasked(t, float64s, n, func(x, y []float64, m []bool) bool {
+		t.Helper()
+		a := simd.LoadFloat64x2Slice(x)
+		b := simd.LoadFloat64x2Slice(y)
+		k := simd.LoadInt64x2Slice(toVect[int64](m)).ToMask()
+		g := make([]int64, n)
+		f(a, b, k).AsInt64x2().StoreSlice(g)
+		w := want(x, y)
+		for i := range m {
+			if !m[i] {
+				w[i] = 0
+			}
+		}
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+	})
+}
+
+// testInt8x32CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testInt8x32CompareMasked(t *testing.T,
+	f func(_, _ simd.Int8x32, m simd.Mask8x32) simd.Mask8x32,
+	want func(_, _ []int8) []int64) {
+	n := 32
+	t.Helper()
+	forSlicePairMasked(t, int8s, n, func(x, y []int8, m []bool) bool {
+		t.Helper()
+		a := simd.LoadInt8x32Slice(x)
+		b := simd.LoadInt8x32Slice(y)
+		k := simd.LoadInt8x32Slice(toVect[int8](m)).ToMask()
+		g := make([]int8, n)
+		f(a, b, k).AsInt8x32().StoreSlice(g)
+		w := want(x, y)
+		for i := range m {
+			if !m[i] {
+				w[i] = 0
+			}
+		}
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+	})
+}
+
+// testInt16x16CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testInt16x16CompareMasked(t *testing.T,
+	f func(_, _ simd.Int16x16, m simd.Mask16x16) simd.Mask16x16,
+	want func(_, _ []int16) []int64) {
+	n := 16
+	t.Helper()
+	forSlicePairMasked(t, int16s, n, func(x, y []int16, m []bool) bool {
+		t.Helper()
+		a := simd.LoadInt16x16Slice(x)
+		b := simd.LoadInt16x16Slice(y)
+		k := simd.LoadInt16x16Slice(toVect[int16](m)).ToMask()
+		g := make([]int16, n)
+		f(a, b, k).AsInt16x16().StoreSlice(g)
+		w := want(x, y)
+		for i := range m {
+			if !m[i] {
+				w[i] = 0
+			}
+		}
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+	})
+}
+
+// testInt32x8CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testInt32x8CompareMasked(t *testing.T,
+	f func(_, _ simd.Int32x8, m simd.Mask32x8) simd.Mask32x8,
+	want func(_, _ []int32) []int64) {
+	n := 8
+	t.Helper()
+	forSlicePairMasked(t, int32s, n, func(x, y []int32, m []bool) bool {
+		t.Helper()
+		a := simd.LoadInt32x8Slice(x)
+		b := simd.LoadInt32x8Slice(y)
+		k := simd.LoadInt32x8Slice(toVect[int32](m)).ToMask()
+		g := make([]int32, n)
+		f(a, b, k).AsInt32x8().StoreSlice(g)
+		w := want(x, y)
+		for i := range m {
+			if !m[i] {
+				w[i] = 0
+			}
+		}
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+	})
+}
+
+// testInt64x4CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testInt64x4CompareMasked(t *testing.T,
+	f func(_, _ simd.Int64x4, m simd.Mask64x4) simd.Mask64x4,
+	want func(_, _ []int64) []int64) {
+	n := 4
+	t.Helper()
+	forSlicePairMasked(t, int64s, n, func(x, y []int64, m []bool) bool {
+		t.Helper()
+		a := simd.LoadInt64x4Slice(x)
+		b := simd.LoadInt64x4Slice(y)
+		k := simd.LoadInt64x4Slice(toVect[int64](m)).ToMask()
+		g := make([]int64, n)
+		f(a, b, k).AsInt64x4().StoreSlice(g)
+		w := want(x, y)
+		for i := range m {
+			if !m[i] {
+				w[i] = 0
+			}
+		}
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+	})
+}
+
+// testUint8x32CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testUint8x32CompareMasked(t *testing.T,
+	f func(_, _ simd.Uint8x32, m simd.Mask8x32) simd.Mask8x32,
+	want func(_, _ []uint8) []int64) {
+	n := 32
+	t.Helper()
+	forSlicePairMasked(t, uint8s, n, func(x, y []uint8, m []bool) bool {
+		t.Helper()
+		a := simd.LoadUint8x32Slice(x)
+		b := simd.LoadUint8x32Slice(y)
+		k := simd.LoadInt8x32Slice(toVect[int8](m)).ToMask()
+		g := make([]int8, n)
+		f(a, b, k).AsInt8x32().StoreSlice(g)
+		w := want(x, y)
+		for i := range m {
+			if !m[i] {
+				w[i] = 0
+			}
+		}
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+	})
+}
+
+// testUint16x16CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testUint16x16CompareMasked(t *testing.T,
+	f func(_, _ simd.Uint16x16, m simd.Mask16x16) simd.Mask16x16,
+	want func(_, _ []uint16) []int64) {
+	n := 16
+	t.Helper()
+	forSlicePairMasked(t, uint16s, n, func(x, y []uint16, m []bool) bool {
+		t.Helper()
+		a := simd.LoadUint16x16Slice(x)
+		b := simd.LoadUint16x16Slice(y)
+		k := simd.LoadInt16x16Slice(toVect[int16](m)).ToMask()
+		g := make([]int16, n)
+		f(a, b, k).AsInt16x16().StoreSlice(g)
+		w := want(x, y)
+		for i := range m {
+			if !m[i] {
+				w[i] = 0
+			}
+		}
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+	})
+}
+
+// testUint32x8CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testUint32x8CompareMasked(t *testing.T,
+	f func(_, _ simd.Uint32x8, m simd.Mask32x8) simd.Mask32x8,
+	want func(_, _ []uint32) []int64) {
+	n := 8
+	t.Helper()
+	forSlicePairMasked(t, uint32s, n, func(x, y []uint32, m []bool) bool {
+		t.Helper()
+		a := simd.LoadUint32x8Slice(x)
+		b := simd.LoadUint32x8Slice(y)
+		k := simd.LoadInt32x8Slice(toVect[int32](m)).ToMask()
+		g := make([]int32, n)
+		f(a, b, k).AsInt32x8().StoreSlice(g)
+		w := want(x, y)
+		for i := range m {
+			if !m[i] {
+				w[i] = 0
+			}
+		}
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+	})
+}
+
+// testUint64x4CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testUint64x4CompareMasked(t *testing.T,
+	f func(_, _ simd.Uint64x4, m simd.Mask64x4) simd.Mask64x4,
+	want func(_, _ []uint64) []int64) {
+	n := 4
+	t.Helper()
+	forSlicePairMasked(t, uint64s, n, func(x, y []uint64, m []bool) bool {
+		t.Helper()
+		a := simd.LoadUint64x4Slice(x)
+		b := simd.LoadUint64x4Slice(y)
+		k := simd.LoadInt64x4Slice(toVect[int64](m)).ToMask()
+		g := make([]int64, n)
+		f(a, b, k).AsInt64x4().StoreSlice(g)
+		w := want(x, y)
+		for i := range m {
+			if !m[i] {
+				w[i] = 0
+			}
+		}
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+	})
+}
+
+// testFloat32x8CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testFloat32x8CompareMasked(t *testing.T,
+	f func(_, _ simd.Float32x8, m simd.Mask32x8) simd.Mask32x8,
+	want func(_, _ []float32) []int64) {
+	n := 8
+	t.Helper()
+	forSlicePairMasked(t, float32s, n, func(x, y []float32, m []bool) bool {
+		t.Helper()
+		a := simd.LoadFloat32x8Slice(x)
+		b := simd.LoadFloat32x8Slice(y)
+		k := simd.LoadInt32x8Slice(toVect[int32](m)).ToMask()
+		g := make([]int32, n)
+		f(a, b, k).AsInt32x8().StoreSlice(g)
+		w := want(x, y)
+		for i := range m {
+			if !m[i] {
+				w[i] = 0
+			}
+		}
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+	})
+}
+
+// testFloat64x4CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testFloat64x4CompareMasked(t *testing.T,
+	f func(_, _ simd.Float64x4, m simd.Mask64x4) simd.Mask64x4,
+	want func(_, _ []float64) []int64) {
+	n := 4
+	t.Helper()
+	forSlicePairMasked(t, float64s, n, func(x, y []float64, m []bool) bool {
+		t.Helper()
+		a := simd.LoadFloat64x4Slice(x)
+		b := simd.LoadFloat64x4Slice(y)
+		k := simd.LoadInt64x4Slice(toVect[int64](m)).ToMask()
+		g := make([]int64, n)
+		f(a, b, k).AsInt64x4().StoreSlice(g)
+		w := want(x, y)
+		for i := range m {
+			if !m[i] {
+				w[i] = 0
+			}
+		}
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+	})
+}
+
+// testInt8x64CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testInt8x64CompareMasked(t *testing.T,
+	f func(_, _ simd.Int8x64, m simd.Mask8x64) simd.Mask8x64,
+	want func(_, _ []int8) []int64) {
+	n := 64
+	t.Helper()
+	forSlicePairMasked(t, int8s, n, func(x, y []int8, m []bool) bool {
+		t.Helper()
+		a := simd.LoadInt8x64Slice(x)
+		b := simd.LoadInt8x64Slice(y)
+		k := simd.LoadInt8x64Slice(toVect[int8](m)).ToMask()
+		g := make([]int8, n)
+		f(a, b, k).AsInt8x64().StoreSlice(g)
+		w := want(x, y)
+		for i := range m {
+			if !m[i] {
+				w[i] = 0
+			}
+		}
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+	})
+}
+
+// testInt16x32CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testInt16x32CompareMasked(t *testing.T,
+	f func(_, _ simd.Int16x32, m simd.Mask16x32) simd.Mask16x32,
+	want func(_, _ []int16) []int64) {
+	n := 32
+	t.Helper()
+	forSlicePairMasked(t, int16s, n, func(x, y []int16, m []bool) bool {
+		t.Helper()
+		a := simd.LoadInt16x32Slice(x)
+		b := simd.LoadInt16x32Slice(y)
+		k := simd.LoadInt16x32Slice(toVect[int16](m)).ToMask()
+		g := make([]int16, n)
+		f(a, b, k).AsInt16x32().StoreSlice(g)
+		w := want(x, y)
+		for i := range m {
+			if !m[i] {
+				w[i] = 0
+			}
+		}
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+	})
+}
+
+// testInt32x16CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testInt32x16CompareMasked(t *testing.T,
+	f func(_, _ simd.Int32x16, m simd.Mask32x16) simd.Mask32x16,
+	want func(_, _ []int32) []int64) {
+	n := 16
+	t.Helper()
+	forSlicePairMasked(t, int32s, n, func(x, y []int32, m []bool) bool {
+		t.Helper()
+		a := simd.LoadInt32x16Slice(x)
+		b := simd.LoadInt32x16Slice(y)
+		k := simd.LoadInt32x16Slice(toVect[int32](m)).ToMask()
+		g := make([]int32, n)
+		f(a, b, k).AsInt32x16().StoreSlice(g)
+		w := want(x, y)
+		for i := range m {
+			if !m[i] {
+				w[i] = 0
+			}
+		}
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+	})
+}
+
+// testInt64x8CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testInt64x8CompareMasked(t *testing.T,
+	f func(_, _ simd.Int64x8, m simd.Mask64x8) simd.Mask64x8,
+	want func(_, _ []int64) []int64) {
+	n := 8
+	t.Helper()
+	forSlicePairMasked(t, int64s, n, func(x, y []int64, m []bool) bool {
+		t.Helper()
+		a := simd.LoadInt64x8Slice(x)
+		b := simd.LoadInt64x8Slice(y)
+		k := simd.LoadInt64x8Slice(toVect[int64](m)).ToMask()
+		g := make([]int64, n)
+		f(a, b, k).AsInt64x8().StoreSlice(g)
+		w := want(x, y)
+		for i := range m {
+			if !m[i] {
+				w[i] = 0
+			}
+		}
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+	})
+}
+
+// testUint8x64CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testUint8x64CompareMasked(t *testing.T,
+	f func(_, _ simd.Uint8x64, m simd.Mask8x64) simd.Mask8x64,
+	want func(_, _ []uint8) []int64) {
+	n := 64
+	t.Helper()
+	forSlicePairMasked(t, uint8s, n, func(x, y []uint8, m []bool) bool {
+		t.Helper()
+		a := simd.LoadUint8x64Slice(x)
+		b := simd.LoadUint8x64Slice(y)
+		k := simd.LoadInt8x64Slice(toVect[int8](m)).ToMask()
+		g := make([]int8, n)
+		f(a, b, k).AsInt8x64().StoreSlice(g)
+		w := want(x, y)
+		for i := range m {
+			if !m[i] {
+				w[i] = 0
+			}
+		}
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+	})
+}
+
+// testUint16x32CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testUint16x32CompareMasked(t *testing.T,
+	f func(_, _ simd.Uint16x32, m simd.Mask16x32) simd.Mask16x32,
+	want func(_, _ []uint16) []int64) {
+	n := 32
+	t.Helper()
+	forSlicePairMasked(t, uint16s, n, func(x, y []uint16, m []bool) bool {
+		t.Helper()
+		a := simd.LoadUint16x32Slice(x)
+		b := simd.LoadUint16x32Slice(y)
+		k := simd.LoadInt16x32Slice(toVect[int16](m)).ToMask()
+		g := make([]int16, n)
+		f(a, b, k).AsInt16x32().StoreSlice(g)
+		w := want(x, y)
+		for i := range m {
+			if !m[i] {
+				w[i] = 0
+			}
+		}
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+	})
+}
+
+// testUint32x16CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testUint32x16CompareMasked(t *testing.T,
+	f func(_, _ simd.Uint32x16, m simd.Mask32x16) simd.Mask32x16,
+	want func(_, _ []uint32) []int64) {
+	n := 16
+	t.Helper()
+	forSlicePairMasked(t, uint32s, n, func(x, y []uint32, m []bool) bool {
+		t.Helper()
+		a := simd.LoadUint32x16Slice(x)
+		b := simd.LoadUint32x16Slice(y)
+		k := simd.LoadInt32x16Slice(toVect[int32](m)).ToMask()
+		g := make([]int32, n)
+		f(a, b, k).AsInt32x16().StoreSlice(g)
+		w := want(x, y)
+		for i := range m {
+			if !m[i] {
+				w[i] = 0
+			}
+		}
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+	})
+}
+
+// testUint64x8CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testUint64x8CompareMasked(t *testing.T,
+	f func(_, _ simd.Uint64x8, m simd.Mask64x8) simd.Mask64x8,
+	want func(_, _ []uint64) []int64) {
+	n := 8
+	t.Helper()
+	forSlicePairMasked(t, uint64s, n, func(x, y []uint64, m []bool) bool {
+		t.Helper()
+		a := simd.LoadUint64x8Slice(x)
+		b := simd.LoadUint64x8Slice(y)
+		k := simd.LoadInt64x8Slice(toVect[int64](m)).ToMask()
+		g := make([]int64, n)
+		f(a, b, k).AsInt64x8().StoreSlice(g)
+		w := want(x, y)
+		for i := range m {
+			if !m[i] {
+				w[i] = 0
+			}
+		}
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+	})
+}
+
+// testFloat32x16CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testFloat32x16CompareMasked(t *testing.T,
+	f func(_, _ simd.Float32x16, m simd.Mask32x16) simd.Mask32x16,
+	want func(_, _ []float32) []int64) {
+	n := 16
+	t.Helper()
+	forSlicePairMasked(t, float32s, n, func(x, y []float32, m []bool) bool {
+		t.Helper()
+		a := simd.LoadFloat32x16Slice(x)
+		b := simd.LoadFloat32x16Slice(y)
+		k := simd.LoadInt32x16Slice(toVect[int32](m)).ToMask()
+		g := make([]int32, n)
+		f(a, b, k).AsInt32x16().StoreSlice(g)
+		w := want(x, y)
+		for i := range m {
+			if !m[i] {
+				w[i] = 0
+			}
+		}
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+	})
+}
+
+// testFloat64x8CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testFloat64x8CompareMasked(t *testing.T,
+	f func(_, _ simd.Float64x8, m simd.Mask64x8) simd.Mask64x8,
+	want func(_, _ []float64) []int64) {
+	n := 8
+	t.Helper()
+	forSlicePairMasked(t, float64s, n, func(x, y []float64, m []bool) bool {
+		t.Helper()
+		a := simd.LoadFloat64x8Slice(x)
+		b := simd.LoadFloat64x8Slice(y)
+		k := simd.LoadInt64x8Slice(toVect[int64](m)).ToMask()
+		g := make([]int64, n)
+		f(a, b, k).AsInt64x8().StoreSlice(g)
+		w := want(x, y)
+		for i := range m {
+			if !m[i] {
+				w[i] = 0
+			}
+		}
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+	})
+}
diff --git a/src/simd/internal/simd_test/generate.go b/src/simd/internal/simd_test/generate.go
new file mode 100644
index 0000000000..e744a5299f
--- /dev/null
+++ b/src/simd/internal/simd_test/generate.go
@@ -0,0 +1,11 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd
+
+package simd
+
+// Invoke code generators.
+
+//go:generate go run -C ../.. genfiles.go
diff --git a/src/simd/internal/simd_test/helpers_test.go b/src/simd/internal/simd_test/helpers_test.go
new file mode 100644
index 0000000000..0a246e0d7d
--- /dev/null
+++ b/src/simd/internal/simd_test/helpers_test.go
@@ -0,0 +1,239 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd && amd64
+
+package simd_test
+
+import (
+	"math"
+	"simd/internal/test_helpers"
+	"testing"
+)
+
+type signed interface {
+	~int | ~int8 | ~int16 | ~int32 | ~int64
+}
+
+type integer interface {
+	~int | ~int8 | ~int16 | ~int32 | ~int64 | ~uint | ~uint8 | ~uint16 | ~uint32 | ~uint64 | ~uintptr
+}
+
+type float interface {
+	~float32 | ~float64
+}
+
+type number interface {
+	~int | ~int8 | ~int16 | ~int32 | ~int64 | ~uint | ~uint8 | ~uint16 | ~uint32 | ~uint64 | ~uintptr | ~float32 | ~float64
+}
+
+func checkSlices[T number](t *testing.T, got, want []T) bool {
+	t.Helper()
+	return test_helpers.CheckSlicesLogInput[T](t, got, want, 0.0, nil)
+}
+
+func checkSlicesLogInput[T number](t *testing.T, got, want []T, flakiness float64, logInput func()) bool {
+	t.Helper()
+	return test_helpers.CheckSlicesLogInput[T](t, got, want, flakiness, logInput)
+}
+
+// sliceOf returns a slice n T's, with each
+// element of the slice initialized to its
+// index + 1.
+func sliceOf[T number](n int) []T {
+	s := make([]T, n)
+	for i := 0; i < n; i++ {
+		s[i] = T(i + 1)
+	}
+	return s
+}
+
+func toVect[T signed](b []bool) []T {
+	s := make([]T, len(b))
+	for i := range b {
+		if b[i] {
+			s[i] = -1
+		}
+	}
+	return s
+}
+
+// s64 converts a slice of some integer type into a slice of int64
+func s64[T number](s []T) []int64 {
+	var is any = s
+	if r, ok := is.([]int64); ok {
+		return r
+	}
+	r := make([]int64, len(s))
+	for i := range s {
+		r[i] = int64(s[i])
+	}
+	return r
+}
+
+// Do implements slice part testing.  It repeatedly calls
+// body on smaller and smaller slices and an output slice
+// for the result, then compares the result to its own
+// calculation of what the result should be.
+func Do[T number](t *testing.T, n int, body func(a, c []T)) {
+	a := sliceOf[T](n)
+	b := sliceOf[T](n)
+
+	for i := n; i >= 0; i-- {
+		c := make([]T, n, n)
+		body(a[:i], c)
+		checkSlices(t, c, b)
+		if i > 0 {
+			b[i-1] = T(0)
+		}
+	}
+}
+
+// map3 returns a function that returns the slice of the results of applying
+// input parameter elem to the respective elements of its 3 slice inputs.
+func map3[T, U any](elem func(x, y, z T) U) func(x, y, z []T) []U {
+	return func(x, y, z []T) []U {
+		s := make([]U, len(x))
+		for i := range s {
+			s[i] = elem(x[i], y[i], z[i])
+		}
+		return s
+	}
+}
+
+// map2 returns a function that returns the slice of the results of applying
+// input parameter elem to the respective elements of its 2 slice inputs.
+func map2[T, U any](elem func(x, y T) U) func(x, y []T) []U {
+	return func(x, y []T) []U {
+		s := make([]U, len(x))
+		for i := range s {
+			s[i] = elem(x[i], y[i])
+		}
+		return s
+	}
+}
+
+// map1 returns a function that returns the slice of the results of applying
+// input parameter elem to the respective elements of its single slice input.
+func map1[T, U any](elem func(x T) U) func(x []T) []U {
+	return func(x []T) []U {
+		s := make([]U, len(x))
+		for i := range s {
+			s[i] = elem(x[i])
+		}
+		return s
+	}
+}
+
+// map1 returns a function that returns the slice of the results of applying
+// comparison function elem to the respective elements of its two slice inputs.
+func mapCompare[T number](elem func(x, y T) bool) func(x, y []T) []int64 {
+	return func(x, y []T) []int64 {
+		s := make([]int64, len(x))
+		for i := range s {
+			if elem(x[i], y[i]) {
+				s[i] = -1
+			}
+		}
+		return s
+	}
+}
+
+// nOf returns a slice of length n whose elements are taken
+// from input slice s.
+func nOf[T any](n int, s []T) []T {
+	if len(s) >= n {
+		return s
+	}
+	r := make([]T, n)
+	for i := range r {
+		r[i] = s[i%len(s)]
+	}
+	return r
+}
+
+const (
+	PN22  = 1.0 / 1024 / 1024 / 4
+	PN24  = 1.0 / 1024 / 1024 / 16
+	PN53  = PN24 * PN24 / 32
+	F0    = float32(1.0 + 513*PN22/2)
+	F1    = float32(1.0 + 511*PN22*8)
+	Aeasy = float32(2046 * PN53)
+	Ahard = float32(2047 * PN53) // 2047 provokes a 2-rounding in 64-bit FMA rounded to 32-bit
+)
+
+var zero = 0.0
+var nzero = -zero
+var inf = 1 / zero
+var ninf = -1 / zero
+var nan = math.NaN()
+
+// N controls how large the test vectors are
+const N = 144
+
+var float32s = nOf(N, []float32{float32(inf), float32(ninf), 1, float32(nan), float32(zero), 2, float32(nan), float32(zero), 3, float32(-zero), float32(1.0 / zero), float32(-1.0 / zero), 1.0 / 2, 1.0 / 4, 1.0 / 8, 1.0 / 1000, 1.0 / 1000000, 1, -1, 0, 2, -2, 3, -3, math.MaxFloat32, 1 / math.MaxFloat32, 10, -10, 100, 20, -20, 300, -300, -4000, -80, -160, -3200, -64, -4, -8, -16, -32, -64})
+var float64s = nOf(N, []float64{inf, ninf, nan, zero, -zero, 1 / zero, -1 / zero, 0.0001, 0.0000001, 1, -1, 0, 2, -2, 3, -3, math.MaxFloat64, 1.0 / math.MaxFloat64, 10, -10, 100, 20, -20, 300, -300, -4000, -80, -16, -32, -64})
+
+var int32s = nOf(N, []int32{1, -1, 0, 2, 4, 8, 1024, 0xffffff, -0xffffff, 0x55555, 0x77777, 0xccccc, -0x55555, -0x77777, -0xccccc, -4, -8, -16, -32, -64})
+var uint32s = nOf(N, []uint32{1, 0, 2, 4, 8, 1024, 0xffffff, ^uint32(0xffffff), 0x55555, 0x77777, 0xccccc, ^uint32(0x55555), ^uint32(0x77777), ^uint32(0xccccc)})
+
+var int64s = nOf(N, []int64{1, -1, 0, 2, 4, 8, 1024, 0xffffff, -0xffffff, 0x55555, 0x77777, 0xccccc, -0x55555, -0x77777, -0xccccc, -4, -8, -16, -32, -64})
+var uint64s = nOf(N, []uint64{1, 0, 2, 4, 8, 1024, 0xffffff, ^uint64(0xffffff), 0x55555, 0x77777, 0xccccc, ^uint64(0x55555), ^uint64(0x77777), ^uint64(0xccccc)})
+
+var int16s = nOf(N, []int16{1, -1, 0, 2, 4, 8, 1024, 3, 5, 7, 11, 13, 3000, 5555, 7777, 11111, 32767, 32766, -32767, -32768, -11111, -4, -8, -16, -32, -64})
+var uint16s = nOf(N, []uint16{1, 0, 2, 4, 8, 1024, 3, 5, 7, 11, 13, 3000, 5555, 7777, 11111, 32767, 32766, 32768, 65535, 45678, 56789})
+
+var int8s = nOf(N, []int8{0, 1, 2, 3, 5, 7, 11, 22, 33, 55, 77, 121, 127, -1, -2, -3, -5, -7, -11, -77, -121, -127, -128, 4, 8, 16, 32, 64, -4, -8, -16, -32, -64})
+var uint8s = nOf(N, []uint8{0, 1, 2, 3, 5, 7, 11, 22, 33, 55, 77, 121, 127, 128, 255, 233, 211, 177, 144, 4, 8, 16, 32, 64})
+
+var bools = nOf(N, []bool{
+	true, false, true, true, false, false, true, true, true, false, false, false, true, true, true, true, false, false, false, false})
+
+func forSlice[T number](t *testing.T, s []T, n int, f func(a []T) bool) {
+	t.Helper()
+	for i := 0; i < len(s)-n; i++ {
+		if !f(s[i : i+n]) {
+			return
+		}
+	}
+}
+
+func forSlicePair[T number](t *testing.T, s []T, n int, f func(a, b []T) bool) {
+	t.Helper()
+	for i := 0; i < len(s)-n; i++ {
+		for j := 0; j < len(s)-n; j++ {
+			if !f(s[i:i+n], s[j:j+n]) {
+				return
+			}
+		}
+	}
+}
+
+func forSliceTriple[T number](t *testing.T, s []T, n int, f func(a, b, c []T) bool) {
+	t.Helper()
+	for i := 0; i < len(s)-n; i += 3 {
+		for j := 0; j < len(s)-n; j += 3 {
+			for k := 0; k < len(s)-n; k += 3 {
+				if !f(s[i:i+n], s[j:j+n], s[k:k+n]) {
+					return
+				}
+			}
+		}
+	}
+}
+
+func forSlicePairMasked[T number](t *testing.T, s []T, n int, f func(a, b []T, m []bool) bool) {
+	t.Helper()
+	m := bools
+	// Step slice pair masked forward much more quickly, otherwise it is slooooow
+	for i := 0; i < len(s)-n; i += 3 {
+		for j := 0; j < len(s)-n; j += 3 {
+			for k := 0; k < len(m)-n; k += 3 {
+				if !f(s[i:i+n], s[j:j+n], m[k:k+n]) {
+					return
+				}
+			}
+		}
+	}
+}
diff --git a/src/simd/internal/simd_test/simd_test.go b/src/simd/internal/simd_test/simd_test.go
new file mode 100644
index 0000000000..f51e3dc15f
--- /dev/null
+++ b/src/simd/internal/simd_test/simd_test.go
@@ -0,0 +1,1248 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd && amd64
+
+package simd_test
+
+import (
+	"reflect"
+	"simd"
+	"slices"
+	"testing"
+)
+
+var sink any
+
+func TestType(t *testing.T) {
+	// Testing:
+	// - Defined as another struct's field is ok
+	// - Pointer is ok
+	// - Type defition is ok
+	// - Type alias is ok
+	// - Type conversion is ok
+	// - Conversion to interface is ok
+	type alias = simd.Int32x4
+	type maskT simd.Mask32x4
+	type myStruct struct {
+		x alias
+		y *simd.Int32x4
+		z maskT
+	}
+	vals := [4]int32{1, 2, 3, 4}
+	v := myStruct{x: simd.LoadInt32x4(&vals)}
+	// masking elements 1 and 2.
+	want := []int32{2, 4, 0, 0}
+	y := simd.LoadInt32x4(&vals)
+	v.y = &y
+	sink = y
+
+	if !simd.X86.AVX512GFNI() {
+		t.Skip("Test requires X86.AVX512, not available on this hardware")
+		return
+	}
+	v.z = maskT(simd.Mask32x4FromBits(0b0011))
+	*v.y = v.y.Add(v.x).Masked(simd.Mask32x4(v.z))
+
+	got := [4]int32{}
+	v.y.Store(&got)
+	for i := range 4 {
+		if want[i] != got[i] {
+			t.Errorf("Result at %d incorrect: want %d, got %d", i, want[i], got[i])
+		}
+	}
+}
+
+func TestUncomparable(t *testing.T) {
+	// Test that simd vectors are not comparable
+	var x, y any = simd.LoadUint32x4(&[4]uint32{1, 2, 3, 4}), simd.LoadUint32x4(&[4]uint32{5, 6, 7, 8})
+	shouldPanic := func(fn func()) {
+		defer func() {
+			if recover() == nil {
+				panic("did not panic")
+			}
+		}()
+		fn()
+	}
+	shouldPanic(func() { _ = x == y })
+}
+
+func TestFuncValue(t *testing.T) {
+	// Test that simd intrinsic can be used as a function value.
+	xv := [4]int32{1, 2, 3, 4}
+	yv := [4]int32{5, 6, 7, 8}
+	want := []int32{6, 8, 10, 12}
+	x := simd.LoadInt32x4(&xv)
+	y := simd.LoadInt32x4(&yv)
+	fn := simd.Int32x4.Add
+	sink = fn
+	x = fn(x, y)
+	got := [4]int32{}
+	x.Store(&got)
+	for i := range 4 {
+		if want[i] != got[i] {
+			t.Errorf("Result at %d incorrect: want %d, got %d", i, want[i], got[i])
+		}
+	}
+}
+
+func TestReflectMethod(t *testing.T) {
+	// Test that simd intrinsic can be accessed via reflection.
+	// NOTE: we don't yet support reflect method.Call.
+	xv := [4]int32{1, 2, 3, 4}
+	yv := [4]int32{5, 6, 7, 8}
+	want := []int32{6, 8, 10, 12}
+	x := simd.LoadInt32x4(&xv)
+	y := simd.LoadInt32x4(&yv)
+	m, ok := reflect.TypeOf(x).MethodByName("Add")
+	if !ok {
+		t.Fatal("Add method not found")
+	}
+	fn := m.Func.Interface().(func(x, y simd.Int32x4) simd.Int32x4)
+	x = fn(x, y)
+	got := [4]int32{}
+	x.Store(&got)
+	for i := range 4 {
+		if want[i] != got[i] {
+			t.Errorf("Result at %d incorrect: want %d, got %d", i, want[i], got[i])
+		}
+	}
+}
+
+func TestVectorConversion(t *testing.T) {
+	if !simd.X86.AVX512GFNI() {
+		t.Skip("Test requires X86.AVX512, not available on this hardware")
+		return
+	}
+	xv := [4]int32{1, 2, 3, 4}
+	x := simd.LoadInt32x4(&xv)
+	xPromoted := x.AsInt64x2()
+	xPromotedDemoted := xPromoted.AsInt32x4()
+	got := [4]int32{}
+	xPromotedDemoted.Store(&got)
+	for i := range 4 {
+		if xv[i] != got[i] {
+			t.Errorf("Result at %d incorrect: want %d, got %d", i, xv[i], got[i])
+		}
+	}
+}
+
+func TestMaskConversion(t *testing.T) {
+	if !simd.X86.AVX512GFNI() {
+		t.Skip("Test requires X86.AVX512, not available on this hardware")
+		return
+	}
+	x := simd.LoadInt32x4Slice([]int32{5, 0, 7, 0})
+	mask := simd.Int32x4{}.Sub(x).ToMask()
+	y := simd.LoadInt32x4Slice([]int32{1, 2, 3, 4}).Add(x).Masked(mask)
+	want := [4]int32{6, 0, 10, 0}
+	got := make([]int32, 4)
+	y.StoreSlice(got)
+	for i := range 4 {
+		if want[i] != got[i] {
+			t.Errorf("Result at %d incorrect: want %d, got %d", i, want[i], got[i])
+		}
+	}
+}
+
+func TestPermute(t *testing.T) {
+	if !simd.X86.AVX512() {
+		t.Skip("Test requires X86.AVX512, not available on this hardware")
+		return
+	}
+	x := []int64{1, 2, 3, 4, 5, 6, 7, 8}
+	indices := []uint64{7, 6, 5, 4, 3, 2, 1, 0}
+	want := []int64{8, 7, 6, 5, 4, 3, 2, 1}
+	got := make([]int64, 8)
+	simd.LoadInt64x8Slice(x).Permute(simd.LoadUint64x8Slice(indices)).StoreSlice(got)
+	for i := range 8 {
+		if want[i] != got[i] {
+			t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
+		}
+	}
+}
+
+func TestPermuteOrZero(t *testing.T) {
+	x := []uint8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+	indices := []int8{7, 6, 5, 4, 3, 2, 1, 0, -1, 8, -1, 9, -1, 10, -1, 11}
+	want := []uint8{8, 7, 6, 5, 4, 3, 2, 1, 0, 9, 0, 10, 0, 11, 0, 12}
+	got := make([]uint8, len(x))
+	simd.LoadUint8x16Slice(x).PermuteOrZero(simd.LoadInt8x16Slice(indices)).StoreSlice(got)
+	for i := range 8 {
+		if want[i] != got[i] {
+			t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
+		}
+	}
+}
+
+func TestConcatPermute(t *testing.T) {
+	if !simd.X86.AVX512() {
+		t.Skip("Test requires X86.AVX512, not available on this hardware")
+		return
+	}
+	x := []int64{1, 2, 3, 4, 5, 6, 7, 8}
+	y := []int64{-1, -2, -3, -4, -5, -6, -7, -8}
+	indices := []uint64{7 + 8, 6, 5 + 8, 4, 3 + 8, 2, 1 + 8, 0}
+	want := []int64{-8, 7, -6, 5, -4, 3, -2, 1}
+	got := make([]int64, 8)
+	simd.LoadInt64x8Slice(x).ConcatPermute(simd.LoadInt64x8Slice(y), simd.LoadUint64x8Slice(indices)).StoreSlice(got)
+	for i := range 8 {
+		if want[i] != got[i] {
+			t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
+		}
+	}
+}
+
+func TestCompress(t *testing.T) {
+	if !simd.X86.AVX512() {
+		t.Skip("Test requires X86.AVX512, not available on this hardware")
+		return
+	}
+	v1234 := simd.LoadInt32x4Slice([]int32{1, 2, 3, 4})
+	v2400 := v1234.Compress(simd.Mask32x4FromBits(0b1010))
+	got := make([]int32, 4)
+	v2400.StoreSlice(got)
+	want := []int32{2, 4, 0, 0}
+	if !slices.Equal(got, want) {
+		t.Errorf("want and got differ, want=%v, got=%v", want, got)
+	}
+}
+
+func TestExpand(t *testing.T) {
+	if !simd.X86.AVX512() {
+		t.Skip("Test requires X86.AVX512, not available on this hardware")
+		return
+	}
+	v3400 := simd.LoadInt32x4Slice([]int32{3, 4, 0, 0})
+	v2400 := v3400.Expand(simd.Mask32x4FromBits(0b1010))
+	got := make([]int32, 4)
+	v2400.StoreSlice(got)
+	want := []int32{0, 3, 0, 4}
+	if !slices.Equal(got, want) {
+		t.Errorf("want and got differ, want=%v, got=%v", want, got)
+	}
+}
+
+var testShiftAllVal uint64 = 3
+
+func TestShiftAll(t *testing.T) {
+	got := make([]int32, 4)
+	simd.LoadInt32x4Slice([]int32{0b11, 0b11, 0b11, 0b11}).ShiftAllLeft(2).StoreSlice(got)
+	for _, v := range got {
+		if v != 0b1100 {
+			t.Errorf("expect 0b1100, got %b", v)
+		}
+	}
+	simd.LoadInt32x4Slice([]int32{0b11, 0b11, 0b11, 0b11}).ShiftAllLeft(testShiftAllVal).StoreSlice(got)
+	for _, v := range got {
+		if v != 0b11000 {
+			t.Errorf("expect 0b11000, got %b", v)
+		}
+	}
+}
+
+func TestSlicesInt8(t *testing.T) {
+	a := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+		17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
+	v := simd.LoadInt8x32Slice(a)
+	b := make([]int8, 32, 32)
+	v.StoreSlice(b)
+	checkSlices(t, a, b)
+}
+
+func TestSlicesInt8SetElem(t *testing.T) {
+	a := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+		17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
+	v := simd.LoadInt8x16Slice(a)
+
+	v = v.SetElem(3, 13)
+	a[3] = 13
+
+	b := make([]int8, 16, 16)
+	v.StoreSlice(b)
+	checkSlices(t, a, b)
+}
+
+func TestSlicesInt8GetElem(t *testing.T) {
+	a := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+		17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
+	v := simd.LoadInt8x16Slice(a)
+	e := v.GetElem(2)
+	if e != a[2] {
+		t.Errorf("GetElem(2) = %d != a[2] = %d", e, a[2])
+	}
+
+}
+
+func TestSlicesInt8TooShortLoad(t *testing.T) {
+	defer func() {
+		if r := recover(); r != nil {
+			t.Logf("Saw EXPECTED panic %v", r)
+		} else {
+			t.Errorf("Did not see expected panic")
+		}
+	}()
+	a := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+		17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31} // TOO SHORT, should panic
+	v := simd.LoadInt8x32Slice(a)
+	b := make([]int8, 32, 32)
+	v.StoreSlice(b)
+	checkSlices(t, a, b)
+}
+
+func TestSlicesInt8TooShortStore(t *testing.T) {
+	defer func() {
+		if r := recover(); r != nil {
+			t.Logf("Saw EXPECTED panic %v", r)
+		} else {
+			t.Errorf("Did not see expected panic")
+		}
+	}()
+	a := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+		17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
+	v := simd.LoadInt8x32Slice(a)
+	b := make([]int8, 31) // TOO SHORT, should panic
+	v.StoreSlice(b)
+	checkSlices(t, a, b)
+}
+
+func TestSlicesFloat64(t *testing.T) {
+	a := []float64{1, 2, 3, 4, 5, 6, 7, 8} // too long, should be fine
+	v := simd.LoadFloat64x4Slice(a)
+	b := make([]float64, 4, 4)
+	v.StoreSlice(b)
+	for i := range b {
+		if a[i] != b[i] {
+			t.Errorf("a and b differ at index %d, a=%f, b=%f", i, a[i], b[i])
+		}
+	}
+}
+
+// TODO: try to reduce this test to be smaller.
+func TestMergeLocals(t *testing.T) {
+	testMergeLocalswrapper(t, simd.Int64x4.Add)
+}
+
+//go:noinline
+func forceSpill() {}
+
+func testMergeLocalswrapper(t *testing.T, op func(simd.Int64x4, simd.Int64x4) simd.Int64x4) {
+	t.Helper()
+	s0 := []int64{0, 1, 2, 3}
+	s1 := []int64{-1, 0, -1, 0}
+	want := []int64{-1, 1, 1, 3}
+	v := simd.LoadInt64x4Slice(s0)
+	m := simd.LoadInt64x4Slice(s1)
+	forceSpill()
+	got := make([]int64, 4)
+	gotv := op(v, m)
+	gotv.StoreSlice(got)
+	for i := range len(want) {
+		if !(got[i] == want[i]) {
+			t.Errorf("Result at %d incorrect: want %v, got %v", i, want[i], got[i])
+		}
+	}
+}
+
+func TestBitMaskFromBits(t *testing.T) {
+	if !simd.X86.AVX512() {
+		t.Skip("Test requires X86.AVX512, not available on this hardware")
+		return
+	}
+	results := [2]int64{}
+	want := [2]int64{0, 6}
+	m := simd.Mask64x2FromBits(0b10)
+	simd.LoadInt64x2Slice([]int64{1, 2}).Add(simd.LoadInt64x2Slice([]int64{3, 4})).Masked(m).Store(&results)
+	for i := range 2 {
+		if results[i] != want[i] {
+			t.Errorf("Result at %d incorrect: want %v, got %v", i, want[i], results[i])
+		}
+	}
+}
+
+var maskForTestBitMaskFromBitsLoad = uint8(0b10)
+
+func TestBitMaskFromBitsLoad(t *testing.T) {
+	if !simd.X86.AVX512() {
+		t.Skip("Test requires X86.AVX512, not available on this hardware")
+		return
+	}
+	results := [2]int64{}
+	want := [2]int64{0, 6}
+	m := simd.Mask64x2FromBits(maskForTestBitMaskFromBitsLoad)
+	simd.LoadInt64x2Slice([]int64{1, 2}).Add(simd.LoadInt64x2Slice([]int64{3, 4})).Masked(m).Store(&results)
+	for i := range 2 {
+		if results[i] != want[i] {
+			t.Errorf("Result at %d incorrect: want %v, got %v", i, want[i], results[i])
+		}
+	}
+}
+
+func TestBitMaskToBits(t *testing.T) {
+	if !simd.X86.AVX512() {
+		t.Skip("Test requires X86.AVX512, not available on this hardware")
+		return
+	}
+	if v := simd.LoadInt16x8Slice([]int16{1, 0, 1, 0, 0, 0, 0, 0}).ToMask().ToBits(); v != 0b101 {
+		t.Errorf("Want 0b101, got %b", v)
+	}
+}
+
+var maskForTestBitMaskFromBitsStore uint8
+
+func TestBitMaskToBitsStore(t *testing.T) {
+	if !simd.X86.AVX512() {
+		t.Skip("Test requires X86.AVX512, not available on this hardware")
+		return
+	}
+	maskForTestBitMaskFromBitsStore = simd.LoadInt16x8Slice([]int16{1, 0, 1, 0, 0, 0, 0, 0}).ToMask().ToBits()
+	if maskForTestBitMaskFromBitsStore != 0b101 {
+		t.Errorf("Want 0b101, got %b", maskForTestBitMaskFromBitsStore)
+	}
+}
+
+func TestMergeFloat(t *testing.T) {
+	k := make([]int64, 4, 4)
+	s := make([]float64, 4, 4)
+
+	a := simd.LoadFloat64x4Slice([]float64{1, 2, 3, 4})
+	b := simd.LoadFloat64x4Slice([]float64{4, 2, 3, 1})
+	g := a.Greater(b)
+	g.AsInt64x4().StoreSlice(k)
+	c := a.Merge(b, g)
+
+	c.StoreSlice(s)
+
+	checkSlices[int64](t, k, []int64{0, 0, 0, -1})
+	checkSlices[float64](t, s, []float64{4, 2, 3, 4})
+}
+
+func TestMergeFloat512(t *testing.T) {
+	if !simd.X86.AVX512() {
+		t.Skip("Test requires X86.AVX512, not available on this hardware")
+		return
+	}
+
+	k := make([]int64, 8, 8)
+	s := make([]float64, 8, 8)
+
+	a := simd.LoadFloat64x8Slice([]float64{1, 2, 3, 4, 5, 6, 7, 8})
+	b := simd.LoadFloat64x8Slice([]float64{8, 7, 6, 5, 4, 2, 3, 1})
+	g := a.Greater(b)
+	g.AsInt64x8().StoreSlice(k)
+	c := a.Merge(b, g)
+	d := a.Masked(g)
+
+	checkSlices[int64](t, k, []int64{0, 0, 0, 0, -1, -1, -1, -1})
+
+	c.StoreSlice(s)
+	checkSlices[float64](t, s, []float64{8, 7, 6, 5, 5, 6, 7, 8})
+
+	d.StoreSlice(s)
+	checkSlices[float64](t, s, []float64{0, 0, 0, 0, 5, 6, 7, 8})
+}
+
+var ro uint8 = 2
+
+func TestRotateAllVariable(t *testing.T) {
+	if !simd.X86.AVX512() {
+		t.Skip("Test requires X86.AVX512, not available on this hardware")
+		return
+	}
+	got := make([]int32, 4)
+	simd.LoadInt32x4Slice([]int32{0b11, 0b11, 0b11, 0b11}).RotateAllLeft(ro).StoreSlice(got)
+	for _, v := range got {
+		if v != 0b1100 {
+			t.Errorf("Want 0b1100, got %b", v)
+		}
+	}
+}
+
+func TestBroadcastUint32x4(t *testing.T) {
+	s := make([]uint32, 4, 4)
+	simd.BroadcastUint32x4(123456789).StoreSlice(s)
+	checkSlices(t, s, []uint32{123456789, 123456789, 123456789, 123456789})
+}
+
+func TestBroadcastFloat32x8(t *testing.T) {
+	s := make([]float32, 8, 8)
+	simd.BroadcastFloat32x8(123456789).StoreSlice(s)
+	checkSlices(t, s, []float32{123456789, 123456789, 123456789, 123456789, 123456789, 123456789, 123456789, 123456789})
+}
+
+func TestBroadcastFloat64x2(t *testing.T) {
+	s := make([]float64, 2, 2)
+	simd.BroadcastFloat64x2(123456789).StoreSlice(s)
+	checkSlices(t, s, []float64{123456789, 123456789})
+}
+
+func TestBroadcastUint64x2(t *testing.T) {
+	s := make([]uint64, 2, 2)
+	simd.BroadcastUint64x2(123456789).StoreSlice(s)
+	checkSlices(t, s, []uint64{123456789, 123456789})
+}
+
+func TestBroadcastUint16x8(t *testing.T) {
+	s := make([]uint16, 8, 8)
+	simd.BroadcastUint16x8(12345).StoreSlice(s)
+	checkSlices(t, s, []uint16{12345, 12345, 12345, 12345})
+}
+
+func TestBroadcastInt8x32(t *testing.T) {
+	s := make([]int8, 32, 32)
+	simd.BroadcastInt8x32(-123).StoreSlice(s)
+	checkSlices(t, s, []int8{-123, -123, -123, -123, -123, -123, -123, -123,
+		-123, -123, -123, -123, -123, -123, -123, -123,
+		-123, -123, -123, -123, -123, -123, -123, -123,
+		-123, -123, -123, -123, -123, -123, -123, -123,
+	})
+}
+
+func TestMaskOpt512(t *testing.T) {
+	if !simd.X86.AVX512() {
+		t.Skip("Test requires X86.AVX512, not available on this hardware")
+		return
+	}
+
+	k := make([]int64, 8, 8)
+	s := make([]float64, 8, 8)
+
+	a := simd.LoadFloat64x8Slice([]float64{2, 0, 2, 0, 2, 0, 2, 0})
+	b := simd.LoadFloat64x8Slice([]float64{1, 1, 1, 1, 1, 1, 1, 1})
+	c := simd.LoadFloat64x8Slice([]float64{1, 2, 3, 4, 5, 6, 7, 8})
+	d := simd.LoadFloat64x8Slice([]float64{2, 4, 6, 8, 10, 12, 14, 16})
+	g := a.Greater(b)
+	e := c.Add(d).Masked(g)
+	e.StoreSlice(s)
+	g.AsInt64x8().StoreSlice(k)
+	checkSlices[int64](t, k, []int64{-1, 0, -1, 0, -1, 0, -1, 0})
+	checkSlices[float64](t, s, []float64{3, 0, 9, 0, 15, 0, 21, 0})
+}
+
+// flattenedTranspose tranposes x and y, regarded as a pair of 2x2
+// matrices, but then flattens the rows in order, i.e
+// x: ABCD ==> a: A1B2
+// y: 1234     b: C3D4
+func flattenedTranspose(x, y simd.Int32x4) (a, b simd.Int32x4) {
+	return x.InterleaveLo(y), x.InterleaveHi(y)
+}
+
+func TestFlattenedTranspose(t *testing.T) {
+	r := make([]int32, 4, 4)
+	s := make([]int32, 4, 4)
+
+	x := simd.LoadInt32x4Slice([]int32{0xA, 0xB, 0xC, 0xD})
+	y := simd.LoadInt32x4Slice([]int32{1, 2, 3, 4})
+	a, b := flattenedTranspose(x, y)
+
+	a.StoreSlice(r)
+	b.StoreSlice(s)
+
+	checkSlices[int32](t, r, []int32{0xA, 1, 0xB, 2})
+	checkSlices[int32](t, s, []int32{0xC, 3, 0xD, 4})
+
+}
+
+func TestClearAVXUpperBits(t *testing.T) {
+	// Test that ClearAVXUpperBits is safe even if there are SIMD values
+	// alive (although usually one should not do this).
+	if !simd.X86.AVX2() {
+		t.Skip("Test requires X86.AVX2, not available on this hardware")
+		return
+	}
+
+	r := make([]int64, 4)
+	s := make([]int64, 4)
+
+	x := simd.LoadInt64x4Slice([]int64{10, 20, 30, 40})
+	y := simd.LoadInt64x4Slice([]int64{1, 2, 3, 4})
+
+	x.Add(y).StoreSlice(r)
+	simd.ClearAVXUpperBits()
+	x.Sub(y).StoreSlice(s)
+
+	checkSlices[int64](t, r, []int64{11, 22, 33, 44})
+	checkSlices[int64](t, s, []int64{9, 18, 27, 36})
+}
+
+func TestLeadingZeros(t *testing.T) {
+	if !simd.X86.AVX512() {
+		t.Skip("Test requires X86.AVX512, not available on this hardware")
+		return
+	}
+
+	src := []uint64{0b1111, 0}
+	want := []uint64{60, 64}
+	got := make([]uint64, 2)
+	simd.LoadUint64x2Slice(src).LeadingZeros().StoreSlice(got)
+	for i := range 2 {
+		if want[i] != got[i] {
+			t.Errorf("Result incorrect at %d: want %d, got %d", i, want[i], got[i])
+		}
+	}
+}
+
+func TestIsZero(t *testing.T) {
+	v1 := simd.LoadUint64x2Slice([]uint64{0, 1})
+	v2 := simd.LoadUint64x2Slice([]uint64{0, 0})
+	if v1.IsZero() {
+		t.Errorf("Result incorrect, want false, got true")
+	}
+	if !v2.IsZero() {
+		t.Errorf("Result incorrect, want true, got false")
+	}
+	if !v1.And(v2).IsZero() {
+		t.Errorf("Result incorrect, want true, got false")
+	}
+	if v1.AndNot(v2).IsZero() {
+		t.Errorf("Result incorrect, want false, got true")
+	}
+	if !v2.And(v1).IsZero() {
+		t.Errorf("Result incorrect, want true, got false")
+	}
+	if !v2.AndNot(v1).IsZero() {
+		t.Errorf("Result incorrect, want true, got false")
+	}
+}
+
+func TestSelect4FromPairConst(t *testing.T) {
+	x := simd.LoadInt32x4Slice([]int32{0, 1, 2, 3})
+	y := simd.LoadInt32x4Slice([]int32{4, 5, 6, 7})
+
+	llll := x.SelectFromPair(0, 1, 2, 3, y)
+	hhhh := x.SelectFromPair(4, 5, 6, 7, y)
+	llhh := x.SelectFromPair(0, 1, 6, 7, y)
+	hhll := x.SelectFromPair(6, 7, 0, 1, y)
+
+	lllh := x.SelectFromPair(0, 1, 2, 7, y)
+	llhl := x.SelectFromPair(0, 1, 7, 2, y)
+	lhll := x.SelectFromPair(0, 7, 1, 2, y)
+	hlll := x.SelectFromPair(7, 0, 1, 2, y)
+
+	hhhl := x.SelectFromPair(4, 5, 6, 0, y)
+	hhlh := x.SelectFromPair(4, 5, 0, 6, y)
+	hlhh := x.SelectFromPair(4, 0, 5, 6, y)
+	lhhh := x.SelectFromPair(0, 4, 5, 6, y)
+
+	lhlh := x.SelectFromPair(0, 4, 1, 5, y)
+	hlhl := x.SelectFromPair(4, 0, 5, 1, y)
+	lhhl := x.SelectFromPair(0, 4, 5, 1, y)
+	hllh := x.SelectFromPair(4, 0, 1, 5, y)
+
+	r := make([]int32, 4, 4)
+
+	foo := func(v simd.Int32x4, a, b, c, d int32) {
+		v.StoreSlice(r)
+		checkSlices[int32](t, r, []int32{a, b, c, d})
+	}
+
+	foo(llll, 0, 1, 2, 3)
+	foo(hhhh, 4, 5, 6, 7)
+	foo(llhh, 0, 1, 6, 7)
+	foo(hhll, 6, 7, 0, 1)
+
+	foo(lllh, 0, 1, 2, 7)
+	foo(llhl, 0, 1, 7, 2)
+	foo(lhll, 0, 7, 1, 2)
+	foo(hlll, 7, 0, 1, 2)
+
+	foo(hhhl, 4, 5, 6, 0)
+	foo(hhlh, 4, 5, 0, 6)
+	foo(hlhh, 4, 0, 5, 6)
+	foo(lhhh, 0, 4, 5, 6)
+
+	foo(lhlh, 0, 4, 1, 5)
+	foo(hlhl, 4, 0, 5, 1)
+	foo(lhhl, 0, 4, 5, 1)
+	foo(hllh, 4, 0, 1, 5)
+}
+
+//go:noinline
+func selectFromPairInt32x4(x simd.Int32x4, a, b, c, d uint8, y simd.Int32x4) simd.Int32x4 {
+	return x.SelectFromPair(a, b, c, d, y)
+}
+
+func TestSelect4FromPairVar(t *testing.T) {
+	x := simd.LoadInt32x4Slice([]int32{0, 1, 2, 3})
+	y := simd.LoadInt32x4Slice([]int32{4, 5, 6, 7})
+
+	llll := selectFromPairInt32x4(x, 0, 1, 2, 3, y)
+	hhhh := selectFromPairInt32x4(x, 4, 5, 6, 7, y)
+	llhh := selectFromPairInt32x4(x, 0, 1, 6, 7, y)
+	hhll := selectFromPairInt32x4(x, 6, 7, 0, 1, y)
+
+	lllh := selectFromPairInt32x4(x, 0, 1, 2, 7, y)
+	llhl := selectFromPairInt32x4(x, 0, 1, 7, 2, y)
+	lhll := selectFromPairInt32x4(x, 0, 7, 1, 2, y)
+	hlll := selectFromPairInt32x4(x, 7, 0, 1, 2, y)
+
+	hhhl := selectFromPairInt32x4(x, 4, 5, 6, 0, y)
+	hhlh := selectFromPairInt32x4(x, 4, 5, 0, 6, y)
+	hlhh := selectFromPairInt32x4(x, 4, 0, 5, 6, y)
+	lhhh := selectFromPairInt32x4(x, 0, 4, 5, 6, y)
+
+	lhlh := selectFromPairInt32x4(x, 0, 4, 1, 5, y)
+	hlhl := selectFromPairInt32x4(x, 4, 0, 5, 1, y)
+	lhhl := selectFromPairInt32x4(x, 0, 4, 5, 1, y)
+	hllh := selectFromPairInt32x4(x, 4, 0, 1, 5, y)
+
+	r := make([]int32, 4, 4)
+
+	foo := func(v simd.Int32x4, a, b, c, d int32) {
+		v.StoreSlice(r)
+		checkSlices[int32](t, r, []int32{a, b, c, d})
+	}
+
+	foo(llll, 0, 1, 2, 3)
+	foo(hhhh, 4, 5, 6, 7)
+	foo(llhh, 0, 1, 6, 7)
+	foo(hhll, 6, 7, 0, 1)
+
+	foo(lllh, 0, 1, 2, 7)
+	foo(llhl, 0, 1, 7, 2)
+	foo(lhll, 0, 7, 1, 2)
+	foo(hlll, 7, 0, 1, 2)
+
+	foo(hhhl, 4, 5, 6, 0)
+	foo(hhlh, 4, 5, 0, 6)
+	foo(hlhh, 4, 0, 5, 6)
+	foo(lhhh, 0, 4, 5, 6)
+
+	foo(lhlh, 0, 4, 1, 5)
+	foo(hlhl, 4, 0, 5, 1)
+	foo(lhhl, 0, 4, 5, 1)
+	foo(hllh, 4, 0, 1, 5)
+}
+
+func TestSelect4FromPairConstGrouped(t *testing.T) {
+	x := simd.LoadFloat32x8Slice([]float32{0, 1, 2, 3, 10, 11, 12, 13})
+	y := simd.LoadFloat32x8Slice([]float32{4, 5, 6, 7, 14, 15, 16, 17})
+
+	llll := x.SelectFromPairGrouped(0, 1, 2, 3, y)
+	hhhh := x.SelectFromPairGrouped(4, 5, 6, 7, y)
+	llhh := x.SelectFromPairGrouped(0, 1, 6, 7, y)
+	hhll := x.SelectFromPairGrouped(6, 7, 0, 1, y)
+
+	lllh := x.SelectFromPairGrouped(0, 1, 2, 7, y)
+	llhl := x.SelectFromPairGrouped(0, 1, 7, 2, y)
+	lhll := x.SelectFromPairGrouped(0, 7, 1, 2, y)
+	hlll := x.SelectFromPairGrouped(7, 0, 1, 2, y)
+
+	hhhl := x.SelectFromPairGrouped(4, 5, 6, 0, y)
+	hhlh := x.SelectFromPairGrouped(4, 5, 0, 6, y)
+	hlhh := x.SelectFromPairGrouped(4, 0, 5, 6, y)
+	lhhh := x.SelectFromPairGrouped(0, 4, 5, 6, y)
+
+	lhlh := x.SelectFromPairGrouped(0, 4, 1, 5, y)
+	hlhl := x.SelectFromPairGrouped(4, 0, 5, 1, y)
+	lhhl := x.SelectFromPairGrouped(0, 4, 5, 1, y)
+	hllh := x.SelectFromPairGrouped(4, 0, 1, 5, y)
+
+	r := make([]float32, 8, 8)
+
+	foo := func(v simd.Float32x8, a, b, c, d float32) {
+		v.StoreSlice(r)
+		checkSlices[float32](t, r, []float32{a, b, c, d, 10 + a, 10 + b, 10 + c, 10 + d})
+	}
+
+	foo(llll, 0, 1, 2, 3)
+	foo(hhhh, 4, 5, 6, 7)
+	foo(llhh, 0, 1, 6, 7)
+	foo(hhll, 6, 7, 0, 1)
+
+	foo(lllh, 0, 1, 2, 7)
+	foo(llhl, 0, 1, 7, 2)
+	foo(lhll, 0, 7, 1, 2)
+	foo(hlll, 7, 0, 1, 2)
+
+	foo(hhhl, 4, 5, 6, 0)
+	foo(hhlh, 4, 5, 0, 6)
+	foo(hlhh, 4, 0, 5, 6)
+	foo(lhhh, 0, 4, 5, 6)
+
+	foo(lhlh, 0, 4, 1, 5)
+	foo(hlhl, 4, 0, 5, 1)
+	foo(lhhl, 0, 4, 5, 1)
+	foo(hllh, 4, 0, 1, 5)
+}
+
+func TestSelectFromPairConstGroupedUint32x16(t *testing.T) {
+	if !simd.X86.AVX512() {
+		t.Skip("Test requires X86.AVX512, not available on this hardware")
+		return
+	}
+	x := simd.LoadUint32x16Slice([]uint32{0, 1, 2, 3, 10, 11, 12, 13, 20, 21, 22, 23, 30, 31, 32, 33})
+	y := simd.LoadUint32x16Slice([]uint32{4, 5, 6, 7, 14, 15, 16, 17, 24, 25, 26, 27, 34, 35, 36, 37})
+
+	llll := x.SelectFromPairGrouped(0, 1, 2, 3, y)
+	hhhh := x.SelectFromPairGrouped(4, 5, 6, 7, y)
+	llhh := x.SelectFromPairGrouped(0, 1, 6, 7, y)
+	hhll := x.SelectFromPairGrouped(6, 7, 0, 1, y)
+
+	lllh := x.SelectFromPairGrouped(0, 1, 2, 7, y)
+	llhl := x.SelectFromPairGrouped(0, 1, 7, 2, y)
+	lhll := x.SelectFromPairGrouped(0, 7, 1, 2, y)
+	hlll := x.SelectFromPairGrouped(7, 0, 1, 2, y)
+
+	hhhl := x.SelectFromPairGrouped(4, 5, 6, 0, y)
+	hhlh := x.SelectFromPairGrouped(4, 5, 0, 6, y)
+	hlhh := x.SelectFromPairGrouped(4, 0, 5, 6, y)
+	lhhh := x.SelectFromPairGrouped(0, 4, 5, 6, y)
+
+	lhlh := x.SelectFromPairGrouped(0, 4, 1, 5, y)
+	hlhl := x.SelectFromPairGrouped(4, 0, 5, 1, y)
+	lhhl := x.SelectFromPairGrouped(0, 4, 5, 1, y)
+	hllh := x.SelectFromPairGrouped(4, 0, 1, 5, y)
+
+	r := make([]uint32, 16, 16)
+
+	foo := func(v simd.Uint32x16, a, b, c, d uint32) {
+		v.StoreSlice(r)
+		checkSlices[uint32](t, r, []uint32{a, b, c, d,
+			10 + a, 10 + b, 10 + c, 10 + d,
+			20 + a, 20 + b, 20 + c, 20 + d,
+			30 + a, 30 + b, 30 + c, 30 + d,
+		})
+	}
+
+	foo(llll, 0, 1, 2, 3)
+	foo(hhhh, 4, 5, 6, 7)
+	foo(llhh, 0, 1, 6, 7)
+	foo(hhll, 6, 7, 0, 1)
+
+	foo(lllh, 0, 1, 2, 7)
+	foo(llhl, 0, 1, 7, 2)
+	foo(lhll, 0, 7, 1, 2)
+	foo(hlll, 7, 0, 1, 2)
+
+	foo(hhhl, 4, 5, 6, 0)
+	foo(hhlh, 4, 5, 0, 6)
+	foo(hlhh, 4, 0, 5, 6)
+	foo(lhhh, 0, 4, 5, 6)
+
+	foo(lhlh, 0, 4, 1, 5)
+	foo(hlhl, 4, 0, 5, 1)
+	foo(lhhl, 0, 4, 5, 1)
+	foo(hllh, 4, 0, 1, 5)
+}
+
+func TestSelect128FromPair(t *testing.T) {
+	x := simd.LoadUint64x4Slice([]uint64{0, 1, 2, 3})
+	y := simd.LoadUint64x4Slice([]uint64{4, 5, 6, 7})
+
+	aa := x.Select128FromPair(0, 0, y)
+	ab := x.Select128FromPair(0, 1, y)
+	bc := x.Select128FromPair(1, 2, y)
+	cd := x.Select128FromPair(2, 3, y)
+	da := x.Select128FromPair(3, 0, y)
+	dc := x.Select128FromPair(3, 2, y)
+
+	r := make([]uint64, 4, 4)
+
+	foo := func(v simd.Uint64x4, a, b uint64) {
+		a, b = 2*a, 2*b
+		v.StoreSlice(r)
+		checkSlices[uint64](t, r, []uint64{a, a + 1, b, b + 1})
+	}
+
+	foo(aa, 0, 0)
+	foo(ab, 0, 1)
+	foo(bc, 1, 2)
+	foo(cd, 2, 3)
+	foo(da, 3, 0)
+	foo(dc, 3, 2)
+}
+
+func TestSelect128FromPairError(t *testing.T) {
+	x := simd.LoadUint64x4Slice([]uint64{0, 1, 2, 3})
+	y := simd.LoadUint64x4Slice([]uint64{4, 5, 6, 7})
+
+	defer func() {
+		if r := recover(); r != nil {
+			t.Logf("Saw expected panic %v", r)
+		}
+	}()
+	_ = x.Select128FromPair(0, 4, y)
+
+	t.Errorf("Should have panicked")
+}
+
+//go:noinline
+func select128FromPair(x simd.Uint64x4, lo, hi uint8, y simd.Uint64x4) simd.Uint64x4 {
+	return x.Select128FromPair(lo, hi, y)
+}
+
+func TestSelect128FromPairVar(t *testing.T) {
+	x := simd.LoadUint64x4Slice([]uint64{0, 1, 2, 3})
+	y := simd.LoadUint64x4Slice([]uint64{4, 5, 6, 7})
+
+	aa := select128FromPair(x, 0, 0, y)
+	ab := select128FromPair(x, 0, 1, y)
+	bc := select128FromPair(x, 1, 2, y)
+	cd := select128FromPair(x, 2, 3, y)
+	da := select128FromPair(x, 3, 0, y)
+	dc := select128FromPair(x, 3, 2, y)
+
+	r := make([]uint64, 4, 4)
+
+	foo := func(v simd.Uint64x4, a, b uint64) {
+		a, b = 2*a, 2*b
+		v.StoreSlice(r)
+		checkSlices[uint64](t, r, []uint64{a, a + 1, b, b + 1})
+	}
+
+	foo(aa, 0, 0)
+	foo(ab, 0, 1)
+	foo(bc, 1, 2)
+	foo(cd, 2, 3)
+	foo(da, 3, 0)
+	foo(dc, 3, 2)
+}
+
+func TestSelect2FromPairConst(t *testing.T) {
+	x := simd.LoadUint64x2Slice([]uint64{0, 1})
+	y := simd.LoadUint64x2Slice([]uint64{2, 3})
+
+	ll := x.SelectFromPair(0, 1, y)
+	hh := x.SelectFromPair(3, 2, y)
+	lh := x.SelectFromPair(0, 3, y)
+	hl := x.SelectFromPair(2, 1, y)
+
+	r := make([]uint64, 2, 2)
+
+	foo := func(v simd.Uint64x2, a, b uint64) {
+		v.StoreSlice(r)
+		checkSlices[uint64](t, r, []uint64{a, b})
+	}
+
+	foo(ll, 0, 1)
+	foo(hh, 3, 2)
+	foo(lh, 0, 3)
+	foo(hl, 2, 1)
+}
+
+func TestSelect2FromPairConstGroupedUint(t *testing.T) {
+	x := simd.LoadUint64x4Slice([]uint64{0, 1, 10, 11})
+	y := simd.LoadUint64x4Slice([]uint64{2, 3, 12, 13})
+
+	ll := x.SelectFromPairGrouped(0, 1, y)
+	hh := x.SelectFromPairGrouped(3, 2, y)
+	lh := x.SelectFromPairGrouped(0, 3, y)
+	hl := x.SelectFromPairGrouped(2, 1, y)
+
+	r := make([]uint64, 4, 4)
+
+	foo := func(v simd.Uint64x4, a, b uint64) {
+		v.StoreSlice(r)
+		checkSlices[uint64](t, r, []uint64{a, b, a + 10, b + 10})
+	}
+
+	foo(ll, 0, 1)
+	foo(hh, 3, 2)
+	foo(lh, 0, 3)
+	foo(hl, 2, 1)
+}
+
+func TestSelect2FromPairConstGroupedFloat(t *testing.T) {
+	x := simd.LoadFloat64x4Slice([]float64{0, 1, 10, 11})
+	y := simd.LoadFloat64x4Slice([]float64{2, 3, 12, 13})
+
+	ll := x.SelectFromPairGrouped(0, 1, y)
+	hh := x.SelectFromPairGrouped(3, 2, y)
+	lh := x.SelectFromPairGrouped(0, 3, y)
+	hl := x.SelectFromPairGrouped(2, 1, y)
+
+	r := make([]float64, 4, 4)
+
+	foo := func(v simd.Float64x4, a, b float64) {
+		v.StoreSlice(r)
+		checkSlices[float64](t, r, []float64{a, b, a + 10, b + 10})
+	}
+
+	foo(ll, 0, 1)
+	foo(hh, 3, 2)
+	foo(lh, 0, 3)
+	foo(hl, 2, 1)
+}
+
+func TestSelect2FromPairConstGroupedInt(t *testing.T) {
+	x := simd.LoadInt64x4Slice([]int64{0, 1, 10, 11})
+	y := simd.LoadInt64x4Slice([]int64{2, 3, 12, 13})
+
+	ll := x.SelectFromPairGrouped(0, 1, y)
+	hh := x.SelectFromPairGrouped(3, 2, y)
+	lh := x.SelectFromPairGrouped(0, 3, y)
+	hl := x.SelectFromPairGrouped(2, 1, y)
+
+	r := make([]int64, 4, 4)
+
+	foo := func(v simd.Int64x4, a, b int64) {
+		v.StoreSlice(r)
+		checkSlices[int64](t, r, []int64{a, b, a + 10, b + 10})
+	}
+
+	foo(ll, 0, 1)
+	foo(hh, 3, 2)
+	foo(lh, 0, 3)
+	foo(hl, 2, 1)
+}
+
+func TestSelect2FromPairConstGroupedInt512(t *testing.T) {
+	if !simd.X86.AVX512() {
+		t.Skip("Test requires X86.AVX512, not available on this hardware")
+		return
+	}
+
+	x := simd.LoadInt64x8Slice([]int64{0, 1, 10, 11, 20, 21, 30, 31})
+	y := simd.LoadInt64x8Slice([]int64{2, 3, 12, 13, 22, 23, 32, 33})
+
+	ll := x.SelectFromPairGrouped(0, 1, y)
+	hh := x.SelectFromPairGrouped(3, 2, y)
+	lh := x.SelectFromPairGrouped(0, 3, y)
+	hl := x.SelectFromPairGrouped(2, 1, y)
+
+	r := make([]int64, 8, 8)
+
+	foo := func(v simd.Int64x8, a, b int64) {
+		v.StoreSlice(r)
+		checkSlices[int64](t, r, []int64{a, b, a + 10, b + 10, a + 20, b + 20, a + 30, b + 30})
+	}
+
+	foo(ll, 0, 1)
+	foo(hh, 3, 2)
+	foo(lh, 0, 3)
+	foo(hl, 2, 1)
+}
+
+func TestString(t *testing.T) {
+	x := simd.LoadUint32x4Slice([]uint32{0, 1, 2, 3})
+	y := simd.LoadInt64x4Slice([]int64{-4, -5, -6, -7})
+	z := simd.LoadFloat32x4Slice([]float32{0.5, 1.5, -2.5, 3.5e9})
+	w := simd.LoadFloat64x4Slice([]float64{0.5, 1.5, -2.5, 3.5e9})
+
+	sx := "{0,1,2,3}"
+	sy := "{-4,-5,-6,-7}"
+	sz := "{0.5,1.5,-2.5,3.5e+09}"
+	sw := sz
+
+	if x.String() != sx {
+		t.Errorf("x=%s wanted %s", x, sx)
+	}
+	if y.String() != sy {
+		t.Errorf("y=%s wanted %s", y, sy)
+	}
+	if z.String() != sz {
+		t.Errorf("z=%s wanted %s", z, sz)
+	}
+	if w.String() != sw {
+		t.Errorf("w=%s wanted %s", w, sw)
+	}
+	t.Logf("w=%s", w)
+	t.Logf("x=%s", x)
+	t.Logf("y=%s", y)
+	t.Logf("z=%s", z)
+}
+
+// a returns an slice of 16 int32
+func a() []int32 {
+	return make([]int32, 16, 16)
+}
+
+// applyTo3 returns a 16-element slice of the results of
+// applying f to the respective elements of vectors x, y, and z.
+func applyTo3(x, y, z simd.Int32x16, f func(x, y, z int32) int32) []int32 {
+	ax, ay, az := a(), a(), a()
+	x.StoreSlice(ax)
+	y.StoreSlice(ay)
+	z.StoreSlice(az)
+
+	r := a()
+	for i := range r {
+		r[i] = f(ax[i], ay[i], az[i])
+	}
+	return r
+}
+
+// applyTo3 returns a 16-element slice of the results of
+// applying f to the respective elements of vectors x, y, z, and w.
+func applyTo4(x, y, z, w simd.Int32x16, f func(x, y, z, w int32) int32) []int32 {
+	ax, ay, az, aw := a(), a(), a(), a()
+	x.StoreSlice(ax)
+	y.StoreSlice(ay)
+	z.StoreSlice(az)
+	w.StoreSlice(aw)
+
+	r := make([]int32, len(ax), len(ax))
+	for i := range r {
+		r[i] = f(ax[i], ay[i], az[i], aw[i])
+	}
+	return r
+}
+
+func TestSelectTernOptInt32x16(t *testing.T) {
+	if !simd.X86.AVX512() {
+		t.Skip("Test requires X86.AVX512, not available on this hardware")
+		return
+	}
+	ax := []int32{0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1}
+	ay := []int32{0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1}
+	az := []int32{0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1}
+	aw := []int32{0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1}
+	am := []int32{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}
+
+	x := simd.LoadInt32x16Slice(ax)
+	y := simd.LoadInt32x16Slice(ay)
+	z := simd.LoadInt32x16Slice(az)
+	w := simd.LoadInt32x16Slice(aw)
+	m := simd.LoadInt32x16Slice(am)
+
+	foo := func(v simd.Int32x16, s []int32) {
+		r := make([]int32, 16, 16)
+		v.StoreSlice(r)
+		checkSlices[int32](t, r, s)
+	}
+
+	t0 := w.Xor(y).Xor(z)
+	ft0 := func(w, y, z int32) int32 {
+		return w ^ y ^ z
+	}
+	foo(t0, applyTo3(w, y, z, ft0))
+
+	t1 := m.And(w.Xor(y).Xor(z.Not()))
+	ft1 := func(m, w, y, z int32) int32 {
+		return m & (w ^ y ^ ^z)
+	}
+	foo(t1, applyTo4(m, w, y, z, ft1))
+
+	t2 := x.Xor(y).Xor(z).And(x.Xor(y).Xor(z.Not()))
+	ft2 := func(x, y, z int32) int32 {
+		return (x ^ y ^ z) & (x ^ y ^ ^z)
+	}
+	foo(t2, applyTo3(x, y, z, ft2))
+}
+
+func TestMaskedMerge(t *testing.T) {
+	x := simd.LoadInt64x4Slice([]int64{1, 2, 3, 4})
+	y := simd.LoadInt64x4Slice([]int64{5, 6, 1, 1})
+	z := simd.LoadInt64x4Slice([]int64{-1, -2, -3, -4})
+	res := make([]int64, 4)
+	expected := []int64{6, 8, -3, -4}
+	mask := x.Less(y)
+	if simd.X86.AVX512() {
+		x.Add(y).Merge(z, mask).StoreSlice(res)
+	} else {
+		x.Add(y).Merge(z, mask).StoreSlice(res)
+	}
+	for i := range 4 {
+		if res[i] != expected[i] {
+			t.Errorf("got %d wanted %d", res[i], expected[i])
+		}
+	}
+}
+
+func TestDotProductQuadruple(t *testing.T) {
+	if !simd.X86.AVXVNNI() {
+		t.Skip("Test requires X86.AVXVNNI, not available on this hardware")
+		return
+	}
+	xd := make([]int8, 16)
+	yd := make([]uint8, 16)
+	zd := make([]int32, 4)
+	wanted1 := make([]int32, 4)
+	wanted2 := make([]int32, 4)
+	res1 := make([]int32, 4)
+	res2 := make([]int32, 4)
+	for i := range 4 {
+		xd[i] = 5
+		yd[i] = 6
+		zd[i] = 3
+		wanted1[i] = 30
+		wanted2[i] = 30
+	}
+	x := simd.LoadInt8x16Slice(xd)
+	y := simd.LoadUint8x16Slice(yd)
+	z := simd.LoadInt32x4Slice(zd)
+	x.DotProductQuadruple(y).StoreSlice(res1)
+	x.DotProductQuadruple(y).Add(z).StoreSlice(res1)
+	for i := range 4 {
+		if res1[i] != wanted1[i] {
+			t.Errorf("got %d wanted %d", res1[i], wanted1[i])
+		}
+		if res2[i] != wanted2[i] {
+			t.Errorf("got %d wanted %d", res2[i], wanted2[i])
+		}
+	}
+}
+
+func TestPermuteScalars(t *testing.T) {
+	x := []int32{11, 12, 13, 14}
+	want := []int32{12, 13, 14, 11}
+	got := make([]int32, 4)
+	simd.LoadInt32x4Slice(x).PermuteScalars(1, 2, 3, 0).StoreSlice(got)
+	for i := range 4 {
+		if want[i] != got[i] {
+			t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
+		}
+	}
+}
+
+func TestPermuteScalarsGrouped(t *testing.T) {
+	x := []int32{11, 12, 13, 14, 21, 22, 23, 24}
+	want := []int32{12, 13, 14, 11, 22, 23, 24, 21}
+	got := make([]int32, 8)
+	simd.LoadInt32x8Slice(x).PermuteScalarsGrouped(1, 2, 3, 0).StoreSlice(got)
+	for i := range 8 {
+		if want[i] != got[i] {
+			t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
+		}
+	}
+}
+
+func TestPermuteScalarsHi(t *testing.T) {
+	x := []int16{-1, -2, -3, -4, 11, 12, 13, 14}
+	want := []int16{-1, -2, -3, -4, 12, 13, 14, 11}
+	got := make([]int16, len(x))
+	simd.LoadInt16x8Slice(x).PermuteScalarsHi(1, 2, 3, 0).StoreSlice(got)
+	for i := range got {
+		if want[i] != got[i] {
+			t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
+		}
+	}
+}
+
+func TestPermuteScalarsLo(t *testing.T) {
+	x := []int16{11, 12, 13, 14, 4, 5, 6, 7}
+	want := []int16{12, 13, 14, 11, 4, 5, 6, 7}
+	got := make([]int16, len(x))
+	simd.LoadInt16x8Slice(x).PermuteScalarsLo(1, 2, 3, 0).StoreSlice(got)
+	for i := range got {
+		if want[i] != got[i] {
+			t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
+		}
+	}
+}
+
+func TestPermuteScalarsHiGrouped(t *testing.T) {
+	x := []int16{-1, -2, -3, -4, 11, 12, 13, 14, -11, -12, -13, -14, 111, 112, 113, 114}
+	want := []int16{-1, -2, -3, -4, 12, 13, 14, 11, -11, -12, -13, -14, 112, 113, 114, 111}
+	got := make([]int16, len(x))
+	simd.LoadInt16x16Slice(x).PermuteScalarsHiGrouped(1, 2, 3, 0).StoreSlice(got)
+	for i := range got {
+		if want[i] != got[i] {
+			t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
+		}
+	}
+}
+
+func TestPermuteScalarsLoGrouped(t *testing.T) {
+	x := []int16{11, 12, 13, 14, 4, 5, 6, 7, 111, 112, 113, 114, 14, 15, 16, 17}
+	want := []int16{12, 13, 14, 11, 4, 5, 6, 7, 112, 113, 114, 111, 14, 15, 16, 17}
+	got := make([]int16, len(x))
+	simd.LoadInt16x16Slice(x).PermuteScalarsLoGrouped(1, 2, 3, 0).StoreSlice(got)
+	for i := range got {
+		if want[i] != got[i] {
+			t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
+		}
+	}
+}
diff --git a/src/simd/internal/simd_test/simulation_helpers_test.go b/src/simd/internal/simd_test/simulation_helpers_test.go
new file mode 100644
index 0000000000..2f040ffb3e
--- /dev/null
+++ b/src/simd/internal/simd_test/simulation_helpers_test.go
@@ -0,0 +1,274 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd && amd64
+
+package simd_test
+
+import (
+	"math"
+)
+
+func less[T number](x, y T) bool {
+	return x < y
+}
+func lessEqual[T number](x, y T) bool {
+	return x <= y
+}
+func greater[T number](x, y T) bool {
+	return x > y
+}
+func greaterEqual[T number](x, y T) bool {
+	return x >= y
+}
+func equal[T number](x, y T) bool {
+	return x == y
+}
+func notEqual[T number](x, y T) bool {
+	return x != y
+}
+
+func abs[T number](x T) T {
+	// TODO this will need a non-standard FP-equality test.
+	if x == 0 { // true if x is -0.
+		return 0 // this is not a negative zero
+	}
+	if x < 0 {
+		return -x
+	}
+	return x
+}
+
+func ceil[T float](x T) T {
+	return T(math.Ceil(float64(x)))
+}
+func floor[T float](x T) T {
+	return T(math.Floor(float64(x)))
+}
+func not[T integer](x T) T {
+	return ^x
+}
+func round[T float](x T) T {
+	return T(math.RoundToEven(float64(x)))
+}
+func sqrt[T float](x T) T {
+	return T(math.Sqrt(float64(x)))
+}
+func trunc[T float](x T) T {
+	return T(math.Trunc(float64(x)))
+}
+
+func add[T number](x, y T) T {
+	return x + y
+}
+
+func sub[T number](x, y T) T {
+	return x - y
+}
+
+func max_[T number](x, y T) T { // "max" lands in infinite recursion
+	return max(x, y)
+}
+
+func min_[T number](x, y T) T { // "min" lands in infinite recursion
+	return min(x, y)
+}
+
+// Also mulLow for integers
+func mul[T number](x, y T) T {
+	return x * y
+}
+
+func div[T number](x, y T) T {
+	return x / y
+}
+
+func and[T integer](x, y T) T {
+	return x & y
+}
+
+func andNotI[T integer](x, y T) T {
+	return x & ^y // order corrected to match expectations
+}
+
+func orI[T integer](x, y T) T {
+	return x | y
+}
+
+func xorI[T integer](x, y T) T {
+	return x ^ y
+}
+
+func ima[T integer](x, y, z T) T {
+	return x*y + z
+}
+
+func fma[T float](x, y, z T) T {
+	return T(math.FMA(float64(x), float64(y), float64(z)))
+}
+
+func toUint8[T number](x T) uint8 {
+	return uint8(x)
+}
+
+func toUint16[T number](x T) uint16 {
+	return uint16(x)
+}
+
+func toUint64[T number](x T) uint64 {
+	return uint64(x)
+}
+
+func toUint32[T number](x T) uint32 {
+	switch y := (any(x)).(type) {
+	case float32:
+		if y < 0 || y > float32(math.MaxUint32) || y != y {
+			return math.MaxUint32
+		}
+	case float64:
+		if y < 0 || y > float64(math.MaxUint32) || y != y {
+			return math.MaxUint32
+		}
+	}
+	return uint32(x)
+}
+
+func toInt8[T number](x T) int8 {
+	return int8(x)
+}
+
+func toInt16[T number](x T) int16 {
+	return int16(x)
+}
+
+func toInt32[T number](x T) int32 {
+	return int32(x)
+}
+
+func toInt64[T number](x T) int64 {
+	return int64(x)
+}
+
+func toFloat32[T number](x T) float32 {
+	return float32(x)
+}
+
+func toFloat64[T number](x T) float64 {
+	return float64(x)
+}
+
+func ceilResidueForPrecision[T float](i int) func(T) T {
+	f := 1.0
+	for i > 0 {
+		f *= 2
+		i--
+	}
+	return func(x T) T {
+		y := float64(x)
+		if math.IsInf(float64(x*T(f)), 0) {
+			return 0
+		}
+		// TODO sort out the rounding issues when T === float32
+		return T(y - math.Ceil(y*f)/f)
+	}
+}
+
+// Slice versions of all these elementwise operations
+
+func addSlice[T number](x, y []T) []T {
+	return map2[T](add)(x, y)
+}
+
+func subSlice[T number](x, y []T) []T {
+	return map2[T](sub)(x, y)
+}
+
+func maxSlice[T number](x, y []T) []T {
+	return map2[T](max_)(x, y)
+}
+
+func minSlice[T number](x, y []T) []T {
+	return map2[T](min_)(x, y)
+}
+
+// mulLow for integers
+func mulSlice[T number](x, y []T) []T {
+	return map2[T](mul)(x, y)
+}
+
+func divSlice[T number](x, y []T) []T {
+	return map2[T](div)(x, y)
+}
+
+func andSlice[T integer](x, y []T) []T {
+	return map2[T](and)(x, y)
+}
+
+func andNotSlice[T integer](x, y []T) []T {
+	return map2[T](andNotI)(x, y)
+}
+
+func orSlice[T integer](x, y []T) []T {
+	return map2[T](orI)(x, y)
+}
+
+func xorSlice[T integer](x, y []T) []T {
+	return map2[T](xorI)(x, y)
+}
+
+func lessSlice[T number](x, y []T) []int64 {
+	return mapCompare[T](less)(x, y)
+}
+
+func lessEqualSlice[T number](x, y []T) []int64 {
+	return mapCompare[T](lessEqual)(x, y)
+}
+
+func greaterSlice[T number](x, y []T) []int64 {
+	return mapCompare[T](greater)(x, y)
+}
+
+func greaterEqualSlice[T number](x, y []T) []int64 {
+	return mapCompare[T](greaterEqual)(x, y)
+}
+
+func equalSlice[T number](x, y []T) []int64 {
+	return mapCompare[T](equal)(x, y)
+}
+
+func notEqualSlice[T number](x, y []T) []int64 {
+	return mapCompare[T](notEqual)(x, y)
+}
+
+func ceilSlice[T float](x []T) []T {
+	return map1[T](ceil)(x)
+}
+
+func floorSlice[T float](x []T) []T {
+	return map1[T](floor)(x)
+}
+
+func notSlice[T integer](x []T) []T {
+	return map1[T](not)(x)
+}
+
+func roundSlice[T float](x []T) []T {
+	return map1[T](round)(x)
+}
+
+func sqrtSlice[T float](x []T) []T {
+	return map1[T](sqrt)(x)
+}
+
+func truncSlice[T float](x []T) []T {
+	return map1[T](trunc)(x)
+}
+
+func imaSlice[T integer](x, y, z []T) []T {
+	return map3[T](ima)(x, y, z)
+}
+
+func fmaSlice[T float](x, y, z []T) []T {
+	return map3[T](fma)(x, y, z)
+}
diff --git a/src/simd/internal/simd_test/slicepart_test.go b/src/simd/internal/simd_test/slicepart_test.go
new file mode 100644
index 0000000000..b7a4a4f71b
--- /dev/null
+++ b/src/simd/internal/simd_test/slicepart_test.go
@@ -0,0 +1,390 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd && amd64
+
+package simd_test
+
+import (
+	"simd"
+	"testing"
+)
+
+func TestSlicePartInt8x16(t *testing.T) {
+	Do(t, 16, func(a, c []int8) {
+		u := simd.LoadInt8x16SlicePart(a)
+		u.StoreSlice(c)
+	})
+}
+
+func TestSlicePartInt8x32(t *testing.T) {
+	a := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+		17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
+	b := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+		17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
+	for i := 32; i >= 0; i-- {
+		u := simd.LoadInt8x32SlicePart(a[:i])
+		c := make([]int8, 32, 32)
+		u.StoreSlice(c)
+		checkSlices(t, c, b)
+		if i > 0 {
+			b[i-1] = 0
+		}
+	}
+}
+
+func TestSlicePartUint8x16(t *testing.T) {
+	a := []uint8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+	b := []uint8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+	for i := 16; i >= 0; i-- {
+		u := simd.LoadUint8x16SlicePart(a[:i])
+		c := make([]uint8, 32, 32)
+		u.StoreSlice(c)
+		checkSlices(t, c, b)
+		if i > 0 {
+			b[i-1] = 0
+		}
+	}
+}
+
+func TestSlicePartUint8x32(t *testing.T) {
+	a := []uint8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+		17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
+	b := []uint8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+		17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
+	for i := 32; i >= 0; i-- {
+		u := simd.LoadUint8x32SlicePart(a[:i])
+		c := make([]uint8, 32, 32)
+		u.StoreSlice(c)
+		checkSlices(t, c, b)
+		if i > 0 {
+			b[i-1] = 0
+		}
+	}
+}
+
+func TestSlicePartInt16x8(t *testing.T) {
+	a := []int16{1, 2, 3, 4, 5, 6, 7, 8}
+	b := []int16{1, 2, 3, 4, 5, 6, 7, 8}
+	for i := 8; i >= 0; i-- {
+		u := simd.LoadInt16x8SlicePart(a[:i])
+		c := make([]int16, 16, 16)
+		u.StoreSlice(c)
+		checkSlices(t, c, b)
+		if i > 0 {
+			b[i-1] = 0
+		}
+	}
+}
+
+func TestSlicePartInt16x16(t *testing.T) {
+	a := []int16{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+	b := []int16{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+	for i := 16; i >= 0; i-- {
+		u := simd.LoadInt16x16SlicePart(a[:i])
+		c := make([]int16, 16, 16)
+		u.StoreSlice(c)
+		checkSlices(t, c, b)
+		if i > 0 {
+			b[i-1] = 0
+		}
+	}
+}
+
+func TestSlicesPartStoreInt8x16(t *testing.T) {
+	a := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+	b := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+	for i := 16; i >= 0; i-- {
+		v := simd.LoadInt8x16Slice(a)
+		c := make([]int8, 32, 32)
+		v.StoreSlicePart(c[:i])
+		checkSlices(t, c, b)
+		if i > 0 {
+			b[i-1] = 0
+		}
+	}
+}
+
+func TestSlicesPartStoreInt16x8(t *testing.T) {
+	a := []int16{1, 2, 3, 4, 5, 6, 7, 8}
+	b := []int16{1, 2, 3, 4, 5, 6, 7, 8}
+	for i := 8; i >= 0; i-- {
+		v := simd.LoadInt16x8Slice(a)
+		c := make([]int16, 32, 32)
+		v.StoreSlicePart(c[:i])
+		checkSlices(t, c, b)
+		if i > 0 {
+			b[i-1] = 0
+		}
+	}
+}
+
+func TestSlicesPartStoreInt16x16(t *testing.T) {
+	a := []int16{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+	b := []int16{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+	for i := 16; i >= 0; i-- {
+		v := simd.LoadInt16x16Slice(a)
+		c := make([]int16, 32, 32)
+		v.StoreSlicePart(c[:i])
+		checkSlices(t, c, b)
+		if i > 0 {
+			b[i-1] = 0
+		}
+	}
+}
+
+func TestSlicesPartStoreUint8x16(t *testing.T) {
+	a := []uint8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+	b := []uint8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+	for i := 16; i >= 0; i-- {
+		v := simd.LoadUint8x16Slice(a)
+		c := make([]uint8, 32, 32)
+		v.StoreSlicePart(c[:i])
+		checkSlices(t, c, b)
+		if i > 0 {
+			b[i-1] = 0
+		}
+	}
+}
+
+func TestSlicesPartStoreUint16x16(t *testing.T) {
+	a := []uint16{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+	b := []uint16{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+	for i := 16; i >= 0; i-- {
+		v := simd.LoadUint16x16Slice(a)
+		c := make([]uint16, 32, 32)
+		v.StoreSlicePart(c[:i])
+		checkSlices(t, c, b)
+		if i > 0 {
+			b[i-1] = 0
+		}
+	}
+}
+
+func TestSlicesPartStoreUint8x32(t *testing.T) {
+	a := []uint8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+		17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
+	b := []uint8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+		17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
+	for i := 32; i >= 0; i-- {
+		v := simd.LoadUint8x32Slice(a)
+		c := make([]uint8, 32, 32)
+		v.StoreSlicePart(c[:i])
+		checkSlices(t, c, b)
+		if i > 0 {
+			b[i-1] = 0
+		}
+	}
+}
+
+func TestSlicePartInt32(t *testing.T) {
+	// 32x4
+	L := 4
+	c := []int32{1, 2, 3, 4, 5, -1, -1, -1, -1}
+	a := c[:L+1]
+	for i := range a {
+		// Test the load first
+		// e is a partial slice.
+		e := a[i:]
+		v := simd.LoadInt32x4SlicePart(e)
+		// d contains what a ought to contain
+		d := make([]int32, L)
+		for j := 0; j < len(e) && j < len(d); j++ {
+			d[j] = e[j]
+		}
+
+		b := make([]int32, L)
+		v.StoreSlice(b)
+		// test the load
+		checkSlices(t, d, b)
+
+		// Test the store
+		f := make([]int32, L+1)
+		for i := range f {
+			f[i] = 99
+		}
+
+		v.StoreSlicePart(f[:len(e)])
+		if len(e) < len(b) {
+			checkSlices(t, f, b[:len(e)])
+		} else {
+			checkSlices(t, f, b)
+		}
+		for i := len(e); i < len(f); i++ {
+			if f[i] != 99 {
+				t.Errorf("StoreSlicePart altered f[%d], expected 99, saw %d", i, f[i])
+			}
+		}
+	}
+}
+
+func TestSlicePartUint64(t *testing.T) {
+	// 64x4
+	L := 4
+	c := []uint64{1, 2, 3, 4, 5, 86, 86, 86, 86}
+	a := c[:L+1]
+	for i := range a {
+		// Test the load first
+		// e is a partial slice.
+		e := a[i:]
+		v := simd.LoadUint64x4SlicePart(e)
+		// d contains what a ought to contain
+		d := make([]uint64, L)
+		for j := 0; j < len(e) && j < len(d); j++ {
+			d[j] = e[j]
+		}
+
+		b := make([]uint64, L)
+		v.StoreSlice(b)
+		// test the load
+		checkSlices(t, d, b)
+
+		// Test the store
+		f := make([]uint64, L+1)
+		for i := range f {
+			f[i] = 99
+		}
+
+		v.StoreSlicePart(f[:len(e)])
+		if len(e) < len(b) {
+			checkSlices(t, f, b[:len(e)])
+		} else {
+			checkSlices(t, f, b)
+		}
+		for i := len(e); i < len(f); i++ {
+			if f[i] != 99 {
+				t.Errorf("StoreSlicePart altered f[%d], expected 99, saw %d", i, f[i])
+			}
+		}
+	}
+}
+
+func TestSlicePartFloat64(t *testing.T) {
+	// 64x2
+	L := 2
+	c := []float64{1, 2, 3, 86, 86, 86, 86}
+	a := c[:L+1]
+	for i := range a {
+		// Test the load first
+		// e is a partial slice.
+		e := a[i:]
+		v := simd.LoadFloat64x2SlicePart(e)
+		// d contains what a ought to contain
+		d := make([]float64, L)
+		for j := 0; j < len(e) && j < len(d); j++ {
+			d[j] = e[j]
+		}
+
+		b := make([]float64, L)
+		v.StoreSlice(b)
+		// test the load
+		checkSlices(t, d, b)
+
+		// Test the store
+		f := make([]float64, L+1)
+		for i := range f {
+			f[i] = 99
+		}
+
+		v.StoreSlicePart(f[:len(e)])
+		if len(e) < len(b) {
+			checkSlices(t, f, b[:len(e)])
+		} else {
+			checkSlices(t, f, b)
+		}
+		for i := len(e); i < len(f); i++ {
+			if f[i] != 99 {
+				t.Errorf("StoreSlicePart altered f[%d], expected 99, saw %v", i, f[i])
+			}
+		}
+	}
+}
+
+func TestSlicePartFloat32(t *testing.T) {
+	// 32x8
+	L := 8
+	c := []float32{1, 2, 3, 4, 5, 6, 7, 8, 86, 86, 86, 86}
+	a := c[:L+1]
+	for i := range a {
+		// Test the load first
+		// e is a partial slice.
+		e := a[i:]
+		v := simd.LoadFloat32x8SlicePart(e)
+		// d contains what a ought to contain
+		d := make([]float32, L)
+		for j := 0; j < len(e) && j < len(d); j++ {
+			d[j] = e[j]
+		}
+
+		b := make([]float32, L)
+		v.StoreSlice(b)
+		// test the load
+		checkSlices(t, d, b)
+
+		// Test the store
+		f := make([]float32, L+1)
+		for i := range f {
+			f[i] = 99
+		}
+
+		v.StoreSlicePart(f[:len(e)])
+		if len(e) < len(b) {
+			checkSlices(t, f, b[:len(e)])
+		} else {
+			checkSlices(t, f, b)
+		}
+		for i := len(e); i < len(f); i++ {
+			if f[i] != 99 {
+				t.Errorf("StoreSlicePart altered f[%d], expected 99, saw %v", i, f[i])
+			}
+		}
+	}
+}
+
+// 512-bit load
+
+func TestSlicePartInt64(t *testing.T) {
+	if !simd.X86.AVX512() {
+		t.Skip("Test requires X86.AVX512, not available on this hardware")
+		return
+	}
+
+	L := 8
+	c := []int64{1, 2, 3, 4, 5, 6, 7, 8, 86, 86, 86, 86}
+	a := c[:L+1]
+	for i := range a {
+		// Test the load first
+		// e is a partial slice.
+		e := a[i:]
+		v := simd.LoadInt64x8SlicePart(e)
+		// d contains what a ought to contain
+		d := make([]int64, L)
+		for j := 0; j < len(e) && j < len(d); j++ {
+			d[j] = e[j]
+		}
+
+		b := make([]int64, L)
+		v.StoreSlice(b)
+		// test the load
+		checkSlicesLogInput(t, b, d, 0.0, func() { t.Helper(); t.Logf("Len(e)=%d", len(e)) })
+
+		// Test the store
+		f := make([]int64, L+1)
+		for i := range f {
+			f[i] = 99
+		}
+
+		v.StoreSlicePart(f[:len(e)])
+		if len(e) < len(b) {
+			checkSlices(t, f, b[:len(e)])
+		} else {
+			checkSlices(t, f, b)
+		}
+		for i := len(e); i < len(f); i++ {
+			if f[i] != 99 {
+				t.Errorf("StoreSlicePart altered f[%d], expected 99, saw %v", i, f[i])
+			}
+		}
+	}
+}
diff --git a/src/simd/internal/simd_test/ternary_helpers_test.go b/src/simd/internal/simd_test/ternary_helpers_test.go
new file mode 100644
index 0000000000..401270c7bd
--- /dev/null
+++ b/src/simd/internal/simd_test/ternary_helpers_test.go
@@ -0,0 +1,545 @@
+// Code generated by 'go run genfiles.go'; DO NOT EDIT.
+
+//go:build goexperiment.simd
+
+// This file contains functions testing ternary simd methods.
+// Each function in this file is specialized for a
+// particular simd type <BaseType><Width>x<Count>.
+
+package simd_test
+
+import (
+	"simd"
+	"testing"
+)
+
+// testInt8x16Ternary tests the simd ternary method f against the expected behavior generated by want
+func testInt8x16Ternary(t *testing.T, f func(_, _, _ simd.Int8x16) simd.Int8x16, want func(_, _, _ []int8) []int8) {
+	n := 16
+	t.Helper()
+	forSliceTriple(t, int8s, n, func(x, y, z []int8) bool {
+		t.Helper()
+		a := simd.LoadInt8x16Slice(x)
+		b := simd.LoadInt8x16Slice(y)
+		c := simd.LoadInt8x16Slice(z)
+		g := make([]int8, n)
+		f(a, b, c).StoreSlice(g)
+		w := want(x, y, z)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+	})
+}
+
+// testInt16x8Ternary tests the simd ternary method f against the expected behavior generated by want
+func testInt16x8Ternary(t *testing.T, f func(_, _, _ simd.Int16x8) simd.Int16x8, want func(_, _, _ []int16) []int16) {
+	n := 8
+	t.Helper()
+	forSliceTriple(t, int16s, n, func(x, y, z []int16) bool {
+		t.Helper()
+		a := simd.LoadInt16x8Slice(x)
+		b := simd.LoadInt16x8Slice(y)
+		c := simd.LoadInt16x8Slice(z)
+		g := make([]int16, n)
+		f(a, b, c).StoreSlice(g)
+		w := want(x, y, z)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+	})
+}
+
+// testInt32x4Ternary tests the simd ternary method f against the expected behavior generated by want
+func testInt32x4Ternary(t *testing.T, f func(_, _, _ simd.Int32x4) simd.Int32x4, want func(_, _, _ []int32) []int32) {
+	n := 4
+	t.Helper()
+	forSliceTriple(t, int32s, n, func(x, y, z []int32) bool {
+		t.Helper()
+		a := simd.LoadInt32x4Slice(x)
+		b := simd.LoadInt32x4Slice(y)
+		c := simd.LoadInt32x4Slice(z)
+		g := make([]int32, n)
+		f(a, b, c).StoreSlice(g)
+		w := want(x, y, z)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+	})
+}
+
+// testInt64x2Ternary tests the simd ternary method f against the expected behavior generated by want
+func testInt64x2Ternary(t *testing.T, f func(_, _, _ simd.Int64x2) simd.Int64x2, want func(_, _, _ []int64) []int64) {
+	n := 2
+	t.Helper()
+	forSliceTriple(t, int64s, n, func(x, y, z []int64) bool {
+		t.Helper()
+		a := simd.LoadInt64x2Slice(x)
+		b := simd.LoadInt64x2Slice(y)
+		c := simd.LoadInt64x2Slice(z)
+		g := make([]int64, n)
+		f(a, b, c).StoreSlice(g)
+		w := want(x, y, z)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+	})
+}
+
+// testUint8x16Ternary tests the simd ternary method f against the expected behavior generated by want
+func testUint8x16Ternary(t *testing.T, f func(_, _, _ simd.Uint8x16) simd.Uint8x16, want func(_, _, _ []uint8) []uint8) {
+	n := 16
+	t.Helper()
+	forSliceTriple(t, uint8s, n, func(x, y, z []uint8) bool {
+		t.Helper()
+		a := simd.LoadUint8x16Slice(x)
+		b := simd.LoadUint8x16Slice(y)
+		c := simd.LoadUint8x16Slice(z)
+		g := make([]uint8, n)
+		f(a, b, c).StoreSlice(g)
+		w := want(x, y, z)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+	})
+}
+
+// testUint16x8Ternary tests the simd ternary method f against the expected behavior generated by want
+func testUint16x8Ternary(t *testing.T, f func(_, _, _ simd.Uint16x8) simd.Uint16x8, want func(_, _, _ []uint16) []uint16) {
+	n := 8
+	t.Helper()
+	forSliceTriple(t, uint16s, n, func(x, y, z []uint16) bool {
+		t.Helper()
+		a := simd.LoadUint16x8Slice(x)
+		b := simd.LoadUint16x8Slice(y)
+		c := simd.LoadUint16x8Slice(z)
+		g := make([]uint16, n)
+		f(a, b, c).StoreSlice(g)
+		w := want(x, y, z)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+	})
+}
+
+// testUint32x4Ternary tests the simd ternary method f against the expected behavior generated by want
+func testUint32x4Ternary(t *testing.T, f func(_, _, _ simd.Uint32x4) simd.Uint32x4, want func(_, _, _ []uint32) []uint32) {
+	n := 4
+	t.Helper()
+	forSliceTriple(t, uint32s, n, func(x, y, z []uint32) bool {
+		t.Helper()
+		a := simd.LoadUint32x4Slice(x)
+		b := simd.LoadUint32x4Slice(y)
+		c := simd.LoadUint32x4Slice(z)
+		g := make([]uint32, n)
+		f(a, b, c).StoreSlice(g)
+		w := want(x, y, z)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+	})
+}
+
+// testUint64x2Ternary tests the simd ternary method f against the expected behavior generated by want
+func testUint64x2Ternary(t *testing.T, f func(_, _, _ simd.Uint64x2) simd.Uint64x2, want func(_, _, _ []uint64) []uint64) {
+	n := 2
+	t.Helper()
+	forSliceTriple(t, uint64s, n, func(x, y, z []uint64) bool {
+		t.Helper()
+		a := simd.LoadUint64x2Slice(x)
+		b := simd.LoadUint64x2Slice(y)
+		c := simd.LoadUint64x2Slice(z)
+		g := make([]uint64, n)
+		f(a, b, c).StoreSlice(g)
+		w := want(x, y, z)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+	})
+}
+
+// testFloat32x4Ternary tests the simd ternary method f against the expected behavior generated by want
+func testFloat32x4Ternary(t *testing.T, f func(_, _, _ simd.Float32x4) simd.Float32x4, want func(_, _, _ []float32) []float32) {
+	n := 4
+	t.Helper()
+	forSliceTriple(t, float32s, n, func(x, y, z []float32) bool {
+		t.Helper()
+		a := simd.LoadFloat32x4Slice(x)
+		b := simd.LoadFloat32x4Slice(y)
+		c := simd.LoadFloat32x4Slice(z)
+		g := make([]float32, n)
+		f(a, b, c).StoreSlice(g)
+		w := want(x, y, z)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+	})
+}
+
+// testFloat64x2Ternary tests the simd ternary method f against the expected behavior generated by want
+func testFloat64x2Ternary(t *testing.T, f func(_, _, _ simd.Float64x2) simd.Float64x2, want func(_, _, _ []float64) []float64) {
+	n := 2
+	t.Helper()
+	forSliceTriple(t, float64s, n, func(x, y, z []float64) bool {
+		t.Helper()
+		a := simd.LoadFloat64x2Slice(x)
+		b := simd.LoadFloat64x2Slice(y)
+		c := simd.LoadFloat64x2Slice(z)
+		g := make([]float64, n)
+		f(a, b, c).StoreSlice(g)
+		w := want(x, y, z)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+	})
+}
+
+// testInt8x32Ternary tests the simd ternary method f against the expected behavior generated by want
+func testInt8x32Ternary(t *testing.T, f func(_, _, _ simd.Int8x32) simd.Int8x32, want func(_, _, _ []int8) []int8) {
+	n := 32
+	t.Helper()
+	forSliceTriple(t, int8s, n, func(x, y, z []int8) bool {
+		t.Helper()
+		a := simd.LoadInt8x32Slice(x)
+		b := simd.LoadInt8x32Slice(y)
+		c := simd.LoadInt8x32Slice(z)
+		g := make([]int8, n)
+		f(a, b, c).StoreSlice(g)
+		w := want(x, y, z)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+	})
+}
+
+// testInt16x16Ternary tests the simd ternary method f against the expected behavior generated by want
+func testInt16x16Ternary(t *testing.T, f func(_, _, _ simd.Int16x16) simd.Int16x16, want func(_, _, _ []int16) []int16) {
+	n := 16
+	t.Helper()
+	forSliceTriple(t, int16s, n, func(x, y, z []int16) bool {
+		t.Helper()
+		a := simd.LoadInt16x16Slice(x)
+		b := simd.LoadInt16x16Slice(y)
+		c := simd.LoadInt16x16Slice(z)
+		g := make([]int16, n)
+		f(a, b, c).StoreSlice(g)
+		w := want(x, y, z)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+	})
+}
+
+// testInt32x8Ternary tests the simd ternary method f against the expected behavior generated by want
+func testInt32x8Ternary(t *testing.T, f func(_, _, _ simd.Int32x8) simd.Int32x8, want func(_, _, _ []int32) []int32) {
+	n := 8
+	t.Helper()
+	forSliceTriple(t, int32s, n, func(x, y, z []int32) bool {
+		t.Helper()
+		a := simd.LoadInt32x8Slice(x)
+		b := simd.LoadInt32x8Slice(y)
+		c := simd.LoadInt32x8Slice(z)
+		g := make([]int32, n)
+		f(a, b, c).StoreSlice(g)
+		w := want(x, y, z)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+	})
+}
+
+// testInt64x4Ternary tests the simd ternary method f against the expected behavior generated by want
+func testInt64x4Ternary(t *testing.T, f func(_, _, _ simd.Int64x4) simd.Int64x4, want func(_, _, _ []int64) []int64) {
+	n := 4
+	t.Helper()
+	forSliceTriple(t, int64s, n, func(x, y, z []int64) bool {
+		t.Helper()
+		a := simd.LoadInt64x4Slice(x)
+		b := simd.LoadInt64x4Slice(y)
+		c := simd.LoadInt64x4Slice(z)
+		g := make([]int64, n)
+		f(a, b, c).StoreSlice(g)
+		w := want(x, y, z)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+	})
+}
+
+// testUint8x32Ternary tests the simd ternary method f against the expected behavior generated by want
+func testUint8x32Ternary(t *testing.T, f func(_, _, _ simd.Uint8x32) simd.Uint8x32, want func(_, _, _ []uint8) []uint8) {
+	n := 32
+	t.Helper()
+	forSliceTriple(t, uint8s, n, func(x, y, z []uint8) bool {
+		t.Helper()
+		a := simd.LoadUint8x32Slice(x)
+		b := simd.LoadUint8x32Slice(y)
+		c := simd.LoadUint8x32Slice(z)
+		g := make([]uint8, n)
+		f(a, b, c).StoreSlice(g)
+		w := want(x, y, z)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+	})
+}
+
+// testUint16x16Ternary tests the simd ternary method f against the expected behavior generated by want
+func testUint16x16Ternary(t *testing.T, f func(_, _, _ simd.Uint16x16) simd.Uint16x16, want func(_, _, _ []uint16) []uint16) {
+	n := 16
+	t.Helper()
+	forSliceTriple(t, uint16s, n, func(x, y, z []uint16) bool {
+		t.Helper()
+		a := simd.LoadUint16x16Slice(x)
+		b := simd.LoadUint16x16Slice(y)
+		c := simd.LoadUint16x16Slice(z)
+		g := make([]uint16, n)
+		f(a, b, c).StoreSlice(g)
+		w := want(x, y, z)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+	})
+}
+
+// testUint32x8Ternary tests the simd ternary method f against the expected behavior generated by want
+func testUint32x8Ternary(t *testing.T, f func(_, _, _ simd.Uint32x8) simd.Uint32x8, want func(_, _, _ []uint32) []uint32) {
+	n := 8
+	t.Helper()
+	forSliceTriple(t, uint32s, n, func(x, y, z []uint32) bool {
+		t.Helper()
+		a := simd.LoadUint32x8Slice(x)
+		b := simd.LoadUint32x8Slice(y)
+		c := simd.LoadUint32x8Slice(z)
+		g := make([]uint32, n)
+		f(a, b, c).StoreSlice(g)
+		w := want(x, y, z)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+	})
+}
+
+// testUint64x4Ternary tests the simd ternary method f against the expected behavior generated by want
+func testUint64x4Ternary(t *testing.T, f func(_, _, _ simd.Uint64x4) simd.Uint64x4, want func(_, _, _ []uint64) []uint64) {
+	n := 4
+	t.Helper()
+	forSliceTriple(t, uint64s, n, func(x, y, z []uint64) bool {
+		t.Helper()
+		a := simd.LoadUint64x4Slice(x)
+		b := simd.LoadUint64x4Slice(y)
+		c := simd.LoadUint64x4Slice(z)
+		g := make([]uint64, n)
+		f(a, b, c).StoreSlice(g)
+		w := want(x, y, z)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+	})
+}
+
+// testFloat32x8Ternary tests the simd ternary method f against the expected behavior generated by want
+func testFloat32x8Ternary(t *testing.T, f func(_, _, _ simd.Float32x8) simd.Float32x8, want func(_, _, _ []float32) []float32) {
+	n := 8
+	t.Helper()
+	forSliceTriple(t, float32s, n, func(x, y, z []float32) bool {
+		t.Helper()
+		a := simd.LoadFloat32x8Slice(x)
+		b := simd.LoadFloat32x8Slice(y)
+		c := simd.LoadFloat32x8Slice(z)
+		g := make([]float32, n)
+		f(a, b, c).StoreSlice(g)
+		w := want(x, y, z)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+	})
+}
+
+// testFloat64x4Ternary tests the simd ternary method f against the expected behavior generated by want
+func testFloat64x4Ternary(t *testing.T, f func(_, _, _ simd.Float64x4) simd.Float64x4, want func(_, _, _ []float64) []float64) {
+	n := 4
+	t.Helper()
+	forSliceTriple(t, float64s, n, func(x, y, z []float64) bool {
+		t.Helper()
+		a := simd.LoadFloat64x4Slice(x)
+		b := simd.LoadFloat64x4Slice(y)
+		c := simd.LoadFloat64x4Slice(z)
+		g := make([]float64, n)
+		f(a, b, c).StoreSlice(g)
+		w := want(x, y, z)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+	})
+}
+
+// testInt8x64Ternary tests the simd ternary method f against the expected behavior generated by want
+func testInt8x64Ternary(t *testing.T, f func(_, _, _ simd.Int8x64) simd.Int8x64, want func(_, _, _ []int8) []int8) {
+	n := 64
+	t.Helper()
+	forSliceTriple(t, int8s, n, func(x, y, z []int8) bool {
+		t.Helper()
+		a := simd.LoadInt8x64Slice(x)
+		b := simd.LoadInt8x64Slice(y)
+		c := simd.LoadInt8x64Slice(z)
+		g := make([]int8, n)
+		f(a, b, c).StoreSlice(g)
+		w := want(x, y, z)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+	})
+}
+
+// testInt16x32Ternary tests the simd ternary method f against the expected behavior generated by want
+func testInt16x32Ternary(t *testing.T, f func(_, _, _ simd.Int16x32) simd.Int16x32, want func(_, _, _ []int16) []int16) {
+	n := 32
+	t.Helper()
+	forSliceTriple(t, int16s, n, func(x, y, z []int16) bool {
+		t.Helper()
+		a := simd.LoadInt16x32Slice(x)
+		b := simd.LoadInt16x32Slice(y)
+		c := simd.LoadInt16x32Slice(z)
+		g := make([]int16, n)
+		f(a, b, c).StoreSlice(g)
+		w := want(x, y, z)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+	})
+}
+
+// testInt32x16Ternary tests the simd ternary method f against the expected behavior generated by want
+func testInt32x16Ternary(t *testing.T, f func(_, _, _ simd.Int32x16) simd.Int32x16, want func(_, _, _ []int32) []int32) {
+	n := 16
+	t.Helper()
+	forSliceTriple(t, int32s, n, func(x, y, z []int32) bool {
+		t.Helper()
+		a := simd.LoadInt32x16Slice(x)
+		b := simd.LoadInt32x16Slice(y)
+		c := simd.LoadInt32x16Slice(z)
+		g := make([]int32, n)
+		f(a, b, c).StoreSlice(g)
+		w := want(x, y, z)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+	})
+}
+
+// testInt64x8Ternary tests the simd ternary method f against the expected behavior generated by want
+func testInt64x8Ternary(t *testing.T, f func(_, _, _ simd.Int64x8) simd.Int64x8, want func(_, _, _ []int64) []int64) {
+	n := 8
+	t.Helper()
+	forSliceTriple(t, int64s, n, func(x, y, z []int64) bool {
+		t.Helper()
+		a := simd.LoadInt64x8Slice(x)
+		b := simd.LoadInt64x8Slice(y)
+		c := simd.LoadInt64x8Slice(z)
+		g := make([]int64, n)
+		f(a, b, c).StoreSlice(g)
+		w := want(x, y, z)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+	})
+}
+
+// testUint8x64Ternary tests the simd ternary method f against the expected behavior generated by want
+func testUint8x64Ternary(t *testing.T, f func(_, _, _ simd.Uint8x64) simd.Uint8x64, want func(_, _, _ []uint8) []uint8) {
+	n := 64
+	t.Helper()
+	forSliceTriple(t, uint8s, n, func(x, y, z []uint8) bool {
+		t.Helper()
+		a := simd.LoadUint8x64Slice(x)
+		b := simd.LoadUint8x64Slice(y)
+		c := simd.LoadUint8x64Slice(z)
+		g := make([]uint8, n)
+		f(a, b, c).StoreSlice(g)
+		w := want(x, y, z)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+	})
+}
+
+// testUint16x32Ternary tests the simd ternary method f against the expected behavior generated by want
+func testUint16x32Ternary(t *testing.T, f func(_, _, _ simd.Uint16x32) simd.Uint16x32, want func(_, _, _ []uint16) []uint16) {
+	n := 32
+	t.Helper()
+	forSliceTriple(t, uint16s, n, func(x, y, z []uint16) bool {
+		t.Helper()
+		a := simd.LoadUint16x32Slice(x)
+		b := simd.LoadUint16x32Slice(y)
+		c := simd.LoadUint16x32Slice(z)
+		g := make([]uint16, n)
+		f(a, b, c).StoreSlice(g)
+		w := want(x, y, z)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+	})
+}
+
+// testUint32x16Ternary tests the simd ternary method f against the expected behavior generated by want
+func testUint32x16Ternary(t *testing.T, f func(_, _, _ simd.Uint32x16) simd.Uint32x16, want func(_, _, _ []uint32) []uint32) {
+	n := 16
+	t.Helper()
+	forSliceTriple(t, uint32s, n, func(x, y, z []uint32) bool {
+		t.Helper()
+		a := simd.LoadUint32x16Slice(x)
+		b := simd.LoadUint32x16Slice(y)
+		c := simd.LoadUint32x16Slice(z)
+		g := make([]uint32, n)
+		f(a, b, c).StoreSlice(g)
+		w := want(x, y, z)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+	})
+}
+
+// testUint64x8Ternary tests the simd ternary method f against the expected behavior generated by want
+func testUint64x8Ternary(t *testing.T, f func(_, _, _ simd.Uint64x8) simd.Uint64x8, want func(_, _, _ []uint64) []uint64) {
+	n := 8
+	t.Helper()
+	forSliceTriple(t, uint64s, n, func(x, y, z []uint64) bool {
+		t.Helper()
+		a := simd.LoadUint64x8Slice(x)
+		b := simd.LoadUint64x8Slice(y)
+		c := simd.LoadUint64x8Slice(z)
+		g := make([]uint64, n)
+		f(a, b, c).StoreSlice(g)
+		w := want(x, y, z)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+	})
+}
+
+// testFloat32x16Ternary tests the simd ternary method f against the expected behavior generated by want
+func testFloat32x16Ternary(t *testing.T, f func(_, _, _ simd.Float32x16) simd.Float32x16, want func(_, _, _ []float32) []float32) {
+	n := 16
+	t.Helper()
+	forSliceTriple(t, float32s, n, func(x, y, z []float32) bool {
+		t.Helper()
+		a := simd.LoadFloat32x16Slice(x)
+		b := simd.LoadFloat32x16Slice(y)
+		c := simd.LoadFloat32x16Slice(z)
+		g := make([]float32, n)
+		f(a, b, c).StoreSlice(g)
+		w := want(x, y, z)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+	})
+}
+
+// testFloat64x8Ternary tests the simd ternary method f against the expected behavior generated by want
+func testFloat64x8Ternary(t *testing.T, f func(_, _, _ simd.Float64x8) simd.Float64x8, want func(_, _, _ []float64) []float64) {
+	n := 8
+	t.Helper()
+	forSliceTriple(t, float64s, n, func(x, y, z []float64) bool {
+		t.Helper()
+		a := simd.LoadFloat64x8Slice(x)
+		b := simd.LoadFloat64x8Slice(y)
+		c := simd.LoadFloat64x8Slice(z)
+		g := make([]float64, n)
+		f(a, b, c).StoreSlice(g)
+		w := want(x, y, z)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+	})
+}
+
+// testFloat32x4TernaryFlaky tests the simd ternary method f against the expected behavior generated by want,
+// but using a flakiness parameter because we haven't exactly figured out how simd floating point works
+func testFloat32x4TernaryFlaky(t *testing.T, f func(x, y, z simd.Float32x4) simd.Float32x4, want func(x, y, z []float32) []float32, flakiness float64) {
+	n := 4
+	t.Helper()
+	forSliceTriple(t, float32s, n, func(x, y, z []float32) bool {
+		t.Helper()
+		a := simd.LoadFloat32x4Slice(x)
+		b := simd.LoadFloat32x4Slice(y)
+		c := simd.LoadFloat32x4Slice(z)
+		g := make([]float32, n)
+		f(a, b, c).StoreSlice(g)
+		w := want(x, y, z)
+		return checkSlicesLogInput(t, g, w, flakiness, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+	})
+}
+
+// testFloat32x8TernaryFlaky tests the simd ternary method f against the expected behavior generated by want,
+// but using a flakiness parameter because we haven't exactly figured out how simd floating point works
+func testFloat32x8TernaryFlaky(t *testing.T, f func(x, y, z simd.Float32x8) simd.Float32x8, want func(x, y, z []float32) []float32, flakiness float64) {
+	n := 8
+	t.Helper()
+	forSliceTriple(t, float32s, n, func(x, y, z []float32) bool {
+		t.Helper()
+		a := simd.LoadFloat32x8Slice(x)
+		b := simd.LoadFloat32x8Slice(y)
+		c := simd.LoadFloat32x8Slice(z)
+		g := make([]float32, n)
+		f(a, b, c).StoreSlice(g)
+		w := want(x, y, z)
+		return checkSlicesLogInput(t, g, w, flakiness, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+	})
+}
+
+// testFloat32x16TernaryFlaky tests the simd ternary method f against the expected behavior generated by want,
+// but using a flakiness parameter because we haven't exactly figured out how simd floating point works
+func testFloat32x16TernaryFlaky(t *testing.T, f func(x, y, z simd.Float32x16) simd.Float32x16, want func(x, y, z []float32) []float32, flakiness float64) {
+	n := 16
+	t.Helper()
+	forSliceTriple(t, float32s, n, func(x, y, z []float32) bool {
+		t.Helper()
+		a := simd.LoadFloat32x16Slice(x)
+		b := simd.LoadFloat32x16Slice(y)
+		c := simd.LoadFloat32x16Slice(z)
+		g := make([]float32, n)
+		f(a, b, c).StoreSlice(g)
+		w := want(x, y, z)
+		return checkSlicesLogInput(t, g, w, flakiness, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+	})
+}
diff --git a/src/simd/internal/simd_test/ternary_test.go b/src/simd/internal/simd_test/ternary_test.go
new file mode 100644
index 0000000000..6b563cef75
--- /dev/null
+++ b/src/simd/internal/simd_test/ternary_test.go
@@ -0,0 +1,23 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd && amd64
+
+package simd_test
+
+import (
+	"simd"
+	"testing"
+)
+
+func TestFMA(t *testing.T) {
+	if simd.X86.AVX512() {
+		testFloat32x4TernaryFlaky(t, simd.Float32x4.MulAdd, fmaSlice[float32], 0.001)
+		testFloat32x8TernaryFlaky(t, simd.Float32x8.MulAdd, fmaSlice[float32], 0.001)
+		testFloat32x16TernaryFlaky(t, simd.Float32x16.MulAdd, fmaSlice[float32], 0.001)
+		testFloat64x2Ternary(t, simd.Float64x2.MulAdd, fmaSlice[float64])
+		testFloat64x4Ternary(t, simd.Float64x4.MulAdd, fmaSlice[float64])
+		testFloat64x8Ternary(t, simd.Float64x8.MulAdd, fmaSlice[float64])
+	}
+}
diff --git a/src/simd/internal/simd_test/transpose_test.go b/src/simd/internal/simd_test/transpose_test.go
new file mode 100644
index 0000000000..cdf818e997
--- /dev/null
+++ b/src/simd/internal/simd_test/transpose_test.go
@@ -0,0 +1,868 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd && amd64
+
+package simd_test
+
+import (
+	"fmt"
+	"simd"
+	"testing"
+)
+
+func Transpose4(a0, a1, a2, a3 simd.Int32x4) (b0, b1, b2, b3 simd.Int32x4) {
+	t0, t1 := a0.InterleaveLo(a1), a0.InterleaveHi(a1)
+	t2, t3 := a2.InterleaveLo(a3), a2.InterleaveHi(a3)
+
+	// a0: ABCD ==> t0: A1B2
+	// a1: 1234     t1: C3D4
+	// a2: EFGH     t2: E5F6
+	// a3: 5678     t3: G7H8
+
+	// need
+	// A1E5
+	// B2F6
+	// C3G7
+	// D4H8
+
+	b0 = t0.SelectFromPair(0, 1, 4, 5, t2) // lower elements from each
+	b1 = t0.SelectFromPair(2, 3, 6, 7, t2) // upper elements from each
+	b2 = t1.SelectFromPair(0, 1, 4, 5, t3) // lowers
+	b3 = t1.SelectFromPair(2, 3, 6, 7, t3) // uppers
+	return
+}
+
+func Transpose8(a0, a1, a2, a3, a4, a5, a6, a7 simd.Int32x8) (b0, b1, b2, b3, b4, b5, b6, b7 simd.Int32x8) {
+	t0, t1 := a0.InterleaveLoGrouped(a1), a0.InterleaveHiGrouped(a1)
+	t2, t3 := a2.InterleaveLoGrouped(a3), a2.InterleaveHiGrouped(a3)
+	t4, t5 := a4.InterleaveLoGrouped(a5), a4.InterleaveHiGrouped(a5)
+	t6, t7 := a6.InterleaveLoGrouped(a7), a6.InterleaveHiGrouped(a7)
+
+	// a0: ABCD ==> t0: A1B2
+	// a1: 1234     t1: C3D4
+	// a2: EFGH     t2: E5F6
+	// a3: 5678     t3: G7H8
+
+	// need
+	// A1E5
+	// B2F6
+	// C3G7
+	// D4H8
+
+	a0 = t0.SelectFromPairGrouped(0, 1, 4, 5, t2) // lower elements from each
+	a1 = t0.SelectFromPairGrouped(2, 3, 6, 7, t2) // upper elements from each
+	a2 = t1.SelectFromPairGrouped(0, 1, 4, 5, t3) // lowers
+	a3 = t1.SelectFromPairGrouped(2, 3, 6, 7, t3) // uppers
+
+	a4 = t4.SelectFromPairGrouped(0, 1, 4, 5, t6) // lower elements from each
+	a5 = t4.SelectFromPairGrouped(2, 3, 6, 7, t6) // upper elements from each
+	a6 = t5.SelectFromPairGrouped(0, 1, 4, 5, t7) // lowers
+	a7 = t5.SelectFromPairGrouped(2, 3, 6, 7, t7) // uppers
+
+	// next need to swap the upper 128 bits of a0-a3 with the lower 128 bits of a4-a7
+
+	b0 = a0.Select128FromPair(0, 2, a4)
+	b4 = a0.Select128FromPair(1, 3, a4)
+
+	b1 = a1.Select128FromPair(0, 2, a5)
+	b5 = a1.Select128FromPair(1, 3, a5)
+
+	b2 = a2.Select128FromPair(0, 2, a6)
+	b6 = a2.Select128FromPair(1, 3, a6)
+
+	b3 = a3.Select128FromPair(0, 2, a7)
+	b7 = a3.Select128FromPair(1, 3, a7)
+
+	return
+}
+
+func TestTranspose4(t *testing.T) {
+	r := make([]int32, 16, 16)
+
+	w := simd.LoadInt32x4Slice([]int32{0xA, 0xB, 0xC, 0xD})
+	x := simd.LoadInt32x4Slice([]int32{1, 2, 3, 4})
+	y := simd.LoadInt32x4Slice([]int32{0xE, 0xF, 0x10, 0x11})
+	z := simd.LoadInt32x4Slice([]int32{5, 6, 7, 8})
+	a, b, c, d := Transpose4(w, x, y, z)
+
+	a.StoreSlice(r[0:])
+	b.StoreSlice(r[4:])
+	c.StoreSlice(r[8:])
+	d.StoreSlice(r[12:])
+
+	checkSlices[int32](t, r, []int32{
+		0xA, 1, 0xE, 5,
+		0xB, 2, 0xF, 6,
+		0xC, 3, 0x10, 7,
+		0xD, 4, 0x11, 8,
+	})
+
+}
+
+func TestTranspose8(t *testing.T) {
+	m := make([]int32, 8)
+
+	a := []int32{}
+	for i := int32(1); i <= 64; i++ {
+		a = append(a, i)
+	}
+
+	p := simd.LoadInt32x8Slice(a[0:])
+	q := simd.LoadInt32x8Slice(a[8:])
+	r := simd.LoadInt32x8Slice(a[16:])
+	s := simd.LoadInt32x8Slice(a[24:])
+
+	w := simd.LoadInt32x8Slice(a[32:])
+	x := simd.LoadInt32x8Slice(a[40:])
+	y := simd.LoadInt32x8Slice(a[48:])
+	z := simd.LoadInt32x8Slice(a[56:])
+
+	p, q, r, s, w, x, y, z = Transpose8(p, q, r, s, w, x, y, z)
+
+	foo := func(a simd.Int32x8, z int32) {
+		a.StoreSlice(m)
+		var o []int32
+		for i := int32(0); i < 8; i++ {
+			o = append(o, z+i*8)
+		}
+		checkSlices[int32](t, m, o)
+	}
+
+	foo(p, 1)
+	foo(q, 2)
+	foo(r, 3)
+	foo(s, 4)
+	foo(w, 5)
+	foo(x, 6)
+	foo(y, 7)
+	foo(z, 8)
+
+}
+
+const BIG = 20000
+
+var bigMatrix [][]int32
+
+// 9x9 is smallest matrix with diagonal and off-diagonal tiles, plus a fringe.
+var nineMatrix [][]int32
+
+var thirtyMatrix [][]int32
+
+func fill(m [][]int32) {
+	for i := range m {
+		m[i] = make([]int32, len(m))
+		for j := range m[i] {
+			m[i][j] = int32(-i<<16 + j)
+		}
+	}
+}
+
+func isTransposed(m [][]int32) bool {
+	for i, mi := range m {
+		for j, a := range mi {
+			if a != int32(-j<<16+i) {
+				return false
+			}
+		}
+	}
+	return true
+}
+
+func dupe(m [][]int32) [][]int32 {
+	n := len(m)
+	p := make([][]int32, n, n)
+	for i := range p {
+		t := make([]int32, n)
+		for j, a := range m[i] {
+			t[j] = a
+		}
+		p[i] = t
+	}
+	return p
+}
+
+func init() {
+	bigMatrix = make([][]int32, BIG, BIG)
+	fill(bigMatrix)
+	nineMatrix = make([][]int32, 9, 9)
+	fill(nineMatrix)
+	thirtyMatrix = make([][]int32, 30, 30)
+	fill(thirtyMatrix)
+}
+
+func BenchmarkPlainTranspose(b *testing.B) {
+	d := dupe(bigMatrix)
+	for b.Loop() {
+		transposePlain(d)
+	}
+}
+
+func BenchmarkTiled4Transpose(b *testing.B) {
+	d := dupe(bigMatrix)
+	for b.Loop() {
+		transposeTiled4(d)
+	}
+}
+
+func BenchmarkTiled8Transpose(b *testing.B) {
+	d := dupe(bigMatrix)
+	for b.Loop() {
+		transposeTiled8(d)
+	}
+}
+
+func Benchmark2BlockedTranspose(b *testing.B) {
+	d := dupe(bigMatrix)
+	for b.Loop() {
+		transpose2Blocked(d)
+	}
+}
+func Benchmark3BlockedTranspose(b *testing.B) {
+	d := dupe(bigMatrix)
+	for b.Loop() {
+		transpose3Blocked(d)
+	}
+}
+func Benchmark4BlockedTranspose(b *testing.B) {
+	d := dupe(bigMatrix)
+	for b.Loop() {
+		transpose4Blocked(d)
+	}
+}
+func Benchmark5aBlockedTranspose(b *testing.B) {
+	d := dupe(bigMatrix)
+	for b.Loop() {
+		transpose5aBlocked(d)
+	}
+}
+
+func Benchmark5bBlockedTranspose(b *testing.B) {
+	d := dupe(bigMatrix)
+	for b.Loop() {
+		transpose5bBlocked(d)
+	}
+}
+
+func transposePlain(m [][]int32) {
+	for i := range m {
+		for j := 0; j < i; j++ {
+			t := m[i][j]
+			m[i][j] = m[j][i]
+			m[j][i] = t
+		}
+	}
+}
+
+func TestTransposePlain(t *testing.T) {
+	d := dupe(nineMatrix)
+	t.Logf("Input matrix is %s", formatMatrix(d))
+	transposePlain(d)
+	if !isTransposed(d) {
+		t.Errorf("d is not transposed, d = %s", formatMatrix(d))
+	} else {
+		t.Logf("Transposed plain matrix = %s", formatMatrix(d))
+	}
+}
+
+func TestTranspose2Blocked(t *testing.T) {
+	d := dupe(nineMatrix)
+	t.Logf("Input matrix is %s", formatMatrix(d))
+	transpose2Blocked(d)
+	if !isTransposed(d) {
+		t.Errorf("d is not transposed, d = %s", formatMatrix(d))
+	}
+}
+
+func TestTranspose3Blocked(t *testing.T) {
+	d := dupe(nineMatrix)
+	t.Logf("Input matrix is %s", formatMatrix(d))
+	transpose3Blocked(d)
+	if !isTransposed(d) {
+		t.Errorf("d is not transposed, d = %s", formatMatrix(d))
+	}
+}
+
+func TestTranspose4Blocked(t *testing.T) {
+	d := dupe(nineMatrix)
+	t.Logf("Input matrix is %s", formatMatrix(d))
+	transpose4Blocked(d)
+	if !isTransposed(d) {
+		t.Errorf("d is not transposed, d = %s", formatMatrix(d))
+	}
+}
+
+func TestTranspose5aBlocked(t *testing.T) {
+	d := dupe(nineMatrix)
+	t.Logf("Input matrix is %s", formatMatrix(d))
+	transpose5aBlocked(d)
+	if !isTransposed(d) {
+		t.Errorf("d is not transposed, d = %s", formatMatrix(d))
+	}
+}
+
+func TestTranspose5bBlocked(t *testing.T) {
+	d := dupe(nineMatrix)
+	t.Logf("Input matrix is %s", formatMatrix(d))
+	transpose5bBlocked(d)
+	if !isTransposed(d) {
+		t.Errorf("d is not transposed, d = %s", formatMatrix(d))
+	}
+}
+
+func TestTransposeTiled4(t *testing.T) {
+	d := dupe(nineMatrix)
+	transposeTiled4(d)
+	if !isTransposed(d) {
+		t.Errorf("d is not transposed, d = %v", d)
+	}
+}
+
+func TestTransposeTiled8(t *testing.T) {
+	d := dupe(thirtyMatrix)
+	transposeTiled8(d)
+	if !isTransposed(d) {
+		t.Errorf("d is not transposed, d = %v", d)
+	}
+}
+
+func formatMatrix(m [][]int32) string {
+	s := ""
+	for _, mi := range m {
+		s += "\n["
+		for _, t := range mi {
+			h := t >> 16
+			l := t & 0xffff
+			s += fmt.Sprintf(" (%d %d)", h, l)
+		}
+		s += " ]"
+	}
+	return s
+}
+
+func transpose2Blocked(m [][]int32) {
+	const B = 2
+	N := len(m)
+	i := 0
+	for ; i <= len(m)-B; i += B {
+		r0, r1 := m[i], m[i+1]
+		if len(r0) < N || len(r1) < N {
+			panic("Early bounds check failure")
+		}
+		// transpose around diagonal
+		d01, d10 := r0[i+1], r1[i]
+		r0[i+1], r1[i] = d10, d01
+
+		// transpose across diagonal
+		j := 0
+		for ; j < i; j += B {
+			a0, a1 := m[j], m[j+1]
+
+			b00, b01 := a0[i], a0[i+1]
+			b10, b11 := a1[i], a1[i+1]
+
+			a0[i], a0[i+1] = r0[j], r1[j]
+			a1[i], a1[i+1] = r0[j+1], r1[j+1]
+
+			r0[j], r0[j+1] = b00, b10
+			r1[j], r1[j+1] = b01, b11
+		}
+	}
+
+	// Do the fringe
+	for ; i < len(m); i++ {
+		j := 0
+		r := m[i]
+		for ; j < i; j++ {
+			t := r[j]
+			r[j] = m[j][i]
+			m[j][i] = t
+		}
+	}
+}
+
+func transpose3Blocked(m [][]int32) {
+	const B = 3
+	N := len(m)
+	i := 0
+	for ; i <= len(m)-B; i += B {
+		r0, r1, r2 := m[i], m[i+1], m[i+2]
+		if len(r0) < N || len(r1) < N {
+			panic("Early bounds check failure")
+		}
+		// transpose around diagonal
+		d01, d10 := r0[i+1], r1[i]
+		d02, d20 := r0[i+2], r2[i]
+		d12, d21 := r1[i+2], r2[i+1]
+
+		r0[i+1], r1[i] = d10, d01
+		r0[i+2], r2[i] = d20, d02
+		r1[i+2], r2[i+1] = d21, d12
+
+		// transpose across diagonal
+		j := 0
+		for ; j < i; j += B {
+			a0, a1, a2 := m[j], m[j+1], m[j+2]
+
+			b00, b01, b02 := a0[i], a0[i+1], a0[i+2]
+			b10, b11, b12 := a1[i], a1[i+1], a1[i+2]
+			b20, b21, b22 := a2[i], a2[i+1], a2[i+2]
+
+			a0[i], a0[i+1], a0[i+2] = r0[j], r1[j], r2[j]
+			a1[i], a1[i+1], a1[i+2] = r0[j+1], r1[j+1], r2[j+1]
+			a2[i], a2[i+1], a2[i+2] = r0[j+2], r1[j+2], r2[j+2]
+
+			r0[j], r0[j+1], r0[j+2] = b00, b10, b20
+			r1[j], r1[j+1], r1[j+2] = b01, b11, b21
+			r2[j], r2[j+1], r2[j+2] = b02, b12, b22
+		}
+	}
+
+	// Do the fringe
+	for ; i < len(m); i++ {
+		j := 0
+		r := m[i]
+		for ; j < i; j++ {
+			t := r[j]
+			r[j] = m[j][i]
+			m[j][i] = t
+		}
+	}
+}
+
+func transpose4Blocked(m [][]int32) {
+	const B = 4
+	N := len(m)
+	i := 0
+	for ; i <= len(m)-B; i += B {
+		r0, r1, r2, r3 := m[i], m[i+1], m[i+2], m[i+3]
+		if len(r0) < N || len(r1) < N || len(r2) < N || len(r3) < N {
+			panic("Early bounds check failure")
+		}
+		// transpose around diagonal
+		d01, d10 := r0[i+1], r1[i]
+		d02, d20 := r0[i+2], r2[i]
+		d03, d30 := r0[i+3], r3[i]
+		d12, d21 := r1[i+2], r2[i+1]
+		d13, d31 := r1[i+3], r3[i+1]
+		d23, d32 := r2[i+3], r3[i+2]
+
+		r0[i+1], r1[i] = d10, d01
+		r0[i+2], r2[i] = d20, d02
+		r0[i+3], r3[i] = d30, d03
+		r1[i+2], r2[i+1] = d21, d12
+		r1[i+3], r3[i+1] = d31, d13
+		r2[i+3], r3[i+2] = d32, d23
+
+		// transpose across diagonal
+		j := 0
+		for ; j < i; j += B {
+			a0, a1, a2, a3 := m[j], m[j+1], m[j+2], m[j+3]
+
+			b00, b01, b02, b03 := a0[i], a0[i+1], a0[i+2], a0[i+3]
+			b10, b11, b12, b13 := a1[i], a1[i+1], a1[i+2], a1[i+3]
+			b20, b21, b22, b23 := a2[i], a2[i+1], a2[i+2], a2[i+3]
+			b30, b31, b32, b33 := a3[i], a3[i+1], a3[i+2], a3[i+3]
+
+			a0[i], a0[i+1], a0[i+2], a0[i+3] = r0[j], r1[j], r2[j], r3[j]
+			a1[i], a1[i+1], a1[i+2], a1[i+3] = r0[j+1], r1[j+1], r2[j+1], r3[j+1]
+			a2[i], a2[i+1], a2[i+2], a2[i+3] = r0[j+2], r1[j+2], r2[j+2], r3[j+2]
+			a3[i], a3[i+1], a3[i+2], a3[i+3] = r0[j+3], r1[j+3], r2[j+3], r3[j+3]
+
+			r0[j], r0[j+1], r0[j+2], r0[j+3] = b00, b10, b20, b30
+			r1[j], r1[j+1], r1[j+2], r1[j+3] = b01, b11, b21, b31
+			r2[j], r2[j+1], r2[j+2], r2[j+3] = b02, b12, b22, b32
+			r3[j], r3[j+1], r3[j+2], r3[j+3] = b03, b13, b23, b33
+		}
+	}
+
+	// Do the fringe
+	for ; i < len(m); i++ {
+		j := 0
+		r := m[i]
+		for ; j < i; j++ {
+			t := r[j]
+			r[j] = m[j][i]
+			m[j][i] = t
+		}
+	}
+}
+
+func transpose5aBlocked(m [][]int32) {
+	const B = 5
+	N := len(m)
+	i := 0
+	for ; i <= len(m)-B; i += B {
+		r0, r1, r2, r3, r4 := m[i], m[i+1], m[i+2], m[i+3], m[i+4]
+		if len(r0) < N || len(r1) < N || len(r2) < N || len(r3) < N || len(r4) < N {
+			panic("Early bounds check failure")
+		}
+		// transpose around diagonal
+		d01, d10 := r0[i+1], r1[i]
+		d02, d20 := r0[i+2], r2[i]
+		d03, d30 := r0[i+3], r3[i]
+		d04, d40 := r0[i+4], r4[i]
+
+		d12, d21 := r1[i+2], r2[i+1]
+		d13, d31 := r1[i+3], r3[i+1]
+		d14, d41 := r1[i+4], r4[i+1]
+
+		d23, d32 := r2[i+3], r3[i+2]
+		d24, d42 := r2[i+4], r4[i+2]
+
+		d34, d43 := r3[i+4], r4[i+3]
+
+		r0[i+1], r1[i] = d10, d01
+		r0[i+2], r2[i] = d20, d02
+		r0[i+3], r3[i] = d30, d03
+		r0[i+4], r4[i] = d40, d04
+
+		r1[i+2], r2[i+1] = d21, d12
+		r1[i+3], r3[i+1] = d31, d13
+		r1[i+4], r4[i+1] = d41, d14
+
+		r2[i+3], r3[i+2] = d32, d23
+		r2[i+4], r4[i+2] = d42, d24
+
+		r3[i+4], r4[i+3] = d43, d34
+
+		// transpose across diagonal
+		j := 0
+		for ; j < i; j += B {
+			a0, a1, a2, a3, a4 := m[j], m[j+1], m[j+2], m[j+3], m[j+4]
+
+			b00, b01, b02, b03, b04 := a0[i], a0[i+1], a0[i+2], a0[i+3], a0[i+4]
+			b10, b11, b12, b13, b14 := a1[i], a1[i+1], a1[i+2], a1[i+3], a1[i+4]
+			b20, b21, b22, b23, b24 := a2[i], a2[i+1], a2[i+2], a2[i+3], a2[i+4]
+			b30, b31, b32, b33, b34 := a3[i], a3[i+1], a3[i+2], a3[i+3], a3[i+4]
+			b40, b41, b42, b43, b44 := a4[i], a4[i+1], a4[i+2], a4[i+3], a4[i+4]
+
+			a0[i], a0[i+1], a0[i+2], a0[i+3], a0[i+4] = r0[j], r1[j], r2[j], r3[j], r4[j]
+			a1[i], a1[i+1], a1[i+2], a1[i+3], a1[i+4] = r0[j+1], r1[j+1], r2[j+1], r3[j+1], r4[j+1]
+			a2[i], a2[i+1], a2[i+2], a2[i+3], a2[i+4] = r0[j+2], r1[j+2], r2[j+2], r3[j+2], r4[j+2]
+			a3[i], a3[i+1], a3[i+2], a3[i+3], a3[i+4] = r0[j+3], r1[j+3], r2[j+3], r3[j+3], r4[j+3]
+			a4[i], a4[i+1], a4[i+2], a4[i+3], a4[i+4] = r0[j+4], r1[j+4], r2[j+4], r3[j+4], r4[j+4]
+
+			r0[j], r0[j+1], r0[j+2], r0[j+3], r0[j+4] = b00, b10, b20, b30, b40
+			r1[j], r1[j+1], r1[j+2], r1[j+3], r1[j+4] = b01, b11, b21, b31, b41
+			r2[j], r2[j+1], r2[j+2], r2[j+3], r2[j+4] = b02, b12, b22, b32, b42
+			r3[j], r3[j+1], r3[j+2], r3[j+3], r3[j+4] = b03, b13, b23, b33, b43
+			r4[j], r4[j+1], r4[j+2], r4[j+3], r4[j+4] = b04, b14, b24, b34, b44
+		}
+	}
+
+	// Do the fringe
+	for ; i < len(m); i++ {
+		j := 0
+		r := m[i]
+		for ; j < i; j++ {
+			t := r[j]
+			r[j] = m[j][i]
+			m[j][i] = t
+		}
+	}
+}
+
+// transpose5bBlocked is just like transpose5aBlocked
+// but rewritten to reduce register pressure in the
+// inner loop.
+func transpose5bBlocked(m [][]int32) {
+	const B = 5
+	N := len(m)
+	i := 0
+	for ; i <= len(m)-B; i += B {
+		r0, r1, r2, r3, r4 := m[i], m[i+1], m[i+2], m[i+3], m[i+4]
+		if len(r0) < N || len(r1) < N || len(r2) < N || len(r3) < N || len(r4) < N {
+			panic("Early bounds check failure")
+		}
+		// transpose around diagonal
+		d01, d10 := r0[i+1], r1[i]
+		d02, d20 := r0[i+2], r2[i]
+		d03, d30 := r0[i+3], r3[i]
+		d04, d40 := r0[i+4], r4[i]
+		r0[i+1], r1[i] = d10, d01
+		r0[i+2], r2[i] = d20, d02
+		r0[i+3], r3[i] = d30, d03
+		r0[i+4], r4[i] = d40, d04
+
+		d12, d21 := r1[i+2], r2[i+1]
+		d13, d31 := r1[i+3], r3[i+1]
+		d14, d41 := r1[i+4], r4[i+1]
+		r1[i+2], r2[i+1] = d21, d12
+		r1[i+3], r3[i+1] = d31, d13
+		r1[i+4], r4[i+1] = d41, d14
+
+		d23, d32 := r2[i+3], r3[i+2]
+		d24, d42 := r2[i+4], r4[i+2]
+		r2[i+3], r3[i+2] = d32, d23
+		r2[i+4], r4[i+2] = d42, d24
+
+		d34, d43 := r3[i+4], r4[i+3]
+		r3[i+4], r4[i+3] = d43, d34
+
+		// transpose across diagonal
+		j := 0
+		for ; j < i; j += B {
+			a4, a0, a1, a2, a3 := m[j+4], m[j], m[j+1], m[j+2], m[j+3]
+
+			// Process column i+4
+			temp0 := a0[i+4]
+			temp1 := a1[i+4]
+			temp2 := a2[i+4]
+			temp3 := a3[i+4]
+			temp4 := a4[i+4]
+
+			a4[i+4] = r4[j+4]
+			a0[i+4] = r4[j]
+			a1[i+4] = r4[j+1]
+			a2[i+4] = r4[j+2]
+			a3[i+4] = r4[j+3]
+
+			r0[j+4] = temp0
+			r1[j+4] = temp1
+			r2[j+4] = temp2
+			r3[j+4] = temp3
+			r4[j+4] = temp4
+
+			// Process column i
+			temp0 = a0[i]
+			temp1 = a1[i]
+			temp2 = a2[i]
+			temp3 = a3[i]
+			temp4 = a4[i]
+
+			a4[i] = r0[j+4]
+			a0[i] = r0[j]
+			a1[i] = r0[j+1]
+			a2[i] = r0[j+2]
+			a3[i] = r0[j+3]
+
+			r0[j] = temp0
+			r1[j] = temp1
+			r2[j] = temp2
+			r3[j] = temp3
+			r4[j] = temp4
+
+			// Process column i+1
+			temp0 = a0[i+1]
+			temp1 = a1[i+1]
+			temp2 = a2[i+1]
+			temp3 = a3[i+1]
+			temp4 = a4[i+1]
+
+			a4[i+1] = r1[j+4]
+			a0[i+1] = r1[j]
+			a1[i+1] = r1[j+1]
+			a2[i+1] = r1[j+2]
+			a3[i+1] = r1[j+3]
+
+			r0[j+1] = temp0
+			r1[j+1] = temp1
+			r2[j+1] = temp2
+			r3[j+1] = temp3
+			r4[j+1] = temp4
+
+			// Process column i+2
+			temp0 = a0[i+2]
+			temp1 = a1[i+2]
+			temp2 = a2[i+2]
+			temp3 = a3[i+2]
+			temp4 = a4[i+2]
+
+			a4[i+2] = r2[j+4]
+			a0[i+2] = r2[j]
+			a1[i+2] = r2[j+1]
+			a2[i+2] = r2[j+2]
+			a3[i+2] = r2[j+3]
+
+			r0[j+2] = temp0
+			r1[j+2] = temp1
+			r2[j+2] = temp2
+			r3[j+2] = temp3
+			r4[j+2] = temp4
+
+			// Process column i+3
+			temp0 = a0[i+3]
+			temp1 = a1[i+3]
+			temp2 = a2[i+3]
+			temp3 = a3[i+3]
+			temp4 = a4[i+3]
+
+			a4[i+3] = r3[j+4]
+			a0[i+3] = r3[j]
+			a1[i+3] = r3[j+1]
+			a2[i+3] = r3[j+2]
+			a3[i+3] = r3[j+3]
+
+			r0[j+3] = temp0
+			r1[j+3] = temp1
+			r2[j+3] = temp2
+			r3[j+3] = temp3
+			r4[j+3] = temp4
+		}
+	}
+
+	// Do the fringe
+	for ; i < len(m); i++ {
+		j := 0
+		r := m[i]
+		for ; j < i; j++ {
+			t := r[j]
+			r[j] = m[j][i]
+			m[j][i] = t
+		}
+	}
+}
+
+func transposeTiled4(m [][]int32) {
+	const B = 4
+	N := len(m)
+	i := 0
+	for ; i < len(m)-(B-1); i += B {
+		r0, r1, r2, r3 := m[i], m[i+1], m[i+2], m[i+3]
+		if len(r0) < N || len(r1) < N || len(r2) < N || len(r3) < N {
+			panic("Early bounds check failure")
+		}
+		// transpose diagonal
+		d0, d1, d2, d3 :=
+			simd.LoadInt32x4Slice(r0[i:]),
+			simd.LoadInt32x4Slice(r1[i:]),
+			simd.LoadInt32x4Slice(r2[i:]),
+			simd.LoadInt32x4Slice(r3[i:])
+
+		d0, d1, d2, d3 = Transpose4(d0, d1, d2, d3)
+
+		d0.StoreSlice(r0[i:])
+		d1.StoreSlice(r1[i:])
+		d2.StoreSlice(r2[i:])
+		d3.StoreSlice(r3[i:])
+
+		// transpose across diagonal
+		j := 0
+		for ; j < i; j += B {
+			a0, a1, a2, a3 := m[j], m[j+1], m[j+2], m[j+3]
+			u0, u1, u2, u3 :=
+				simd.LoadInt32x4Slice(a0[i:]),
+				simd.LoadInt32x4Slice(a1[i:]),
+				simd.LoadInt32x4Slice(a2[i:]),
+				simd.LoadInt32x4Slice(a3[i:])
+
+			u0, u1, u2, u3 = Transpose4(u0, u1, u2, u3)
+
+			l0 := simd.LoadInt32x4Slice(r0[j:])
+			u0.StoreSlice(r0[j:])
+			l1 := simd.LoadInt32x4Slice(r1[j:])
+			u1.StoreSlice(r1[j:])
+			l2 := simd.LoadInt32x4Slice(r2[j:])
+			u2.StoreSlice(r2[j:])
+			l3 := simd.LoadInt32x4Slice(r3[j:])
+			u3.StoreSlice(r3[j:])
+
+			u0, u1, u2, u3 = Transpose4(l0, l1, l2, l3)
+
+			u0.StoreSlice(a0[i:])
+			u1.StoreSlice(a1[i:])
+			u2.StoreSlice(a2[i:])
+			u3.StoreSlice(a3[i:])
+		}
+	}
+	// Do the fringe
+	for ; i < len(m); i++ {
+		j := 0
+		r := m[i]
+		for ; j < i; j++ {
+			t := r[j]
+			r[j] = m[j][i]
+			m[j][i] = t
+		}
+	}
+}
+
+func transposeTiled8(m [][]int32) {
+	const B = 8
+	N := len(m)
+	i := 0
+	for ; i < len(m)-(B-1); i += B {
+		r0, r1, r2, r3, r4, r5, r6, r7 := m[i], m[i+1], m[i+2], m[i+3], m[i+4], m[i+5], m[i+6], m[i+7]
+		if len(r0) < N || len(r1) < N || len(r2) < N || len(r3) < N || len(r4) < N || len(r5) < N || len(r6) < N || len(r7) < N {
+			panic("Early bounds check failure")
+		}
+		// transpose diagonal
+		d0, d1, d2, d3, d4, d5, d6, d7 :=
+			simd.LoadInt32x8Slice(r0[i:]),
+			simd.LoadInt32x8Slice(r1[i:]),
+			simd.LoadInt32x8Slice(r2[i:]),
+			simd.LoadInt32x8Slice(r3[i:]),
+			simd.LoadInt32x8Slice(r4[i:]),
+			simd.LoadInt32x8Slice(r5[i:]),
+			simd.LoadInt32x8Slice(r6[i:]),
+			simd.LoadInt32x8Slice(r7[i:])
+
+		d0, d1, d2, d3, d4, d5, d6, d7 = Transpose8(d0, d1, d2, d3, d4, d5, d6, d7)
+
+		d0.StoreSlice(r0[i:])
+		d1.StoreSlice(r1[i:])
+		d2.StoreSlice(r2[i:])
+		d3.StoreSlice(r3[i:])
+		d4.StoreSlice(r4[i:])
+		d5.StoreSlice(r5[i:])
+		d6.StoreSlice(r6[i:])
+		d7.StoreSlice(r7[i:])
+
+		// transpose across diagonal
+		j := 0
+		for ; j < i; j += B {
+			a7, a0, a1, a2, a3, a4, a5, a6 := m[j+7], m[j], m[j+1], m[j+2], m[j+3], m[j+4], m[j+5], m[j+6]
+			u0, u1, u2, u3, u4, u5, u6, u7 :=
+				simd.LoadInt32x8Slice(a0[i:]),
+				simd.LoadInt32x8Slice(a1[i:]),
+				simd.LoadInt32x8Slice(a2[i:]),
+				simd.LoadInt32x8Slice(a3[i:]),
+				simd.LoadInt32x8Slice(a4[i:]),
+				simd.LoadInt32x8Slice(a5[i:]),
+				simd.LoadInt32x8Slice(a6[i:]),
+				simd.LoadInt32x8Slice(a7[i:])
+
+			u0, u1, u2, u3, u4, u5, u6, u7 = Transpose8(u0, u1, u2, u3, u4, u5, u6, u7)
+
+			l0 := simd.LoadInt32x8Slice(r0[j:])
+			u0.StoreSlice(r0[j:])
+			l1 := simd.LoadInt32x8Slice(r1[j:])
+			u1.StoreSlice(r1[j:])
+			l2 := simd.LoadInt32x8Slice(r2[j:])
+			u2.StoreSlice(r2[j:])
+			l3 := simd.LoadInt32x8Slice(r3[j:])
+			u3.StoreSlice(r3[j:])
+			l4 := simd.LoadInt32x8Slice(r4[j:])
+			u4.StoreSlice(r4[j:])
+			l5 := simd.LoadInt32x8Slice(r5[j:])
+			u5.StoreSlice(r5[j:])
+			l6 := simd.LoadInt32x8Slice(r6[j:])
+			u6.StoreSlice(r6[j:])
+			l7 := simd.LoadInt32x8Slice(r7[j:])
+			u7.StoreSlice(r7[j:])
+
+			u0, u1, u2, u3, u4, u5, u6, u7 = Transpose8(l0, l1, l2, l3, l4, l5, l6, l7)
+
+			u0.StoreSlice(a0[i:])
+			u1.StoreSlice(a1[i:])
+			u2.StoreSlice(a2[i:])
+			u3.StoreSlice(a3[i:])
+			u4.StoreSlice(a4[i:])
+			u5.StoreSlice(a5[i:])
+			u6.StoreSlice(a6[i:])
+			u7.StoreSlice(a7[i:])
+		}
+	}
+	// Do the fringe
+	for ; i < len(m); i++ {
+		j := 0
+		r := m[i]
+		for ; j < i; j++ {
+			t := r[j]
+			r[j] = m[j][i]
+			m[j][i] = t
+		}
+	}
+}
diff --git a/src/simd/internal/simd_test/unary_helpers_test.go b/src/simd/internal/simd_test/unary_helpers_test.go
new file mode 100644
index 0000000000..d99fd3c505
--- /dev/null
+++ b/src/simd/internal/simd_test/unary_helpers_test.go
@@ -0,0 +1,1439 @@
+// Code generated by 'go run genfiles.go'; DO NOT EDIT.
+
+//go:build goexperiment.simd
+
+// This file contains functions testing unary simd methods.
+// Each function in this file is specialized for a
+// particular simd type <BaseType><Width>x<Count>.
+
+package simd_test
+
+import (
+	"simd"
+	"testing"
+)
+
+// testInt8x16Unary tests the simd unary method f against the expected behavior generated by want
+func testInt8x16Unary(t *testing.T, f func(_ simd.Int8x16) simd.Int8x16, want func(_ []int8) []int8) {
+	n := 16
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := simd.LoadInt8x16Slice(x)
+		g := make([]int8, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x8Unary tests the simd unary method f against the expected behavior generated by want
+func testInt16x8Unary(t *testing.T, f func(_ simd.Int16x8) simd.Int16x8, want func(_ []int16) []int16) {
+	n := 8
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := simd.LoadInt16x8Slice(x)
+		g := make([]int16, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x4Unary tests the simd unary method f against the expected behavior generated by want
+func testInt32x4Unary(t *testing.T, f func(_ simd.Int32x4) simd.Int32x4, want func(_ []int32) []int32) {
+	n := 4
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := simd.LoadInt32x4Slice(x)
+		g := make([]int32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x2Unary tests the simd unary method f against the expected behavior generated by want
+func testInt64x2Unary(t *testing.T, f func(_ simd.Int64x2) simd.Int64x2, want func(_ []int64) []int64) {
+	n := 2
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := simd.LoadInt64x2Slice(x)
+		g := make([]int64, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x16Unary tests the simd unary method f against the expected behavior generated by want
+func testUint8x16Unary(t *testing.T, f func(_ simd.Uint8x16) simd.Uint8x16, want func(_ []uint8) []uint8) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := simd.LoadUint8x16Slice(x)
+		g := make([]uint8, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x8Unary tests the simd unary method f against the expected behavior generated by want
+func testUint16x8Unary(t *testing.T, f func(_ simd.Uint16x8) simd.Uint16x8, want func(_ []uint16) []uint16) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := simd.LoadUint16x8Slice(x)
+		g := make([]uint16, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x4Unary tests the simd unary method f against the expected behavior generated by want
+func testUint32x4Unary(t *testing.T, f func(_ simd.Uint32x4) simd.Uint32x4, want func(_ []uint32) []uint32) {
+	n := 4
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := simd.LoadUint32x4Slice(x)
+		g := make([]uint32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x2Unary tests the simd unary method f against the expected behavior generated by want
+func testUint64x2Unary(t *testing.T, f func(_ simd.Uint64x2) simd.Uint64x2, want func(_ []uint64) []uint64) {
+	n := 2
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := simd.LoadUint64x2Slice(x)
+		g := make([]uint64, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat32x4Unary tests the simd unary method f against the expected behavior generated by want
+func testFloat32x4Unary(t *testing.T, f func(_ simd.Float32x4) simd.Float32x4, want func(_ []float32) []float32) {
+	n := 4
+	t.Helper()
+	forSlice(t, float32s, n, func(x []float32) bool {
+		t.Helper()
+		a := simd.LoadFloat32x4Slice(x)
+		g := make([]float32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat64x2Unary tests the simd unary method f against the expected behavior generated by want
+func testFloat64x2Unary(t *testing.T, f func(_ simd.Float64x2) simd.Float64x2, want func(_ []float64) []float64) {
+	n := 2
+	t.Helper()
+	forSlice(t, float64s, n, func(x []float64) bool {
+		t.Helper()
+		a := simd.LoadFloat64x2Slice(x)
+		g := make([]float64, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x32Unary tests the simd unary method f against the expected behavior generated by want
+func testInt8x32Unary(t *testing.T, f func(_ simd.Int8x32) simd.Int8x32, want func(_ []int8) []int8) {
+	n := 32
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := simd.LoadInt8x32Slice(x)
+		g := make([]int8, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x16Unary tests the simd unary method f against the expected behavior generated by want
+func testInt16x16Unary(t *testing.T, f func(_ simd.Int16x16) simd.Int16x16, want func(_ []int16) []int16) {
+	n := 16
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := simd.LoadInt16x16Slice(x)
+		g := make([]int16, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x8Unary tests the simd unary method f against the expected behavior generated by want
+func testInt32x8Unary(t *testing.T, f func(_ simd.Int32x8) simd.Int32x8, want func(_ []int32) []int32) {
+	n := 8
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := simd.LoadInt32x8Slice(x)
+		g := make([]int32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x4Unary tests the simd unary method f against the expected behavior generated by want
+func testInt64x4Unary(t *testing.T, f func(_ simd.Int64x4) simd.Int64x4, want func(_ []int64) []int64) {
+	n := 4
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := simd.LoadInt64x4Slice(x)
+		g := make([]int64, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x32Unary tests the simd unary method f against the expected behavior generated by want
+func testUint8x32Unary(t *testing.T, f func(_ simd.Uint8x32) simd.Uint8x32, want func(_ []uint8) []uint8) {
+	n := 32
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := simd.LoadUint8x32Slice(x)
+		g := make([]uint8, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x16Unary tests the simd unary method f against the expected behavior generated by want
+func testUint16x16Unary(t *testing.T, f func(_ simd.Uint16x16) simd.Uint16x16, want func(_ []uint16) []uint16) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := simd.LoadUint16x16Slice(x)
+		g := make([]uint16, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x8Unary tests the simd unary method f against the expected behavior generated by want
+func testUint32x8Unary(t *testing.T, f func(_ simd.Uint32x8) simd.Uint32x8, want func(_ []uint32) []uint32) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := simd.LoadUint32x8Slice(x)
+		g := make([]uint32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x4Unary tests the simd unary method f against the expected behavior generated by want
+func testUint64x4Unary(t *testing.T, f func(_ simd.Uint64x4) simd.Uint64x4, want func(_ []uint64) []uint64) {
+	n := 4
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := simd.LoadUint64x4Slice(x)
+		g := make([]uint64, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat32x8Unary tests the simd unary method f against the expected behavior generated by want
+func testFloat32x8Unary(t *testing.T, f func(_ simd.Float32x8) simd.Float32x8, want func(_ []float32) []float32) {
+	n := 8
+	t.Helper()
+	forSlice(t, float32s, n, func(x []float32) bool {
+		t.Helper()
+		a := simd.LoadFloat32x8Slice(x)
+		g := make([]float32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat64x4Unary tests the simd unary method f against the expected behavior generated by want
+func testFloat64x4Unary(t *testing.T, f func(_ simd.Float64x4) simd.Float64x4, want func(_ []float64) []float64) {
+	n := 4
+	t.Helper()
+	forSlice(t, float64s, n, func(x []float64) bool {
+		t.Helper()
+		a := simd.LoadFloat64x4Slice(x)
+		g := make([]float64, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x64Unary tests the simd unary method f against the expected behavior generated by want
+func testInt8x64Unary(t *testing.T, f func(_ simd.Int8x64) simd.Int8x64, want func(_ []int8) []int8) {
+	n := 64
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := simd.LoadInt8x64Slice(x)
+		g := make([]int8, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x32Unary tests the simd unary method f against the expected behavior generated by want
+func testInt16x32Unary(t *testing.T, f func(_ simd.Int16x32) simd.Int16x32, want func(_ []int16) []int16) {
+	n := 32
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := simd.LoadInt16x32Slice(x)
+		g := make([]int16, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x16Unary tests the simd unary method f against the expected behavior generated by want
+func testInt32x16Unary(t *testing.T, f func(_ simd.Int32x16) simd.Int32x16, want func(_ []int32) []int32) {
+	n := 16
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := simd.LoadInt32x16Slice(x)
+		g := make([]int32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x8Unary tests the simd unary method f against the expected behavior generated by want
+func testInt64x8Unary(t *testing.T, f func(_ simd.Int64x8) simd.Int64x8, want func(_ []int64) []int64) {
+	n := 8
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := simd.LoadInt64x8Slice(x)
+		g := make([]int64, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x64Unary tests the simd unary method f against the expected behavior generated by want
+func testUint8x64Unary(t *testing.T, f func(_ simd.Uint8x64) simd.Uint8x64, want func(_ []uint8) []uint8) {
+	n := 64
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := simd.LoadUint8x64Slice(x)
+		g := make([]uint8, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x32Unary tests the simd unary method f against the expected behavior generated by want
+func testUint16x32Unary(t *testing.T, f func(_ simd.Uint16x32) simd.Uint16x32, want func(_ []uint16) []uint16) {
+	n := 32
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := simd.LoadUint16x32Slice(x)
+		g := make([]uint16, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x16Unary tests the simd unary method f against the expected behavior generated by want
+func testUint32x16Unary(t *testing.T, f func(_ simd.Uint32x16) simd.Uint32x16, want func(_ []uint32) []uint32) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := simd.LoadUint32x16Slice(x)
+		g := make([]uint32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x8Unary tests the simd unary method f against the expected behavior generated by want
+func testUint64x8Unary(t *testing.T, f func(_ simd.Uint64x8) simd.Uint64x8, want func(_ []uint64) []uint64) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := simd.LoadUint64x8Slice(x)
+		g := make([]uint64, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat32x16Unary tests the simd unary method f against the expected behavior generated by want
+func testFloat32x16Unary(t *testing.T, f func(_ simd.Float32x16) simd.Float32x16, want func(_ []float32) []float32) {
+	n := 16
+	t.Helper()
+	forSlice(t, float32s, n, func(x []float32) bool {
+		t.Helper()
+		a := simd.LoadFloat32x16Slice(x)
+		g := make([]float32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat64x8Unary tests the simd unary method f against the expected behavior generated by want
+func testFloat64x8Unary(t *testing.T, f func(_ simd.Float64x8) simd.Float64x8, want func(_ []float64) []float64) {
+	n := 8
+	t.Helper()
+	forSlice(t, float64s, n, func(x []float64) bool {
+		t.Helper()
+		a := simd.LoadFloat64x8Slice(x)
+		g := make([]float64, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x16ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testInt8x16ConvertToInt32(t *testing.T, f func(x simd.Int8x16) simd.Int32x16, want func(x []int8) []int32) {
+	n := 16
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := simd.LoadInt8x16Slice(x)
+		g := make([]int32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x8ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testInt16x8ConvertToInt32(t *testing.T, f func(x simd.Int16x8) simd.Int32x8, want func(x []int16) []int32) {
+	n := 8
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := simd.LoadInt16x8Slice(x)
+		g := make([]int32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x4ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testInt32x4ConvertToInt32(t *testing.T, f func(x simd.Int32x4) simd.Int32x4, want func(x []int32) []int32) {
+	n := 4
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := simd.LoadInt32x4Slice(x)
+		g := make([]int32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x16ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testUint8x16ConvertToInt32(t *testing.T, f func(x simd.Uint8x16) simd.Int32x16, want func(x []uint8) []int32) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := simd.LoadUint8x16Slice(x)
+		g := make([]int32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x8ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testUint16x8ConvertToInt32(t *testing.T, f func(x simd.Uint16x8) simd.Int32x8, want func(x []uint16) []int32) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := simd.LoadUint16x8Slice(x)
+		g := make([]int32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x4ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testUint32x4ConvertToInt32(t *testing.T, f func(x simd.Uint32x4) simd.Int32x4, want func(x []uint32) []int32) {
+	n := 4
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := simd.LoadUint32x4Slice(x)
+		g := make([]int32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat32x4ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testFloat32x4ConvertToInt32(t *testing.T, f func(x simd.Float32x4) simd.Int32x4, want func(x []float32) []int32) {
+	n := 4
+	t.Helper()
+	forSlice(t, float32s, n, func(x []float32) bool {
+		t.Helper()
+		a := simd.LoadFloat32x4Slice(x)
+		g := make([]int32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x16ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testInt16x16ConvertToInt32(t *testing.T, f func(x simd.Int16x16) simd.Int32x16, want func(x []int16) []int32) {
+	n := 16
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := simd.LoadInt16x16Slice(x)
+		g := make([]int32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x8ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testInt32x8ConvertToInt32(t *testing.T, f func(x simd.Int32x8) simd.Int32x8, want func(x []int32) []int32) {
+	n := 8
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := simd.LoadInt32x8Slice(x)
+		g := make([]int32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x4ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testInt64x4ConvertToInt32(t *testing.T, f func(x simd.Int64x4) simd.Int32x4, want func(x []int64) []int32) {
+	n := 4
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := simd.LoadInt64x4Slice(x)
+		g := make([]int32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x16ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testUint16x16ConvertToInt32(t *testing.T, f func(x simd.Uint16x16) simd.Int32x16, want func(x []uint16) []int32) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := simd.LoadUint16x16Slice(x)
+		g := make([]int32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x8ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testUint32x8ConvertToInt32(t *testing.T, f func(x simd.Uint32x8) simd.Int32x8, want func(x []uint32) []int32) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := simd.LoadUint32x8Slice(x)
+		g := make([]int32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x4ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testUint64x4ConvertToInt32(t *testing.T, f func(x simd.Uint64x4) simd.Int32x4, want func(x []uint64) []int32) {
+	n := 4
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := simd.LoadUint64x4Slice(x)
+		g := make([]int32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat32x8ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testFloat32x8ConvertToInt32(t *testing.T, f func(x simd.Float32x8) simd.Int32x8, want func(x []float32) []int32) {
+	n := 8
+	t.Helper()
+	forSlice(t, float32s, n, func(x []float32) bool {
+		t.Helper()
+		a := simd.LoadFloat32x8Slice(x)
+		g := make([]int32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat64x4ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testFloat64x4ConvertToInt32(t *testing.T, f func(x simd.Float64x4) simd.Int32x4, want func(x []float64) []int32) {
+	n := 4
+	t.Helper()
+	forSlice(t, float64s, n, func(x []float64) bool {
+		t.Helper()
+		a := simd.LoadFloat64x4Slice(x)
+		g := make([]int32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x16ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testInt32x16ConvertToInt32(t *testing.T, f func(x simd.Int32x16) simd.Int32x16, want func(x []int32) []int32) {
+	n := 16
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := simd.LoadInt32x16Slice(x)
+		g := make([]int32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x8ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testInt64x8ConvertToInt32(t *testing.T, f func(x simd.Int64x8) simd.Int32x8, want func(x []int64) []int32) {
+	n := 8
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := simd.LoadInt64x8Slice(x)
+		g := make([]int32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x16ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testUint32x16ConvertToInt32(t *testing.T, f func(x simd.Uint32x16) simd.Int32x16, want func(x []uint32) []int32) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := simd.LoadUint32x16Slice(x)
+		g := make([]int32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x8ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testUint64x8ConvertToInt32(t *testing.T, f func(x simd.Uint64x8) simd.Int32x8, want func(x []uint64) []int32) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := simd.LoadUint64x8Slice(x)
+		g := make([]int32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat32x16ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testFloat32x16ConvertToInt32(t *testing.T, f func(x simd.Float32x16) simd.Int32x16, want func(x []float32) []int32) {
+	n := 16
+	t.Helper()
+	forSlice(t, float32s, n, func(x []float32) bool {
+		t.Helper()
+		a := simd.LoadFloat32x16Slice(x)
+		g := make([]int32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat64x8ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testFloat64x8ConvertToInt32(t *testing.T, f func(x simd.Float64x8) simd.Int32x8, want func(x []float64) []int32) {
+	n := 8
+	t.Helper()
+	forSlice(t, float64s, n, func(x []float64) bool {
+		t.Helper()
+		a := simd.LoadFloat64x8Slice(x)
+		g := make([]int32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x16ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testInt8x16ConvertToUint32(t *testing.T, f func(x simd.Int8x16) simd.Uint32x16, want func(x []int8) []uint32) {
+	n := 16
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := simd.LoadInt8x16Slice(x)
+		g := make([]uint32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x8ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testInt16x8ConvertToUint32(t *testing.T, f func(x simd.Int16x8) simd.Uint32x8, want func(x []int16) []uint32) {
+	n := 8
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := simd.LoadInt16x8Slice(x)
+		g := make([]uint32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x4ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testInt32x4ConvertToUint32(t *testing.T, f func(x simd.Int32x4) simd.Uint32x4, want func(x []int32) []uint32) {
+	n := 4
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := simd.LoadInt32x4Slice(x)
+		g := make([]uint32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x16ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testUint8x16ConvertToUint32(t *testing.T, f func(x simd.Uint8x16) simd.Uint32x16, want func(x []uint8) []uint32) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := simd.LoadUint8x16Slice(x)
+		g := make([]uint32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x8ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testUint16x8ConvertToUint32(t *testing.T, f func(x simd.Uint16x8) simd.Uint32x8, want func(x []uint16) []uint32) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := simd.LoadUint16x8Slice(x)
+		g := make([]uint32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x4ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testUint32x4ConvertToUint32(t *testing.T, f func(x simd.Uint32x4) simd.Uint32x4, want func(x []uint32) []uint32) {
+	n := 4
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := simd.LoadUint32x4Slice(x)
+		g := make([]uint32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat32x4ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testFloat32x4ConvertToUint32(t *testing.T, f func(x simd.Float32x4) simd.Uint32x4, want func(x []float32) []uint32) {
+	n := 4
+	t.Helper()
+	forSlice(t, float32s, n, func(x []float32) bool {
+		t.Helper()
+		a := simd.LoadFloat32x4Slice(x)
+		g := make([]uint32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x16ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testInt16x16ConvertToUint32(t *testing.T, f func(x simd.Int16x16) simd.Uint32x16, want func(x []int16) []uint32) {
+	n := 16
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := simd.LoadInt16x16Slice(x)
+		g := make([]uint32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x8ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testInt32x8ConvertToUint32(t *testing.T, f func(x simd.Int32x8) simd.Uint32x8, want func(x []int32) []uint32) {
+	n := 8
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := simd.LoadInt32x8Slice(x)
+		g := make([]uint32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x4ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testInt64x4ConvertToUint32(t *testing.T, f func(x simd.Int64x4) simd.Uint32x4, want func(x []int64) []uint32) {
+	n := 4
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := simd.LoadInt64x4Slice(x)
+		g := make([]uint32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x16ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testUint16x16ConvertToUint32(t *testing.T, f func(x simd.Uint16x16) simd.Uint32x16, want func(x []uint16) []uint32) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := simd.LoadUint16x16Slice(x)
+		g := make([]uint32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x8ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testUint32x8ConvertToUint32(t *testing.T, f func(x simd.Uint32x8) simd.Uint32x8, want func(x []uint32) []uint32) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := simd.LoadUint32x8Slice(x)
+		g := make([]uint32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x4ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testUint64x4ConvertToUint32(t *testing.T, f func(x simd.Uint64x4) simd.Uint32x4, want func(x []uint64) []uint32) {
+	n := 4
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := simd.LoadUint64x4Slice(x)
+		g := make([]uint32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat32x8ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testFloat32x8ConvertToUint32(t *testing.T, f func(x simd.Float32x8) simd.Uint32x8, want func(x []float32) []uint32) {
+	n := 8
+	t.Helper()
+	forSlice(t, float32s, n, func(x []float32) bool {
+		t.Helper()
+		a := simd.LoadFloat32x8Slice(x)
+		g := make([]uint32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat64x4ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testFloat64x4ConvertToUint32(t *testing.T, f func(x simd.Float64x4) simd.Uint32x4, want func(x []float64) []uint32) {
+	n := 4
+	t.Helper()
+	forSlice(t, float64s, n, func(x []float64) bool {
+		t.Helper()
+		a := simd.LoadFloat64x4Slice(x)
+		g := make([]uint32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x16ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testInt32x16ConvertToUint32(t *testing.T, f func(x simd.Int32x16) simd.Uint32x16, want func(x []int32) []uint32) {
+	n := 16
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := simd.LoadInt32x16Slice(x)
+		g := make([]uint32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x8ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testInt64x8ConvertToUint32(t *testing.T, f func(x simd.Int64x8) simd.Uint32x8, want func(x []int64) []uint32) {
+	n := 8
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := simd.LoadInt64x8Slice(x)
+		g := make([]uint32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x16ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testUint32x16ConvertToUint32(t *testing.T, f func(x simd.Uint32x16) simd.Uint32x16, want func(x []uint32) []uint32) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := simd.LoadUint32x16Slice(x)
+		g := make([]uint32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x8ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testUint64x8ConvertToUint32(t *testing.T, f func(x simd.Uint64x8) simd.Uint32x8, want func(x []uint64) []uint32) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := simd.LoadUint64x8Slice(x)
+		g := make([]uint32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat32x16ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testFloat32x16ConvertToUint32(t *testing.T, f func(x simd.Float32x16) simd.Uint32x16, want func(x []float32) []uint32) {
+	n := 16
+	t.Helper()
+	forSlice(t, float32s, n, func(x []float32) bool {
+		t.Helper()
+		a := simd.LoadFloat32x16Slice(x)
+		g := make([]uint32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat64x8ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testFloat64x8ConvertToUint32(t *testing.T, f func(x simd.Float64x8) simd.Uint32x8, want func(x []float64) []uint32) {
+	n := 8
+	t.Helper()
+	forSlice(t, float64s, n, func(x []float64) bool {
+		t.Helper()
+		a := simd.LoadFloat64x8Slice(x)
+		g := make([]uint32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x16ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testInt8x16ConvertToUint16(t *testing.T, f func(x simd.Int8x16) simd.Uint16x16, want func(x []int8) []uint16) {
+	n := 16
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := simd.LoadInt8x16Slice(x)
+		g := make([]uint16, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x8ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testInt16x8ConvertToUint16(t *testing.T, f func(x simd.Int16x8) simd.Uint16x8, want func(x []int16) []uint16) {
+	n := 8
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := simd.LoadInt16x8Slice(x)
+		g := make([]uint16, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x16ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testUint8x16ConvertToUint16(t *testing.T, f func(x simd.Uint8x16) simd.Uint16x16, want func(x []uint8) []uint16) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := simd.LoadUint8x16Slice(x)
+		g := make([]uint16, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x8ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testUint16x8ConvertToUint16(t *testing.T, f func(x simd.Uint16x8) simd.Uint16x8, want func(x []uint16) []uint16) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := simd.LoadUint16x8Slice(x)
+		g := make([]uint16, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x32ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testInt8x32ConvertToUint16(t *testing.T, f func(x simd.Int8x32) simd.Uint16x32, want func(x []int8) []uint16) {
+	n := 32
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := simd.LoadInt8x32Slice(x)
+		g := make([]uint16, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x16ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testInt16x16ConvertToUint16(t *testing.T, f func(x simd.Int16x16) simd.Uint16x16, want func(x []int16) []uint16) {
+	n := 16
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := simd.LoadInt16x16Slice(x)
+		g := make([]uint16, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x8ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testInt32x8ConvertToUint16(t *testing.T, f func(x simd.Int32x8) simd.Uint16x8, want func(x []int32) []uint16) {
+	n := 8
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := simd.LoadInt32x8Slice(x)
+		g := make([]uint16, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x32ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testUint8x32ConvertToUint16(t *testing.T, f func(x simd.Uint8x32) simd.Uint16x32, want func(x []uint8) []uint16) {
+	n := 32
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := simd.LoadUint8x32Slice(x)
+		g := make([]uint16, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x16ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testUint16x16ConvertToUint16(t *testing.T, f func(x simd.Uint16x16) simd.Uint16x16, want func(x []uint16) []uint16) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := simd.LoadUint16x16Slice(x)
+		g := make([]uint16, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x8ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testUint32x8ConvertToUint16(t *testing.T, f func(x simd.Uint32x8) simd.Uint16x8, want func(x []uint32) []uint16) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := simd.LoadUint32x8Slice(x)
+		g := make([]uint16, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat32x8ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testFloat32x8ConvertToUint16(t *testing.T, f func(x simd.Float32x8) simd.Uint16x8, want func(x []float32) []uint16) {
+	n := 8
+	t.Helper()
+	forSlice(t, float32s, n, func(x []float32) bool {
+		t.Helper()
+		a := simd.LoadFloat32x8Slice(x)
+		g := make([]uint16, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x32ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testInt16x32ConvertToUint16(t *testing.T, f func(x simd.Int16x32) simd.Uint16x32, want func(x []int16) []uint16) {
+	n := 32
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := simd.LoadInt16x32Slice(x)
+		g := make([]uint16, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x16ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testInt32x16ConvertToUint16(t *testing.T, f func(x simd.Int32x16) simd.Uint16x16, want func(x []int32) []uint16) {
+	n := 16
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := simd.LoadInt32x16Slice(x)
+		g := make([]uint16, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x8ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testInt64x8ConvertToUint16(t *testing.T, f func(x simd.Int64x8) simd.Uint16x8, want func(x []int64) []uint16) {
+	n := 8
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := simd.LoadInt64x8Slice(x)
+		g := make([]uint16, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x32ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testUint16x32ConvertToUint16(t *testing.T, f func(x simd.Uint16x32) simd.Uint16x32, want func(x []uint16) []uint16) {
+	n := 32
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := simd.LoadUint16x32Slice(x)
+		g := make([]uint16, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x16ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testUint32x16ConvertToUint16(t *testing.T, f func(x simd.Uint32x16) simd.Uint16x16, want func(x []uint32) []uint16) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := simd.LoadUint32x16Slice(x)
+		g := make([]uint16, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x8ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testUint64x8ConvertToUint16(t *testing.T, f func(x simd.Uint64x8) simd.Uint16x8, want func(x []uint64) []uint16) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := simd.LoadUint64x8Slice(x)
+		g := make([]uint16, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat32x16ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testFloat32x16ConvertToUint16(t *testing.T, f func(x simd.Float32x16) simd.Uint16x16, want func(x []float32) []uint16) {
+	n := 16
+	t.Helper()
+	forSlice(t, float32s, n, func(x []float32) bool {
+		t.Helper()
+		a := simd.LoadFloat32x16Slice(x)
+		g := make([]uint16, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat64x8ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testFloat64x8ConvertToUint16(t *testing.T, f func(x simd.Float64x8) simd.Uint16x8, want func(x []float64) []uint16) {
+	n := 8
+	t.Helper()
+	forSlice(t, float64s, n, func(x []float64) bool {
+		t.Helper()
+		a := simd.LoadFloat64x8Slice(x)
+		g := make([]uint16, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat32x4UnaryFlaky tests the simd unary method f against the expected behavior generated by want,
+// but using a flakiness parameter because we haven't exactly figured out how simd floating point works
+func testFloat32x4UnaryFlaky(t *testing.T, f func(x simd.Float32x4) simd.Float32x4, want func(x []float32) []float32, flakiness float64) {
+	n := 4
+	t.Helper()
+	forSlice(t, float32s, n, func(x []float32) bool {
+		t.Helper()
+		a := simd.LoadFloat32x4Slice(x)
+		g := make([]float32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, flakiness, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat64x2UnaryFlaky tests the simd unary method f against the expected behavior generated by want,
+// but using a flakiness parameter because we haven't exactly figured out how simd floating point works
+func testFloat64x2UnaryFlaky(t *testing.T, f func(x simd.Float64x2) simd.Float64x2, want func(x []float64) []float64, flakiness float64) {
+	n := 2
+	t.Helper()
+	forSlice(t, float64s, n, func(x []float64) bool {
+		t.Helper()
+		a := simd.LoadFloat64x2Slice(x)
+		g := make([]float64, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, flakiness, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat32x8UnaryFlaky tests the simd unary method f against the expected behavior generated by want,
+// but using a flakiness parameter because we haven't exactly figured out how simd floating point works
+func testFloat32x8UnaryFlaky(t *testing.T, f func(x simd.Float32x8) simd.Float32x8, want func(x []float32) []float32, flakiness float64) {
+	n := 8
+	t.Helper()
+	forSlice(t, float32s, n, func(x []float32) bool {
+		t.Helper()
+		a := simd.LoadFloat32x8Slice(x)
+		g := make([]float32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, flakiness, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat64x4UnaryFlaky tests the simd unary method f against the expected behavior generated by want,
+// but using a flakiness parameter because we haven't exactly figured out how simd floating point works
+func testFloat64x4UnaryFlaky(t *testing.T, f func(x simd.Float64x4) simd.Float64x4, want func(x []float64) []float64, flakiness float64) {
+	n := 4
+	t.Helper()
+	forSlice(t, float64s, n, func(x []float64) bool {
+		t.Helper()
+		a := simd.LoadFloat64x4Slice(x)
+		g := make([]float64, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, flakiness, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat32x16UnaryFlaky tests the simd unary method f against the expected behavior generated by want,
+// but using a flakiness parameter because we haven't exactly figured out how simd floating point works
+func testFloat32x16UnaryFlaky(t *testing.T, f func(x simd.Float32x16) simd.Float32x16, want func(x []float32) []float32, flakiness float64) {
+	n := 16
+	t.Helper()
+	forSlice(t, float32s, n, func(x []float32) bool {
+		t.Helper()
+		a := simd.LoadFloat32x16Slice(x)
+		g := make([]float32, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, flakiness, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat64x8UnaryFlaky tests the simd unary method f against the expected behavior generated by want,
+// but using a flakiness parameter because we haven't exactly figured out how simd floating point works
+func testFloat64x8UnaryFlaky(t *testing.T, f func(x simd.Float64x8) simd.Float64x8, want func(x []float64) []float64, flakiness float64) {
+	n := 8
+	t.Helper()
+	forSlice(t, float64s, n, func(x []float64) bool {
+		t.Helper()
+		a := simd.LoadFloat64x8Slice(x)
+		g := make([]float64, n)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, flakiness, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
diff --git a/src/simd/internal/simd_test/unary_test.go b/src/simd/internal/simd_test/unary_test.go
new file mode 100644
index 0000000000..ea4c114992
--- /dev/null
+++ b/src/simd/internal/simd_test/unary_test.go
@@ -0,0 +1,137 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd && amd64
+
+package simd_test
+
+import (
+	"math"
+	"simd"
+	"testing"
+)
+
+func TestCeil(t *testing.T) {
+	testFloat32x4Unary(t, simd.Float32x4.Ceil, ceilSlice[float32])
+	testFloat32x8Unary(t, simd.Float32x8.Ceil, ceilSlice[float32])
+	testFloat64x2Unary(t, simd.Float64x2.Ceil, ceilSlice[float64])
+	testFloat64x4Unary(t, simd.Float64x4.Ceil, ceilSlice[float64])
+	if simd.X86.AVX512() {
+		// testFloat32x16Unary(t, simd.Float32x16.Ceil, ceilSlice[float32]) // missing
+		// testFloat64x8Unary(t, simd.Float64x8.Ceil, ceilSlice[float64])   // missing
+	}
+}
+
+func TestFloor(t *testing.T) {
+	testFloat32x4Unary(t, simd.Float32x4.Floor, floorSlice[float32])
+	testFloat32x8Unary(t, simd.Float32x8.Floor, floorSlice[float32])
+	testFloat64x2Unary(t, simd.Float64x2.Floor, floorSlice[float64])
+	testFloat64x4Unary(t, simd.Float64x4.Floor, floorSlice[float64])
+	if simd.X86.AVX512() {
+		// testFloat32x16Unary(t, simd.Float32x16.Floor, floorSlice[float32]) // missing
+		// testFloat64x8Unary(t, simd.Float64x8.Floor, floorSlice[float64])   // missing
+	}
+}
+
+func TestTrunc(t *testing.T) {
+	testFloat32x4Unary(t, simd.Float32x4.Trunc, truncSlice[float32])
+	testFloat32x8Unary(t, simd.Float32x8.Trunc, truncSlice[float32])
+	testFloat64x2Unary(t, simd.Float64x2.Trunc, truncSlice[float64])
+	testFloat64x4Unary(t, simd.Float64x4.Trunc, truncSlice[float64])
+	if simd.X86.AVX512() {
+		// testFloat32x16Unary(t, simd.Float32x16.Trunc, truncSlice[float32]) // missing
+		// testFloat64x8Unary(t, simd.Float64x8.Trunc, truncSlice[float64])   // missing
+	}
+}
+
+func TestRound(t *testing.T) {
+	testFloat32x4Unary(t, simd.Float32x4.RoundToEven, roundSlice[float32])
+	testFloat32x8Unary(t, simd.Float32x8.RoundToEven, roundSlice[float32])
+	testFloat64x2Unary(t, simd.Float64x2.RoundToEven, roundSlice[float64])
+	testFloat64x4Unary(t, simd.Float64x4.RoundToEven, roundSlice[float64])
+	if simd.X86.AVX512() {
+		// testFloat32x16Unary(t, simd.Float32x16.Round, roundSlice[float32]) // missing
+		// testFloat64x8Unary(t, simd.Float64x8.Round, roundSlice[float64])   // missing
+	}
+}
+
+func TestSqrt(t *testing.T) {
+	testFloat32x4Unary(t, simd.Float32x4.Sqrt, sqrtSlice[float32])
+	testFloat32x8Unary(t, simd.Float32x8.Sqrt, sqrtSlice[float32])
+	testFloat64x2Unary(t, simd.Float64x2.Sqrt, sqrtSlice[float64])
+	testFloat64x4Unary(t, simd.Float64x4.Sqrt, sqrtSlice[float64])
+	if simd.X86.AVX512() {
+		testFloat32x16Unary(t, simd.Float32x16.Sqrt, sqrtSlice[float32])
+		testFloat64x8Unary(t, simd.Float64x8.Sqrt, sqrtSlice[float64])
+	}
+}
+
+func TestNot(t *testing.T) {
+	testInt8x16Unary(t, simd.Int8x16.Not, map1[int8](not))
+	testInt8x32Unary(t, simd.Int8x32.Not, map1[int8](not))
+	testInt16x8Unary(t, simd.Int16x8.Not, map1[int16](not))
+	testInt16x16Unary(t, simd.Int16x16.Not, map1[int16](not))
+	testInt32x4Unary(t, simd.Int32x4.Not, map1[int32](not))
+	testInt32x8Unary(t, simd.Int32x8.Not, map1[int32](not))
+}
+
+func TestAbsolute(t *testing.T) {
+	testInt8x16Unary(t, simd.Int8x16.Abs, map1[int8](abs))
+	testInt8x32Unary(t, simd.Int8x32.Abs, map1[int8](abs))
+	testInt16x8Unary(t, simd.Int16x8.Abs, map1[int16](abs))
+	testInt16x16Unary(t, simd.Int16x16.Abs, map1[int16](abs))
+	testInt32x4Unary(t, simd.Int32x4.Abs, map1[int32](abs))
+	testInt32x8Unary(t, simd.Int32x8.Abs, map1[int32](abs))
+	if simd.X86.AVX512() {
+		testInt8x64Unary(t, simd.Int8x64.Abs, map1[int8](abs))
+		testInt16x32Unary(t, simd.Int16x32.Abs, map1[int16](abs))
+		testInt32x16Unary(t, simd.Int32x16.Abs, map1[int32](abs))
+		testInt64x2Unary(t, simd.Int64x2.Abs, map1[int64](abs))
+		testInt64x4Unary(t, simd.Int64x4.Abs, map1[int64](abs))
+		testInt64x8Unary(t, simd.Int64x8.Abs, map1[int64](abs))
+	}
+}
+
+func TestCeilScaledResidue(t *testing.T) {
+	if !simd.X86.AVX512() {
+		t.Skip("Needs AVX512")
+	}
+	testFloat64x8UnaryFlaky(t,
+		func(x simd.Float64x8) simd.Float64x8 { return x.CeilScaledResidue(0) },
+		map1(ceilResidueForPrecision[float64](0)),
+		0.001)
+	testFloat64x8UnaryFlaky(t,
+		func(x simd.Float64x8) simd.Float64x8 { return x.CeilScaledResidue(1) },
+		map1(ceilResidueForPrecision[float64](1)),
+		0.001)
+	testFloat64x8Unary(t,
+		func(x simd.Float64x8) simd.Float64x8 { return x.Sub(x.CeilScaled(0)) },
+		map1[float64](func(x float64) float64 { return x - math.Ceil(x) }))
+}
+
+func TestToUint32(t *testing.T) {
+	if !simd.X86.AVX512() {
+		t.Skip("Needs AVX512")
+	}
+	testFloat32x4ConvertToUint32(t, simd.Float32x4.ConvertToUint32, map1[float32](toUint32))
+	testFloat32x8ConvertToUint32(t, simd.Float32x8.ConvertToUint32, map1[float32](toUint32))
+	testFloat32x16ConvertToUint32(t, simd.Float32x16.ConvertToUint32, map1[float32](toUint32))
+}
+
+func TestToInt32(t *testing.T) {
+	testFloat32x4ConvertToInt32(t, simd.Float32x4.ConvertToInt32, map1[float32](toInt32))
+	testFloat32x8ConvertToInt32(t, simd.Float32x8.ConvertToInt32, map1[float32](toInt32))
+}
+
+func TestConverts(t *testing.T) {
+	testUint8x16ConvertToUint16(t, simd.Uint8x16.ExtendToUint16, map1[uint8](toUint16))
+	testUint16x8ConvertToUint32(t, simd.Uint16x8.ExtendToUint32, map1[uint16](toUint32))
+}
+
+func TestConvertsAVX512(t *testing.T) {
+	if !simd.X86.AVX512() {
+		t.Skip("Needs AVX512")
+	}
+	testUint8x32ConvertToUint16(t, simd.Uint8x32.ExtendToUint16, map1[uint8](toUint16))
+}
diff --git a/src/simd/internal/test_helpers/checkslices.go b/src/simd/internal/test_helpers/checkslices.go
new file mode 100644
index 0000000000..54453798a2
--- /dev/null
+++ b/src/simd/internal/test_helpers/checkslices.go
@@ -0,0 +1,123 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd && amd64
+
+package test_helpers
+
+import (
+	"math"
+	"testing"
+)
+
+type signed interface {
+	~int | ~int8 | ~int16 | ~int32 | ~int64
+}
+
+type integer interface {
+	~int | ~int8 | ~int16 | ~int32 | ~int64 | ~uint | ~uint8 | ~uint16 | ~uint32 | ~uint64 | ~uintptr
+}
+
+type float interface {
+	~float32 | ~float64
+}
+
+type number interface {
+	~int | ~int8 | ~int16 | ~int32 | ~int64 | ~uint | ~uint8 | ~uint16 | ~uint32 | ~uint64 | ~uintptr | ~float32 | ~float64
+}
+
+func CheckSlices[T number](t *testing.T, got, want []T) bool {
+	t.Helper()
+	return CheckSlicesLogInput[T](t, got, want, 0.0, nil)
+}
+
+// CheckSlices compares two slices for equality,
+// reporting a test error if there is a problem,
+// and also consumes the two slices so that a
+// test/benchmark won't be dead-code eliminated.
+func CheckSlicesLogInput[T number](t *testing.T, got, want []T, flakiness float64, logInput func()) bool {
+	t.Helper()
+	var z T
+	for i := range want {
+		if got[i] != want[i] {
+			var ia any = got[i]
+			var ib any = want[i]
+			switch x := ia.(type) {
+			case float32:
+				y := ib.(float32)
+				if math.IsNaN(float64(x)) && math.IsNaN(float64(y)) {
+					continue
+				}
+				if flakiness > 0 {
+					if y == 0 {
+						if math.Abs(float64(x)) < flakiness {
+							continue
+						}
+					} else {
+						if math.Abs(float64((x-y)/y)) < flakiness {
+							continue
+						}
+					}
+				}
+			case float64:
+				y := ib.(float64)
+				if math.IsNaN(x) && math.IsNaN(y) {
+					continue
+				}
+				if flakiness > 0 {
+					if y == 0 {
+						if math.Abs(x) < flakiness {
+							continue
+						}
+					} else if math.Abs((x-y)/y) < flakiness {
+						continue
+					}
+				}
+
+			default:
+			}
+
+			t.Logf("For %T vector elements:", z)
+			t.Logf("got =%v", got)
+			t.Logf("want=%v", want)
+			if logInput != nil {
+				logInput()
+			}
+			t.Errorf("at index %d, got=%v, want=%v", i, got[i], want[i])
+			return false
+		} else if got[i] == 0 { // for floating point, 0.0 == -0.0 but a bitwise check can see the difference
+			var ia any = got[i]
+			var ib any = want[i]
+			switch x := ia.(type) {
+			case float32:
+				y := ib.(float32)
+				if math.Float32bits(x) != math.Float32bits(y) {
+					t.Logf("For %T vector elements:", z)
+					t.Logf("got =%v", got)
+					t.Logf("want=%v", want)
+					if logInput != nil {
+						logInput()
+					}
+					t.Errorf("at index %d, different signs of zero", i)
+					return false
+				}
+			case float64:
+				y := ib.(float64)
+				if math.Float64bits(x) != math.Float64bits(y) {
+					t.Logf("For %T vector elements:", z)
+					t.Logf("got =%v", got)
+					t.Logf("want=%v", want)
+					if logInput != nil {
+						logInput()
+					}
+					t.Errorf("at index %d, different signs of zero", i)
+					return false
+				}
+			default:
+			}
+
+		}
+	}
+	return true
+}
author	Cherry Mui <cherryyz@google.com>	2025-11-24 16:02:01 -0500
committer	Cherry Mui <cherryyz@google.com>	2025-11-24 16:02:08 -0500
commit	d4f5650cc5cdc5fa559491991208f8563bd6f3b8 (patch)
tree	723d49351a1fc8ee7f295cd69bfe8ef7501cfe60 /src/simd/internal
parent	e704b0993b69b564d7a0f515cf206f2cc9f1a342 (diff)
parent	7d65463a5431063490a229a80f1d4c6cc19a2169 (diff)
download	go-d4f5650cc5cdc5fa559491991208f8563bd6f3b8.tar.xz