diff options
| author | David Chase <drchase@google.com> | 2025-09-20 16:52:07 -0400 |
|---|---|---|
| committer | David Chase <drchase@google.com> | 2025-09-26 13:11:24 -0700 |
| commit | ea3b2ecd2878a694f9f42011eccb1312feb82bca (patch) | |
| tree | 88b9d9075fadcb19dc676e070232b74d3e6b1d0e /src/simd | |
| parent | 25c36b95d1523f22d4c46ec237acc03e00540e0a (diff) | |
| download | go-ea3b2ecd2878a694f9f42011eccb1312feb82bca.tar.xz | |
[dev.simd] cmd/compile, simd: add 64-bit select-from-pair methods
these are in the same style as the 32-bit select-from-pair,
including the grouped variant. This does not quite capture
the full awesome power of VSHUFPD where it can select
differently in each group; that will be some other method,
that is more complex.
Change-Id: I807ddd7c1256103b5b0d7c5d60bd70b185e3aaf0
Reviewed-on: https://go-review.googlesource.com/c/go/+/705695
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Junyang Shao <shaojunyang@google.com>
Diffstat (limited to 'src/simd')
| -rw-r--r-- | src/simd/internal/simd_test/simd_test.go | 120 | ||||
| -rw-r--r-- | src/simd/pkginternal_test.go | 112 | ||||
| -rw-r--r-- | src/simd/shuffles_amd64.go | 798 |
3 files changed, 717 insertions, 313 deletions
diff --git a/src/simd/internal/simd_test/simd_test.go b/src/simd/internal/simd_test/simd_test.go index e38f7eea01..d00fcf5dd3 100644 --- a/src/simd/internal/simd_test/simd_test.go +++ b/src/simd/internal/simd_test/simd_test.go @@ -595,7 +595,7 @@ func TestIsZero(t *testing.T) { } } -func TestSelectFromPairConst(t *testing.T) { +func TestSelect4FromPairConst(t *testing.T) { x := simd.LoadInt32x4Slice([]int32{0, 1, 2, 3}) y := simd.LoadInt32x4Slice([]int32{4, 5, 6, 7}) @@ -652,7 +652,7 @@ func selectFromPairInt32x4(x simd.Int32x4, a, b, c, d uint8, y simd.Int32x4) sim return x.SelectFromPair(a, b, c, d, y) } -func TestSelectFromPairVar(t *testing.T) { +func TestSelect4FromPairVar(t *testing.T) { x := simd.LoadInt32x4Slice([]int32{0, 1, 2, 3}) y := simd.LoadInt32x4Slice([]int32{4, 5, 6, 7}) @@ -704,7 +704,7 @@ func TestSelectFromPairVar(t *testing.T) { foo(hllh, 4, 0, 1, 5) } -func TestSelectFromPairConstGroupedFloat32x8(t *testing.T) { +func TestSelect4FromPairConstGrouped(t *testing.T) { x := simd.LoadFloat32x8Slice([]float32{0, 1, 2, 3, 10, 11, 12, 13}) y := simd.LoadFloat32x8Slice([]float32{4, 5, 6, 7, 14, 15, 16, 17}) @@ -887,5 +887,119 @@ func TestSelect128FromPairVar(t *testing.T) { foo(cd, 2, 3) foo(da, 3, 0) foo(dc, 3, 2) +} + +func TestSelect2FromPairConst(t *testing.T) { + x := simd.LoadUint64x2Slice([]uint64{0, 1}) + y := simd.LoadUint64x2Slice([]uint64{2, 3}) + + ll := x.SelectFromPair(0, 1, y) + hh := x.SelectFromPair(3, 2, y) + lh := x.SelectFromPair(0, 3, y) + hl := x.SelectFromPair(2, 1, y) + + r := make([]uint64, 2, 2) + + foo := func(v simd.Uint64x2, a, b uint64) { + v.StoreSlice(r) + checkSlices[uint64](t, r, []uint64{a, b}) + } + + foo(ll, 0, 1) + foo(hh, 3, 2) + foo(lh, 0, 3) + foo(hl, 2, 1) +} + +func TestSelect2FromPairConstGroupedUint(t *testing.T) { + x := simd.LoadUint64x4Slice([]uint64{0, 1, 10, 11}) + y := simd.LoadUint64x4Slice([]uint64{2, 3, 12, 13}) + + ll := x.SelectFromPairGrouped(0, 1, y) + hh := x.SelectFromPairGrouped(3, 2, y) + lh := x.SelectFromPairGrouped(0, 3, y) + hl := x.SelectFromPairGrouped(2, 1, y) + + r := make([]uint64, 4, 4) + + foo := func(v simd.Uint64x4, a, b uint64) { + v.StoreSlice(r) + checkSlices[uint64](t, r, []uint64{a, b, a + 10, b + 10}) + } + + foo(ll, 0, 1) + foo(hh, 3, 2) + foo(lh, 0, 3) + foo(hl, 2, 1) +} + +func TestSelect2FromPairConstGroupedFloat(t *testing.T) { + x := simd.LoadFloat64x4Slice([]float64{0, 1, 10, 11}) + y := simd.LoadFloat64x4Slice([]float64{2, 3, 12, 13}) + + ll := x.SelectFromPairGrouped(0, 1, y) + hh := x.SelectFromPairGrouped(3, 2, y) + lh := x.SelectFromPairGrouped(0, 3, y) + hl := x.SelectFromPairGrouped(2, 1, y) + + r := make([]float64, 4, 4) + + foo := func(v simd.Float64x4, a, b float64) { + v.StoreSlice(r) + checkSlices[float64](t, r, []float64{a, b, a + 10, b + 10}) + } + + foo(ll, 0, 1) + foo(hh, 3, 2) + foo(lh, 0, 3) + foo(hl, 2, 1) +} + +func TestSelect2FromPairConstGroupedInt(t *testing.T) { + x := simd.LoadInt64x4Slice([]int64{0, 1, 10, 11}) + y := simd.LoadInt64x4Slice([]int64{2, 3, 12, 13}) + + ll := x.SelectFromPairGrouped(0, 1, y) + hh := x.SelectFromPairGrouped(3, 2, y) + lh := x.SelectFromPairGrouped(0, 3, y) + hl := x.SelectFromPairGrouped(2, 1, y) + + r := make([]int64, 4, 4) + + foo := func(v simd.Int64x4, a, b int64) { + v.StoreSlice(r) + checkSlices[int64](t, r, []int64{a, b, a + 10, b + 10}) + } + + foo(ll, 0, 1) + foo(hh, 3, 2) + foo(lh, 0, 3) + foo(hl, 2, 1) +} + +func TestSelect2FromPairConstGroupedInt512(t *testing.T) { + if !simd.HasAVX512() { + t.Skip("Test requires HasAVX512, not available on this hardware") + return + } + + x := simd.LoadInt64x8Slice([]int64{0, 1, 10, 11, 20, 21, 30, 31}) + y := simd.LoadInt64x8Slice([]int64{2, 3, 12, 13, 22, 23, 32, 33}) + + ll := x.SelectFromPairGrouped(0, 1, y) + hh := x.SelectFromPairGrouped(3, 2, y) + lh := x.SelectFromPairGrouped(0, 3, y) + hl := x.SelectFromPairGrouped(2, 1, y) + + r := make([]int64, 8, 8) + + foo := func(v simd.Int64x8, a, b int64) { + v.StoreSlice(r) + checkSlices[int64](t, r, []int64{a, b, a + 10, b + 10, a + 20, b + 20, a + 30, b + 30}) + } + foo(ll, 0, 1) + foo(hh, 3, 2) + foo(lh, 0, 3) + foo(hl, 2, 1) } diff --git a/src/simd/pkginternal_test.go b/src/simd/pkginternal_test.go index 557a0537b4..632e24d9d9 100644 --- a/src/simd/pkginternal_test.go +++ b/src/simd/pkginternal_test.go @@ -99,53 +99,53 @@ func select2x4x32(x Int32x4, a, b, c, d uint8, y Int32x4) Int32x4 { switch pattern { case _LLLL: - return x.concatSelectedConstant(cscimm(a, b, c, d), x) + return x.concatSelectedConstant(cscimm4(a, b, c, d), x) case _HHHH: - return y.concatSelectedConstant(cscimm(a, b, c, d), y) + return y.concatSelectedConstant(cscimm4(a, b, c, d), y) case _LLHH: - return x.concatSelectedConstant(cscimm(a, b, c, d), y) + return x.concatSelectedConstant(cscimm4(a, b, c, d), y) case _HHLL: - return y.concatSelectedConstant(cscimm(a, b, c, d), x) + return y.concatSelectedConstant(cscimm4(a, b, c, d), x) case _HLLL: - z := y.concatSelectedConstant(cscimm(a, a, b, b), x) - return z.concatSelectedConstant(cscimm(0, 2, c, d), x) + z := y.concatSelectedConstant(cscimm4(a, a, b, b), x) + return z.concatSelectedConstant(cscimm4(0, 2, c, d), x) case _LHLL: - z := x.concatSelectedConstant(cscimm(a, a, b, b), y) - return z.concatSelectedConstant(cscimm(0, 2, c, d), x) + z := x.concatSelectedConstant(cscimm4(a, a, b, b), y) + return z.concatSelectedConstant(cscimm4(0, 2, c, d), x) case _HLHH: - z := y.concatSelectedConstant(cscimm(a, a, b, b), x) - return z.concatSelectedConstant(cscimm(0, 2, c, d), y) + z := y.concatSelectedConstant(cscimm4(a, a, b, b), x) + return z.concatSelectedConstant(cscimm4(0, 2, c, d), y) case _LHHH: - z := x.concatSelectedConstant(cscimm(a, a, b, b), y) - return z.concatSelectedConstant(cscimm(0, 2, c, d), y) + z := x.concatSelectedConstant(cscimm4(a, a, b, b), y) + return z.concatSelectedConstant(cscimm4(0, 2, c, d), y) case _LLLH: - z := x.concatSelectedConstant(cscimm(c, c, d, d), y) - return x.concatSelectedConstant(cscimm(a, b, 0, 2), z) + z := x.concatSelectedConstant(cscimm4(c, c, d, d), y) + return x.concatSelectedConstant(cscimm4(a, b, 0, 2), z) case _LLHL: - z := y.concatSelectedConstant(cscimm(c, c, d, d), x) - return x.concatSelectedConstant(cscimm(a, b, 0, 2), z) + z := y.concatSelectedConstant(cscimm4(c, c, d, d), x) + return x.concatSelectedConstant(cscimm4(a, b, 0, 2), z) case _HHLH: - z := x.concatSelectedConstant(cscimm(c, c, d, d), y) - return y.concatSelectedConstant(cscimm(a, b, 0, 2), z) + z := x.concatSelectedConstant(cscimm4(c, c, d, d), y) + return y.concatSelectedConstant(cscimm4(a, b, 0, 2), z) case _HHHL: - z := y.concatSelectedConstant(cscimm(c, c, d, d), x) - return y.concatSelectedConstant(cscimm(a, b, 0, 2), z) + z := y.concatSelectedConstant(cscimm4(c, c, d, d), x) + return y.concatSelectedConstant(cscimm4(a, b, 0, 2), z) case _LHLH: - z := x.concatSelectedConstant(cscimm(a, c, b, d), y) - return z.concatSelectedConstant(0b11_01_10_00 /* =cscimm(0, 2, 1, 3) */, z) + z := x.concatSelectedConstant(cscimm4(a, c, b, d), y) + return z.concatSelectedConstant(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z) case _HLHL: - z := x.concatSelectedConstant(cscimm(b, d, a, c), y) - return z.concatSelectedConstant(0b01_11_00_10 /* =cscimm(2, 0, 3, 1) */, z) + z := x.concatSelectedConstant(cscimm4(b, d, a, c), y) + return z.concatSelectedConstant(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z) case _HLLH: - z := x.concatSelectedConstant(cscimm(b, c, a, d), y) - return z.concatSelectedConstant(0b11_01_00_10 /* =cscimm(2, 0, 1, 3) */, z) + z := x.concatSelectedConstant(cscimm4(b, c, a, d), y) + return z.concatSelectedConstant(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z) case _LHHL: - z := x.concatSelectedConstant(cscimm(a, d, b, c), y) - return z.concatSelectedConstant(0b01_11_10_00 /* =cscimm(0, 2, 3, 1) */, z) + z := x.concatSelectedConstant(cscimm4(a, d, b, c), y) + return z.concatSelectedConstant(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z) } panic("missing case, switch should be exhaustive") } @@ -180,53 +180,53 @@ func select2x8x32Grouped(x Int32x8, a, b, c, d uint8, y Int32x8) Int32x8 { switch pattern { case _LLLL: - return x.concatSelectedConstantGrouped(cscimm(a, b, c, d), x) + return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x) case _HHHH: - return y.concatSelectedConstantGrouped(cscimm(a, b, c, d), y) + return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y) case _LLHH: - return x.concatSelectedConstantGrouped(cscimm(a, b, c, d), y) + return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y) case _HHLL: - return y.concatSelectedConstantGrouped(cscimm(a, b, c, d), x) + return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x) case _HLLL: - z := y.concatSelectedConstantGrouped(cscimm(a, a, b, b), x) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), x) + z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x) case _LHLL: - z := x.concatSelectedConstantGrouped(cscimm(a, a, b, b), y) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), x) + z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x) case _HLHH: - z := y.concatSelectedConstantGrouped(cscimm(a, a, b, b), x) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), y) + z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y) case _LHHH: - z := x.concatSelectedConstantGrouped(cscimm(a, a, b, b), y) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), y) + z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y) case _LLLH: - z := x.concatSelectedConstantGrouped(cscimm(c, c, d, d), y) - return x.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y) + return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _LLHL: - z := y.concatSelectedConstantGrouped(cscimm(c, c, d, d), x) - return x.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x) + return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _HHLH: - z := x.concatSelectedConstantGrouped(cscimm(c, c, d, d), y) - return y.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y) + return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _HHHL: - z := y.concatSelectedConstantGrouped(cscimm(c, c, d, d), x) - return y.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x) + return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _LHLH: - z := x.concatSelectedConstantGrouped(cscimm(a, c, b, d), y) - return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm(0, 2, 1, 3) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(a, c, b, d), y) + return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z) case _HLHL: - z := x.concatSelectedConstantGrouped(cscimm(b, d, a, c), y) - return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm(2, 0, 3, 1) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(b, d, a, c), y) + return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z) case _HLLH: - z := x.concatSelectedConstantGrouped(cscimm(b, c, a, d), y) - return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm(2, 0, 1, 3) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(b, c, a, d), y) + return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z) case _LHHL: - z := x.concatSelectedConstantGrouped(cscimm(a, d, b, c), y) - return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm(0, 2, 3, 1) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(a, d, b, c), y) + return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z) } panic("missing case, switch should be exhaustive") } diff --git a/src/simd/shuffles_amd64.go b/src/simd/shuffles_amd64.go index 68c840730b..c46a2d06fe 100644 --- a/src/simd/shuffles_amd64.go +++ b/src/simd/shuffles_amd64.go @@ -44,6 +44,16 @@ const ( _HHHH // a:y, b:y, c:y, d:y ) +// These constants represent the source pattern for the four parameters +// (a, b, c, d) passed to SelectFromPair and SelectFromPairGrouped for +// two-element vectors. +const ( + _LL = iota + _HL + _LH + _HH +) + // SelectFromPair returns the selection of four elements from the two // vectors x and y, where selector values in the range 0-3 specify // elements from x and values in the range 4-7 specify the 0-3 elements @@ -72,53 +82,53 @@ func (x Int32x4) SelectFromPair(a, b, c, d uint8, y Int32x4) Int32x4 { switch pattern { case _LLLL: - return x.concatSelectedConstant(cscimm(a, b, c, d), x) + return x.concatSelectedConstant(cscimm4(a, b, c, d), x) case _HHHH: - return y.concatSelectedConstant(cscimm(a, b, c, d), y) + return y.concatSelectedConstant(cscimm4(a, b, c, d), y) case _LLHH: - return x.concatSelectedConstant(cscimm(a, b, c, d), y) + return x.concatSelectedConstant(cscimm4(a, b, c, d), y) case _HHLL: - return y.concatSelectedConstant(cscimm(a, b, c, d), x) + return y.concatSelectedConstant(cscimm4(a, b, c, d), x) case _HLLL: - z := y.concatSelectedConstant(cscimm(a, a, b, b), x) - return z.concatSelectedConstant(cscimm(0, 2, c, d), x) + z := y.concatSelectedConstant(cscimm4(a, a, b, b), x) + return z.concatSelectedConstant(cscimm4(0, 2, c, d), x) case _LHLL: - z := x.concatSelectedConstant(cscimm(a, a, b, b), y) - return z.concatSelectedConstant(cscimm(0, 2, c, d), x) + z := x.concatSelectedConstant(cscimm4(a, a, b, b), y) + return z.concatSelectedConstant(cscimm4(0, 2, c, d), x) case _HLHH: - z := y.concatSelectedConstant(cscimm(a, a, b, b), x) - return z.concatSelectedConstant(cscimm(0, 2, c, d), y) + z := y.concatSelectedConstant(cscimm4(a, a, b, b), x) + return z.concatSelectedConstant(cscimm4(0, 2, c, d), y) case _LHHH: - z := x.concatSelectedConstant(cscimm(a, a, b, b), y) - return z.concatSelectedConstant(cscimm(0, 2, c, d), y) + z := x.concatSelectedConstant(cscimm4(a, a, b, b), y) + return z.concatSelectedConstant(cscimm4(0, 2, c, d), y) case _LLLH: - z := x.concatSelectedConstant(cscimm(c, c, d, d), y) - return x.concatSelectedConstant(cscimm(a, b, 0, 2), z) + z := x.concatSelectedConstant(cscimm4(c, c, d, d), y) + return x.concatSelectedConstant(cscimm4(a, b, 0, 2), z) case _LLHL: - z := y.concatSelectedConstant(cscimm(c, c, d, d), x) - return x.concatSelectedConstant(cscimm(a, b, 0, 2), z) + z := y.concatSelectedConstant(cscimm4(c, c, d, d), x) + return x.concatSelectedConstant(cscimm4(a, b, 0, 2), z) case _HHLH: - z := x.concatSelectedConstant(cscimm(c, c, d, d), y) - return y.concatSelectedConstant(cscimm(a, b, 0, 2), z) + z := x.concatSelectedConstant(cscimm4(c, c, d, d), y) + return y.concatSelectedConstant(cscimm4(a, b, 0, 2), z) case _HHHL: - z := y.concatSelectedConstant(cscimm(c, c, d, d), x) - return y.concatSelectedConstant(cscimm(a, b, 0, 2), z) + z := y.concatSelectedConstant(cscimm4(c, c, d, d), x) + return y.concatSelectedConstant(cscimm4(a, b, 0, 2), z) case _LHLH: - z := x.concatSelectedConstant(cscimm(a, c, b, d), y) - return z.concatSelectedConstant(0b11_01_10_00 /* =cscimm(0, 2, 1, 3) */, z) + z := x.concatSelectedConstant(cscimm4(a, c, b, d), y) + return z.concatSelectedConstant(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z) case _HLHL: - z := x.concatSelectedConstant(cscimm(b, d, a, c), y) - return z.concatSelectedConstant(0b01_11_00_10 /* =cscimm(2, 0, 3, 1) */, z) + z := x.concatSelectedConstant(cscimm4(b, d, a, c), y) + return z.concatSelectedConstant(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z) case _HLLH: - z := x.concatSelectedConstant(cscimm(b, c, a, d), y) - return z.concatSelectedConstant(0b11_01_00_10 /* =cscimm(2, 0, 1, 3) */, z) + z := x.concatSelectedConstant(cscimm4(b, c, a, d), y) + return z.concatSelectedConstant(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z) case _LHHL: - z := x.concatSelectedConstant(cscimm(a, d, b, c), y) - return z.concatSelectedConstant(0b01_11_10_00 /* =cscimm(0, 2, 3, 1) */, z) + z := x.concatSelectedConstant(cscimm4(a, d, b, c), y) + return z.concatSelectedConstant(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z) } panic("missing case, switch should be exhaustive") } @@ -144,53 +154,53 @@ func (x Uint32x4) SelectFromPair(a, b, c, d uint8, y Uint32x4) Uint32x4 { switch pattern { case _LLLL: - return x.concatSelectedConstant(cscimm(a, b, c, d), x) + return x.concatSelectedConstant(cscimm4(a, b, c, d), x) case _HHHH: - return y.concatSelectedConstant(cscimm(a, b, c, d), y) + return y.concatSelectedConstant(cscimm4(a, b, c, d), y) case _LLHH: - return x.concatSelectedConstant(cscimm(a, b, c, d), y) + return x.concatSelectedConstant(cscimm4(a, b, c, d), y) case _HHLL: - return y.concatSelectedConstant(cscimm(a, b, c, d), x) + return y.concatSelectedConstant(cscimm4(a, b, c, d), x) case _HLLL: - z := y.concatSelectedConstant(cscimm(a, a, b, b), x) - return z.concatSelectedConstant(cscimm(0, 2, c, d), x) + z := y.concatSelectedConstant(cscimm4(a, a, b, b), x) + return z.concatSelectedConstant(cscimm4(0, 2, c, d), x) case _LHLL: - z := x.concatSelectedConstant(cscimm(a, a, b, b), y) - return z.concatSelectedConstant(cscimm(0, 2, c, d), x) + z := x.concatSelectedConstant(cscimm4(a, a, b, b), y) + return z.concatSelectedConstant(cscimm4(0, 2, c, d), x) case _HLHH: - z := y.concatSelectedConstant(cscimm(a, a, b, b), x) - return z.concatSelectedConstant(cscimm(0, 2, c, d), y) + z := y.concatSelectedConstant(cscimm4(a, a, b, b), x) + return z.concatSelectedConstant(cscimm4(0, 2, c, d), y) case _LHHH: - z := x.concatSelectedConstant(cscimm(a, a, b, b), y) - return z.concatSelectedConstant(cscimm(0, 2, c, d), y) + z := x.concatSelectedConstant(cscimm4(a, a, b, b), y) + return z.concatSelectedConstant(cscimm4(0, 2, c, d), y) case _LLLH: - z := x.concatSelectedConstant(cscimm(c, c, d, d), y) - return x.concatSelectedConstant(cscimm(a, b, 0, 2), z) + z := x.concatSelectedConstant(cscimm4(c, c, d, d), y) + return x.concatSelectedConstant(cscimm4(a, b, 0, 2), z) case _LLHL: - z := y.concatSelectedConstant(cscimm(c, c, d, d), x) - return x.concatSelectedConstant(cscimm(a, b, 0, 2), z) + z := y.concatSelectedConstant(cscimm4(c, c, d, d), x) + return x.concatSelectedConstant(cscimm4(a, b, 0, 2), z) case _HHLH: - z := x.concatSelectedConstant(cscimm(c, c, d, d), y) - return y.concatSelectedConstant(cscimm(a, b, 0, 2), z) + z := x.concatSelectedConstant(cscimm4(c, c, d, d), y) + return y.concatSelectedConstant(cscimm4(a, b, 0, 2), z) case _HHHL: - z := y.concatSelectedConstant(cscimm(c, c, d, d), x) - return y.concatSelectedConstant(cscimm(a, b, 0, 2), z) + z := y.concatSelectedConstant(cscimm4(c, c, d, d), x) + return y.concatSelectedConstant(cscimm4(a, b, 0, 2), z) case _LHLH: - z := x.concatSelectedConstant(cscimm(a, c, b, d), y) - return z.concatSelectedConstant(0b11_01_10_00 /* =cscimm(0, 2, 1, 3) */, z) + z := x.concatSelectedConstant(cscimm4(a, c, b, d), y) + return z.concatSelectedConstant(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z) case _HLHL: - z := x.concatSelectedConstant(cscimm(b, d, a, c), y) - return z.concatSelectedConstant(0b01_11_00_10 /* =cscimm(2, 0, 3, 1) */, z) + z := x.concatSelectedConstant(cscimm4(b, d, a, c), y) + return z.concatSelectedConstant(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z) case _HLLH: - z := x.concatSelectedConstant(cscimm(b, c, a, d), y) - return z.concatSelectedConstant(0b11_01_00_10 /* =cscimm(2, 0, 1, 3) */, z) + z := x.concatSelectedConstant(cscimm4(b, c, a, d), y) + return z.concatSelectedConstant(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z) case _LHHL: - z := x.concatSelectedConstant(cscimm(a, d, b, c), y) - return z.concatSelectedConstant(0b01_11_10_00 /* =cscimm(0, 2, 3, 1) */, z) + z := x.concatSelectedConstant(cscimm4(a, d, b, c), y) + return z.concatSelectedConstant(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z) } panic("missing case, switch should be exhaustive") } @@ -216,53 +226,53 @@ func (x Float32x4) SelectFromPair(a, b, c, d uint8, y Float32x4) Float32x4 { switch pattern { case _LLLL: - return x.concatSelectedConstant(cscimm(a, b, c, d), x) + return x.concatSelectedConstant(cscimm4(a, b, c, d), x) case _HHHH: - return y.concatSelectedConstant(cscimm(a, b, c, d), y) + return y.concatSelectedConstant(cscimm4(a, b, c, d), y) case _LLHH: - return x.concatSelectedConstant(cscimm(a, b, c, d), y) + return x.concatSelectedConstant(cscimm4(a, b, c, d), y) case _HHLL: - return y.concatSelectedConstant(cscimm(a, b, c, d), x) + return y.concatSelectedConstant(cscimm4(a, b, c, d), x) case _HLLL: - z := y.concatSelectedConstant(cscimm(a, a, b, b), x) - return z.concatSelectedConstant(cscimm(0, 2, c, d), x) + z := y.concatSelectedConstant(cscimm4(a, a, b, b), x) + return z.concatSelectedConstant(cscimm4(0, 2, c, d), x) case _LHLL: - z := x.concatSelectedConstant(cscimm(a, a, b, b), y) - return z.concatSelectedConstant(cscimm(0, 2, c, d), x) + z := x.concatSelectedConstant(cscimm4(a, a, b, b), y) + return z.concatSelectedConstant(cscimm4(0, 2, c, d), x) case _HLHH: - z := y.concatSelectedConstant(cscimm(a, a, b, b), x) - return z.concatSelectedConstant(cscimm(0, 2, c, d), y) + z := y.concatSelectedConstant(cscimm4(a, a, b, b), x) + return z.concatSelectedConstant(cscimm4(0, 2, c, d), y) case _LHHH: - z := x.concatSelectedConstant(cscimm(a, a, b, b), y) - return z.concatSelectedConstant(cscimm(0, 2, c, d), y) + z := x.concatSelectedConstant(cscimm4(a, a, b, b), y) + return z.concatSelectedConstant(cscimm4(0, 2, c, d), y) case _LLLH: - z := x.concatSelectedConstant(cscimm(c, c, d, d), y) - return x.concatSelectedConstant(cscimm(a, b, 0, 2), z) + z := x.concatSelectedConstant(cscimm4(c, c, d, d), y) + return x.concatSelectedConstant(cscimm4(a, b, 0, 2), z) case _LLHL: - z := y.concatSelectedConstant(cscimm(c, c, d, d), x) - return x.concatSelectedConstant(cscimm(a, b, 0, 2), z) + z := y.concatSelectedConstant(cscimm4(c, c, d, d), x) + return x.concatSelectedConstant(cscimm4(a, b, 0, 2), z) case _HHLH: - z := x.concatSelectedConstant(cscimm(c, c, d, d), y) - return y.concatSelectedConstant(cscimm(a, b, 0, 2), z) + z := x.concatSelectedConstant(cscimm4(c, c, d, d), y) + return y.concatSelectedConstant(cscimm4(a, b, 0, 2), z) case _HHHL: - z := y.concatSelectedConstant(cscimm(c, c, d, d), x) - return y.concatSelectedConstant(cscimm(a, b, 0, 2), z) + z := y.concatSelectedConstant(cscimm4(c, c, d, d), x) + return y.concatSelectedConstant(cscimm4(a, b, 0, 2), z) case _LHLH: - z := x.concatSelectedConstant(cscimm(a, c, b, d), y) - return z.concatSelectedConstant(0b11_01_10_00 /* =cscimm(0, 2, 1, 3) */, z) + z := x.concatSelectedConstant(cscimm4(a, c, b, d), y) + return z.concatSelectedConstant(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z) case _HLHL: - z := x.concatSelectedConstant(cscimm(b, d, a, c), y) - return z.concatSelectedConstant(0b01_11_00_10 /* =cscimm(2, 0, 3, 1) */, z) + z := x.concatSelectedConstant(cscimm4(b, d, a, c), y) + return z.concatSelectedConstant(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z) case _HLLH: - z := x.concatSelectedConstant(cscimm(b, c, a, d), y) - return z.concatSelectedConstant(0b11_01_00_10 /* =cscimm(2, 0, 1, 3) */, z) + z := x.concatSelectedConstant(cscimm4(b, c, a, d), y) + return z.concatSelectedConstant(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z) case _LHHL: - z := x.concatSelectedConstant(cscimm(a, d, b, c), y) - return z.concatSelectedConstant(0b01_11_10_00 /* =cscimm(0, 2, 3, 1) */, z) + z := x.concatSelectedConstant(cscimm4(a, d, b, c), y) + return z.concatSelectedConstant(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z) } panic("missing case, switch should be exhaustive") } @@ -291,53 +301,53 @@ func (x Int32x8) SelectFromPairGrouped(a, b, c, d uint8, y Int32x8) Int32x8 { switch pattern { case _LLLL: - return x.concatSelectedConstantGrouped(cscimm(a, b, c, d), x) + return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x) case _HHHH: - return y.concatSelectedConstantGrouped(cscimm(a, b, c, d), y) + return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y) case _LLHH: - return x.concatSelectedConstantGrouped(cscimm(a, b, c, d), y) + return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y) case _HHLL: - return y.concatSelectedConstantGrouped(cscimm(a, b, c, d), x) + return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x) case _HLLL: - z := y.concatSelectedConstantGrouped(cscimm(a, a, b, b), x) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), x) + z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x) case _LHLL: - z := x.concatSelectedConstantGrouped(cscimm(a, a, b, b), y) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), x) + z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x) case _HLHH: - z := y.concatSelectedConstantGrouped(cscimm(a, a, b, b), x) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), y) + z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y) case _LHHH: - z := x.concatSelectedConstantGrouped(cscimm(a, a, b, b), y) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), y) + z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y) case _LLLH: - z := x.concatSelectedConstantGrouped(cscimm(c, c, d, d), y) - return x.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y) + return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _LLHL: - z := y.concatSelectedConstantGrouped(cscimm(c, c, d, d), x) - return x.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x) + return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _HHLH: - z := x.concatSelectedConstantGrouped(cscimm(c, c, d, d), y) - return y.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y) + return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _HHHL: - z := y.concatSelectedConstantGrouped(cscimm(c, c, d, d), x) - return y.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x) + return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _LHLH: - z := x.concatSelectedConstantGrouped(cscimm(a, c, b, d), y) - return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm(0, 2, 1, 3) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(a, c, b, d), y) + return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z) case _HLHL: - z := x.concatSelectedConstantGrouped(cscimm(b, d, a, c), y) - return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm(2, 0, 3, 1) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(b, d, a, c), y) + return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z) case _HLLH: - z := x.concatSelectedConstantGrouped(cscimm(b, c, a, d), y) - return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm(2, 0, 1, 3) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(b, c, a, d), y) + return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z) case _LHHL: - z := x.concatSelectedConstantGrouped(cscimm(a, d, b, c), y) - return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm(0, 2, 3, 1) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(a, d, b, c), y) + return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z) } panic("missing case, switch should be exhaustive") } @@ -366,53 +376,53 @@ func (x Uint32x8) SelectFromPairGrouped(a, b, c, d uint8, y Uint32x8) Uint32x8 { switch pattern { case _LLLL: - return x.concatSelectedConstantGrouped(cscimm(a, b, c, d), x) + return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x) case _HHHH: - return y.concatSelectedConstantGrouped(cscimm(a, b, c, d), y) + return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y) case _LLHH: - return x.concatSelectedConstantGrouped(cscimm(a, b, c, d), y) + return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y) case _HHLL: - return y.concatSelectedConstantGrouped(cscimm(a, b, c, d), x) + return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x) case _HLLL: - z := y.concatSelectedConstantGrouped(cscimm(a, a, b, b), x) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), x) + z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x) case _LHLL: - z := x.concatSelectedConstantGrouped(cscimm(a, a, b, b), y) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), x) + z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x) case _HLHH: - z := y.concatSelectedConstantGrouped(cscimm(a, a, b, b), x) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), y) + z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y) case _LHHH: - z := x.concatSelectedConstantGrouped(cscimm(a, a, b, b), y) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), y) + z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y) case _LLLH: - z := x.concatSelectedConstantGrouped(cscimm(c, c, d, d), y) - return x.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y) + return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _LLHL: - z := y.concatSelectedConstantGrouped(cscimm(c, c, d, d), x) - return x.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x) + return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _HHLH: - z := x.concatSelectedConstantGrouped(cscimm(c, c, d, d), y) - return y.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y) + return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _HHHL: - z := y.concatSelectedConstantGrouped(cscimm(c, c, d, d), x) - return y.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x) + return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _LHLH: - z := x.concatSelectedConstantGrouped(cscimm(a, c, b, d), y) - return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm(0, 2, 1, 3) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(a, c, b, d), y) + return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z) case _HLHL: - z := x.concatSelectedConstantGrouped(cscimm(b, d, a, c), y) - return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm(2, 0, 3, 1) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(b, d, a, c), y) + return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z) case _HLLH: - z := x.concatSelectedConstantGrouped(cscimm(b, c, a, d), y) - return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm(2, 0, 1, 3) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(b, c, a, d), y) + return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z) case _LHHL: - z := x.concatSelectedConstantGrouped(cscimm(a, d, b, c), y) - return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm(0, 2, 3, 1) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(a, d, b, c), y) + return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z) } panic("missing case, switch should be exhaustive") } @@ -441,53 +451,53 @@ func (x Float32x8) SelectFromPairGrouped(a, b, c, d uint8, y Float32x8) Float32x switch pattern { case _LLLL: - return x.concatSelectedConstantGrouped(cscimm(a, b, c, d), x) + return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x) case _HHHH: - return y.concatSelectedConstantGrouped(cscimm(a, b, c, d), y) + return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y) case _LLHH: - return x.concatSelectedConstantGrouped(cscimm(a, b, c, d), y) + return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y) case _HHLL: - return y.concatSelectedConstantGrouped(cscimm(a, b, c, d), x) + return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x) case _HLLL: - z := y.concatSelectedConstantGrouped(cscimm(a, a, b, b), x) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), x) + z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x) case _LHLL: - z := x.concatSelectedConstantGrouped(cscimm(a, a, b, b), y) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), x) + z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x) case _HLHH: - z := y.concatSelectedConstantGrouped(cscimm(a, a, b, b), x) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), y) + z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y) case _LHHH: - z := x.concatSelectedConstantGrouped(cscimm(a, a, b, b), y) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), y) + z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y) case _LLLH: - z := x.concatSelectedConstantGrouped(cscimm(c, c, d, d), y) - return x.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y) + return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _LLHL: - z := y.concatSelectedConstantGrouped(cscimm(c, c, d, d), x) - return x.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x) + return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _HHLH: - z := x.concatSelectedConstantGrouped(cscimm(c, c, d, d), y) - return y.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y) + return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _HHHL: - z := y.concatSelectedConstantGrouped(cscimm(c, c, d, d), x) - return y.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x) + return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _LHLH: - z := x.concatSelectedConstantGrouped(cscimm(a, c, b, d), y) - return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm(0, 2, 1, 3) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(a, c, b, d), y) + return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z) case _HLHL: - z := x.concatSelectedConstantGrouped(cscimm(b, d, a, c), y) - return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm(2, 0, 3, 1) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(b, d, a, c), y) + return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z) case _HLLH: - z := x.concatSelectedConstantGrouped(cscimm(b, c, a, d), y) - return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm(2, 0, 1, 3) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(b, c, a, d), y) + return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z) case _LHHL: - z := x.concatSelectedConstantGrouped(cscimm(a, d, b, c), y) - return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm(0, 2, 3, 1) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(a, d, b, c), y) + return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z) } panic("missing case, switch should be exhaustive") } @@ -511,53 +521,53 @@ func (x Int32x16) SelectFromPairGrouped(a, b, c, d uint8, y Int32x16) Int32x16 { switch pattern { case _LLLL: - return x.concatSelectedConstantGrouped(cscimm(a, b, c, d), x) + return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x) case _HHHH: - return y.concatSelectedConstantGrouped(cscimm(a, b, c, d), y) + return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y) case _LLHH: - return x.concatSelectedConstantGrouped(cscimm(a, b, c, d), y) + return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y) case _HHLL: - return y.concatSelectedConstantGrouped(cscimm(a, b, c, d), x) + return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x) case _HLLL: - z := y.concatSelectedConstantGrouped(cscimm(a, a, b, b), x) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), x) + z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x) case _LHLL: - z := x.concatSelectedConstantGrouped(cscimm(a, a, b, b), y) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), x) + z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x) case _HLHH: - z := y.concatSelectedConstantGrouped(cscimm(a, a, b, b), x) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), y) + z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y) case _LHHH: - z := x.concatSelectedConstantGrouped(cscimm(a, a, b, b), y) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), y) + z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y) case _LLLH: - z := x.concatSelectedConstantGrouped(cscimm(c, c, d, d), y) - return x.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y) + return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _LLHL: - z := y.concatSelectedConstantGrouped(cscimm(c, c, d, d), x) - return x.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x) + return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _HHLH: - z := x.concatSelectedConstantGrouped(cscimm(c, c, d, d), y) - return y.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y) + return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _HHHL: - z := y.concatSelectedConstantGrouped(cscimm(c, c, d, d), x) - return y.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x) + return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _LHLH: - z := x.concatSelectedConstantGrouped(cscimm(a, c, b, d), y) - return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm(0, 2, 1, 3) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(a, c, b, d), y) + return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z) case _HLHL: - z := x.concatSelectedConstantGrouped(cscimm(b, d, a, c), y) - return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm(2, 0, 3, 1) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(b, d, a, c), y) + return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z) case _HLLH: - z := x.concatSelectedConstantGrouped(cscimm(b, c, a, d), y) - return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm(2, 0, 1, 3) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(b, c, a, d), y) + return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z) case _LHHL: - z := x.concatSelectedConstantGrouped(cscimm(a, d, b, c), y) - return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm(0, 2, 3, 1) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(a, d, b, c), y) + return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z) } panic("missing case, switch should be exhaustive") } @@ -581,53 +591,53 @@ func (x Uint32x16) SelectFromPairGrouped(a, b, c, d uint8, y Uint32x16) Uint32x1 switch pattern { case _LLLL: - return x.concatSelectedConstantGrouped(cscimm(a, b, c, d), x) + return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x) case _HHHH: - return y.concatSelectedConstantGrouped(cscimm(a, b, c, d), y) + return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y) case _LLHH: - return x.concatSelectedConstantGrouped(cscimm(a, b, c, d), y) + return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y) case _HHLL: - return y.concatSelectedConstantGrouped(cscimm(a, b, c, d), x) + return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x) case _HLLL: - z := y.concatSelectedConstantGrouped(cscimm(a, a, b, b), x) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), x) + z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x) case _LHLL: - z := x.concatSelectedConstantGrouped(cscimm(a, a, b, b), y) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), x) + z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x) case _HLHH: - z := y.concatSelectedConstantGrouped(cscimm(a, a, b, b), x) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), y) + z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y) case _LHHH: - z := x.concatSelectedConstantGrouped(cscimm(a, a, b, b), y) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), y) + z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y) case _LLLH: - z := x.concatSelectedConstantGrouped(cscimm(c, c, d, d), y) - return x.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y) + return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _LLHL: - z := y.concatSelectedConstantGrouped(cscimm(c, c, d, d), x) - return x.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x) + return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _HHLH: - z := x.concatSelectedConstantGrouped(cscimm(c, c, d, d), y) - return y.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y) + return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _HHHL: - z := y.concatSelectedConstantGrouped(cscimm(c, c, d, d), x) - return y.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x) + return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _LHLH: - z := x.concatSelectedConstantGrouped(cscimm(a, c, b, d), y) - return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm(0, 2, 1, 3) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(a, c, b, d), y) + return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z) case _HLHL: - z := x.concatSelectedConstantGrouped(cscimm(b, d, a, c), y) - return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm(2, 0, 3, 1) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(b, d, a, c), y) + return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z) case _HLLH: - z := x.concatSelectedConstantGrouped(cscimm(b, c, a, d), y) - return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm(2, 0, 1, 3) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(b, c, a, d), y) + return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z) case _LHHL: - z := x.concatSelectedConstantGrouped(cscimm(a, d, b, c), y) - return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm(0, 2, 3, 1) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(a, d, b, c), y) + return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z) } panic("missing case, switch should be exhaustive") } @@ -651,59 +661,339 @@ func (x Float32x16) SelectFromPairGrouped(a, b, c, d uint8, y Float32x16) Float3 switch pattern { case _LLLL: - return x.concatSelectedConstantGrouped(cscimm(a, b, c, d), x) + return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x) case _HHHH: - return y.concatSelectedConstantGrouped(cscimm(a, b, c, d), y) + return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y) case _LLHH: - return x.concatSelectedConstantGrouped(cscimm(a, b, c, d), y) + return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y) case _HHLL: - return y.concatSelectedConstantGrouped(cscimm(a, b, c, d), x) + return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x) case _HLLL: - z := y.concatSelectedConstantGrouped(cscimm(a, a, b, b), x) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), x) + z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x) case _LHLL: - z := x.concatSelectedConstantGrouped(cscimm(a, a, b, b), y) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), x) + z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x) case _HLHH: - z := y.concatSelectedConstantGrouped(cscimm(a, a, b, b), x) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), y) + z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y) case _LHHH: - z := x.concatSelectedConstantGrouped(cscimm(a, a, b, b), y) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), y) + z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y) case _LLLH: - z := x.concatSelectedConstantGrouped(cscimm(c, c, d, d), y) - return x.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y) + return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _LLHL: - z := y.concatSelectedConstantGrouped(cscimm(c, c, d, d), x) - return x.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x) + return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _HHLH: - z := x.concatSelectedConstantGrouped(cscimm(c, c, d, d), y) - return y.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y) + return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _HHHL: - z := y.concatSelectedConstantGrouped(cscimm(c, c, d, d), x) - return y.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x) + return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _LHLH: - z := x.concatSelectedConstantGrouped(cscimm(a, c, b, d), y) - return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm(0, 2, 1, 3) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(a, c, b, d), y) + return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z) case _HLHL: - z := x.concatSelectedConstantGrouped(cscimm(b, d, a, c), y) - return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm(2, 0, 3, 1) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(b, d, a, c), y) + return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z) case _HLLH: - z := x.concatSelectedConstantGrouped(cscimm(b, c, a, d), y) - return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm(2, 0, 1, 3) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(b, c, a, d), y) + return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z) case _LHHL: - z := x.concatSelectedConstantGrouped(cscimm(a, d, b, c), y) - return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm(0, 2, 3, 1) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(a, d, b, c), y) + return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z) } panic("missing case, switch should be exhaustive") } -// cscimm converts the 4 vector element indices into a single +// cscimm4 converts the 4 vector element indices into a single // uint8 for use as an immediate. -func cscimm(a, b, c, d uint8) uint8 { +func cscimm4(a, b, c, d uint8) uint8 { return uint8(a + b<<2 + c<<4 + d<<6) } + +// cscimm2 converts the 2 vector element indices into a single +// uint8 for use as an immediate. +func cscimm2(a, b uint8) uint8 { + return uint8(a + b<<1) +} + +// cscimm2g2 converts the 2 vector element indices into a single +// uint8 for use as an immediate, but duplicated for VSHUFPD +// to emulate grouped behavior of VSHUFPS +func cscimm2g2(a, b uint8) uint8 { + g := cscimm2(a, b) + return g + g<<2 +} + +// cscimm2g4 converts the 2 vector element indices into a single +// uint8 for use as an immediate, but with four copies for VSHUFPD +// to emulate grouped behavior of VSHUFPS +func cscimm2g4(a, b uint8) uint8 { + g := cscimm2g2(a, b) + return g + g<<4 +} + +// SelectFromPair returns the selection of two elements from the two +// vectors x and y, where selector values in the range 0-1 specify +// elements from x and values in the range 2-3 specify the 0-1 elements +// of y. When the selectors are constants the selection can be +// implemented in a single instruction. +// +// If the selectors are not constant this will translate to a function +// call. +// +// Asm: VSHUFPD, CPU Feature: AVX +func (x Uint64x2) SelectFromPair(a, b uint8, y Uint64x2) Uint64x2 { + pattern := (a&2)>>1 + (b & 2) + + a, b = a&1, b&1 + + switch pattern { + case _LL: + return x.concatSelectedConstant(cscimm2(a, b), x) + case _HH: + return y.concatSelectedConstant(cscimm2(a, b), y) + case _LH: + return x.concatSelectedConstant(cscimm2(a, b), y) + case _HL: + return y.concatSelectedConstant(cscimm2(a, b), x) + } + panic("missing case, switch should be exhaustive") +} + +// SelectFromPairGrouped returns, for each of the two 128-bit halves of +// the vectors x and y, the selection of two elements from the two +// vectors x and y, where selector values in the range 0-1 specify +// elements from x and values in the range 2-3 specify the 0-1 elements +// of y. When the selectors are constants the selection can be +// implemented in a single instruction. +// +// If the selectors are not constant this will translate to a function +// call. +// +// Asm: VSHUFPD, CPU Feature: AVX +func (x Uint64x4) SelectFromPairGrouped(a, b uint8, y Uint64x4) Uint64x4 { + pattern := (a&2)>>1 + (b & 2) + + a, b = a&1, b&1 + + switch pattern { + case _LL: + return x.concatSelectedConstantGrouped(cscimm2g2(a, b), x) + case _HH: + return y.concatSelectedConstantGrouped(cscimm2g2(a, b), y) + case _LH: + return x.concatSelectedConstantGrouped(cscimm2g2(a, b), y) + case _HL: + return y.concatSelectedConstantGrouped(cscimm2g2(a, b), x) + } + panic("missing case, switch should be exhaustive") +} + +// SelectFromPairGrouped returns, for each of the four 128-bit subvectors +// of the vectors x and y, the selection of two elements from the two +// vectors x and y, where selector values in the range 0-1 specify +// elements from x and values in the range 2-3 specify the 0-1 elements +// of y. When the selectors are constants the selection can be +// implemented in a single instruction. +// +// If the selectors are not constant this will translate to a function +// call. +// +// Asm: VSHUFPD, CPU Feature: AVX512 +func (x Uint64x8) SelectFromPairGrouped(a, b uint8, y Uint64x8) Uint64x8 { + pattern := (a&2)>>1 + (b & 2) + + a, b = a&1, b&1 + + switch pattern { + case _LL: + return x.concatSelectedConstantGrouped(cscimm2g4(a, b), x) + case _HH: + return y.concatSelectedConstantGrouped(cscimm2g4(a, b), y) + case _LH: + return x.concatSelectedConstantGrouped(cscimm2g4(a, b), y) + case _HL: + return y.concatSelectedConstantGrouped(cscimm2g4(a, b), x) + } + panic("missing case, switch should be exhaustive") +} + +// SelectFromPair returns the selection of two elements from the two +// vectors x and y, where selector values in the range 0-1 specify +// elements from x and values in the range 2-3 specify the 0-1 elements +// of y. When the selectors are constants the selection can be +// implemented in a single instruction. +// +// If the selectors are not constant this will translate to a function +// call. +// +// Asm: VSHUFPD, CPU Feature: AVX +func (x Float64x2) SelectFromPair(a, b uint8, y Float64x2) Float64x2 { + pattern := (a&2)>>1 + (b & 2) + + a, b = a&1, b&1 + + switch pattern { + case _LL: + return x.concatSelectedConstant(cscimm2(a, b), x) + case _HH: + return y.concatSelectedConstant(cscimm2(a, b), y) + case _LH: + return x.concatSelectedConstant(cscimm2(a, b), y) + case _HL: + return y.concatSelectedConstant(cscimm2(a, b), x) + } + panic("missing case, switch should be exhaustive") +} + +// SelectFromPairGrouped returns, for each of the two 128-bit halves of +// the vectors x and y, the selection of two elements from the two +// vectors x and y, where selector values in the range 0-1 specify +// elements from x and values in the range 2-3 specify the 0-1 elements +// of y. When the selectors are constants the selection can be +// implemented in a single instruction. +// +// If the selectors are not constant this will translate to a function +// call. +// +// Asm: VSHUFPD, CPU Feature: AVX +func (x Float64x4) SelectFromPairGrouped(a, b uint8, y Float64x4) Float64x4 { + pattern := (a&2)>>1 + (b & 2) + + a, b = a&1, b&1 + + switch pattern { + case _LL: + return x.concatSelectedConstantGrouped(cscimm2g2(a, b), x) + case _HH: + return y.concatSelectedConstantGrouped(cscimm2g2(a, b), y) + case _LH: + return x.concatSelectedConstantGrouped(cscimm2g2(a, b), y) + case _HL: + return y.concatSelectedConstantGrouped(cscimm2g2(a, b), x) + } + panic("missing case, switch should be exhaustive") +} + +// SelectFromPairGrouped returns, for each of the four 128-bit subvectors +// of the vectors x and y, the selection of two elements from the two +// vectors x and y, where selector values in the range 0-1 specify +// elements from x and values in the range 2-3 specify the 0-1 elements +// of y. When the selectors are constants the selection can be +// implemented in a single instruction. +// +// If the selectors are not constant this will translate to a function +// call. +// +// Asm: VSHUFPD, CPU Feature: AVX512 +func (x Float64x8) SelectFromPairGrouped(a, b uint8, y Float64x8) Float64x8 { + pattern := (a&2)>>1 + (b & 2) + + a, b = a&1, b&1 + + switch pattern { + case _LL: + return x.concatSelectedConstantGrouped(cscimm2g4(a, b), x) + case _HH: + return y.concatSelectedConstantGrouped(cscimm2g4(a, b), y) + case _LH: + return x.concatSelectedConstantGrouped(cscimm2g4(a, b), y) + case _HL: + return y.concatSelectedConstantGrouped(cscimm2g4(a, b), x) + } + panic("missing case, switch should be exhaustive") +} + +// SelectFromPair returns the selection of two elements from the two +// vectors x and y, where selector values in the range 0-1 specify +// elements from x and values in the range 2-3 specify the 0-1 elements +// of y. When the selectors are constants the selection can be +// implemented in a single instruction. +// +// If the selectors are not constant this will translate to a function +// call. +// +// Asm: VSHUFPD, CPU Feature: AVX +func (x Int64x2) SelectFromPair(a, b uint8, y Int64x2) Int64x2 { + pattern := (a&2)>>1 + (b & 2) + + a, b = a&1, b&1 + + switch pattern { + case _LL: + return x.concatSelectedConstant(cscimm2(a, b), x) + case _HH: + return y.concatSelectedConstant(cscimm2(a, b), y) + case _LH: + return x.concatSelectedConstant(cscimm2(a, b), y) + case _HL: + return y.concatSelectedConstant(cscimm2(a, b), x) + } + panic("missing case, switch should be exhaustive") +} + +// SelectFromPairGrouped returns, for each of the two 128-bit halves of +// the vectors x and y, the selection of two elements from the two +// vectors x and y, where selector values in the range 0-1 specify +// elements from x and values in the range 2-3 specify the 0-1 elements +// of y. When the selectors are constants the selection can be +// implemented in a single instruction. +// +// If the selectors are not constant this will translate to a function +// call. +// +// Asm: VSHUFPD, CPU Feature: AVX +func (x Int64x4) SelectFromPairGrouped(a, b uint8, y Int64x4) Int64x4 { + pattern := (a&2)>>1 + (b & 2) + + a, b = a&1, b&1 + + switch pattern { + case _LL: + return x.concatSelectedConstantGrouped(cscimm2g2(a, b), x) + case _HH: + return y.concatSelectedConstantGrouped(cscimm2g2(a, b), y) + case _LH: + return x.concatSelectedConstantGrouped(cscimm2g2(a, b), y) + case _HL: + return y.concatSelectedConstantGrouped(cscimm2g2(a, b), x) + } + panic("missing case, switch should be exhaustive") +} + +// SelectFromPairGrouped returns, for each of the four 128-bit subvectors +// of the vectors x and y, the selection of two elements from the two +// vectors x and y, where selector values in the range 0-1 specify +// elements from x and values in the range 2-3 specify the 0-1 elements +// of y. When the selectors are constants the selection can be +// implemented in a single instruction. +// +// If the selectors are not constant this will translate to a function +// call. +// +// Asm: VSHUFPD, CPU Feature: AVX512 +func (x Int64x8) SelectFromPairGrouped(a, b uint8, y Int64x8) Int64x8 { + pattern := (a&2)>>1 + (b & 2) + + a, b = a&1, b&1 + + switch pattern { + case _LL: + return x.concatSelectedConstantGrouped(cscimm2g4(a, b), x) + case _HH: + return y.concatSelectedConstantGrouped(cscimm2g4(a, b), y) + case _LH: + return x.concatSelectedConstantGrouped(cscimm2g4(a, b), y) + case _HL: + return y.concatSelectedConstantGrouped(cscimm2g4(a, b), x) + } + panic("missing case, switch should be exhaustive") +} |
