[dev.simd] cmd/compile, simd: complete AVX2? u?int shuffles

The namings follow the following convention: - If its indices are from constant, amend "Constant" to the name. - If its indices are used by multiple groups, mend "Grouped" to the name. - If its indexing only the low part, amend "Lo", similarly "Hi". Change-Id: I6a58f5dae54c882ebd59f39b5288f6f3f14d957f Reviewed-on: https://go-review.googlesource.com/c/go/+/698296 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: David Chase <drchase@google.com>
author: Junyang Shao <shaojunyang@google.com> 2025-08-21 20:37:57 +0000
committer: Junyang Shao <shaojunyang@google.com> 2025-08-22 09:10:28 -0700
commit: baea0c700b70d90331be3370f89991d7428d92aa (patch)
tree: cd1843d625da8ffc686625ab4828e98f5dbc2dfc /src/simd
parent: fa1e78c9adf6377fd2797ee50cb8210f0bd34781 (diff)
download: go-baea0c700b70d90331be3370f89991d7428d92aa.tar.xz
3 files changed, 384 insertions, 2 deletions
diff --git a/src/simd/_gen/simdgen/ops/Moves/categories.yaml b/src/simd/_gen/simdgen/ops/Moves/categories.yaml
index a576829e8f..556562b51a 100644
--- a/src/simd/_gen/simdgen/ops/Moves/categories.yaml
+++ b/src/simd/_gen/simdgen/ops/Moves/categories.yaml
@@ -74,4 +74,32 @@
   commutative: false
   documentation: !string |-
     // NAME copies element zero of its (128-bit) input to all elements of
-    // the 512-bit output vector.
-\ No newline at end of file
+    // the 512-bit output vector.
+- go: PermuteGrouped
+  commutative: false
+  documentation: !string |- # Detailed documentation will rely on the specific ops.
+    // NAME performs a grouped permutation of vector x using indices:
+- go: PermuteConstant
+  commutative: false
+  documentation: !string |- # Detailed documentation will rely on the specific ops.
+    // NAME performs a permutation of vector x using constant indices:
+- go: PermuteConstantGrouped
+  commutative: false
+  documentation: !string |- # Detailed documentation will rely on the specific ops.
+    // NAME performs a grouped permutation of vector x using constant indices:
+- go: PermuteConstantLo
+  commutative: false
+  documentation: !string |- # Detailed documentation will rely on the specific ops.
+    // NAME performs a permutation of vector x using constant indices:
+- go: PermuteConstantLoGrouped
+  commutative: false
+  documentation: !string |- # Detailed documentation will rely on the specific ops.
+    // NAME performs a grouped permutation of vector x using constant indices:
+- go: PermuteConstantHi
+  commutative: false
+  documentation: !string |- # Detailed documentation will rely on the specific ops.
+    // NAME performs a permutation of vector x using constant indices:
+- go: PermuteConstantHiGrouped
+  commutative: false
+  documentation: !string |- # Detailed documentation will rely on the specific ops.
+    // NAME performs a grouped permutation of vector x using constant indices:
+\ No newline at end of file
diff --git a/src/simd/_gen/simdgen/ops/Moves/go.yaml b/src/simd/_gen/simdgen/ops/Moves/go.yaml
index 3cdb9efe27..3d471ec480 100644
--- a/src/simd/_gen/simdgen/ops/Moves/go.yaml
+++ b/src/simd/_gen/simdgen/ops/Moves/go.yaml
@@ -432,4 +432,98 @@
     go: $t
     name: indices
   out:
-  - *128any
-\ No newline at end of file
+  - *128any
+- go: PermuteGrouped
+  asm: VPSHUFB
+  addDoc: !string |-
+    // result := {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
+    // Only the needed bits to represent the index of a group of x are used in indices' elements.
+    // However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
+    // Each group is of size 128-bit.
+  in:
+  - &256Or512any
+    bits: "256|512"
+    go: $t
+  - bits: "256|512"
+    go: $t
+    name: indices
+  out:
+  - *256Or512any
+
+- go: PermuteConstant
+  asm: VPSHUFD
+  addDoc: !string |-
+    // result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
+    // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+  in:
+  - *128any
+  - class: immediate
+    immOffset: 0
+    name: indices
+  out:
+  - *128any
+- go: PermuteConstantGrouped
+  asm: VPSHUFD
+  addDoc: !string |-
+    // result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+    // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+    // Each group is of size 128-bit.
+  in:
+  - *256Or512any
+  - class: immediate
+    immOffset: 0
+    name: indices
+  out:
+  - *256Or512any
+
+- go: PermuteConstantLo
+  asm: VPSHUFHW
+  addDoc: !string |-
+    // result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
+    // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+  in:
+    - *128any
+    - class: immediate
+      immOffset: 0
+      name: indices
+  out:
+    - *128any
+- go: PermuteConstantLoGrouped
+  asm: VPSHUFHW
+  addDoc: !string |-
+    // result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+    // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+    // Each group is of size 128-bit.
+  in:
+  - *256Or512any
+  - class: immediate
+    immOffset: 0
+    name: indices
+  out:
+  - *256Or512any
+
+- go: PermuteConstantHi
+  asm: VPSHUFHW
+  addDoc: !string |-
+    // result := {x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
+    // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+  in:
+  - *128any
+  - class: immediate
+    immOffset: 0
+    name: indices
+  out:
+  - *128any
+- go: PermuteConstantHiGrouped
+  asm: VPSHUFHW
+  addDoc: !string |-
+    // result := {x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], x_group1[indices[0:2]+4], ...}
+    // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+    // Each group is of size 128-bit.
+  in:
+  - *256Or512any
+  - class: immediate
+    immOffset: 0
+    name: indices
+  out:
+  - *256Or512any
+\ No newline at end of file
diff --git a/src/simd/ops_amd64.go b/src/simd/ops_amd64.go
index e0e580bd27..e600f7c1a0 100644
--- a/src/simd/ops_amd64.go
+++ b/src/simd/ops_amd64.go
@@ -4564,6 +4564,266 @@ func (x Int64x8) Permute2(y Int64x8, indices Uint64x8) Int64x8
 // Asm: VPERMI2Q, CPU Feature: AVX512
 func (x Uint64x8) Permute2(y Uint64x8, indices Uint64x8) Uint64x8
 
+/* PermuteConstant */
+
+// PermuteConstant performs a permutation of vector x using constant indices:
+// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
+// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFD, CPU Feature: AVX
+func (x Int32x4) PermuteConstant(indices uint8) Int32x4
+
+// PermuteConstant performs a permutation of vector x using constant indices:
+// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
+// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFD, CPU Feature: AVX
+func (x Uint32x4) PermuteConstant(indices uint8) Uint32x4
+
+/* PermuteConstantGrouped */
+
+// PermuteConstantGrouped performs a grouped permutation of vector x using constant indices:
+// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFD, CPU Feature: AVX2
+func (x Int32x8) PermuteConstantGrouped(indices uint8) Int32x8
+
+// PermuteConstantGrouped performs a grouped permutation of vector x using constant indices:
+// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFD, CPU Feature: AVX512
+func (x Int32x16) PermuteConstantGrouped(indices uint8) Int32x16
+
+// PermuteConstantGrouped performs a grouped permutation of vector x using constant indices:
+// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFD, CPU Feature: AVX2
+func (x Uint32x8) PermuteConstantGrouped(indices uint8) Uint32x8
+
+// PermuteConstantGrouped performs a grouped permutation of vector x using constant indices:
+// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFD, CPU Feature: AVX512
+func (x Uint32x16) PermuteConstantGrouped(indices uint8) Uint32x16
+
+/* PermuteConstantHi */
+
+// PermuteConstantHi performs a permutation of vector x using constant indices:
+// result := {x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
+// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Int16x8) PermuteConstantHi(indices uint8) Int16x8
+
+// PermuteConstantHi performs a permutation of vector x using constant indices:
+// result := {x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
+// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX
+func (x Int32x4) PermuteConstantHi(indices uint8) Int32x4
+
+// PermuteConstantHi performs a permutation of vector x using constant indices:
+// result := {x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
+// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Uint16x8) PermuteConstantHi(indices uint8) Uint16x8
+
+// PermuteConstantHi performs a permutation of vector x using constant indices:
+// result := {x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
+// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX
+func (x Uint32x4) PermuteConstantHi(indices uint8) Uint32x4
+
+/* PermuteConstantHiGrouped */
+
+// PermuteConstantHiGrouped performs a grouped permutation of vector x using constant indices:
+// result := {x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], x_group1[indices[0:2]+4], ...}
+// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX2
+func (x Int16x16) PermuteConstantHiGrouped(indices uint8) Int16x16
+
+// PermuteConstantHiGrouped performs a grouped permutation of vector x using constant indices:
+// result := {x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], x_group1[indices[0:2]+4], ...}
+// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Int16x32) PermuteConstantHiGrouped(indices uint8) Int16x32
+
+// PermuteConstantHiGrouped performs a grouped permutation of vector x using constant indices:
+// result := {x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], x_group1[indices[0:2]+4], ...}
+// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX2
+func (x Uint16x16) PermuteConstantHiGrouped(indices uint8) Uint16x16
+
+// PermuteConstantHiGrouped performs a grouped permutation of vector x using constant indices:
+// result := {x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], x_group1[indices[0:2]+4], ...}
+// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Uint16x32) PermuteConstantHiGrouped(indices uint8) Uint16x32
+
+/* PermuteConstantLo */
+
+// PermuteConstantLo performs a permutation of vector x using constant indices:
+// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
+// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Int16x8) PermuteConstantLo(indices uint8) Int16x8
+
+// PermuteConstantLo performs a permutation of vector x using constant indices:
+// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
+// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX
+func (x Int32x4) PermuteConstantLo(indices uint8) Int32x4
+
+// PermuteConstantLo performs a permutation of vector x using constant indices:
+// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
+// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Uint16x8) PermuteConstantLo(indices uint8) Uint16x8
+
+// PermuteConstantLo performs a permutation of vector x using constant indices:
+// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
+// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX
+func (x Uint32x4) PermuteConstantLo(indices uint8) Uint32x4
+
+/* PermuteConstantLoGrouped */
+
+// PermuteConstantLoGrouped performs a grouped permutation of vector x using constant indices:
+// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX2
+func (x Int16x16) PermuteConstantLoGrouped(indices uint8) Int16x16
+
+// PermuteConstantLoGrouped performs a grouped permutation of vector x using constant indices:
+// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Int16x32) PermuteConstantLoGrouped(indices uint8) Int16x32
+
+// PermuteConstantLoGrouped performs a grouped permutation of vector x using constant indices:
+// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX2
+func (x Uint16x16) PermuteConstantLoGrouped(indices uint8) Uint16x16
+
+// PermuteConstantLoGrouped performs a grouped permutation of vector x using constant indices:
+// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Uint16x32) PermuteConstantLoGrouped(indices uint8) Uint16x32
+
+/* PermuteGrouped */
+
+// PermuteGrouped performs a grouped permutation of vector x using indices:
+// result := {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
+// Only the needed bits to represent the index of a group of x are used in indices' elements.
+// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
+// Each group is of size 128-bit.
+//
+// Asm: VPSHUFB, CPU Feature: AVX2
+func (x Int8x32) PermuteGrouped(indices Int8x32) Int8x32
+
+// PermuteGrouped performs a grouped permutation of vector x using indices:
+// result := {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
+// Only the needed bits to represent the index of a group of x are used in indices' elements.
+// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
+// Each group is of size 128-bit.
+//
+// Asm: VPSHUFB, CPU Feature: AVX512
+func (x Int8x64) PermuteGrouped(indices Int8x64) Int8x64
+
+// PermuteGrouped performs a grouped permutation of vector x using indices:
+// result := {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
+// Only the needed bits to represent the index of a group of x are used in indices' elements.
+// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
+// Each group is of size 128-bit.
+//
+// Asm: VPSHUFB, CPU Feature: AVX2
+func (x Uint8x32) PermuteGrouped(indices Uint8x32) Uint8x32
+
+// PermuteGrouped performs a grouped permutation of vector x using indices:
+// result := {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
+// Only the needed bits to represent the index of a group of x are used in indices' elements.
+// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
+// Each group is of size 128-bit.
+//
+// Asm: VPSHUFB, CPU Feature: AVX512
+func (x Uint8x64) PermuteGrouped(indices Uint8x64) Uint8x64
+
 /* Reciprocal */
 
 // Reciprocal computes an approximate reciprocal of each element.
author	Junyang Shao <shaojunyang@google.com>	2025-08-21 20:37:57 +0000
committer	Junyang Shao <shaojunyang@google.com>	2025-08-22 09:10:28 -0700
commit	baea0c700b70d90331be3370f89991d7428d92aa (patch)
tree	cd1843d625da8ffc686625ab4828e98f5dbc2dfc /src/simd
parent	fa1e78c9adf6377fd2797ee50cb8210f0bd34781 (diff)
download	go-baea0c700b70d90331be3370f89991d7428d92aa.tar.xz