diff options
| author | Junyang Shao <shaojunyang@google.com> | 2025-08-21 20:37:57 +0000 |
|---|---|---|
| committer | Junyang Shao <shaojunyang@google.com> | 2025-08-22 09:10:28 -0700 |
| commit | baea0c700b70d90331be3370f89991d7428d92aa (patch) | |
| tree | cd1843d625da8ffc686625ab4828e98f5dbc2dfc /src/simd | |
| parent | fa1e78c9adf6377fd2797ee50cb8210f0bd34781 (diff) | |
| download | go-baea0c700b70d90331be3370f89991d7428d92aa.tar.xz | |
[dev.simd] cmd/compile, simd: complete AVX2? u?int shuffles
The namings follow the following convention:
- If its indices are from constant, amend "Constant" to the name.
- If its indices are used by multiple groups, mend "Grouped" to the
name.
- If its indexing only the low part, amend "Lo", similarly "Hi".
Change-Id: I6a58f5dae54c882ebd59f39b5288f6f3f14d957f
Reviewed-on: https://go-review.googlesource.com/c/go/+/698296
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: David Chase <drchase@google.com>
Diffstat (limited to 'src/simd')
| -rw-r--r-- | src/simd/_gen/simdgen/ops/Moves/categories.yaml | 30 | ||||
| -rw-r--r-- | src/simd/_gen/simdgen/ops/Moves/go.yaml | 96 | ||||
| -rw-r--r-- | src/simd/ops_amd64.go | 260 |
3 files changed, 384 insertions, 2 deletions
diff --git a/src/simd/_gen/simdgen/ops/Moves/categories.yaml b/src/simd/_gen/simdgen/ops/Moves/categories.yaml index a576829e8f..556562b51a 100644 --- a/src/simd/_gen/simdgen/ops/Moves/categories.yaml +++ b/src/simd/_gen/simdgen/ops/Moves/categories.yaml @@ -74,4 +74,32 @@ commutative: false documentation: !string |- // NAME copies element zero of its (128-bit) input to all elements of - // the 512-bit output vector.
\ No newline at end of file + // the 512-bit output vector. +- go: PermuteGrouped + commutative: false + documentation: !string |- # Detailed documentation will rely on the specific ops. + // NAME performs a grouped permutation of vector x using indices: +- go: PermuteConstant + commutative: false + documentation: !string |- # Detailed documentation will rely on the specific ops. + // NAME performs a permutation of vector x using constant indices: +- go: PermuteConstantGrouped + commutative: false + documentation: !string |- # Detailed documentation will rely on the specific ops. + // NAME performs a grouped permutation of vector x using constant indices: +- go: PermuteConstantLo + commutative: false + documentation: !string |- # Detailed documentation will rely on the specific ops. + // NAME performs a permutation of vector x using constant indices: +- go: PermuteConstantLoGrouped + commutative: false + documentation: !string |- # Detailed documentation will rely on the specific ops. + // NAME performs a grouped permutation of vector x using constant indices: +- go: PermuteConstantHi + commutative: false + documentation: !string |- # Detailed documentation will rely on the specific ops. + // NAME performs a permutation of vector x using constant indices: +- go: PermuteConstantHiGrouped + commutative: false + documentation: !string |- # Detailed documentation will rely on the specific ops. + // NAME performs a grouped permutation of vector x using constant indices:
\ No newline at end of file diff --git a/src/simd/_gen/simdgen/ops/Moves/go.yaml b/src/simd/_gen/simdgen/ops/Moves/go.yaml index 3cdb9efe27..3d471ec480 100644 --- a/src/simd/_gen/simdgen/ops/Moves/go.yaml +++ b/src/simd/_gen/simdgen/ops/Moves/go.yaml @@ -432,4 +432,98 @@ go: $t name: indices out: - - *128any
\ No newline at end of file + - *128any +- go: PermuteGrouped + asm: VPSHUFB + addDoc: !string |- + // result := {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...} + // Only the needed bits to represent the index of a group of x are used in indices' elements. + // However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed. + // Each group is of size 128-bit. + in: + - &256Or512any + bits: "256|512" + go: $t + - bits: "256|512" + go: $t + name: indices + out: + - *256Or512any + +- go: PermuteConstant + asm: VPSHUFD + addDoc: !string |- + // result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]} + // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. + in: + - *128any + - class: immediate + immOffset: 0 + name: indices + out: + - *128any +- go: PermuteConstantGrouped + asm: VPSHUFD + addDoc: !string |- + // result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} + // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. + // Each group is of size 128-bit. + in: + - *256Or512any + - class: immediate + immOffset: 0 + name: indices + out: + - *256Or512any + +- go: PermuteConstantLo + asm: VPSHUFHW + addDoc: !string |- + // result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]} + // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. + in: + - *128any + - class: immediate + immOffset: 0 + name: indices + out: + - *128any +- go: PermuteConstantLoGrouped + asm: VPSHUFHW + addDoc: !string |- + // result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} + // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. + // Each group is of size 128-bit. + in: + - *256Or512any + - class: immediate + immOffset: 0 + name: indices + out: + - *256Or512any + +- go: PermuteConstantHi + asm: VPSHUFHW + addDoc: !string |- + // result := {x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]} + // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. + in: + - *128any + - class: immediate + immOffset: 0 + name: indices + out: + - *128any +- go: PermuteConstantHiGrouped + asm: VPSHUFHW + addDoc: !string |- + // result := {x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], x_group1[indices[0:2]+4], ...} + // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. + // Each group is of size 128-bit. + in: + - *256Or512any + - class: immediate + immOffset: 0 + name: indices + out: + - *256Or512any
\ No newline at end of file diff --git a/src/simd/ops_amd64.go b/src/simd/ops_amd64.go index e0e580bd27..e600f7c1a0 100644 --- a/src/simd/ops_amd64.go +++ b/src/simd/ops_amd64.go @@ -4564,6 +4564,266 @@ func (x Int64x8) Permute2(y Int64x8, indices Uint64x8) Int64x8 // Asm: VPERMI2Q, CPU Feature: AVX512 func (x Uint64x8) Permute2(y Uint64x8, indices Uint64x8) Uint64x8 +/* PermuteConstant */ + +// PermuteConstant performs a permutation of vector x using constant indices: +// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]} +// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFD, CPU Feature: AVX +func (x Int32x4) PermuteConstant(indices uint8) Int32x4 + +// PermuteConstant performs a permutation of vector x using constant indices: +// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]} +// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFD, CPU Feature: AVX +func (x Uint32x4) PermuteConstant(indices uint8) Uint32x4 + +/* PermuteConstantGrouped */ + +// PermuteConstantGrouped performs a grouped permutation of vector x using constant indices: +// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} +// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. +// Each group is of size 128-bit. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFD, CPU Feature: AVX2 +func (x Int32x8) PermuteConstantGrouped(indices uint8) Int32x8 + +// PermuteConstantGrouped performs a grouped permutation of vector x using constant indices: +// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} +// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. +// Each group is of size 128-bit. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFD, CPU Feature: AVX512 +func (x Int32x16) PermuteConstantGrouped(indices uint8) Int32x16 + +// PermuteConstantGrouped performs a grouped permutation of vector x using constant indices: +// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} +// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. +// Each group is of size 128-bit. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFD, CPU Feature: AVX2 +func (x Uint32x8) PermuteConstantGrouped(indices uint8) Uint32x8 + +// PermuteConstantGrouped performs a grouped permutation of vector x using constant indices: +// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} +// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. +// Each group is of size 128-bit. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFD, CPU Feature: AVX512 +func (x Uint32x16) PermuteConstantGrouped(indices uint8) Uint32x16 + +/* PermuteConstantHi */ + +// PermuteConstantHi performs a permutation of vector x using constant indices: +// result := {x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]} +// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFHW, CPU Feature: AVX512 +func (x Int16x8) PermuteConstantHi(indices uint8) Int16x8 + +// PermuteConstantHi performs a permutation of vector x using constant indices: +// result := {x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]} +// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFHW, CPU Feature: AVX +func (x Int32x4) PermuteConstantHi(indices uint8) Int32x4 + +// PermuteConstantHi performs a permutation of vector x using constant indices: +// result := {x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]} +// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFHW, CPU Feature: AVX512 +func (x Uint16x8) PermuteConstantHi(indices uint8) Uint16x8 + +// PermuteConstantHi performs a permutation of vector x using constant indices: +// result := {x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]} +// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFHW, CPU Feature: AVX +func (x Uint32x4) PermuteConstantHi(indices uint8) Uint32x4 + +/* PermuteConstantHiGrouped */ + +// PermuteConstantHiGrouped performs a grouped permutation of vector x using constant indices: +// result := {x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], x_group1[indices[0:2]+4], ...} +// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. +// Each group is of size 128-bit. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFHW, CPU Feature: AVX2 +func (x Int16x16) PermuteConstantHiGrouped(indices uint8) Int16x16 + +// PermuteConstantHiGrouped performs a grouped permutation of vector x using constant indices: +// result := {x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], x_group1[indices[0:2]+4], ...} +// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. +// Each group is of size 128-bit. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFHW, CPU Feature: AVX512 +func (x Int16x32) PermuteConstantHiGrouped(indices uint8) Int16x32 + +// PermuteConstantHiGrouped performs a grouped permutation of vector x using constant indices: +// result := {x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], x_group1[indices[0:2]+4], ...} +// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. +// Each group is of size 128-bit. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFHW, CPU Feature: AVX2 +func (x Uint16x16) PermuteConstantHiGrouped(indices uint8) Uint16x16 + +// PermuteConstantHiGrouped performs a grouped permutation of vector x using constant indices: +// result := {x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], x_group1[indices[0:2]+4], ...} +// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. +// Each group is of size 128-bit. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFHW, CPU Feature: AVX512 +func (x Uint16x32) PermuteConstantHiGrouped(indices uint8) Uint16x32 + +/* PermuteConstantLo */ + +// PermuteConstantLo performs a permutation of vector x using constant indices: +// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]} +// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFHW, CPU Feature: AVX512 +func (x Int16x8) PermuteConstantLo(indices uint8) Int16x8 + +// PermuteConstantLo performs a permutation of vector x using constant indices: +// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]} +// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFHW, CPU Feature: AVX +func (x Int32x4) PermuteConstantLo(indices uint8) Int32x4 + +// PermuteConstantLo performs a permutation of vector x using constant indices: +// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]} +// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFHW, CPU Feature: AVX512 +func (x Uint16x8) PermuteConstantLo(indices uint8) Uint16x8 + +// PermuteConstantLo performs a permutation of vector x using constant indices: +// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]} +// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFHW, CPU Feature: AVX +func (x Uint32x4) PermuteConstantLo(indices uint8) Uint32x4 + +/* PermuteConstantLoGrouped */ + +// PermuteConstantLoGrouped performs a grouped permutation of vector x using constant indices: +// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} +// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. +// Each group is of size 128-bit. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFHW, CPU Feature: AVX2 +func (x Int16x16) PermuteConstantLoGrouped(indices uint8) Int16x16 + +// PermuteConstantLoGrouped performs a grouped permutation of vector x using constant indices: +// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} +// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. +// Each group is of size 128-bit. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFHW, CPU Feature: AVX512 +func (x Int16x32) PermuteConstantLoGrouped(indices uint8) Int16x32 + +// PermuteConstantLoGrouped performs a grouped permutation of vector x using constant indices: +// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} +// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. +// Each group is of size 128-bit. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFHW, CPU Feature: AVX2 +func (x Uint16x16) PermuteConstantLoGrouped(indices uint8) Uint16x16 + +// PermuteConstantLoGrouped performs a grouped permutation of vector x using constant indices: +// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} +// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. +// Each group is of size 128-bit. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFHW, CPU Feature: AVX512 +func (x Uint16x32) PermuteConstantLoGrouped(indices uint8) Uint16x32 + +/* PermuteGrouped */ + +// PermuteGrouped performs a grouped permutation of vector x using indices: +// result := {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...} +// Only the needed bits to represent the index of a group of x are used in indices' elements. +// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed. +// Each group is of size 128-bit. +// +// Asm: VPSHUFB, CPU Feature: AVX2 +func (x Int8x32) PermuteGrouped(indices Int8x32) Int8x32 + +// PermuteGrouped performs a grouped permutation of vector x using indices: +// result := {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...} +// Only the needed bits to represent the index of a group of x are used in indices' elements. +// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed. +// Each group is of size 128-bit. +// +// Asm: VPSHUFB, CPU Feature: AVX512 +func (x Int8x64) PermuteGrouped(indices Int8x64) Int8x64 + +// PermuteGrouped performs a grouped permutation of vector x using indices: +// result := {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...} +// Only the needed bits to represent the index of a group of x are used in indices' elements. +// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed. +// Each group is of size 128-bit. +// +// Asm: VPSHUFB, CPU Feature: AVX2 +func (x Uint8x32) PermuteGrouped(indices Uint8x32) Uint8x32 + +// PermuteGrouped performs a grouped permutation of vector x using indices: +// result := {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...} +// Only the needed bits to represent the index of a group of x are used in indices' elements. +// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed. +// Each group is of size 128-bit. +// +// Asm: VPSHUFB, CPU Feature: AVX512 +func (x Uint8x64) PermuteGrouped(indices Uint8x64) Uint8x64 + /* Reciprocal */ // Reciprocal computes an approximate reciprocal of each element. |
