aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid Chase <drchase@google.com>2025-07-24 10:31:46 -0400
committerDavid Chase <drchase@google.com>2025-08-01 14:27:47 -0700
commitc25e5c86b2da8117b2d5c934b368ecbcf8e2efd5 (patch)
tree7ea310c0b603638cfd812326728ceca430423747
parent1ac5f3533f9dccb0f2fd9f21f833a76e68378ea7 (diff)
downloadgo-c25e5c86b2da8117b2d5c934b368ecbcf8e2efd5.tar.xz
[dev.simd] cmd/compile: generated code for K-mask-register slice load/stores
plus slice-part load, store and test for a single type. Generated by arch/internal/simdgen CL 690315 Change-Id: I58052728b544c4a772a2870ac68f3c832813e1ea Reviewed-on: https://go-review.googlesource.com/c/go/+/690336 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Junyang Shao <shaojunyang@google.com>
-rw-r--r--src/cmd/compile/internal/ssagen/simdintrinsics.go28
-rw-r--r--src/simd/slicepart_amd64.go45
-rw-r--r--src/simd/slicepart_test.go47
-rw-r--r--src/simd/types_amd64.go232
4 files changed, 352 insertions, 0 deletions
diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go
index dddfab5b71..a30144cbd1 100644
--- a/src/cmd/compile/internal/ssagen/simdintrinsics.go
+++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go
@@ -2148,26 +2148,54 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
addF(simdPackage, "Float32x4.StoreMasked", simdMaskedStore(ssa.OpStoreMasked32), sys.AMD64)
addF(simdPackage, "LoadMaskedFloat32x8", simdMaskedLoad(ssa.OpLoadMasked32), sys.AMD64)
addF(simdPackage, "Float32x8.StoreMasked", simdMaskedStore(ssa.OpStoreMasked32), sys.AMD64)
+ addF(simdPackage, "LoadMaskedFloat32x16", simdMaskedLoad(ssa.OpLoadMasked32), sys.AMD64)
+ addF(simdPackage, "Float32x16.StoreMasked", simdMaskedStore(ssa.OpStoreMasked32), sys.AMD64)
addF(simdPackage, "LoadMaskedFloat64x2", simdMaskedLoad(ssa.OpLoadMasked64), sys.AMD64)
addF(simdPackage, "Float64x2.StoreMasked", simdMaskedStore(ssa.OpStoreMasked64), sys.AMD64)
addF(simdPackage, "LoadMaskedFloat64x4", simdMaskedLoad(ssa.OpLoadMasked64), sys.AMD64)
addF(simdPackage, "Float64x4.StoreMasked", simdMaskedStore(ssa.OpStoreMasked64), sys.AMD64)
+ addF(simdPackage, "LoadMaskedFloat64x8", simdMaskedLoad(ssa.OpLoadMasked64), sys.AMD64)
+ addF(simdPackage, "Float64x8.StoreMasked", simdMaskedStore(ssa.OpStoreMasked64), sys.AMD64)
+ addF(simdPackage, "LoadMaskedInt8x64", simdMaskedLoad(ssa.OpLoadMasked8), sys.AMD64)
+ addF(simdPackage, "Int8x64.StoreMasked", simdMaskedStore(ssa.OpStoreMasked8), sys.AMD64)
+ addF(simdPackage, "LoadMaskedInt16x32", simdMaskedLoad(ssa.OpLoadMasked16), sys.AMD64)
+ addF(simdPackage, "Int16x32.StoreMasked", simdMaskedStore(ssa.OpStoreMasked16), sys.AMD64)
addF(simdPackage, "LoadMaskedInt32x4", simdMaskedLoad(ssa.OpLoadMasked32), sys.AMD64)
addF(simdPackage, "Int32x4.StoreMasked", simdMaskedStore(ssa.OpStoreMasked32), sys.AMD64)
addF(simdPackage, "LoadMaskedInt32x8", simdMaskedLoad(ssa.OpLoadMasked32), sys.AMD64)
addF(simdPackage, "Int32x8.StoreMasked", simdMaskedStore(ssa.OpStoreMasked32), sys.AMD64)
+ addF(simdPackage, "LoadMaskedInt32x16", simdMaskedLoad(ssa.OpLoadMasked32), sys.AMD64)
+ addF(simdPackage, "Int32x16.StoreMasked", simdMaskedStore(ssa.OpStoreMasked32), sys.AMD64)
addF(simdPackage, "LoadMaskedInt64x2", simdMaskedLoad(ssa.OpLoadMasked64), sys.AMD64)
addF(simdPackage, "Int64x2.StoreMasked", simdMaskedStore(ssa.OpStoreMasked64), sys.AMD64)
addF(simdPackage, "LoadMaskedInt64x4", simdMaskedLoad(ssa.OpLoadMasked64), sys.AMD64)
addF(simdPackage, "Int64x4.StoreMasked", simdMaskedStore(ssa.OpStoreMasked64), sys.AMD64)
+ addF(simdPackage, "LoadMaskedInt64x8", simdMaskedLoad(ssa.OpLoadMasked64), sys.AMD64)
+ addF(simdPackage, "Int64x8.StoreMasked", simdMaskedStore(ssa.OpStoreMasked64), sys.AMD64)
+ addF(simdPackage, "LoadMaskedUint8x64", simdMaskedLoad(ssa.OpLoadMasked8), sys.AMD64)
+ addF(simdPackage, "Uint8x64.StoreMasked", simdMaskedStore(ssa.OpStoreMasked8), sys.AMD64)
+ addF(simdPackage, "LoadMaskedUint16x32", simdMaskedLoad(ssa.OpLoadMasked16), sys.AMD64)
+ addF(simdPackage, "Uint16x32.StoreMasked", simdMaskedStore(ssa.OpStoreMasked16), sys.AMD64)
addF(simdPackage, "LoadMaskedUint32x4", simdMaskedLoad(ssa.OpLoadMasked32), sys.AMD64)
addF(simdPackage, "Uint32x4.StoreMasked", simdMaskedStore(ssa.OpStoreMasked32), sys.AMD64)
addF(simdPackage, "LoadMaskedUint32x8", simdMaskedLoad(ssa.OpLoadMasked32), sys.AMD64)
addF(simdPackage, "Uint32x8.StoreMasked", simdMaskedStore(ssa.OpStoreMasked32), sys.AMD64)
+ addF(simdPackage, "LoadMaskedUint32x16", simdMaskedLoad(ssa.OpLoadMasked32), sys.AMD64)
+ addF(simdPackage, "Uint32x16.StoreMasked", simdMaskedStore(ssa.OpStoreMasked32), sys.AMD64)
addF(simdPackage, "LoadMaskedUint64x2", simdMaskedLoad(ssa.OpLoadMasked64), sys.AMD64)
addF(simdPackage, "Uint64x2.StoreMasked", simdMaskedStore(ssa.OpStoreMasked64), sys.AMD64)
addF(simdPackage, "LoadMaskedUint64x4", simdMaskedLoad(ssa.OpLoadMasked64), sys.AMD64)
addF(simdPackage, "Uint64x4.StoreMasked", simdMaskedStore(ssa.OpStoreMasked64), sys.AMD64)
+ addF(simdPackage, "LoadMaskedUint64x8", simdMaskedLoad(ssa.OpLoadMasked64), sys.AMD64)
+ addF(simdPackage, "Uint64x8.StoreMasked", simdMaskedStore(ssa.OpStoreMasked64), sys.AMD64)
+ addF(simdPackage, "LoadMaskedMask8x64", simdMaskedLoad(ssa.OpLoadMasked8), sys.AMD64)
+ addF(simdPackage, "Mask8x64.StoreMasked", simdMaskedStore(ssa.OpStoreMasked8), sys.AMD64)
+ addF(simdPackage, "LoadMaskedMask16x32", simdMaskedLoad(ssa.OpLoadMasked16), sys.AMD64)
+ addF(simdPackage, "Mask16x32.StoreMasked", simdMaskedStore(ssa.OpStoreMasked16), sys.AMD64)
+ addF(simdPackage, "LoadMaskedMask32x16", simdMaskedLoad(ssa.OpLoadMasked32), sys.AMD64)
+ addF(simdPackage, "Mask32x16.StoreMasked", simdMaskedStore(ssa.OpStoreMasked32), sys.AMD64)
+ addF(simdPackage, "LoadMaskedMask64x8", simdMaskedLoad(ssa.OpLoadMasked64), sys.AMD64)
+ addF(simdPackage, "Mask64x8.StoreMasked", simdMaskedStore(ssa.OpStoreMasked64), sys.AMD64)
addF(simdPackage, "Mask8x16.AsInt8x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Int8x16.AsMask8x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Mask8x16.And", opLen2(ssa.OpAndInt32x4, types.TypeVec128), sys.AMD64)
diff --git a/src/simd/slicepart_amd64.go b/src/simd/slicepart_amd64.go
index 00025775be..3fcfc6255b 100644
--- a/src/simd/slicepart_amd64.go
+++ b/src/simd/slicepart_amd64.go
@@ -419,6 +419,24 @@ func paInt64x4(s []int64) *[4]int64 {
return (*[4]int64)(unsafe.Pointer(&s[0]))
}
+// For 512-bit masked loads/stores
+
+func paInt64x8(s []int64) *[8]int64 {
+ return (*[8]int64)(unsafe.Pointer(&s[0]))
+}
+
+func paInt32x16(s []int32) *[16]int32 {
+ return (*[16]int32)(unsafe.Pointer(&s[0]))
+}
+
+func paInt16x32(s []int16) *[32]int16 {
+ return (*[32]int16)(unsafe.Pointer(&s[0]))
+}
+
+func paInt8x64(s []int8) *[64]int8 {
+ return (*[64]int8)(unsafe.Pointer(&s[0]))
+}
+
/* 32 and 64-bit slice-part loads for AVX2 (128 and 256 bit) */
// LoadInt32x4SlicePart loads a Int32x4 from the slice s.
@@ -742,3 +760,30 @@ func (x Float64x4) StoreSlicePart(s []float64) {
t := unsafe.Slice((*int64)(unsafe.Pointer(&s[0])), len(s))
x.AsInt64x4().StoreSlicePart(t)
}
+
+func LoadInt64x8SlicePart(s []int64) Int64x8 {
+ l := len(s)
+ if l >= 8 {
+ return LoadInt64x8Slice(s)
+ }
+ if l == 0 {
+ var x Int64x8
+ return x
+ }
+
+ mask := Mask64x8FromBits(0xff >> (8 - l))
+ return LoadMaskedInt64x8(paInt64x8(s), mask)
+}
+
+func (x Int64x8) StoreSlicePart(s []int64) {
+ l := len(s)
+ if l >= 8 {
+ x.StoreSlice(s)
+ return
+ }
+ if l == 0 {
+ return
+ }
+ mask := Mask64x8FromBits(0xff >> (8 - l))
+ x.StoreMasked(paInt64x8(s), mask)
+}
diff --git a/src/simd/slicepart_test.go b/src/simd/slicepart_test.go
index cfdb7581d9..c9492bea1b 100644
--- a/src/simd/slicepart_test.go
+++ b/src/simd/slicepart_test.go
@@ -341,3 +341,50 @@ func TestSlicePartFloat32(t *testing.T) {
}
}
}
+
+// 512-bit load
+
+func TestSlicePartInt64(t *testing.T) {
+ if !simd.HasAVX512() {
+ t.Skip("Test requires HasAVX512, not available on this hardware")
+ return
+ }
+
+ L := 8
+ c := []int64{1, 2, 3, 4, 5, 6, 7, 8, 86, 86, 86, 86}
+ a := c[:L+1]
+ for i := range a {
+ // Test the load first
+ // e is a partial slice.
+ e := a[i:]
+ v := simd.LoadInt64x8SlicePart(e)
+ // d contains what a ought to contain
+ d := make([]int64, L)
+ for j := 0; j < len(e) && j < len(d); j++ {
+ d[j] = e[j]
+ }
+
+ b := make([]int64, L)
+ v.StoreSlice(b)
+ // test the load
+ checkSlicesLogInput(t, b, d, func() { t.Helper(); t.Logf("Len(e)=%d", len(e)) })
+
+ // Test the store
+ f := make([]int64, L+1)
+ for i := range f {
+ f[i] = 99
+ }
+
+ v.StoreSlicePart(f[:len(e)])
+ if len(e) < len(b) {
+ checkSlices(t, f, b[:len(e)])
+ } else {
+ checkSlices(t, f, b)
+ }
+ for i := len(e); i < len(f); i++ {
+ if f[i] != 99 {
+ t.Errorf("StoreSlicePart altered f[%d], expected 99, saw %v", i, f[i])
+ }
+ }
+ }
+}
diff --git a/src/simd/types_amd64.go b/src/simd/types_amd64.go
index 252da021e2..ac8cf3c210 100644
--- a/src/simd/types_amd64.go
+++ b/src/simd/types_amd64.go
@@ -31,12 +31,16 @@ func (x Float32x4) Store(y *[4]float32)
// LoadMaskedFloat32x4 loads a Float32x4 from an array,
// at those elements enabled by mask
//
+// Asm: VMASKMOVD, CPU Feature: AVX2
+//
//go:noescape
func LoadMaskedFloat32x4(y *[4]float32, mask Mask32x4) Float32x4
// StoreMasked stores a Float32x4 to an array,
// at those elements enabled by mask
//
+// Asm: VMASKMOVD, CPU Feature: AVX2
+//
//go:noescape
func (x Float32x4) StoreMasked(y *[4]float32, mask Mask32x4)
@@ -62,12 +66,16 @@ func (x Float64x2) Store(y *[2]float64)
// LoadMaskedFloat64x2 loads a Float64x2 from an array,
// at those elements enabled by mask
//
+// Asm: VMASKMOVQ, CPU Feature: AVX2
+//
//go:noescape
func LoadMaskedFloat64x2(y *[2]float64, mask Mask64x2) Float64x2
// StoreMasked stores a Float64x2 to an array,
// at those elements enabled by mask
//
+// Asm: VMASKMOVQ, CPU Feature: AVX2
+//
//go:noescape
func (x Float64x2) StoreMasked(y *[2]float64, mask Mask64x2)
@@ -131,12 +139,16 @@ func (x Int32x4) Store(y *[4]int32)
// LoadMaskedInt32x4 loads a Int32x4 from an array,
// at those elements enabled by mask
//
+// Asm: VMASKMOVD, CPU Feature: AVX2
+//
//go:noescape
func LoadMaskedInt32x4(y *[4]int32, mask Mask32x4) Int32x4
// StoreMasked stores a Int32x4 to an array,
// at those elements enabled by mask
//
+// Asm: VMASKMOVD, CPU Feature: AVX2
+//
//go:noescape
func (x Int32x4) StoreMasked(y *[4]int32, mask Mask32x4)
@@ -162,12 +174,16 @@ func (x Int64x2) Store(y *[2]int64)
// LoadMaskedInt64x2 loads a Int64x2 from an array,
// at those elements enabled by mask
//
+// Asm: VMASKMOVQ, CPU Feature: AVX2
+//
//go:noescape
func LoadMaskedInt64x2(y *[2]int64, mask Mask64x2) Int64x2
// StoreMasked stores a Int64x2 to an array,
// at those elements enabled by mask
//
+// Asm: VMASKMOVQ, CPU Feature: AVX2
+//
//go:noescape
func (x Int64x2) StoreMasked(y *[2]int64, mask Mask64x2)
@@ -231,12 +247,16 @@ func (x Uint32x4) Store(y *[4]uint32)
// LoadMaskedUint32x4 loads a Uint32x4 from an array,
// at those elements enabled by mask
//
+// Asm: VMASKMOVD, CPU Feature: AVX2
+//
//go:noescape
func LoadMaskedUint32x4(y *[4]uint32, mask Mask32x4) Uint32x4
// StoreMasked stores a Uint32x4 to an array,
// at those elements enabled by mask
//
+// Asm: VMASKMOVD, CPU Feature: AVX2
+//
//go:noescape
func (x Uint32x4) StoreMasked(y *[4]uint32, mask Mask32x4)
@@ -262,12 +282,16 @@ func (x Uint64x2) Store(y *[2]uint64)
// LoadMaskedUint64x2 loads a Uint64x2 from an array,
// at those elements enabled by mask
//
+// Asm: VMASKMOVQ, CPU Feature: AVX2
+//
//go:noescape
func LoadMaskedUint64x2(y *[2]uint64, mask Mask64x2) Uint64x2
// StoreMasked stores a Uint64x2 to an array,
// at those elements enabled by mask
//
+// Asm: VMASKMOVQ, CPU Feature: AVX2
+//
//go:noescape
func (x Uint64x2) StoreMasked(y *[2]uint64, mask Mask64x2)
@@ -295,6 +319,8 @@ func (x Mask8x16) StoreToBits(y *uint64)
// Mask8x16FromBits constructs a Mask8x16 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
// Only the lower 16 bits of y are used.
+//
+// Asm: KMOVB, CPU Feature: AVX512"
func Mask8x16FromBits(y uint16) Mask8x16
// Mask16x8 is a 128-bit SIMD vector of 8 int16
@@ -321,6 +347,8 @@ func (x Mask16x8) StoreToBits(y *uint64)
// Mask16x8FromBits constructs a Mask16x8 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
// Only the lower 8 bits of y are used.
+//
+// Asm: KMOVW, CPU Feature: AVX512"
func Mask16x8FromBits(y uint8) Mask16x8
// Mask32x4 is a 128-bit SIMD vector of 4 int32
@@ -347,6 +375,8 @@ func (x Mask32x4) StoreToBits(y *uint64)
// Mask32x4FromBits constructs a Mask32x4 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
// Only the lower 4 bits of y are used.
+//
+// Asm: KMOVD, CPU Feature: AVX512"
func Mask32x4FromBits(y uint8) Mask32x4
// Mask64x2 is a 128-bit SIMD vector of 2 int64
@@ -373,6 +403,8 @@ func (x Mask64x2) StoreToBits(y *uint64)
// Mask64x2FromBits constructs a Mask64x2 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
// Only the lower 2 bits of y are used.
+//
+// Asm: KMOVQ, CPU Feature: AVX512"
func Mask64x2FromBits(y uint8) Mask64x2
// v256 is a tag type that tells the compiler that this is really 256-bit SIMD
@@ -402,12 +434,16 @@ func (x Float32x8) Store(y *[8]float32)
// LoadMaskedFloat32x8 loads a Float32x8 from an array,
// at those elements enabled by mask
//
+// Asm: VMASKMOVD, CPU Feature: AVX2
+//
//go:noescape
func LoadMaskedFloat32x8(y *[8]float32, mask Mask32x8) Float32x8
// StoreMasked stores a Float32x8 to an array,
// at those elements enabled by mask
//
+// Asm: VMASKMOVD, CPU Feature: AVX2
+//
//go:noescape
func (x Float32x8) StoreMasked(y *[8]float32, mask Mask32x8)
@@ -433,12 +469,16 @@ func (x Float64x4) Store(y *[4]float64)
// LoadMaskedFloat64x4 loads a Float64x4 from an array,
// at those elements enabled by mask
//
+// Asm: VMASKMOVQ, CPU Feature: AVX2
+//
//go:noescape
func LoadMaskedFloat64x4(y *[4]float64, mask Mask64x4) Float64x4
// StoreMasked stores a Float64x4 to an array,
// at those elements enabled by mask
//
+// Asm: VMASKMOVQ, CPU Feature: AVX2
+//
//go:noescape
func (x Float64x4) StoreMasked(y *[4]float64, mask Mask64x4)
@@ -502,12 +542,16 @@ func (x Int32x8) Store(y *[8]int32)
// LoadMaskedInt32x8 loads a Int32x8 from an array,
// at those elements enabled by mask
//
+// Asm: VMASKMOVD, CPU Feature: AVX2
+//
//go:noescape
func LoadMaskedInt32x8(y *[8]int32, mask Mask32x8) Int32x8
// StoreMasked stores a Int32x8 to an array,
// at those elements enabled by mask
//
+// Asm: VMASKMOVD, CPU Feature: AVX2
+//
//go:noescape
func (x Int32x8) StoreMasked(y *[8]int32, mask Mask32x8)
@@ -533,12 +577,16 @@ func (x Int64x4) Store(y *[4]int64)
// LoadMaskedInt64x4 loads a Int64x4 from an array,
// at those elements enabled by mask
//
+// Asm: VMASKMOVQ, CPU Feature: AVX2
+//
//go:noescape
func LoadMaskedInt64x4(y *[4]int64, mask Mask64x4) Int64x4
// StoreMasked stores a Int64x4 to an array,
// at those elements enabled by mask
//
+// Asm: VMASKMOVQ, CPU Feature: AVX2
+//
//go:noescape
func (x Int64x4) StoreMasked(y *[4]int64, mask Mask64x4)
@@ -602,12 +650,16 @@ func (x Uint32x8) Store(y *[8]uint32)
// LoadMaskedUint32x8 loads a Uint32x8 from an array,
// at those elements enabled by mask
//
+// Asm: VMASKMOVD, CPU Feature: AVX2
+//
//go:noescape
func LoadMaskedUint32x8(y *[8]uint32, mask Mask32x8) Uint32x8
// StoreMasked stores a Uint32x8 to an array,
// at those elements enabled by mask
//
+// Asm: VMASKMOVD, CPU Feature: AVX2
+//
//go:noescape
func (x Uint32x8) StoreMasked(y *[8]uint32, mask Mask32x8)
@@ -633,12 +685,16 @@ func (x Uint64x4) Store(y *[4]uint64)
// LoadMaskedUint64x4 loads a Uint64x4 from an array,
// at those elements enabled by mask
//
+// Asm: VMASKMOVQ, CPU Feature: AVX2
+//
//go:noescape
func LoadMaskedUint64x4(y *[4]uint64, mask Mask64x4) Uint64x4
// StoreMasked stores a Uint64x4 to an array,
// at those elements enabled by mask
//
+// Asm: VMASKMOVQ, CPU Feature: AVX2
+//
//go:noescape
func (x Uint64x4) StoreMasked(y *[4]uint64, mask Mask64x4)
@@ -666,6 +722,8 @@ func (x Mask8x32) StoreToBits(y *uint64)
// Mask8x32FromBits constructs a Mask8x32 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
// Only the lower 32 bits of y are used.
+//
+// Asm: KMOVB, CPU Feature: AVX512"
func Mask8x32FromBits(y uint32) Mask8x32
// Mask16x16 is a 256-bit SIMD vector of 16 int16
@@ -692,6 +750,8 @@ func (x Mask16x16) StoreToBits(y *uint64)
// Mask16x16FromBits constructs a Mask16x16 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
// Only the lower 16 bits of y are used.
+//
+// Asm: KMOVW, CPU Feature: AVX512"
func Mask16x16FromBits(y uint16) Mask16x16
// Mask32x8 is a 256-bit SIMD vector of 8 int32
@@ -718,6 +778,8 @@ func (x Mask32x8) StoreToBits(y *uint64)
// Mask32x8FromBits constructs a Mask32x8 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
// Only the lower 8 bits of y are used.
+//
+// Asm: KMOVD, CPU Feature: AVX512"
func Mask32x8FromBits(y uint8) Mask32x8
// Mask64x4 is a 256-bit SIMD vector of 4 int64
@@ -744,6 +806,8 @@ func (x Mask64x4) StoreToBits(y *uint64)
// Mask64x4FromBits constructs a Mask64x4 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
// Only the lower 4 bits of y are used.
+//
+// Asm: KMOVQ, CPU Feature: AVX512"
func Mask64x4FromBits(y uint8) Mask64x4
// v512 is a tag type that tells the compiler that this is really 512-bit SIMD
@@ -770,6 +834,22 @@ func LoadFloat32x16(y *[16]float32) Float32x16
//go:noescape
func (x Float32x16) Store(y *[16]float32)
+// LoadMaskedFloat32x16 loads a Float32x16 from an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU32.Z, CPU Feature: AVX512
+//
+//go:noescape
+func LoadMaskedFloat32x16(y *[16]float32, mask Mask32x16) Float32x16
+
+// StoreMasked stores a Float32x16 to an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU32, CPU Feature: AVX512
+//
+//go:noescape
+func (x Float32x16) StoreMasked(y *[16]float32, mask Mask32x16)
+
// Float64x8 is a 512-bit SIMD vector of 8 float64
type Float64x8 struct {
float64x8 v512
@@ -789,6 +869,22 @@ func LoadFloat64x8(y *[8]float64) Float64x8
//go:noescape
func (x Float64x8) Store(y *[8]float64)
+// LoadMaskedFloat64x8 loads a Float64x8 from an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU64.Z, CPU Feature: AVX512
+//
+//go:noescape
+func LoadMaskedFloat64x8(y *[8]float64, mask Mask64x8) Float64x8
+
+// StoreMasked stores a Float64x8 to an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU64, CPU Feature: AVX512
+//
+//go:noescape
+func (x Float64x8) StoreMasked(y *[8]float64, mask Mask64x8)
+
// Int8x64 is a 512-bit SIMD vector of 64 int8
type Int8x64 struct {
int8x64 v512
@@ -808,6 +904,22 @@ func LoadInt8x64(y *[64]int8) Int8x64
//go:noescape
func (x Int8x64) Store(y *[64]int8)
+// LoadMaskedInt8x64 loads a Int8x64 from an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU8.Z, CPU Feature: AVX512
+//
+//go:noescape
+func LoadMaskedInt8x64(y *[64]int8, mask Mask8x64) Int8x64
+
+// StoreMasked stores a Int8x64 to an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU8, CPU Feature: AVX512
+//
+//go:noescape
+func (x Int8x64) StoreMasked(y *[64]int8, mask Mask8x64)
+
// Int16x32 is a 512-bit SIMD vector of 32 int16
type Int16x32 struct {
int16x32 v512
@@ -827,6 +939,22 @@ func LoadInt16x32(y *[32]int16) Int16x32
//go:noescape
func (x Int16x32) Store(y *[32]int16)
+// LoadMaskedInt16x32 loads a Int16x32 from an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU16.Z, CPU Feature: AVX512
+//
+//go:noescape
+func LoadMaskedInt16x32(y *[32]int16, mask Mask16x32) Int16x32
+
+// StoreMasked stores a Int16x32 to an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU16, CPU Feature: AVX512
+//
+//go:noescape
+func (x Int16x32) StoreMasked(y *[32]int16, mask Mask16x32)
+
// Int32x16 is a 512-bit SIMD vector of 16 int32
type Int32x16 struct {
int32x16 v512
@@ -846,6 +974,22 @@ func LoadInt32x16(y *[16]int32) Int32x16
//go:noescape
func (x Int32x16) Store(y *[16]int32)
+// LoadMaskedInt32x16 loads a Int32x16 from an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU32.Z, CPU Feature: AVX512
+//
+//go:noescape
+func LoadMaskedInt32x16(y *[16]int32, mask Mask32x16) Int32x16
+
+// StoreMasked stores a Int32x16 to an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU32, CPU Feature: AVX512
+//
+//go:noescape
+func (x Int32x16) StoreMasked(y *[16]int32, mask Mask32x16)
+
// Int64x8 is a 512-bit SIMD vector of 8 int64
type Int64x8 struct {
int64x8 v512
@@ -865,6 +1009,22 @@ func LoadInt64x8(y *[8]int64) Int64x8
//go:noescape
func (x Int64x8) Store(y *[8]int64)
+// LoadMaskedInt64x8 loads a Int64x8 from an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU64.Z, CPU Feature: AVX512
+//
+//go:noescape
+func LoadMaskedInt64x8(y *[8]int64, mask Mask64x8) Int64x8
+
+// StoreMasked stores a Int64x8 to an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU64, CPU Feature: AVX512
+//
+//go:noescape
+func (x Int64x8) StoreMasked(y *[8]int64, mask Mask64x8)
+
// Uint8x64 is a 512-bit SIMD vector of 64 uint8
type Uint8x64 struct {
uint8x64 v512
@@ -884,6 +1044,22 @@ func LoadUint8x64(y *[64]uint8) Uint8x64
//go:noescape
func (x Uint8x64) Store(y *[64]uint8)
+// LoadMaskedUint8x64 loads a Uint8x64 from an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU8.Z, CPU Feature: AVX512
+//
+//go:noescape
+func LoadMaskedUint8x64(y *[64]uint8, mask Mask8x64) Uint8x64
+
+// StoreMasked stores a Uint8x64 to an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU8, CPU Feature: AVX512
+//
+//go:noescape
+func (x Uint8x64) StoreMasked(y *[64]uint8, mask Mask8x64)
+
// Uint16x32 is a 512-bit SIMD vector of 32 uint16
type Uint16x32 struct {
uint16x32 v512
@@ -903,6 +1079,22 @@ func LoadUint16x32(y *[32]uint16) Uint16x32
//go:noescape
func (x Uint16x32) Store(y *[32]uint16)
+// LoadMaskedUint16x32 loads a Uint16x32 from an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU16.Z, CPU Feature: AVX512
+//
+//go:noescape
+func LoadMaskedUint16x32(y *[32]uint16, mask Mask16x32) Uint16x32
+
+// StoreMasked stores a Uint16x32 to an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU16, CPU Feature: AVX512
+//
+//go:noescape
+func (x Uint16x32) StoreMasked(y *[32]uint16, mask Mask16x32)
+
// Uint32x16 is a 512-bit SIMD vector of 16 uint32
type Uint32x16 struct {
uint32x16 v512
@@ -922,6 +1114,22 @@ func LoadUint32x16(y *[16]uint32) Uint32x16
//go:noescape
func (x Uint32x16) Store(y *[16]uint32)
+// LoadMaskedUint32x16 loads a Uint32x16 from an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU32.Z, CPU Feature: AVX512
+//
+//go:noescape
+func LoadMaskedUint32x16(y *[16]uint32, mask Mask32x16) Uint32x16
+
+// StoreMasked stores a Uint32x16 to an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU32, CPU Feature: AVX512
+//
+//go:noescape
+func (x Uint32x16) StoreMasked(y *[16]uint32, mask Mask32x16)
+
// Uint64x8 is a 512-bit SIMD vector of 8 uint64
type Uint64x8 struct {
uint64x8 v512
@@ -941,6 +1149,22 @@ func LoadUint64x8(y *[8]uint64) Uint64x8
//go:noescape
func (x Uint64x8) Store(y *[8]uint64)
+// LoadMaskedUint64x8 loads a Uint64x8 from an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU64.Z, CPU Feature: AVX512
+//
+//go:noescape
+func LoadMaskedUint64x8(y *[8]uint64, mask Mask64x8) Uint64x8
+
+// StoreMasked stores a Uint64x8 to an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU64, CPU Feature: AVX512
+//
+//go:noescape
+func (x Uint64x8) StoreMasked(y *[8]uint64, mask Mask64x8)
+
// Mask8x64 is a 512-bit SIMD vector of 64 int8
type Mask8x64 struct {
int8x64 v512
@@ -965,6 +1189,8 @@ func (x Mask8x64) StoreToBits(y *uint64)
// Mask8x64FromBits constructs a Mask8x64 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
// Only the lower 64 bits of y are used.
+//
+// Asm: KMOVB, CPU Feature: AVX512"
func Mask8x64FromBits(y uint64) Mask8x64
// Mask16x32 is a 512-bit SIMD vector of 32 int16
@@ -991,6 +1217,8 @@ func (x Mask16x32) StoreToBits(y *uint64)
// Mask16x32FromBits constructs a Mask16x32 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
// Only the lower 32 bits of y are used.
+//
+// Asm: KMOVW, CPU Feature: AVX512"
func Mask16x32FromBits(y uint32) Mask16x32
// Mask32x16 is a 512-bit SIMD vector of 16 int32
@@ -1017,6 +1245,8 @@ func (x Mask32x16) StoreToBits(y *uint64)
// Mask32x16FromBits constructs a Mask32x16 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
// Only the lower 16 bits of y are used.
+//
+// Asm: KMOVD, CPU Feature: AVX512"
func Mask32x16FromBits(y uint16) Mask32x16
// Mask64x8 is a 512-bit SIMD vector of 8 int64
@@ -1043,4 +1273,6 @@ func (x Mask64x8) StoreToBits(y *uint64)
// Mask64x8FromBits constructs a Mask64x8 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
// Only the lower 8 bits of y are used.
+//
+// Asm: KMOVQ, CPU Feature: AVX512"
func Mask64x8FromBits(y uint8) Mask64x8