simd/archsimd: use V(P)MOVMSK for mask ToBits if possible

VPMOVMSKB, VMOVMSKPS, and VMOVMSKPD moves AVX1/2-style masks to integer registers, similar to VPMOV[BWDQ]2M (which moves to mask registers). The former is available on AVX1/2, the latter requires AVX512. So use the former if it is supported, i.e. for 128- and 256-bit vectors with 8-, 32-, and 64-bit elements (16-bit elements always require AVX512). Change-Id: I972195116617ed2faaf95cee5cd6b250e671496c Reviewed-on: https://go-review.googlesource.com/c/go/+/734060 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: Junyang Shao <shaojunyang@google.com>
author: Cherry Mui <cherryyz@google.com> 2026-01-05 12:56:08 -0500
committer: Cherry Mui <cherryyz@google.com> 2026-01-05 12:22:51 -0800
commit: 9b2e3b9a02bd8872bdbf5c6086674fa6b4bc8ef9 (patch)
tree: 51a43dfc235315fbbc88226f0e52ab77cb628e46 /src/simd
parent: f8ee0f84753b22254d217bf28ce8ecca7db7025c (diff)
download: go-9b2e3b9a02bd8872bdbf5c6086674fa6b4bc8ef9.tar.xz
3 files changed, 106 insertions, 12 deletions
diff --git a/src/simd/archsimd/_gen/simdgen/gen_simdTypes.go b/src/simd/archsimd/_gen/simdgen/gen_simdTypes.go
index d8c4481296..dd3a75eb44 100644
--- a/src/simd/archsimd/_gen/simdgen/gen_simdTypes.go
+++ b/src/simd/archsimd/_gen/simdgen/gen_simdTypes.go
@@ -93,6 +93,33 @@ func (x simdType) MaskedStoreDoc() string {
 	}
 }
 
+func (x simdType) ToBitsDoc() string {
+	if x.Size == 512 || x.ElemBits() == 16 {
+		return fmt.Sprintf("// Asm: KMOV%s, CPU Features: AVX512", x.IntelSizeSuffix())
+	}
+	// 128/256 bit vectors with 8, 32, 64 bit elements
+	var asm string
+	var feat string
+	switch x.ElemBits() {
+	case 8:
+		asm = "VPMOVMSKB"
+		if x.Size == 256 {
+			feat = "AVX2"
+		} else {
+			feat = "AVX"
+		}
+	case 32:
+		asm = "VMOVMSKPS"
+		feat = "AVX"
+	case 64:
+		asm = "VMOVMSKPD"
+		feat = "AVX"
+	default:
+		panic("unexpected ElemBits")
+	}
+	return fmt.Sprintf("// Asm: %s, CPU Features: %s", asm, feat)
+}
+
 func compareSimdTypes(x, y simdType) int {
 	// "vreg" then "mask"
 	if c := -compareNatural(x.Type, y.Type); c != 0 {
@@ -210,7 +237,7 @@ func {{.Name}}FromBits(y uint{{.LanesContainer}}) {{.Name}}
 // Only the lower {{.Lanes}} bits of y are used.
 {{- end}}
 //
-// Asm: KMOV{{.IntelSizeSuffix}}, CPU Features: AVX512
+{{.ToBitsDoc}}
 func (x {{.Name}}) ToBits() uint{{.LanesContainer}}
 `
 
diff --git a/src/simd/archsimd/internal/simd_test/simd_test.go b/src/simd/archsimd/internal/simd_test/simd_test.go
index 671ec05e79..36bde92455 100644
--- a/src/simd/archsimd/internal/simd_test/simd_test.go
+++ b/src/simd/archsimd/internal/simd_test/simd_test.go
@@ -379,12 +379,79 @@ func TestBitMaskFromBitsLoad(t *testing.T) {
 }
 
 func TestBitMaskToBits(t *testing.T) {
-	if !archsimd.X86.AVX512() {
-		t.Skip("Test requires X86.AVX512, not available on this hardware")
-		return
+	int8s := []int8{
+		0, 1, 1, 0, 0, 1, 0, 1,
+		1, 0, 1, 1, 0, 0, 1, 0,
+		1, 0, 0, 1, 1, 0, 1, 0,
+		0, 1, 1, 0, 0, 1, 0, 1,
+		1, 0, 0, 1, 0, 1, 1, 0,
+		0, 1, 0, 1, 1, 0, 0, 1,
+		1, 0, 1, 0, 0, 1, 1, 0,
+		0, 1, 1, 0, 1, 0, 0, 1,
+	}
+	int16s := make([]int16, 32)
+	for i := range int16s {
+		int16s[i] = int16(int8s[i])
+	}
+	int32s := make([]int32, 16)
+	for i := range int32s {
+		int32s[i] = int32(int8s[i])
+	}
+	int64s := make([]int64, 8)
+	for i := range int64s {
+		int64s[i] = int64(int8s[i])
+	}
+	want64 := uint64(0)
+	for i := range int8s {
+		want64 |= uint64(int8s[i]) << i
+	}
+	want32 := uint32(want64)
+	want16 := uint16(want64)
+	want8 := uint8(want64)
+	want4 := want8 & 0b1111
+	want2 := want4 & 0b11
+
+	if v := archsimd.LoadInt8x16Slice(int8s[:16]).ToMask().ToBits(); v != want16 {
+		t.Errorf("want %b, got %b", want16, v)
+	}
+	if v := archsimd.LoadInt32x4Slice(int32s[:4]).ToMask().ToBits(); v != want4 {
+		t.Errorf("want %b, got %b", want4, v)
+	}
+	if v := archsimd.LoadInt32x8Slice(int32s[:8]).ToMask().ToBits(); v != want8 {
+		t.Errorf("want %b, got %b", want8, v)
+	}
+	if v := archsimd.LoadInt64x2Slice(int64s[:2]).ToMask().ToBits(); v != want2 {
+		t.Errorf("want %b, got %b", want2, v)
+	}
+	if v := archsimd.LoadInt64x4Slice(int64s[:4]).ToMask().ToBits(); v != want4 {
+		t.Errorf("want %b, got %b", want4, v)
 	}
-	if v := archsimd.LoadInt16x8Slice([]int16{1, 0, 1, 0, 0, 0, 0, 0}).ToMask().ToBits(); v != 0b101 {
-		t.Errorf("Want 0b101, got %b", v)
+
+	if archsimd.X86.AVX2() {
+		if v := archsimd.LoadInt8x32Slice(int8s[:32]).ToMask().ToBits(); v != want32 {
+			t.Errorf("want %b, got %b", want32, v)
+		}
+	}
+
+	if archsimd.X86.AVX512() {
+		if v := archsimd.LoadInt8x64Slice(int8s).ToMask().ToBits(); v != want64 {
+			t.Errorf("want %b, got %b", want64, v)
+		}
+		if v := archsimd.LoadInt16x8Slice(int16s[:8]).ToMask().ToBits(); v != want8 {
+			t.Errorf("want %b, got %b", want8, v)
+		}
+		if v := archsimd.LoadInt16x16Slice(int16s[:16]).ToMask().ToBits(); v != want16 {
+			t.Errorf("want %b, got %b", want16, v)
+		}
+		if v := archsimd.LoadInt16x32Slice(int16s).ToMask().ToBits(); v != want32 {
+			t.Errorf("want %b, got %b", want32, v)
+		}
+		if v := archsimd.LoadInt32x16Slice(int32s).ToMask().ToBits(); v != want16 {
+			t.Errorf("want %b, got %b", want16, v)
+		}
+		if v := archsimd.LoadInt64x8Slice(int64s).ToMask().ToBits(); v != want8 {
+			t.Errorf("want %b, got %b", want8, v)
+		}
 	}
 }
 
diff --git a/src/simd/archsimd/types_amd64.go b/src/simd/archsimd/types_amd64.go
index f39549c705..3d0a49dc09 100644
--- a/src/simd/archsimd/types_amd64.go
+++ b/src/simd/archsimd/types_amd64.go
@@ -308,7 +308,7 @@ func Mask8x16FromBits(y uint16) Mask8x16
 
 // ToBits constructs a bitmap from a Mask8x16, where 1 means set for the indexed element, 0 means unset.
 //
-// Asm: KMOVB, CPU Features: AVX512
+// Asm: VPMOVMSKB, CPU Features: AVX
 func (x Mask8x16) ToBits() uint16
 
 // Mask16x8 is a mask for a SIMD vector of 8 16-bit elements.
@@ -342,7 +342,7 @@ func Mask32x4FromBits(y uint8) Mask32x4
 // ToBits constructs a bitmap from a Mask32x4, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 4 bits of y are used.
 //
-// Asm: KMOVD, CPU Features: AVX512
+// Asm: VMOVMSKPS, CPU Features: AVX
 func (x Mask32x4) ToBits() uint8
 
 // Mask64x2 is a mask for a SIMD vector of 2 64-bit elements.
@@ -360,7 +360,7 @@ func Mask64x2FromBits(y uint8) Mask64x2
 // ToBits constructs a bitmap from a Mask64x2, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 2 bits of y are used.
 //
-// Asm: KMOVQ, CPU Features: AVX512
+// Asm: VMOVMSKPD, CPU Features: AVX
 func (x Mask64x2) ToBits() uint8
 
 // v256 is a tag type that tells the compiler that this is really 256-bit SIMD
@@ -667,7 +667,7 @@ func Mask8x32FromBits(y uint32) Mask8x32
 
 // ToBits constructs a bitmap from a Mask8x32, where 1 means set for the indexed element, 0 means unset.
 //
-// Asm: KMOVB, CPU Features: AVX512
+// Asm: VPMOVMSKB, CPU Features: AVX2
 func (x Mask8x32) ToBits() uint32
 
 // Mask16x16 is a mask for a SIMD vector of 16 16-bit elements.
@@ -699,7 +699,7 @@ func Mask32x8FromBits(y uint8) Mask32x8
 
 // ToBits constructs a bitmap from a Mask32x8, where 1 means set for the indexed element, 0 means unset.
 //
-// Asm: KMOVD, CPU Features: AVX512
+// Asm: VMOVMSKPS, CPU Features: AVX
 func (x Mask32x8) ToBits() uint8
 
 // Mask64x4 is a mask for a SIMD vector of 4 64-bit elements.
@@ -717,7 +717,7 @@ func Mask64x4FromBits(y uint8) Mask64x4
 // ToBits constructs a bitmap from a Mask64x4, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 4 bits of y are used.
 //
-// Asm: KMOVQ, CPU Features: AVX512
+// Asm: VMOVMSKPD, CPU Features: AVX
 func (x Mask64x4) ToBits() uint8
 
 // v512 is a tag type that tells the compiler that this is really 512-bit SIMD
author	Cherry Mui <cherryyz@google.com>	2026-01-05 12:56:08 -0500
committer	Cherry Mui <cherryyz@google.com>	2026-01-05 12:22:51 -0800
commit	9b2e3b9a02bd8872bdbf5c6086674fa6b4bc8ef9 (patch)
tree	51a43dfc235315fbbc88226f0e52ab77cb628e46 /src/simd
parent	f8ee0f84753b22254d217bf28ce8ecca7db7025c (diff)
download	go-9b2e3b9a02bd8872bdbf5c6086674fa6b4bc8ef9.tar.xz