simd/archsimd: use V(P)MOVMSK for mask ToBits if possible

VPMOVMSKB, VMOVMSKPS, and VMOVMSKPD moves AVX1/2-style masks to integer registers, similar to VPMOV[BWDQ]2M (which moves to mask registers). The former is available on AVX1/2, the latter requires AVX512. So use the former if it is supported, i.e. for 128- and 256-bit vectors with 8-, 32-, and 64-bit elements (16-bit elements always require AVX512). Change-Id: I972195116617ed2faaf95cee5cd6b250e671496c Reviewed-on: https://go-review.googlesource.com/c/go/+/734060 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: Junyang Shao <shaojunyang@google.com>
author: Cherry Mui <cherryyz@google.com> 2026-01-05 12:56:08 -0500
committer: Cherry Mui <cherryyz@google.com> 2026-01-05 12:22:51 -0800
commit: 9b2e3b9a02bd8872bdbf5c6086674fa6b4bc8ef9 (patch)
tree: 51a43dfc235315fbbc88226f0e52ab77cb628e46 /src/cmd
parent: f8ee0f84753b22254d217bf28ce8ecca7db7025c (diff)
download: go-9b2e3b9a02bd8872bdbf5c6086674fa6b4bc8ef9.tar.xz
5 files changed, 136 insertions, 139 deletions
diff --git a/src/cmd/compile/internal/amd64/ssa.go b/src/cmd/compile/internal/amd64/ssa.go
index 5ddcb84c59..e9a566d759 100644
--- a/src/cmd/compile/internal/amd64/ssa.go
+++ b/src/cmd/compile/internal/amd64/ssa.go
@@ -1845,7 +1845,13 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
 		ssa.OpAMD64VPMOVVec32x16ToM,
 		ssa.OpAMD64VPMOVVec64x2ToM,
 		ssa.OpAMD64VPMOVVec64x4ToM,
-		ssa.OpAMD64VPMOVVec64x8ToM:
+		ssa.OpAMD64VPMOVVec64x8ToM,
+		ssa.OpAMD64VPMOVMSKB128,
+		ssa.OpAMD64VPMOVMSKB256,
+		ssa.OpAMD64VMOVMSKPS128,
+		ssa.OpAMD64VMOVMSKPS256,
+		ssa.OpAMD64VMOVMSKPD128,
+		ssa.OpAMD64VMOVMSKPD256:
 		p := s.Prog(v.Op.Asm())
 		p.From.Type = obj.TYPE_REG
 		p.From.Reg = simdReg(v.Args[0])
diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64.rules b/src/cmd/compile/internal/ssa/_gen/AMD64.rules
index 9cd23c6286..b49e85b53c 100644
--- a/src/cmd/compile/internal/ssa/_gen/AMD64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/AMD64.rules
@@ -1679,21 +1679,21 @@
 (Cvt8toMask64x8 <t> x) => (VPMOVMToVec64x8 <types.TypeVec512> (KMOVBk <t> x))
 
 // masks to integers
-(CvtMask8x16to16 <t> x) => (KMOVWi <t> (VPMOVVec8x16ToM <types.TypeMask> x))
-(CvtMask8x32to32 <t> x) => (KMOVDi <t> (VPMOVVec8x32ToM <types.TypeMask> x))
-(CvtMask8x64to64 <t> x) => (KMOVQi <t> (VPMOVVec8x64ToM <types.TypeMask> x))
+(CvtMask8x16to16 ...) => (VPMOVMSKB128 ...)
+(CvtMask8x32to32 ...) => (VPMOVMSKB256 ...)
+(CvtMask8x64to64 x) => (KMOVQi (VPMOVVec8x64ToM <types.TypeMask> x))
 
-(CvtMask16x8to8 <t> x) => (KMOVBi <t> (VPMOVVec16x8ToM <types.TypeMask> x))
-(CvtMask16x16to16 <t> x) => (KMOVWi <t> (VPMOVVec16x16ToM <types.TypeMask> x))
-(CvtMask16x32to32 <t> x) => (KMOVDi <t> (VPMOVVec16x32ToM <types.TypeMask> x))
+(CvtMask16x8to8 x) => (KMOVBi (VPMOVVec16x8ToM <types.TypeMask> x))
+(CvtMask16x16to16 x) => (KMOVWi (VPMOVVec16x16ToM <types.TypeMask> x))
+(CvtMask16x32to32 x) => (KMOVDi (VPMOVVec16x32ToM <types.TypeMask> x))
 
-(CvtMask32x4to8 <t> x) => (KMOVBi <t> (VPMOVVec32x4ToM <types.TypeMask> x))
-(CvtMask32x8to8 <t> x) => (KMOVBi <t> (VPMOVVec32x8ToM <types.TypeMask> x))
-(CvtMask32x16to16 <t> x) => (KMOVWi <t> (VPMOVVec32x16ToM <types.TypeMask> x))
+(CvtMask32x4to8 ...) => (VMOVMSKPS128 ...)
+(CvtMask32x8to8 ...) => (VMOVMSKPS256 ...)
+(CvtMask32x16to16 x) => (KMOVWi (VPMOVVec32x16ToM <types.TypeMask> x))
 
-(CvtMask64x2to8 <t> x) => (KMOVBi <t> (VPMOVVec64x2ToM <types.TypeMask> x))
-(CvtMask64x4to8 <t> x) => (KMOVBi <t> (VPMOVVec64x4ToM <types.TypeMask> x))
-(CvtMask64x8to8 <t> x) => (KMOVBi <t> (VPMOVVec64x8ToM <types.TypeMask> x))
+(CvtMask64x2to8 ...) => (VMOVMSKPD128 ...)
+(CvtMask64x4to8 ...) => (VMOVMSKPD256 ...)
+(CvtMask64x8to8 x) => (KMOVBi (VPMOVVec64x8ToM <types.TypeMask> x))
 
 // optimizations
 (MOVBstore [off] {sym} ptr (KMOVBi mask) mem) => (KMOVBstore [off] {sym} ptr mask mem)
diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go
index 2fb4fdfc96..b13eb5aa21 100644
--- a/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go
+++ b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go
@@ -1368,6 +1368,7 @@ func init() {
 		{name: "VPMASK64load512", argLength: 3, reg: vloadk, asm: "VMOVDQU64", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"},    // load from arg0+auxint+aux, arg1=k mask, arg2 = mem
 		{name: "VPMASK64store512", argLength: 4, reg: vstorek, asm: "VMOVDQU64", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"}, // store, *(arg0+auxint+aux) = arg2, arg1=k mask, arg3 = mem
 
+		// AVX512 moves between int-vector and mask registers
 		{name: "VPMOVMToVec8x16", argLength: 1, reg: kv, asm: "VPMOVM2B"},
 		{name: "VPMOVMToVec8x32", argLength: 1, reg: kv, asm: "VPMOVM2B"},
 		{name: "VPMOVMToVec8x64", argLength: 1, reg: kw, asm: "VPMOVM2B"},
@@ -1400,6 +1401,14 @@ func init() {
 		{name: "VPMOVVec64x4ToM", argLength: 1, reg: vk, asm: "VPMOVQ2M"},
 		{name: "VPMOVVec64x8ToM", argLength: 1, reg: wk, asm: "VPMOVQ2M"},
 
+		// AVX1/2 moves from int-vector to bitmask (extracting sign bits)
+		{name: "VPMOVMSKB128", argLength: 1, reg: vgp, asm: "VPMOVMSKB"},
+		{name: "VPMOVMSKB256", argLength: 1, reg: vgp, asm: "VPMOVMSKB"},
+		{name: "VMOVMSKPS128", argLength: 1, reg: vgp, asm: "VMOVMSKPS"},
+		{name: "VMOVMSKPS256", argLength: 1, reg: vgp, asm: "VMOVMSKPS"},
+		{name: "VMOVMSKPD128", argLength: 1, reg: vgp, asm: "VMOVMSKPD"},
+		{name: "VMOVMSKPD256", argLength: 1, reg: vgp, asm: "VMOVMSKPD"},
+
 		// X15 is the zero register up to 128-bit. For larger values, we zero it on the fly.
 		{name: "Zero128", argLength: 0, reg: x15only, zeroWidth: true, fixedReg: true},
 		{name: "Zero256", argLength: 0, reg: v01, asm: "VPXOR"},
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index abaf7911d4..7b70dc2686 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -1214,6 +1214,12 @@ const (
 	OpAMD64VPMOVVec64x2ToM
 	OpAMD64VPMOVVec64x4ToM
 	OpAMD64VPMOVVec64x8ToM
+	OpAMD64VPMOVMSKB128
+	OpAMD64VPMOVMSKB256
+	OpAMD64VMOVMSKPS128
+	OpAMD64VMOVMSKPS256
+	OpAMD64VMOVMSKPD128
+	OpAMD64VMOVMSKPD256
 	OpAMD64Zero128
 	OpAMD64Zero256
 	OpAMD64Zero512
@@ -20352,6 +20358,84 @@ var opcodeTable = [...]opInfo{
 		},
 	},
 	{
+		name:   "VPMOVMSKB128",
+		argLen: 1,
+		asm:    x86.AVPMOVMSKB,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+			},
+		},
+	},
+	{
+		name:   "VPMOVMSKB256",
+		argLen: 1,
+		asm:    x86.AVPMOVMSKB,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+			},
+		},
+	},
+	{
+		name:   "VMOVMSKPS128",
+		argLen: 1,
+		asm:    x86.AVMOVMSKPS,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+			},
+		},
+	},
+	{
+		name:   "VMOVMSKPS256",
+		argLen: 1,
+		asm:    x86.AVMOVMSKPS,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+			},
+		},
+	},
+	{
+		name:   "VMOVMSKPD128",
+		argLen: 1,
+		asm:    x86.AVMOVMSKPD,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+			},
+		},
+	},
+	{
+		name:   "VMOVMSKPD256",
+		argLen: 1,
+		asm:    x86.AVMOVMSKPD,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+			},
+		},
+	},
+	{
 		name:      "Zero128",
 		argLen:    0,
 		zeroWidth: true,
diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go
index 3eb2a6278b..e84bf19c83 100644
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@@ -3050,19 +3050,25 @@ func rewriteValueAMD64(v *Value) bool {
 	case OpCvtMask32x16to16:
 		return rewriteValueAMD64_OpCvtMask32x16to16(v)
 	case OpCvtMask32x4to8:
-		return rewriteValueAMD64_OpCvtMask32x4to8(v)
+		v.Op = OpAMD64VMOVMSKPS128
+		return true
 	case OpCvtMask32x8to8:
-		return rewriteValueAMD64_OpCvtMask32x8to8(v)
+		v.Op = OpAMD64VMOVMSKPS256
+		return true
 	case OpCvtMask64x2to8:
-		return rewriteValueAMD64_OpCvtMask64x2to8(v)
+		v.Op = OpAMD64VMOVMSKPD128
+		return true
 	case OpCvtMask64x4to8:
-		return rewriteValueAMD64_OpCvtMask64x4to8(v)
+		v.Op = OpAMD64VMOVMSKPD256
+		return true
 	case OpCvtMask64x8to8:
 		return rewriteValueAMD64_OpCvtMask64x8to8(v)
 	case OpCvtMask8x16to16:
-		return rewriteValueAMD64_OpCvtMask8x16to16(v)
+		v.Op = OpAMD64VPMOVMSKB128
+		return true
 	case OpCvtMask8x32to32:
-		return rewriteValueAMD64_OpCvtMask8x32to32(v)
+		v.Op = OpAMD64VPMOVMSKB256
+		return true
 	case OpCvtMask8x64to64:
 		return rewriteValueAMD64_OpCvtMask8x64to64(v)
 	case OpDiv128u:
@@ -68722,13 +68728,11 @@ func rewriteValueAMD64_OpCvt8toMask64x8(v *Value) bool {
 func rewriteValueAMD64_OpCvtMask16x16to16(v *Value) bool {
 	v_0 := v.Args[0]
 	b := v.Block
-	// match: (CvtMask16x16to16 <t> x)
-	// result: (KMOVWi <t> (VPMOVVec16x16ToM <types.TypeMask> x))
+	// match: (CvtMask16x16to16 x)
+	// result: (KMOVWi (VPMOVVec16x16ToM <types.TypeMask> x))
 	for {
-		t := v.Type
 		x := v_0
 		v.reset(OpAMD64KMOVWi)
-		v.Type = t
 		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask)
 		v0.AddArg(x)
 		v.AddArg(v0)
@@ -68738,13 +68742,11 @@ func rewriteValueAMD64_OpCvtMask16x16to16(v *Value) bool {
 func rewriteValueAMD64_OpCvtMask16x32to32(v *Value) bool {
 	v_0 := v.Args[0]
 	b := v.Block
-	// match: (CvtMask16x32to32 <t> x)
-	// result: (KMOVDi <t> (VPMOVVec16x32ToM <types.TypeMask> x))
+	// match: (CvtMask16x32to32 x)
+	// result: (KMOVDi (VPMOVVec16x32ToM <types.TypeMask> x))
 	for {
-		t := v.Type
 		x := v_0
 		v.reset(OpAMD64KMOVDi)
-		v.Type = t
 		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask)
 		v0.AddArg(x)
 		v.AddArg(v0)
@@ -68754,13 +68756,11 @@ func rewriteValueAMD64_OpCvtMask16x32to32(v *Value) bool {
 func rewriteValueAMD64_OpCvtMask16x8to8(v *Value) bool {
 	v_0 := v.Args[0]
 	b := v.Block
-	// match: (CvtMask16x8to8 <t> x)
-	// result: (KMOVBi <t> (VPMOVVec16x8ToM <types.TypeMask> x))
+	// match: (CvtMask16x8to8 x)
+	// result: (KMOVBi (VPMOVVec16x8ToM <types.TypeMask> x))
 	for {
-		t := v.Type
 		x := v_0
 		v.reset(OpAMD64KMOVBi)
-		v.Type = t
 		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
 		v0.AddArg(x)
 		v.AddArg(v0)
@@ -68770,141 +68770,39 @@ func rewriteValueAMD64_OpCvtMask16x8to8(v *Value) bool {
 func rewriteValueAMD64_OpCvtMask32x16to16(v *Value) bool {
 	v_0 := v.Args[0]
 	b := v.Block
-	// match: (CvtMask32x16to16 <t> x)
-	// result: (KMOVWi <t> (VPMOVVec32x16ToM <types.TypeMask> x))
+	// match: (CvtMask32x16to16 x)
+	// result: (KMOVWi (VPMOVVec32x16ToM <types.TypeMask> x))
 	for {
-		t := v.Type
 		x := v_0
 		v.reset(OpAMD64KMOVWi)
-		v.Type = t
 		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
 		v0.AddArg(x)
 		v.AddArg(v0)
 		return true
 	}
 }
-func rewriteValueAMD64_OpCvtMask32x4to8(v *Value) bool {
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (CvtMask32x4to8 <t> x)
-	// result: (KMOVBi <t> (VPMOVVec32x4ToM <types.TypeMask> x))
-	for {
-		t := v.Type
-		x := v_0
-		v.reset(OpAMD64KMOVBi)
-		v.Type = t
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
-		v0.AddArg(x)
-		v.AddArg(v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpCvtMask32x8to8(v *Value) bool {
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (CvtMask32x8to8 <t> x)
-	// result: (KMOVBi <t> (VPMOVVec32x8ToM <types.TypeMask> x))
-	for {
-		t := v.Type
-		x := v_0
-		v.reset(OpAMD64KMOVBi)
-		v.Type = t
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
-		v0.AddArg(x)
-		v.AddArg(v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpCvtMask64x2to8(v *Value) bool {
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (CvtMask64x2to8 <t> x)
-	// result: (KMOVBi <t> (VPMOVVec64x2ToM <types.TypeMask> x))
-	for {
-		t := v.Type
-		x := v_0
-		v.reset(OpAMD64KMOVBi)
-		v.Type = t
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
-		v0.AddArg(x)
-		v.AddArg(v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpCvtMask64x4to8(v *Value) bool {
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (CvtMask64x4to8 <t> x)
-	// result: (KMOVBi <t> (VPMOVVec64x4ToM <types.TypeMask> x))
-	for {
-		t := v.Type
-		x := v_0
-		v.reset(OpAMD64KMOVBi)
-		v.Type = t
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
-		v0.AddArg(x)
-		v.AddArg(v0)
-		return true
-	}
-}
 func rewriteValueAMD64_OpCvtMask64x8to8(v *Value) bool {
 	v_0 := v.Args[0]
 	b := v.Block
-	// match: (CvtMask64x8to8 <t> x)
-	// result: (KMOVBi <t> (VPMOVVec64x8ToM <types.TypeMask> x))
+	// match: (CvtMask64x8to8 x)
+	// result: (KMOVBi (VPMOVVec64x8ToM <types.TypeMask> x))
 	for {
-		t := v.Type
 		x := v_0
 		v.reset(OpAMD64KMOVBi)
-		v.Type = t
 		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
 		v0.AddArg(x)
 		v.AddArg(v0)
 		return true
 	}
 }
-func rewriteValueAMD64_OpCvtMask8x16to16(v *Value) bool {
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (CvtMask8x16to16 <t> x)
-	// result: (KMOVWi <t> (VPMOVVec8x16ToM <types.TypeMask> x))
-	for {
-		t := v.Type
-		x := v_0
-		v.reset(OpAMD64KMOVWi)
-		v.Type = t
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
-		v0.AddArg(x)
-		v.AddArg(v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpCvtMask8x32to32(v *Value) bool {
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (CvtMask8x32to32 <t> x)
-	// result: (KMOVDi <t> (VPMOVVec8x32ToM <types.TypeMask> x))
-	for {
-		t := v.Type
-		x := v_0
-		v.reset(OpAMD64KMOVDi)
-		v.Type = t
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask)
-		v0.AddArg(x)
-		v.AddArg(v0)
-		return true
-	}
-}
 func rewriteValueAMD64_OpCvtMask8x64to64(v *Value) bool {
 	v_0 := v.Args[0]
 	b := v.Block
-	// match: (CvtMask8x64to64 <t> x)
-	// result: (KMOVQi <t> (VPMOVVec8x64ToM <types.TypeMask> x))
+	// match: (CvtMask8x64to64 x)
+	// result: (KMOVQi (VPMOVVec8x64ToM <types.TypeMask> x))
 	for {
-		t := v.Type
 		x := v_0
 		v.reset(OpAMD64KMOVQi)
-		v.Type = t
 		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask)
 		v0.AddArg(x)
 		v.AddArg(v0)
author	Cherry Mui <cherryyz@google.com>	2026-01-05 12:56:08 -0500
committer	Cherry Mui <cherryyz@google.com>	2026-01-05 12:22:51 -0800
commit	9b2e3b9a02bd8872bdbf5c6086674fa6b4bc8ef9 (patch)
tree	51a43dfc235315fbbc88226f0e52ab77cb628e46 /src/cmd
parent	f8ee0f84753b22254d217bf28ce8ecca7db7025c (diff)
download	go-9b2e3b9a02bd8872bdbf5c6086674fa6b4bc8ef9.tar.xz