aboutsummaryrefslogtreecommitdiff
path: root/src/cmd/compile/internal
diff options
context:
space:
mode:
authorJunyang Shao <shaojunyang@google.com>2025-09-08 19:38:56 +0000
committerJunyang Shao <shaojunyang@google.com>2025-09-08 13:53:52 -0700
commitc39b2fdd1ec86f68668141a0901d5f3fc634854e (patch)
tree104830fc4a0bfe2f1f7dd419bbd36a2c1621a55f /src/cmd/compile/internal
parent832c1f76dc665f0e211eec12dd77c17fa2ceedd7 (diff)
downloadgo-c39b2fdd1ec86f68668141a0901d5f3fc634854e.tar.xz
[dev.simd] cmd/compile, simd: add VPLZCNT[DQ]
Change-Id: Ifd6d8c12deac9c41722fdf2511d860a334e83438 Reviewed-on: https://go-review.googlesource.com/c/go/+/701915 Reviewed-by: Cherry Mui <cherryyz@google.com> TryBot-Bypass: Junyang Shao <shaojunyang@google.com>
Diffstat (limited to 'src/cmd/compile/internal')
-rw-r--r--src/cmd/compile/internal/amd64/simdssa.go18
-rw-r--r--src/cmd/compile/internal/ssa/_gen/simdAMD64.rules14
-rw-r--r--src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go12
-rw-r--r--src/cmd/compile/internal/ssa/_gen/simdgenericOps.go12
-rw-r--r--src/cmd/compile/internal/ssa/opGen.go246
-rw-r--r--src/cmd/compile/internal/ssa/rewriteAMD64.go60
-rw-r--r--src/cmd/compile/internal/ssagen/simdintrinsics.go12
7 files changed, 374 insertions, 0 deletions
diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go
index 33f6669300..1c289507e1 100644
--- a/src/cmd/compile/internal/amd64/simdssa.go
+++ b/src/cmd/compile/internal/amd64/simdssa.go
@@ -110,6 +110,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPMOVZXBQ256,
ssa.OpAMD64VPMOVZXWQ256,
ssa.OpAMD64VPMOVZXBQ512,
+ ssa.OpAMD64VPLZCNTD128,
+ ssa.OpAMD64VPLZCNTD256,
+ ssa.OpAMD64VPLZCNTD512,
+ ssa.OpAMD64VPLZCNTQ128,
+ ssa.OpAMD64VPLZCNTQ256,
+ ssa.OpAMD64VPLZCNTQ512,
ssa.OpAMD64VPOPCNTB128,
ssa.OpAMD64VPOPCNTB256,
ssa.OpAMD64VPOPCNTB512,
@@ -863,6 +869,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPEXPANDQMasked128,
ssa.OpAMD64VPEXPANDQMasked256,
ssa.OpAMD64VPEXPANDQMasked512,
+ ssa.OpAMD64VPLZCNTDMasked128,
+ ssa.OpAMD64VPLZCNTDMasked256,
+ ssa.OpAMD64VPLZCNTDMasked512,
+ ssa.OpAMD64VPLZCNTQMasked128,
+ ssa.OpAMD64VPLZCNTQMasked256,
+ ssa.OpAMD64VPLZCNTQMasked512,
ssa.OpAMD64VPOPCNTBMasked128,
ssa.OpAMD64VPOPCNTBMasked256,
ssa.OpAMD64VPOPCNTBMasked512,
@@ -1581,6 +1593,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VGF2P8MULBMasked128,
ssa.OpAMD64VGF2P8MULBMasked256,
ssa.OpAMD64VGF2P8MULBMasked512,
+ ssa.OpAMD64VPLZCNTDMasked128,
+ ssa.OpAMD64VPLZCNTDMasked256,
+ ssa.OpAMD64VPLZCNTDMasked512,
+ ssa.OpAMD64VPLZCNTQMasked128,
+ ssa.OpAMD64VPLZCNTQMasked256,
+ ssa.OpAMD64VPLZCNTQMasked512,
ssa.OpAMD64VMAXPSMasked128,
ssa.OpAMD64VMAXPSMasked256,
ssa.OpAMD64VMAXPSMasked512,
diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
index 35ef1d35b6..bfedad1e9b 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
@@ -562,6 +562,18 @@
(IsNanFloat64x2 x y) => (VCMPPD128 [3] x y)
(IsNanFloat64x4 x y) => (VCMPPD256 [3] x y)
(IsNanFloat64x8 x y) => (VPMOVMToVec64x8 (VCMPPD512 [3] x y))
+(LeadingZerosInt32x4 ...) => (VPLZCNTD128 ...)
+(LeadingZerosInt32x8 ...) => (VPLZCNTD256 ...)
+(LeadingZerosInt32x16 ...) => (VPLZCNTD512 ...)
+(LeadingZerosInt64x2 ...) => (VPLZCNTQ128 ...)
+(LeadingZerosInt64x4 ...) => (VPLZCNTQ256 ...)
+(LeadingZerosInt64x8 ...) => (VPLZCNTQ512 ...)
+(LeadingZerosUint32x4 ...) => (VPLZCNTD128 ...)
+(LeadingZerosUint32x8 ...) => (VPLZCNTD256 ...)
+(LeadingZerosUint32x16 ...) => (VPLZCNTD512 ...)
+(LeadingZerosUint64x2 ...) => (VPLZCNTQ128 ...)
+(LeadingZerosUint64x4 ...) => (VPLZCNTQ256 ...)
+(LeadingZerosUint64x8 ...) => (VPLZCNTQ512 ...)
(LessFloat32x4 x y) => (VCMPPS128 [1] x y)
(LessFloat32x8 x y) => (VCMPPS256 [1] x y)
(LessFloat32x16 x y) => (VPMOVMToVec32x16 (VCMPPS512 [1] x y))
@@ -1334,6 +1346,8 @@
(VMOVDQU8Masked512 (VGF2P8AFFINEINVQB512 [a] x y) mask) => (VGF2P8AFFINEINVQBMasked512 [a] x y mask)
(VMOVDQU8Masked512 (VGF2P8AFFINEQB512 [a] x y) mask) => (VGF2P8AFFINEQBMasked512 [a] x y mask)
(VMOVDQU8Masked512 (VGF2P8MULB512 x y) mask) => (VGF2P8MULBMasked512 x y mask)
+(VMOVDQU32Masked512 (VPLZCNTD512 x) mask) => (VPLZCNTDMasked512 x mask)
+(VMOVDQU64Masked512 (VPLZCNTQ512 x) mask) => (VPLZCNTQMasked512 x mask)
(VMOVDQU32Masked512 (VMAXPS512 x y) mask) => (VMAXPSMasked512 x y mask)
(VMOVDQU64Masked512 (VMAXPD512 x y) mask) => (VMAXPDMasked512 x y mask)
(VMOVDQU8Masked512 (VPMAXSB512 x y) mask) => (VPMAXSBMasked512 x y mask)
diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
index 1448f8776a..9143f25bca 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
@@ -450,6 +450,18 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
{name: "VPHSUBSW256", argLength: 2, reg: v21, asm: "VPHSUBSW", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPHSUBW128", argLength: 2, reg: v21, asm: "VPHSUBW", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPHSUBW256", argLength: 2, reg: v21, asm: "VPHSUBW", commutative: false, typ: "Vec256", resultInArg0: false},
+ {name: "VPLZCNTD128", argLength: 1, reg: w11, asm: "VPLZCNTD", commutative: false, typ: "Vec128", resultInArg0: false},
+ {name: "VPLZCNTD256", argLength: 1, reg: w11, asm: "VPLZCNTD", commutative: false, typ: "Vec256", resultInArg0: false},
+ {name: "VPLZCNTD512", argLength: 1, reg: w11, asm: "VPLZCNTD", commutative: false, typ: "Vec512", resultInArg0: false},
+ {name: "VPLZCNTDMasked128", argLength: 2, reg: wkw, asm: "VPLZCNTD", commutative: false, typ: "Vec128", resultInArg0: false},
+ {name: "VPLZCNTDMasked256", argLength: 2, reg: wkw, asm: "VPLZCNTD", commutative: false, typ: "Vec256", resultInArg0: false},
+ {name: "VPLZCNTDMasked512", argLength: 2, reg: wkw, asm: "VPLZCNTD", commutative: false, typ: "Vec512", resultInArg0: false},
+ {name: "VPLZCNTQ128", argLength: 1, reg: w11, asm: "VPLZCNTQ", commutative: false, typ: "Vec128", resultInArg0: false},
+ {name: "VPLZCNTQ256", argLength: 1, reg: w11, asm: "VPLZCNTQ", commutative: false, typ: "Vec256", resultInArg0: false},
+ {name: "VPLZCNTQ512", argLength: 1, reg: w11, asm: "VPLZCNTQ", commutative: false, typ: "Vec512", resultInArg0: false},
+ {name: "VPLZCNTQMasked128", argLength: 2, reg: wkw, asm: "VPLZCNTQ", commutative: false, typ: "Vec128", resultInArg0: false},
+ {name: "VPLZCNTQMasked256", argLength: 2, reg: wkw, asm: "VPLZCNTQ", commutative: false, typ: "Vec256", resultInArg0: false},
+ {name: "VPLZCNTQMasked512", argLength: 2, reg: wkw, asm: "VPLZCNTQ", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPMADDUBSW128", argLength: 2, reg: v21, asm: "VPMADDUBSW", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPMADDUBSW256", argLength: 2, reg: v21, asm: "VPMADDUBSW", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPMADDUBSW512", argLength: 2, reg: w21, asm: "VPMADDUBSW", commutative: false, typ: "Vec512", resultInArg0: false},
diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
index 11c5785f7d..7ee4989d89 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
@@ -526,6 +526,18 @@ func simdGenericOps() []opData {
{name: "IsNanFloat64x2", argLength: 2, commutative: true},
{name: "IsNanFloat64x4", argLength: 2, commutative: true},
{name: "IsNanFloat64x8", argLength: 2, commutative: true},
+ {name: "LeadingZerosInt32x4", argLength: 1, commutative: false},
+ {name: "LeadingZerosInt32x8", argLength: 1, commutative: false},
+ {name: "LeadingZerosInt32x16", argLength: 1, commutative: false},
+ {name: "LeadingZerosInt64x2", argLength: 1, commutative: false},
+ {name: "LeadingZerosInt64x4", argLength: 1, commutative: false},
+ {name: "LeadingZerosInt64x8", argLength: 1, commutative: false},
+ {name: "LeadingZerosUint32x4", argLength: 1, commutative: false},
+ {name: "LeadingZerosUint32x8", argLength: 1, commutative: false},
+ {name: "LeadingZerosUint32x16", argLength: 1, commutative: false},
+ {name: "LeadingZerosUint64x2", argLength: 1, commutative: false},
+ {name: "LeadingZerosUint64x4", argLength: 1, commutative: false},
+ {name: "LeadingZerosUint64x8", argLength: 1, commutative: false},
{name: "LessEqualFloat32x4", argLength: 2, commutative: false},
{name: "LessEqualFloat32x8", argLength: 2, commutative: false},
{name: "LessEqualFloat32x16", argLength: 2, commutative: false},
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index 9fc6059865..8719602036 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -1682,6 +1682,18 @@ const (
OpAMD64VPHSUBSW256
OpAMD64VPHSUBW128
OpAMD64VPHSUBW256
+ OpAMD64VPLZCNTD128
+ OpAMD64VPLZCNTD256
+ OpAMD64VPLZCNTD512
+ OpAMD64VPLZCNTDMasked128
+ OpAMD64VPLZCNTDMasked256
+ OpAMD64VPLZCNTDMasked512
+ OpAMD64VPLZCNTQ128
+ OpAMD64VPLZCNTQ256
+ OpAMD64VPLZCNTQ512
+ OpAMD64VPLZCNTQMasked128
+ OpAMD64VPLZCNTQMasked256
+ OpAMD64VPLZCNTQMasked512
OpAMD64VPMADDUBSW128
OpAMD64VPMADDUBSW256
OpAMD64VPMADDUBSW512
@@ -5343,6 +5355,18 @@ const (
OpIsNanFloat64x2
OpIsNanFloat64x4
OpIsNanFloat64x8
+ OpLeadingZerosInt32x4
+ OpLeadingZerosInt32x8
+ OpLeadingZerosInt32x16
+ OpLeadingZerosInt64x2
+ OpLeadingZerosInt64x4
+ OpLeadingZerosInt64x8
+ OpLeadingZerosUint32x4
+ OpLeadingZerosUint32x8
+ OpLeadingZerosUint32x16
+ OpLeadingZerosUint64x2
+ OpLeadingZerosUint64x4
+ OpLeadingZerosUint64x8
OpLessEqualFloat32x4
OpLessEqualFloat32x8
OpLessEqualFloat32x16
@@ -25898,6 +25922,168 @@ var opcodeTable = [...]opInfo{
},
},
{
+ name: "VPLZCNTD128",
+ argLen: 1,
+ asm: x86.AVPLZCNTD,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ outputs: []outputInfo{
+ {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ },
+ },
+ {
+ name: "VPLZCNTD256",
+ argLen: 1,
+ asm: x86.AVPLZCNTD,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ outputs: []outputInfo{
+ {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ },
+ },
+ {
+ name: "VPLZCNTD512",
+ argLen: 1,
+ asm: x86.AVPLZCNTD,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ outputs: []outputInfo{
+ {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ },
+ },
+ {
+ name: "VPLZCNTDMasked128",
+ argLen: 2,
+ asm: x86.AVPLZCNTD,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+ {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ outputs: []outputInfo{
+ {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ },
+ },
+ {
+ name: "VPLZCNTDMasked256",
+ argLen: 2,
+ asm: x86.AVPLZCNTD,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+ {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ outputs: []outputInfo{
+ {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ },
+ },
+ {
+ name: "VPLZCNTDMasked512",
+ argLen: 2,
+ asm: x86.AVPLZCNTD,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+ {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ outputs: []outputInfo{
+ {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ },
+ },
+ {
+ name: "VPLZCNTQ128",
+ argLen: 1,
+ asm: x86.AVPLZCNTQ,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ outputs: []outputInfo{
+ {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ },
+ },
+ {
+ name: "VPLZCNTQ256",
+ argLen: 1,
+ asm: x86.AVPLZCNTQ,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ outputs: []outputInfo{
+ {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ },
+ },
+ {
+ name: "VPLZCNTQ512",
+ argLen: 1,
+ asm: x86.AVPLZCNTQ,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ outputs: []outputInfo{
+ {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ },
+ },
+ {
+ name: "VPLZCNTQMasked128",
+ argLen: 2,
+ asm: x86.AVPLZCNTQ,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+ {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ outputs: []outputInfo{
+ {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ },
+ },
+ {
+ name: "VPLZCNTQMasked256",
+ argLen: 2,
+ asm: x86.AVPLZCNTQ,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+ {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ outputs: []outputInfo{
+ {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ },
+ },
+ {
+ name: "VPLZCNTQMasked512",
+ argLen: 2,
+ asm: x86.AVPLZCNTQ,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+ {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ outputs: []outputInfo{
+ {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ },
+ },
+ {
name: "VPMADDUBSW128",
argLen: 2,
asm: x86.AVPMADDUBSW,
@@ -68573,6 +68759,66 @@ var opcodeTable = [...]opInfo{
generic: true,
},
{
+ name: "LeadingZerosInt32x4",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "LeadingZerosInt32x8",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "LeadingZerosInt32x16",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "LeadingZerosInt64x2",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "LeadingZerosInt64x4",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "LeadingZerosInt64x8",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "LeadingZerosUint32x4",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "LeadingZerosUint32x8",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "LeadingZerosUint32x16",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "LeadingZerosUint64x2",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "LeadingZerosUint64x4",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "LeadingZerosUint64x8",
+ argLen: 1,
+ generic: true,
+ },
+ {
name: "LessEqualFloat32x4",
argLen: 2,
generic: true,
diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go
index 236eed8629..06cafc8e6d 100644
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@@ -2489,6 +2489,42 @@ func rewriteValueAMD64(v *Value) bool {
return rewriteValueAMD64_OpIsNonNil(v)
case OpIsSliceInBounds:
return rewriteValueAMD64_OpIsSliceInBounds(v)
+ case OpLeadingZerosInt32x16:
+ v.Op = OpAMD64VPLZCNTD512
+ return true
+ case OpLeadingZerosInt32x4:
+ v.Op = OpAMD64VPLZCNTD128
+ return true
+ case OpLeadingZerosInt32x8:
+ v.Op = OpAMD64VPLZCNTD256
+ return true
+ case OpLeadingZerosInt64x2:
+ v.Op = OpAMD64VPLZCNTQ128
+ return true
+ case OpLeadingZerosInt64x4:
+ v.Op = OpAMD64VPLZCNTQ256
+ return true
+ case OpLeadingZerosInt64x8:
+ v.Op = OpAMD64VPLZCNTQ512
+ return true
+ case OpLeadingZerosUint32x16:
+ v.Op = OpAMD64VPLZCNTD512
+ return true
+ case OpLeadingZerosUint32x4:
+ v.Op = OpAMD64VPLZCNTD128
+ return true
+ case OpLeadingZerosUint32x8:
+ v.Op = OpAMD64VPLZCNTD256
+ return true
+ case OpLeadingZerosUint64x2:
+ v.Op = OpAMD64VPLZCNTQ128
+ return true
+ case OpLeadingZerosUint64x4:
+ v.Op = OpAMD64VPLZCNTQ256
+ return true
+ case OpLeadingZerosUint64x8:
+ v.Op = OpAMD64VPLZCNTQ512
+ return true
case OpLeq16:
return rewriteValueAMD64_OpLeq16(v)
case OpLeq16U:
@@ -27364,6 +27400,18 @@ func rewriteValueAMD64_OpAMD64VMOVDQU32Masked512(v *Value) bool {
v.AddArg3(x, y, mask)
return true
}
+ // match: (VMOVDQU32Masked512 (VPLZCNTD512 x) mask)
+ // result: (VPLZCNTDMasked512 x mask)
+ for {
+ if v_0.Op != OpAMD64VPLZCNTD512 {
+ break
+ }
+ x := v_0.Args[0]
+ mask := v_1
+ v.reset(OpAMD64VPLZCNTDMasked512)
+ v.AddArg2(x, mask)
+ return true
+ }
// match: (VMOVDQU32Masked512 (VMAXPS512 x y) mask)
// result: (VMAXPSMasked512 x y mask)
for {
@@ -28057,6 +28105,18 @@ func rewriteValueAMD64_OpAMD64VMOVDQU64Masked512(v *Value) bool {
v.AddArg3(x, y, mask)
return true
}
+ // match: (VMOVDQU64Masked512 (VPLZCNTQ512 x) mask)
+ // result: (VPLZCNTQMasked512 x mask)
+ for {
+ if v_0.Op != OpAMD64VPLZCNTQ512 {
+ break
+ }
+ x := v_0.Args[0]
+ mask := v_1
+ v.reset(OpAMD64VPLZCNTQMasked512)
+ v.AddArg2(x, mask)
+ return true
+ }
// match: (VMOVDQU64Masked512 (VMAXPD512 x y) mask)
// result: (VMAXPDMasked512 x y mask)
for {
diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go
index d75dc440d2..4f933de008 100644
--- a/src/cmd/compile/internal/ssagen/simdintrinsics.go
+++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go
@@ -574,6 +574,18 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
addF(simdPackage, "Float64x2.IsNan", opLen2(ssa.OpIsNanFloat64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Float64x4.IsNan", opLen2(ssa.OpIsNanFloat64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Float64x8.IsNan", opLen2(ssa.OpIsNanFloat64x8, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Int32x4.LeadingZeros", opLen1(ssa.OpLeadingZerosInt32x4, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Int32x8.LeadingZeros", opLen1(ssa.OpLeadingZerosInt32x8, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int32x16.LeadingZeros", opLen1(ssa.OpLeadingZerosInt32x16, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Int64x2.LeadingZeros", opLen1(ssa.OpLeadingZerosInt64x2, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Int64x4.LeadingZeros", opLen1(ssa.OpLeadingZerosInt64x4, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int64x8.LeadingZeros", opLen1(ssa.OpLeadingZerosInt64x8, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Uint32x4.LeadingZeros", opLen1(ssa.OpLeadingZerosUint32x4, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Uint32x8.LeadingZeros", opLen1(ssa.OpLeadingZerosUint32x8, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Uint32x16.LeadingZeros", opLen1(ssa.OpLeadingZerosUint32x16, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Uint64x2.LeadingZeros", opLen1(ssa.OpLeadingZerosUint64x2, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Uint64x4.LeadingZeros", opLen1(ssa.OpLeadingZerosUint64x4, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Uint64x8.LeadingZeros", opLen1(ssa.OpLeadingZerosUint64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Float32x4.Less", opLen2(ssa.OpLessFloat32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Float32x8.Less", opLen2(ssa.OpLessFloat32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Float32x16.Less", opLen2(ssa.OpLessFloat32x16, types.TypeVec512), sys.AMD64)