diff options
| author | David Chase <drchase@google.com> | 2025-08-04 15:19:54 -0400 |
|---|---|---|
| committer | David Chase <drchase@google.com> | 2025-08-05 17:43:49 -0700 |
| commit | 7ca34599ec4df8a21b7d4580f7e1c716c44f7e0f (patch) | |
| tree | 77daf23bb69e1fe4f9f5804a755d3187d3a4734f /src/cmd/compile | |
| parent | 82d056ddd7378ee23ab073c7a195d92cfc4a59d6 (diff) | |
| download | go-7ca34599ec4df8a21b7d4580f7e1c716c44f7e0f.tar.xz | |
[dev.simd] simd, cmd/compile: generated files to add 'blend' and 'blendMasked'
Generated by arch/internal/simdgen CL 693175
These methods are not public because of simdgen-induced name/signature
issues, and because their addition was motivated by the need for
emulation tools.
The specific name signature problems are:
1) one set of instructions has the "Masked" suffix (because of how
that is incorporated into names) and the other set does not (though I
suppose the operation could be renamed).
2) because the AVX2 instruction is bytes-only, to get the signature
right, requires "OverwriteBase" but OverwriteBase also requires
OverwriteClass and "simdgen does not support [OverwriteClass] in
inputs".
3) the default operation order is false, true, but we want this in a
"x.Merged(y, mask)" that pairs with "x.Masked(mask)" where the true
case is x and the false case is y/zero, but the default ordering for
VPBLENDVB and VPBLENDMB is false->x and true->y.
4) VPBLENDVB only comes in byte width, which causes problems
for floats.
All this may get fixed in the future, for now it is just an
implementation detail.
Change-Id: I61b655c7011e2c33f8644f704f886133c89d2f15
Reviewed-on: https://go-review.googlesource.com/c/go/+/693155
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Junyang Shao <shaojunyang@google.com>
Diffstat (limited to 'src/cmd/compile')
| -rw-r--r-- | src/cmd/compile/internal/amd64/simdssa.go | 14 | ||||
| -rw-r--r-- | src/cmd/compile/internal/ssa/_gen/simdAMD64.rules | 6 | ||||
| -rw-r--r-- | src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go | 6 | ||||
| -rw-r--r-- | src/cmd/compile/internal/ssa/_gen/simdgenericOps.go | 6 | ||||
| -rw-r--r-- | src/cmd/compile/internal/ssa/opGen.go | 132 | ||||
| -rw-r--r-- | src/cmd/compile/internal/ssa/rewriteAMD64.go | 86 | ||||
| -rw-r--r-- | src/cmd/compile/internal/ssagen/simdintrinsics.go | 6 |
7 files changed, 255 insertions, 1 deletions
diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go index bd6af6221d..e0571d2cc3 100644 --- a/src/cmd/compile/internal/amd64/simdssa.go +++ b/src/cmd/compile/internal/amd64/simdssa.go @@ -589,7 +589,11 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPXORDMasked512, ssa.OpAMD64VPXORQMasked128, ssa.OpAMD64VPXORQMasked256, - ssa.OpAMD64VPXORQMasked512: + ssa.OpAMD64VPXORQMasked512, + ssa.OpAMD64VPBLENDMBMasked512, + ssa.OpAMD64VPBLENDMWMasked512, + ssa.OpAMD64VPBLENDMDMasked512, + ssa.OpAMD64VPBLENDMQMasked512: p = simdV2kv(s, v) case ssa.OpAMD64VPABSBMasked128, @@ -660,6 +664,10 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VSQRTPDMasked512: p = simdVkv(s, v) + case ssa.OpAMD64VPBLENDVB128, + ssa.OpAMD64VPBLENDVB256: + p = simdV31(s, v) + case ssa.OpAMD64VROUNDPS128, ssa.OpAMD64VROUNDPS256, ssa.OpAMD64VROUNDPD128, @@ -1552,6 +1560,10 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPXORQMasked128, ssa.OpAMD64VPXORQMasked256, ssa.OpAMD64VPXORQMasked512, + ssa.OpAMD64VPBLENDMBMasked512, + ssa.OpAMD64VPBLENDMWMasked512, + ssa.OpAMD64VPBLENDMDMasked512, + ssa.OpAMD64VPBLENDMQMasked512, ssa.OpAMD64VPSLLWMasked128const, ssa.OpAMD64VPSLLWMasked256const, ssa.OpAMD64VPSLLWMasked512const, diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules index b8bd0d9b4c..9a4c82c0af 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules @@ -1891,3 +1891,9 @@ (XorMaskedUint64x2 x y mask) => (VPXORQMasked128 x y (VPMOVVec64x2ToM <types.TypeMask> mask)) (XorMaskedUint64x4 x y mask) => (VPXORQMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask)) (XorMaskedUint64x8 x y mask) => (VPXORQMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask)) +(blendInt8x16 ...) => (VPBLENDVB128 ...) +(blendInt8x32 ...) => (VPBLENDVB256 ...) +(blendMaskedInt8x64 x y mask) => (VPBLENDMBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask)) +(blendMaskedInt16x32 x y mask) => (VPBLENDMWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask)) +(blendMaskedInt32x16 x y mask) => (VPBLENDMDMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask)) +(blendMaskedInt64x8 x y mask) => (VPBLENDMQMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask)) diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go index 8b7a7791bc..7860a0889e 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go @@ -227,6 +227,12 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VPAVGWMasked128", argLength: 3, reg: w2kw, asm: "VPAVGW", commutative: true, typ: "Vec128", resultInArg0: false}, {name: "VPAVGWMasked256", argLength: 3, reg: w2kw, asm: "VPAVGW", commutative: true, typ: "Vec256", resultInArg0: false}, {name: "VPAVGWMasked512", argLength: 3, reg: w2kw, asm: "VPAVGW", commutative: true, typ: "Vec512", resultInArg0: false}, + {name: "VPBLENDMBMasked512", argLength: 3, reg: w2kw, asm: "VPBLENDMB", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPBLENDMDMasked512", argLength: 3, reg: w2kw, asm: "VPBLENDMD", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPBLENDMQMasked512", argLength: 3, reg: w2kw, asm: "VPBLENDMQ", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPBLENDMWMasked512", argLength: 3, reg: w2kw, asm: "VPBLENDMW", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPBLENDVB128", argLength: 3, reg: v31, asm: "VPBLENDVB", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPBLENDVB256", argLength: 3, reg: v31, asm: "VPBLENDVB", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPCMPEQB128", argLength: 2, reg: v21, asm: "VPCMPEQB", commutative: true, typ: "Vec128", resultInArg0: false}, {name: "VPCMPEQB256", argLength: 2, reg: v21, asm: "VPCMPEQB", commutative: true, typ: "Vec256", resultInArg0: false}, {name: "VPCMPEQB512", argLength: 2, reg: w2k, asm: "VPCMPEQB", commutative: true, typ: "Mask", resultInArg0: false}, diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go index ea52254413..bf85df5e6d 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go +++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go @@ -1558,6 +1558,12 @@ func simdGenericOps() []opData { {name: "XorUint64x2", argLength: 2, commutative: true}, {name: "XorUint64x4", argLength: 2, commutative: true}, {name: "XorUint64x8", argLength: 2, commutative: true}, + {name: "blendInt8x16", argLength: 3, commutative: false}, + {name: "blendInt8x32", argLength: 3, commutative: false}, + {name: "blendMaskedInt8x64", argLength: 3, commutative: false}, + {name: "blendMaskedInt16x32", argLength: 3, commutative: false}, + {name: "blendMaskedInt32x16", argLength: 3, commutative: false}, + {name: "blendMaskedInt64x8", argLength: 3, commutative: false}, {name: "CeilScaledFloat32x4", argLength: 1, commutative: false, aux: "Int8"}, {name: "CeilScaledFloat32x8", argLength: 1, commutative: false, aux: "Int8"}, {name: "CeilScaledFloat32x16", argLength: 1, commutative: false, aux: "Int8"}, diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 15fcabbb8d..9ce9220901 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -1446,6 +1446,12 @@ const ( OpAMD64VPAVGWMasked128 OpAMD64VPAVGWMasked256 OpAMD64VPAVGWMasked512 + OpAMD64VPBLENDMBMasked512 + OpAMD64VPBLENDMDMasked512 + OpAMD64VPBLENDMQMasked512 + OpAMD64VPBLENDMWMasked512 + OpAMD64VPBLENDVB128 + OpAMD64VPBLENDVB256 OpAMD64VPCMPEQB128 OpAMD64VPCMPEQB256 OpAMD64VPCMPEQB512 @@ -6109,6 +6115,12 @@ const ( OpXorUint64x2 OpXorUint64x4 OpXorUint64x8 + OpblendInt8x16 + OpblendInt8x32 + OpblendMaskedInt8x64 + OpblendMaskedInt16x32 + OpblendMaskedInt32x16 + OpblendMaskedInt64x8 OpCeilScaledFloat32x4 OpCeilScaledFloat32x8 OpCeilScaledFloat32x16 @@ -22711,6 +22723,96 @@ var opcodeTable = [...]opInfo{ }, }, { + name: "VPBLENDMBMasked512", + argLen: 3, + asm: x86.AVPBLENDMB, + reg: regInfo{ + inputs: []inputInfo{ + {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPBLENDMDMasked512", + argLen: 3, + asm: x86.AVPBLENDMD, + reg: regInfo{ + inputs: []inputInfo{ + {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPBLENDMQMasked512", + argLen: 3, + asm: x86.AVPBLENDMQ, + reg: regInfo{ + inputs: []inputInfo{ + {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPBLENDMWMasked512", + argLen: 3, + asm: x86.AVPBLENDMW, + reg: regInfo{ + inputs: []inputInfo{ + {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPBLENDVB128", + argLen: 3, + asm: x86.AVPBLENDVB, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPBLENDVB256", + argLen: 3, + asm: x86.AVPBLENDVB, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { name: "VPCMPEQB128", argLen: 2, commutative: true, @@ -70898,6 +71000,36 @@ var opcodeTable = [...]opInfo{ generic: true, }, { + name: "blendInt8x16", + argLen: 3, + generic: true, + }, + { + name: "blendInt8x32", + argLen: 3, + generic: true, + }, + { + name: "blendMaskedInt8x64", + argLen: 3, + generic: true, + }, + { + name: "blendMaskedInt16x32", + argLen: 3, + generic: true, + }, + { + name: "blendMaskedInt32x16", + argLen: 3, + generic: true, + }, + { + name: "blendMaskedInt64x8", + argLen: 3, + generic: true, + }, + { name: "CeilScaledFloat32x4", auxType: auxInt8, argLen: 1, diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index 2e564b0c30..e181798245 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -5659,6 +5659,20 @@ func rewriteValueAMD64(v *Value) bool { return true case OpZeroSIMD: return rewriteValueAMD64_OpZeroSIMD(v) + case OpblendInt8x16: + v.Op = OpAMD64VPBLENDVB128 + return true + case OpblendInt8x32: + v.Op = OpAMD64VPBLENDVB256 + return true + case OpblendMaskedInt16x32: + return rewriteValueAMD64_OpblendMaskedInt16x32(v) + case OpblendMaskedInt32x16: + return rewriteValueAMD64_OpblendMaskedInt32x16(v) + case OpblendMaskedInt64x8: + return rewriteValueAMD64_OpblendMaskedInt64x8(v) + case OpblendMaskedInt8x64: + return rewriteValueAMD64_OpblendMaskedInt8x64(v) } return false } @@ -57117,6 +57131,78 @@ func rewriteValueAMD64_OpZeroSIMD(v *Value) bool { } return false } +func rewriteValueAMD64_OpblendMaskedInt16x32(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (blendMaskedInt16x32 x y mask) + // result: (VPBLENDMWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VPBLENDMWMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpblendMaskedInt32x16(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (blendMaskedInt32x16 x y mask) + // result: (VPBLENDMDMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VPBLENDMDMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpblendMaskedInt64x8(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (blendMaskedInt64x8 x y mask) + // result: (VPBLENDMQMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VPBLENDMQMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpblendMaskedInt8x64(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (blendMaskedInt8x64 x y mask) + // result: (VPBLENDMBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VPBLENDMBMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} func rewriteBlockAMD64(b *Block) bool { typ := &b.Func.Config.Types switch b.Kind { diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go index 511974ffa1..fb68846347 100644 --- a/src/cmd/compile/internal/ssagen/simdintrinsics.go +++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go @@ -1830,6 +1830,12 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Uint64x2.XorMasked", opLen3(ssa.OpXorMaskedUint64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint64x4.XorMasked", opLen3(ssa.OpXorMaskedUint64x4, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint64x8.XorMasked", opLen3(ssa.OpXorMaskedUint64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int8x16.blend", opLen3(ssa.OpblendInt8x16, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int8x32.blend", opLen3(ssa.OpblendInt8x32, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int8x64.blendMasked", opLen3(ssa.OpblendMaskedInt8x64, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int16x32.blendMasked", opLen3(ssa.OpblendMaskedInt16x32, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int32x16.blendMasked", opLen3(ssa.OpblendMaskedInt32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int64x8.blendMasked", opLen3(ssa.OpblendMaskedInt64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Float32x4.AsFloat64x2", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Float32x4.AsInt8x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Float32x4.AsInt16x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) |
