aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCherry Mui <cherryyz@google.com>2026-01-08 11:57:28 -0500
committerJunyang Shao <shaojunyang@google.com>2026-01-08 09:44:00 -0800
commit8ac4477d83672af8c3d39399685731ee6b81ce2f (patch)
tree961ce0a2f55308fe144dca622f5f73e3d2115123
parent5facb3b24b1c388176572eb95239f94d6ed4017d (diff)
downloadgo-8ac4477d83672af8c3d39399685731ee6b81ce2f.tar.xz
simd/archsimd: rename Broadcast methods
Currently the Broadcast128/256/512 methods broadcast the lowest element of the input vector to a vector of the corresponding width. There are also variations of broadcast operations that broadcast the whole (128- or 256-bit) vector to a larger vector, which we don't yet support. Our current naming is unclear which version it is, though. Rename the current ones to Broadcast1ToN, to be clear that they broadcast one element. The vector version probably will be named BoradcastAllToN (not included in this CL). Change-Id: I47a21e367f948ec0b578d63706a40d20f5a9f46d Reviewed-on: https://go-review.googlesource.com/c/go/+/734840 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Junyang Shao <shaojunyang@google.com>
-rw-r--r--src/cmd/compile/internal/amd64/simdssa.go88
-rw-r--r--src/cmd/compile/internal/ssa/_gen/simdAMD64.rules82
-rw-r--r--src/cmd/compile/internal/ssa/_gen/simdgenericOps.go60
-rw-r--r--src/cmd/compile/internal/ssa/opGen.go120
-rw-r--r--src/cmd/compile/internal/ssa/rewriteAMD64.go112
-rw-r--r--src/cmd/compile/internal/ssagen/simdintrinsics.go60
-rw-r--r--src/simd/archsimd/_gen/simdgen/ops/Moves/categories.yaml33
-rw-r--r--src/simd/archsimd/_gen/simdgen/ops/Moves/go.yaml79
-rw-r--r--src/simd/archsimd/_gen/tmplgen/main.go2
-rw-r--r--src/simd/archsimd/ops_amd64.go240
-rw-r--r--src/simd/archsimd/other_gen_amd64.go60
11 files changed, 490 insertions, 446 deletions
diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go
index c4d0fd69c6..a028cbe86d 100644
--- a/src/cmd/compile/internal/amd64/simdssa.go
+++ b/src/cmd/compile/internal/amd64/simdssa.go
@@ -25,23 +25,23 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPABSQ128,
ssa.OpAMD64VPABSQ256,
ssa.OpAMD64VPABSQ512,
- ssa.OpAMD64VBROADCASTSS128,
ssa.OpAMD64VPBROADCASTQ128,
- ssa.OpAMD64VPBROADCASTB128,
- ssa.OpAMD64VPBROADCASTW128,
+ ssa.OpAMD64VBROADCASTSS128,
+ ssa.OpAMD64VBROADCASTSD256,
ssa.OpAMD64VPBROADCASTD128,
+ ssa.OpAMD64VPBROADCASTQ256,
ssa.OpAMD64VBROADCASTSS256,
- ssa.OpAMD64VBROADCASTSD256,
- ssa.OpAMD64VPBROADCASTB256,
- ssa.OpAMD64VPBROADCASTW256,
+ ssa.OpAMD64VBROADCASTSD512,
+ ssa.OpAMD64VPBROADCASTW128,
ssa.OpAMD64VPBROADCASTD256,
- ssa.OpAMD64VPBROADCASTQ256,
+ ssa.OpAMD64VPBROADCASTQ512,
ssa.OpAMD64VBROADCASTSS512,
- ssa.OpAMD64VBROADCASTSD512,
- ssa.OpAMD64VPBROADCASTB512,
- ssa.OpAMD64VPBROADCASTW512,
+ ssa.OpAMD64VPBROADCASTB128,
+ ssa.OpAMD64VPBROADCASTW256,
ssa.OpAMD64VPBROADCASTD512,
- ssa.OpAMD64VPBROADCASTQ512,
+ ssa.OpAMD64VPBROADCASTB256,
+ ssa.OpAMD64VPBROADCASTW512,
+ ssa.OpAMD64VPBROADCASTB512,
ssa.OpAMD64VCVTPD2PSX128,
ssa.OpAMD64VCVTPD2PSY128,
ssa.OpAMD64VCVTPD2PS256,
@@ -832,23 +832,23 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPABSQMasked128,
ssa.OpAMD64VPABSQMasked256,
ssa.OpAMD64VPABSQMasked512,
- ssa.OpAMD64VBROADCASTSSMasked128,
ssa.OpAMD64VPBROADCASTQMasked128,
- ssa.OpAMD64VPBROADCASTBMasked128,
- ssa.OpAMD64VPBROADCASTWMasked128,
+ ssa.OpAMD64VBROADCASTSSMasked128,
+ ssa.OpAMD64VBROADCASTSDMasked256,
ssa.OpAMD64VPBROADCASTDMasked128,
+ ssa.OpAMD64VPBROADCASTQMasked256,
ssa.OpAMD64VBROADCASTSSMasked256,
- ssa.OpAMD64VBROADCASTSDMasked256,
- ssa.OpAMD64VPBROADCASTBMasked256,
- ssa.OpAMD64VPBROADCASTWMasked256,
+ ssa.OpAMD64VBROADCASTSDMasked512,
+ ssa.OpAMD64VPBROADCASTWMasked128,
ssa.OpAMD64VPBROADCASTDMasked256,
- ssa.OpAMD64VPBROADCASTQMasked256,
+ ssa.OpAMD64VPBROADCASTQMasked512,
ssa.OpAMD64VBROADCASTSSMasked512,
- ssa.OpAMD64VBROADCASTSDMasked512,
- ssa.OpAMD64VPBROADCASTBMasked512,
- ssa.OpAMD64VPBROADCASTWMasked512,
+ ssa.OpAMD64VPBROADCASTBMasked128,
+ ssa.OpAMD64VPBROADCASTWMasked256,
ssa.OpAMD64VPBROADCASTDMasked512,
- ssa.OpAMD64VPBROADCASTQMasked512,
+ ssa.OpAMD64VPBROADCASTBMasked256,
+ ssa.OpAMD64VPBROADCASTWMasked512,
+ ssa.OpAMD64VPBROADCASTBMasked512,
ssa.OpAMD64VCOMPRESSPSMasked128,
ssa.OpAMD64VCOMPRESSPSMasked256,
ssa.OpAMD64VCOMPRESSPSMasked512,
@@ -2460,23 +2460,23 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPABSQMasked128Merging,
ssa.OpAMD64VPABSQMasked256Merging,
ssa.OpAMD64VPABSQMasked512Merging,
- ssa.OpAMD64VBROADCASTSSMasked128Merging,
ssa.OpAMD64VPBROADCASTQMasked128Merging,
- ssa.OpAMD64VPBROADCASTBMasked128Merging,
- ssa.OpAMD64VPBROADCASTWMasked128Merging,
+ ssa.OpAMD64VBROADCASTSSMasked128Merging,
+ ssa.OpAMD64VBROADCASTSDMasked256Merging,
ssa.OpAMD64VPBROADCASTDMasked128Merging,
+ ssa.OpAMD64VPBROADCASTQMasked256Merging,
ssa.OpAMD64VBROADCASTSSMasked256Merging,
- ssa.OpAMD64VBROADCASTSDMasked256Merging,
- ssa.OpAMD64VPBROADCASTBMasked256Merging,
- ssa.OpAMD64VPBROADCASTWMasked256Merging,
+ ssa.OpAMD64VBROADCASTSDMasked512Merging,
+ ssa.OpAMD64VPBROADCASTWMasked128Merging,
ssa.OpAMD64VPBROADCASTDMasked256Merging,
- ssa.OpAMD64VPBROADCASTQMasked256Merging,
+ ssa.OpAMD64VPBROADCASTQMasked512Merging,
ssa.OpAMD64VBROADCASTSSMasked512Merging,
- ssa.OpAMD64VBROADCASTSDMasked512Merging,
- ssa.OpAMD64VPBROADCASTBMasked512Merging,
- ssa.OpAMD64VPBROADCASTWMasked512Merging,
+ ssa.OpAMD64VPBROADCASTBMasked128Merging,
+ ssa.OpAMD64VPBROADCASTWMasked256Merging,
ssa.OpAMD64VPBROADCASTDMasked512Merging,
- ssa.OpAMD64VPBROADCASTQMasked512Merging,
+ ssa.OpAMD64VPBROADCASTBMasked256Merging,
+ ssa.OpAMD64VPBROADCASTWMasked512Merging,
+ ssa.OpAMD64VPBROADCASTBMasked512Merging,
ssa.OpAMD64VRNDSCALEPSMasked128Merging,
ssa.OpAMD64VRNDSCALEPSMasked256Merging,
ssa.OpAMD64VRNDSCALEPSMasked512Merging,
@@ -2817,23 +2817,23 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPAVGWMasked128,
ssa.OpAMD64VPAVGWMasked256,
ssa.OpAMD64VPAVGWMasked512,
- ssa.OpAMD64VBROADCASTSSMasked128,
ssa.OpAMD64VPBROADCASTQMasked128,
- ssa.OpAMD64VPBROADCASTBMasked128,
- ssa.OpAMD64VPBROADCASTWMasked128,
+ ssa.OpAMD64VBROADCASTSSMasked128,
+ ssa.OpAMD64VBROADCASTSDMasked256,
ssa.OpAMD64VPBROADCASTDMasked128,
+ ssa.OpAMD64VPBROADCASTQMasked256,
ssa.OpAMD64VBROADCASTSSMasked256,
- ssa.OpAMD64VBROADCASTSDMasked256,
- ssa.OpAMD64VPBROADCASTBMasked256,
- ssa.OpAMD64VPBROADCASTWMasked256,
+ ssa.OpAMD64VBROADCASTSDMasked512,
+ ssa.OpAMD64VPBROADCASTWMasked128,
ssa.OpAMD64VPBROADCASTDMasked256,
- ssa.OpAMD64VPBROADCASTQMasked256,
+ ssa.OpAMD64VPBROADCASTQMasked512,
ssa.OpAMD64VBROADCASTSSMasked512,
- ssa.OpAMD64VBROADCASTSDMasked512,
- ssa.OpAMD64VPBROADCASTBMasked512,
- ssa.OpAMD64VPBROADCASTWMasked512,
+ ssa.OpAMD64VPBROADCASTBMasked128,
+ ssa.OpAMD64VPBROADCASTWMasked256,
ssa.OpAMD64VPBROADCASTDMasked512,
- ssa.OpAMD64VPBROADCASTQMasked512,
+ ssa.OpAMD64VPBROADCASTBMasked256,
+ ssa.OpAMD64VPBROADCASTWMasked512,
+ ssa.OpAMD64VPBROADCASTBMasked512,
ssa.OpAMD64VRNDSCALEPSMasked128,
ssa.OpAMD64VRNDSCALEPSMasked128load,
ssa.OpAMD64VRNDSCALEPSMasked256,
diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
index 5c83f39a1f..799461610d 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
@@ -140,36 +140,36 @@
(AverageUint16x8 ...) => (VPAVGW128 ...)
(AverageUint16x16 ...) => (VPAVGW256 ...)
(AverageUint16x32 ...) => (VPAVGW512 ...)
-(Broadcast128Float32x4 ...) => (VBROADCASTSS128 ...)
-(Broadcast128Float64x2 ...) => (VPBROADCASTQ128 ...)
-(Broadcast128Int8x16 ...) => (VPBROADCASTB128 ...)
-(Broadcast128Int16x8 ...) => (VPBROADCASTW128 ...)
-(Broadcast128Int32x4 ...) => (VPBROADCASTD128 ...)
-(Broadcast128Int64x2 ...) => (VPBROADCASTQ128 ...)
-(Broadcast128Uint8x16 ...) => (VPBROADCASTB128 ...)
-(Broadcast128Uint16x8 ...) => (VPBROADCASTW128 ...)
-(Broadcast128Uint32x4 ...) => (VPBROADCASTD128 ...)
-(Broadcast128Uint64x2 ...) => (VPBROADCASTQ128 ...)
-(Broadcast256Float32x4 ...) => (VBROADCASTSS256 ...)
-(Broadcast256Float64x2 ...) => (VBROADCASTSD256 ...)
-(Broadcast256Int8x16 ...) => (VPBROADCASTB256 ...)
-(Broadcast256Int16x8 ...) => (VPBROADCASTW256 ...)
-(Broadcast256Int32x4 ...) => (VPBROADCASTD256 ...)
-(Broadcast256Int64x2 ...) => (VPBROADCASTQ256 ...)
-(Broadcast256Uint8x16 ...) => (VPBROADCASTB256 ...)
-(Broadcast256Uint16x8 ...) => (VPBROADCASTW256 ...)
-(Broadcast256Uint32x4 ...) => (VPBROADCASTD256 ...)
-(Broadcast256Uint64x2 ...) => (VPBROADCASTQ256 ...)
-(Broadcast512Float32x4 ...) => (VBROADCASTSS512 ...)
-(Broadcast512Float64x2 ...) => (VBROADCASTSD512 ...)
-(Broadcast512Int8x16 ...) => (VPBROADCASTB512 ...)
-(Broadcast512Int16x8 ...) => (VPBROADCASTW512 ...)
-(Broadcast512Int32x4 ...) => (VPBROADCASTD512 ...)
-(Broadcast512Int64x2 ...) => (VPBROADCASTQ512 ...)
-(Broadcast512Uint8x16 ...) => (VPBROADCASTB512 ...)
-(Broadcast512Uint16x8 ...) => (VPBROADCASTW512 ...)
-(Broadcast512Uint32x4 ...) => (VPBROADCASTD512 ...)
-(Broadcast512Uint64x2 ...) => (VPBROADCASTQ512 ...)
+(Broadcast1To2Float64x2 ...) => (VPBROADCASTQ128 ...)
+(Broadcast1To2Int64x2 ...) => (VPBROADCASTQ128 ...)
+(Broadcast1To2Uint64x2 ...) => (VPBROADCASTQ128 ...)
+(Broadcast1To4Float32x4 ...) => (VBROADCASTSS128 ...)
+(Broadcast1To4Float64x2 ...) => (VBROADCASTSD256 ...)
+(Broadcast1To4Int32x4 ...) => (VPBROADCASTD128 ...)
+(Broadcast1To4Int64x2 ...) => (VPBROADCASTQ256 ...)
+(Broadcast1To4Uint32x4 ...) => (VPBROADCASTD128 ...)
+(Broadcast1To4Uint64x2 ...) => (VPBROADCASTQ256 ...)
+(Broadcast1To8Float32x4 ...) => (VBROADCASTSS256 ...)
+(Broadcast1To8Float64x2 ...) => (VBROADCASTSD512 ...)
+(Broadcast1To8Int16x8 ...) => (VPBROADCASTW128 ...)
+(Broadcast1To8Int32x4 ...) => (VPBROADCASTD256 ...)
+(Broadcast1To8Int64x2 ...) => (VPBROADCASTQ512 ...)
+(Broadcast1To8Uint16x8 ...) => (VPBROADCASTW128 ...)
+(Broadcast1To8Uint32x4 ...) => (VPBROADCASTD256 ...)
+(Broadcast1To8Uint64x2 ...) => (VPBROADCASTQ512 ...)
+(Broadcast1To16Float32x4 ...) => (VBROADCASTSS512 ...)
+(Broadcast1To16Int8x16 ...) => (VPBROADCASTB128 ...)
+(Broadcast1To16Int16x8 ...) => (VPBROADCASTW256 ...)
+(Broadcast1To16Int32x4 ...) => (VPBROADCASTD512 ...)
+(Broadcast1To16Uint8x16 ...) => (VPBROADCASTB128 ...)
+(Broadcast1To16Uint16x8 ...) => (VPBROADCASTW256 ...)
+(Broadcast1To16Uint32x4 ...) => (VPBROADCASTD512 ...)
+(Broadcast1To32Int8x16 ...) => (VPBROADCASTB256 ...)
+(Broadcast1To32Int16x8 ...) => (VPBROADCASTW512 ...)
+(Broadcast1To32Uint8x16 ...) => (VPBROADCASTB256 ...)
+(Broadcast1To32Uint16x8 ...) => (VPBROADCASTW512 ...)
+(Broadcast1To64Int8x16 ...) => (VPBROADCASTB512 ...)
+(Broadcast1To64Uint8x16 ...) => (VPBROADCASTB512 ...)
(CeilFloat32x4 x) => (VROUNDPS128 [2] x)
(CeilFloat32x8 x) => (VROUNDPS256 [2] x)
(CeilFloat64x2 x) => (VROUNDPD128 [2] x)
@@ -1424,23 +1424,23 @@
(VMOVDQU16Masked128 (VPAVGW128 x y) mask) => (VPAVGWMasked128 x y mask)
(VMOVDQU16Masked256 (VPAVGW256 x y) mask) => (VPAVGWMasked256 x y mask)
(VMOVDQU16Masked512 (VPAVGW512 x y) mask) => (VPAVGWMasked512 x y mask)
-(VMOVDQU32Masked128 (VBROADCASTSS128 x) mask) => (VBROADCASTSSMasked128 x mask)
(VMOVDQU64Masked128 (VPBROADCASTQ128 x) mask) => (VPBROADCASTQMasked128 x mask)
-(VMOVDQU8Masked128 (VPBROADCASTB128 x) mask) => (VPBROADCASTBMasked128 x mask)
-(VMOVDQU16Masked128 (VPBROADCASTW128 x) mask) => (VPBROADCASTWMasked128 x mask)
+(VMOVDQU32Masked128 (VBROADCASTSS128 x) mask) => (VBROADCASTSSMasked128 x mask)
+(VMOVDQU64Masked256 (VBROADCASTSD256 x) mask) => (VBROADCASTSDMasked256 x mask)
(VMOVDQU32Masked128 (VPBROADCASTD128 x) mask) => (VPBROADCASTDMasked128 x mask)
+(VMOVDQU64Masked256 (VPBROADCASTQ256 x) mask) => (VPBROADCASTQMasked256 x mask)
(VMOVDQU32Masked256 (VBROADCASTSS256 x) mask) => (VBROADCASTSSMasked256 x mask)
-(VMOVDQU64Masked256 (VBROADCASTSD256 x) mask) => (VBROADCASTSDMasked256 x mask)
-(VMOVDQU8Masked256 (VPBROADCASTB256 x) mask) => (VPBROADCASTBMasked256 x mask)
-(VMOVDQU16Masked256 (VPBROADCASTW256 x) mask) => (VPBROADCASTWMasked256 x mask)
+(VMOVDQU64Masked512 (VBROADCASTSD512 x) mask) => (VBROADCASTSDMasked512 x mask)
+(VMOVDQU16Masked128 (VPBROADCASTW128 x) mask) => (VPBROADCASTWMasked128 x mask)
(VMOVDQU32Masked256 (VPBROADCASTD256 x) mask) => (VPBROADCASTDMasked256 x mask)
-(VMOVDQU64Masked256 (VPBROADCASTQ256 x) mask) => (VPBROADCASTQMasked256 x mask)
+(VMOVDQU64Masked512 (VPBROADCASTQ512 x) mask) => (VPBROADCASTQMasked512 x mask)
(VMOVDQU32Masked512 (VBROADCASTSS512 x) mask) => (VBROADCASTSSMasked512 x mask)
-(VMOVDQU64Masked512 (VBROADCASTSD512 x) mask) => (VBROADCASTSDMasked512 x mask)
-(VMOVDQU8Masked512 (VPBROADCASTB512 x) mask) => (VPBROADCASTBMasked512 x mask)
-(VMOVDQU16Masked512 (VPBROADCASTW512 x) mask) => (VPBROADCASTWMasked512 x mask)
+(VMOVDQU8Masked128 (VPBROADCASTB128 x) mask) => (VPBROADCASTBMasked128 x mask)
+(VMOVDQU16Masked256 (VPBROADCASTW256 x) mask) => (VPBROADCASTWMasked256 x mask)
(VMOVDQU32Masked512 (VPBROADCASTD512 x) mask) => (VPBROADCASTDMasked512 x mask)
-(VMOVDQU64Masked512 (VPBROADCASTQ512 x) mask) => (VPBROADCASTQMasked512 x mask)
+(VMOVDQU8Masked256 (VPBROADCASTB256 x) mask) => (VPBROADCASTBMasked256 x mask)
+(VMOVDQU16Masked512 (VPBROADCASTW512 x) mask) => (VPBROADCASTWMasked512 x mask)
+(VMOVDQU8Masked512 (VPBROADCASTB512 x) mask) => (VPBROADCASTBMasked512 x mask)
(VMOVDQU32Masked128 (VRNDSCALEPS128 [a] x) mask) => (VRNDSCALEPSMasked128 [a] x mask)
(VMOVDQU32Masked256 (VRNDSCALEPS256 [a] x) mask) => (VRNDSCALEPSMasked256 [a] x mask)
(VMOVDQU32Masked512 (VRNDSCALEPS512 [a] x) mask) => (VRNDSCALEPSMasked512 [a] x mask)
diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
index 889ab0d84f..ff863a389f 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
@@ -143,36 +143,36 @@ func simdGenericOps() []opData {
{name: "AverageUint16x8", argLength: 2, commutative: true},
{name: "AverageUint16x16", argLength: 2, commutative: true},
{name: "AverageUint16x32", argLength: 2, commutative: true},
- {name: "Broadcast128Float32x4", argLength: 1, commutative: false},
- {name: "Broadcast128Float64x2", argLength: 1, commutative: false},
- {name: "Broadcast128Int8x16", argLength: 1, commutative: false},
- {name: "Broadcast128Int16x8", argLength: 1, commutative: false},
- {name: "Broadcast128Int32x4", argLength: 1, commutative: false},
- {name: "Broadcast128Int64x2", argLength: 1, commutative: false},
- {name: "Broadcast128Uint8x16", argLength: 1, commutative: false},
- {name: "Broadcast128Uint16x8", argLength: 1, commutative: false},
- {name: "Broadcast128Uint32x4", argLength: 1, commutative: false},
- {name: "Broadcast128Uint64x2", argLength: 1, commutative: false},
- {name: "Broadcast256Float32x4", argLength: 1, commutative: false},
- {name: "Broadcast256Float64x2", argLength: 1, commutative: false},
- {name: "Broadcast256Int8x16", argLength: 1, commutative: false},
- {name: "Broadcast256Int16x8", argLength: 1, commutative: false},
- {name: "Broadcast256Int32x4", argLength: 1, commutative: false},
- {name: "Broadcast256Int64x2", argLength: 1, commutative: false},
- {name: "Broadcast256Uint8x16", argLength: 1, commutative: false},
- {name: "Broadcast256Uint16x8", argLength: 1, commutative: false},
- {name: "Broadcast256Uint32x4", argLength: 1, commutative: false},
- {name: "Broadcast256Uint64x2", argLength: 1, commutative: false},
- {name: "Broadcast512Float32x4", argLength: 1, commutative: false},
- {name: "Broadcast512Float64x2", argLength: 1, commutative: false},
- {name: "Broadcast512Int8x16", argLength: 1, commutative: false},
- {name: "Broadcast512Int16x8", argLength: 1, commutative: false},
- {name: "Broadcast512Int32x4", argLength: 1, commutative: false},
- {name: "Broadcast512Int64x2", argLength: 1, commutative: false},
- {name: "Broadcast512Uint8x16", argLength: 1, commutative: false},
- {name: "Broadcast512Uint16x8", argLength: 1, commutative: false},
- {name: "Broadcast512Uint32x4", argLength: 1, commutative: false},
- {name: "Broadcast512Uint64x2", argLength: 1, commutative: false},
+ {name: "Broadcast1To2Float64x2", argLength: 1, commutative: false},
+ {name: "Broadcast1To2Int64x2", argLength: 1, commutative: false},
+ {name: "Broadcast1To2Uint64x2", argLength: 1, commutative: false},
+ {name: "Broadcast1To4Float32x4", argLength: 1, commutative: false},
+ {name: "Broadcast1To4Float64x2", argLength: 1, commutative: false},
+ {name: "Broadcast1To4Int32x4", argLength: 1, commutative: false},
+ {name: "Broadcast1To4Int64x2", argLength: 1, commutative: false},
+ {name: "Broadcast1To4Uint32x4", argLength: 1, commutative: false},
+ {name: "Broadcast1To4Uint64x2", argLength: 1, commutative: false},
+ {name: "Broadcast1To8Float32x4", argLength: 1, commutative: false},
+ {name: "Broadcast1To8Float64x2", argLength: 1, commutative: false},
+ {name: "Broadcast1To8Int16x8", argLength: 1, commutative: false},
+ {name: "Broadcast1To8Int32x4", argLength: 1, commutative: false},
+ {name: "Broadcast1To8Int64x2", argLength: 1, commutative: false},
+ {name: "Broadcast1To8Uint16x8", argLength: 1, commutative: false},
+ {name: "Broadcast1To8Uint32x4", argLength: 1, commutative: false},
+ {name: "Broadcast1To8Uint64x2", argLength: 1, commutative: false},
+ {name: "Broadcast1To16Float32x4", argLength: 1, commutative: false},
+ {name: "Broadcast1To16Int8x16", argLength: 1, commutative: false},
+ {name: "Broadcast1To16Int16x8", argLength: 1, commutative: false},
+ {name: "Broadcast1To16Int32x4", argLength: 1, commutative: false},
+ {name: "Broadcast1To16Uint8x16", argLength: 1, commutative: false},
+ {name: "Broadcast1To16Uint16x8", argLength: 1, commutative: false},
+ {name: "Broadcast1To16Uint32x4", argLength: 1, commutative: false},
+ {name: "Broadcast1To32Int8x16", argLength: 1, commutative: false},
+ {name: "Broadcast1To32Int16x8", argLength: 1, commutative: false},
+ {name: "Broadcast1To32Uint8x16", argLength: 1, commutative: false},
+ {name: "Broadcast1To32Uint16x8", argLength: 1, commutative: false},
+ {name: "Broadcast1To64Int8x16", argLength: 1, commutative: false},
+ {name: "Broadcast1To64Uint8x16", argLength: 1, commutative: false},
{name: "CeilFloat32x4", argLength: 1, commutative: false},
{name: "CeilFloat32x8", argLength: 1, commutative: false},
{name: "CeilFloat64x2", argLength: 1, commutative: false},
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index 7b70dc2686..9e5fdb1fc1 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -6309,36 +6309,36 @@ const (
OpAverageUint16x8
OpAverageUint16x16
OpAverageUint16x32
- OpBroadcast128Float32x4
- OpBroadcast128Float64x2
- OpBroadcast128Int8x16
- OpBroadcast128Int16x8
- OpBroadcast128Int32x4
- OpBroadcast128Int64x2
- OpBroadcast128Uint8x16
- OpBroadcast128Uint16x8
- OpBroadcast128Uint32x4
- OpBroadcast128Uint64x2
- OpBroadcast256Float32x4
- OpBroadcast256Float64x2
- OpBroadcast256Int8x16
- OpBroadcast256Int16x8
- OpBroadcast256Int32x4
- OpBroadcast256Int64x2
- OpBroadcast256Uint8x16
- OpBroadcast256Uint16x8
- OpBroadcast256Uint32x4
- OpBroadcast256Uint64x2
- OpBroadcast512Float32x4
- OpBroadcast512Float64x2
- OpBroadcast512Int8x16
- OpBroadcast512Int16x8
- OpBroadcast512Int32x4
- OpBroadcast512Int64x2
- OpBroadcast512Uint8x16
- OpBroadcast512Uint16x8
- OpBroadcast512Uint32x4
- OpBroadcast512Uint64x2
+ OpBroadcast1To2Float64x2
+ OpBroadcast1To2Int64x2
+ OpBroadcast1To2Uint64x2
+ OpBroadcast1To4Float32x4
+ OpBroadcast1To4Float64x2
+ OpBroadcast1To4Int32x4
+ OpBroadcast1To4Int64x2
+ OpBroadcast1To4Uint32x4
+ OpBroadcast1To4Uint64x2
+ OpBroadcast1To8Float32x4
+ OpBroadcast1To8Float64x2
+ OpBroadcast1To8Int16x8
+ OpBroadcast1To8Int32x4
+ OpBroadcast1To8Int64x2
+ OpBroadcast1To8Uint16x8
+ OpBroadcast1To8Uint32x4
+ OpBroadcast1To8Uint64x2
+ OpBroadcast1To16Float32x4
+ OpBroadcast1To16Int8x16
+ OpBroadcast1To16Int16x8
+ OpBroadcast1To16Int32x4
+ OpBroadcast1To16Uint8x16
+ OpBroadcast1To16Uint16x8
+ OpBroadcast1To16Uint32x4
+ OpBroadcast1To32Int8x16
+ OpBroadcast1To32Int16x8
+ OpBroadcast1To32Uint8x16
+ OpBroadcast1To32Uint16x8
+ OpBroadcast1To64Int8x16
+ OpBroadcast1To64Uint8x16
OpCeilFloat32x4
OpCeilFloat32x8
OpCeilFloat64x2
@@ -89875,152 +89875,152 @@ var opcodeTable = [...]opInfo{
generic: true,
},
{
- name: "Broadcast128Float32x4",
+ name: "Broadcast1To2Float64x2",
argLen: 1,
generic: true,
},
{
- name: "Broadcast128Float64x2",
+ name: "Broadcast1To2Int64x2",
argLen: 1,
generic: true,
},
{
- name: "Broadcast128Int8x16",
+ name: "Broadcast1To2Uint64x2",
argLen: 1,
generic: true,
},
{
- name: "Broadcast128Int16x8",
+ name: "Broadcast1To4Float32x4",
argLen: 1,
generic: true,
},
{
- name: "Broadcast128Int32x4",
+ name: "Broadcast1To4Float64x2",
argLen: 1,
generic: true,
},
{
- name: "Broadcast128Int64x2",
+ name: "Broadcast1To4Int32x4",
argLen: 1,
generic: true,
},
{
- name: "Broadcast128Uint8x16",
+ name: "Broadcast1To4Int64x2",
argLen: 1,
generic: true,
},
{
- name: "Broadcast128Uint16x8",
+ name: "Broadcast1To4Uint32x4",
argLen: 1,
generic: true,
},
{
- name: "Broadcast128Uint32x4",
+ name: "Broadcast1To4Uint64x2",
argLen: 1,
generic: true,
},
{
- name: "Broadcast128Uint64x2",
+ name: "Broadcast1To8Float32x4",
argLen: 1,
generic: true,
},
{
- name: "Broadcast256Float32x4",
+ name: "Broadcast1To8Float64x2",
argLen: 1,
generic: true,
},
{
- name: "Broadcast256Float64x2",
+ name: "Broadcast1To8Int16x8",
argLen: 1,
generic: true,
},
{
- name: "Broadcast256Int8x16",
+ name: "Broadcast1To8Int32x4",
argLen: 1,
generic: true,
},
{
- name: "Broadcast256Int16x8",
+ name: "Broadcast1To8Int64x2",
argLen: 1,
generic: true,
},
{
- name: "Broadcast256Int32x4",
+ name: "Broadcast1To8Uint16x8",
argLen: 1,
generic: true,
},
{
- name: "Broadcast256Int64x2",
+ name: "Broadcast1To8Uint32x4",
argLen: 1,
generic: true,
},
{
- name: "Broadcast256Uint8x16",
+ name: "Broadcast1To8Uint64x2",
argLen: 1,
generic: true,
},
{
- name: "Broadcast256Uint16x8",
+ name: "Broadcast1To16Float32x4",
argLen: 1,
generic: true,
},
{
- name: "Broadcast256Uint32x4",
+ name: "Broadcast1To16Int8x16",
argLen: 1,
generic: true,
},
{
- name: "Broadcast256Uint64x2",
+ name: "Broadcast1To16Int16x8",
argLen: 1,
generic: true,
},
{
- name: "Broadcast512Float32x4",
+ name: "Broadcast1To16Int32x4",
argLen: 1,
generic: true,
},
{
- name: "Broadcast512Float64x2",
+ name: "Broadcast1To16Uint8x16",
argLen: 1,
generic: true,
},
{
- name: "Broadcast512Int8x16",
+ name: "Broadcast1To16Uint16x8",
argLen: 1,
generic: true,
},
{
- name: "Broadcast512Int16x8",
+ name: "Broadcast1To16Uint32x4",
argLen: 1,
generic: true,
},
{
- name: "Broadcast512Int32x4",
+ name: "Broadcast1To32Int8x16",
argLen: 1,
generic: true,
},
{
- name: "Broadcast512Int64x2",
+ name: "Broadcast1To32Int16x8",
argLen: 1,
generic: true,
},
{
- name: "Broadcast512Uint8x16",
+ name: "Broadcast1To32Uint8x16",
argLen: 1,
generic: true,
},
{
- name: "Broadcast512Uint16x8",
+ name: "Broadcast1To32Uint16x8",
argLen: 1,
generic: true,
},
{
- name: "Broadcast512Uint32x4",
+ name: "Broadcast1To64Int8x16",
argLen: 1,
generic: true,
},
{
- name: "Broadcast512Uint64x2",
+ name: "Broadcast1To64Uint8x16",
argLen: 1,
generic: true,
},
diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go
index e84bf19c83..fe0005bb05 100644
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@@ -2479,96 +2479,96 @@ func rewriteValueAMD64(v *Value) bool {
return rewriteValueAMD64_OpBitLen64(v)
case OpBitLen8:
return rewriteValueAMD64_OpBitLen8(v)
- case OpBroadcast128Float32x4:
- v.Op = OpAMD64VBROADCASTSS128
+ case OpBroadcast1To16Float32x4:
+ v.Op = OpAMD64VBROADCASTSS512
return true
- case OpBroadcast128Float64x2:
- v.Op = OpAMD64VPBROADCASTQ128
+ case OpBroadcast1To16Int16x8:
+ v.Op = OpAMD64VPBROADCASTW256
return true
- case OpBroadcast128Int16x8:
- v.Op = OpAMD64VPBROADCASTW128
+ case OpBroadcast1To16Int32x4:
+ v.Op = OpAMD64VPBROADCASTD512
return true
- case OpBroadcast128Int32x4:
- v.Op = OpAMD64VPBROADCASTD128
+ case OpBroadcast1To16Int8x16:
+ v.Op = OpAMD64VPBROADCASTB128
return true
- case OpBroadcast128Int64x2:
- v.Op = OpAMD64VPBROADCASTQ128
+ case OpBroadcast1To16Uint16x8:
+ v.Op = OpAMD64VPBROADCASTW256
return true
- case OpBroadcast128Int8x16:
+ case OpBroadcast1To16Uint32x4:
+ v.Op = OpAMD64VPBROADCASTD512
+ return true
+ case OpBroadcast1To16Uint8x16:
v.Op = OpAMD64VPBROADCASTB128
return true
- case OpBroadcast128Uint16x8:
- v.Op = OpAMD64VPBROADCASTW128
+ case OpBroadcast1To2Float64x2:
+ v.Op = OpAMD64VPBROADCASTQ128
return true
- case OpBroadcast128Uint32x4:
- v.Op = OpAMD64VPBROADCASTD128
+ case OpBroadcast1To2Int64x2:
+ v.Op = OpAMD64VPBROADCASTQ128
return true
- case OpBroadcast128Uint64x2:
+ case OpBroadcast1To2Uint64x2:
v.Op = OpAMD64VPBROADCASTQ128
return true
- case OpBroadcast128Uint8x16:
- v.Op = OpAMD64VPBROADCASTB128
+ case OpBroadcast1To32Int16x8:
+ v.Op = OpAMD64VPBROADCASTW512
return true
- case OpBroadcast256Float32x4:
- v.Op = OpAMD64VBROADCASTSS256
+ case OpBroadcast1To32Int8x16:
+ v.Op = OpAMD64VPBROADCASTB256
return true
- case OpBroadcast256Float64x2:
- v.Op = OpAMD64VBROADCASTSD256
+ case OpBroadcast1To32Uint16x8:
+ v.Op = OpAMD64VPBROADCASTW512
return true
- case OpBroadcast256Int16x8:
- v.Op = OpAMD64VPBROADCASTW256
+ case OpBroadcast1To32Uint8x16:
+ v.Op = OpAMD64VPBROADCASTB256
return true
- case OpBroadcast256Int32x4:
- v.Op = OpAMD64VPBROADCASTD256
+ case OpBroadcast1To4Float32x4:
+ v.Op = OpAMD64VBROADCASTSS128
return true
- case OpBroadcast256Int64x2:
- v.Op = OpAMD64VPBROADCASTQ256
+ case OpBroadcast1To4Float64x2:
+ v.Op = OpAMD64VBROADCASTSD256
return true
- case OpBroadcast256Int8x16:
- v.Op = OpAMD64VPBROADCASTB256
+ case OpBroadcast1To4Int32x4:
+ v.Op = OpAMD64VPBROADCASTD128
return true
- case OpBroadcast256Uint16x8:
- v.Op = OpAMD64VPBROADCASTW256
+ case OpBroadcast1To4Int64x2:
+ v.Op = OpAMD64VPBROADCASTQ256
return true
- case OpBroadcast256Uint32x4:
- v.Op = OpAMD64VPBROADCASTD256
+ case OpBroadcast1To4Uint32x4:
+ v.Op = OpAMD64VPBROADCASTD128
return true
- case OpBroadcast256Uint64x2:
+ case OpBroadcast1To4Uint64x2:
v.Op = OpAMD64VPBROADCASTQ256
return true
- case OpBroadcast256Uint8x16:
- v.Op = OpAMD64VPBROADCASTB256
+ case OpBroadcast1To64Int8x16:
+ v.Op = OpAMD64VPBROADCASTB512
return true
- case OpBroadcast512Float32x4:
- v.Op = OpAMD64VBROADCASTSS512
+ case OpBroadcast1To64Uint8x16:
+ v.Op = OpAMD64VPBROADCASTB512
+ return true
+ case OpBroadcast1To8Float32x4:
+ v.Op = OpAMD64VBROADCASTSS256
return true
- case OpBroadcast512Float64x2:
+ case OpBroadcast1To8Float64x2:
v.Op = OpAMD64VBROADCASTSD512
return true
- case OpBroadcast512Int16x8:
- v.Op = OpAMD64VPBROADCASTW512
+ case OpBroadcast1To8Int16x8:
+ v.Op = OpAMD64VPBROADCASTW128
return true
- case OpBroadcast512Int32x4:
- v.Op = OpAMD64VPBROADCASTD512
+ case OpBroadcast1To8Int32x4:
+ v.Op = OpAMD64VPBROADCASTD256
return true
- case OpBroadcast512Int64x2:
+ case OpBroadcast1To8Int64x2:
v.Op = OpAMD64VPBROADCASTQ512
return true
- case OpBroadcast512Int8x16:
- v.Op = OpAMD64VPBROADCASTB512
- return true
- case OpBroadcast512Uint16x8:
- v.Op = OpAMD64VPBROADCASTW512
+ case OpBroadcast1To8Uint16x8:
+ v.Op = OpAMD64VPBROADCASTW128
return true
- case OpBroadcast512Uint32x4:
- v.Op = OpAMD64VPBROADCASTD512
+ case OpBroadcast1To8Uint32x4:
+ v.Op = OpAMD64VPBROADCASTD256
return true
- case OpBroadcast512Uint64x2:
+ case OpBroadcast1To8Uint64x2:
v.Op = OpAMD64VPBROADCASTQ512
return true
- case OpBroadcast512Uint8x16:
- v.Op = OpAMD64VPBROADCASTB512
- return true
case OpBswap16:
return rewriteValueAMD64_OpBswap16(v)
case OpBswap32:
diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go
index 4ad0c6032c..e50561845b 100644
--- a/src/cmd/compile/internal/ssagen/simdintrinsics.go
+++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go
@@ -152,36 +152,36 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
addF(simdPackage, "Uint16x8.Average", opLen2(ssa.OpAverageUint16x8, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint16x16.Average", opLen2(ssa.OpAverageUint16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint16x32.Average", opLen2(ssa.OpAverageUint16x32, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Float32x4.Broadcast128", opLen1(ssa.OpBroadcast128Float32x4, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Float64x2.Broadcast128", opLen1(ssa.OpBroadcast128Float64x2, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Int8x16.Broadcast128", opLen1(ssa.OpBroadcast128Int8x16, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Int16x8.Broadcast128", opLen1(ssa.OpBroadcast128Int16x8, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Int32x4.Broadcast128", opLen1(ssa.OpBroadcast128Int32x4, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Int64x2.Broadcast128", opLen1(ssa.OpBroadcast128Int64x2, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Uint8x16.Broadcast128", opLen1(ssa.OpBroadcast128Uint8x16, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Uint16x8.Broadcast128", opLen1(ssa.OpBroadcast128Uint16x8, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Uint32x4.Broadcast128", opLen1(ssa.OpBroadcast128Uint32x4, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Uint64x2.Broadcast128", opLen1(ssa.OpBroadcast128Uint64x2, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Float32x4.Broadcast256", opLen1(ssa.OpBroadcast256Float32x4, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Float64x2.Broadcast256", opLen1(ssa.OpBroadcast256Float64x2, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Int8x16.Broadcast256", opLen1(ssa.OpBroadcast256Int8x16, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Int16x8.Broadcast256", opLen1(ssa.OpBroadcast256Int16x8, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Int32x4.Broadcast256", opLen1(ssa.OpBroadcast256Int32x4, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Int64x2.Broadcast256", opLen1(ssa.OpBroadcast256Int64x2, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Uint8x16.Broadcast256", opLen1(ssa.OpBroadcast256Uint8x16, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Uint16x8.Broadcast256", opLen1(ssa.OpBroadcast256Uint16x8, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Uint32x4.Broadcast256", opLen1(ssa.OpBroadcast256Uint32x4, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Uint64x2.Broadcast256", opLen1(ssa.OpBroadcast256Uint64x2, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Float32x4.Broadcast512", opLen1(ssa.OpBroadcast512Float32x4, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Float64x2.Broadcast512", opLen1(ssa.OpBroadcast512Float64x2, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Int8x16.Broadcast512", opLen1(ssa.OpBroadcast512Int8x16, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Int16x8.Broadcast512", opLen1(ssa.OpBroadcast512Int16x8, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Int32x4.Broadcast512", opLen1(ssa.OpBroadcast512Int32x4, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Int64x2.Broadcast512", opLen1(ssa.OpBroadcast512Int64x2, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Uint8x16.Broadcast512", opLen1(ssa.OpBroadcast512Uint8x16, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Uint16x8.Broadcast512", opLen1(ssa.OpBroadcast512Uint16x8, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Uint32x4.Broadcast512", opLen1(ssa.OpBroadcast512Uint32x4, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Uint64x2.Broadcast512", opLen1(ssa.OpBroadcast512Uint64x2, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Float64x2.Broadcast1To2", opLen1(ssa.OpBroadcast1To2Float64x2, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Int64x2.Broadcast1To2", opLen1(ssa.OpBroadcast1To2Int64x2, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Uint64x2.Broadcast1To2", opLen1(ssa.OpBroadcast1To2Uint64x2, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Float32x4.Broadcast1To4", opLen1(ssa.OpBroadcast1To4Float32x4, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Float64x2.Broadcast1To4", opLen1(ssa.OpBroadcast1To4Float64x2, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int32x4.Broadcast1To4", opLen1(ssa.OpBroadcast1To4Int32x4, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Int64x2.Broadcast1To4", opLen1(ssa.OpBroadcast1To4Int64x2, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Uint32x4.Broadcast1To4", opLen1(ssa.OpBroadcast1To4Uint32x4, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Uint64x2.Broadcast1To4", opLen1(ssa.OpBroadcast1To4Uint64x2, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Float32x4.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Float32x4, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Float64x2.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Float64x2, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Int16x8.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Int16x8, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Int32x4.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Int32x4, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int64x2.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Int64x2, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Uint16x8.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Uint16x8, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Uint32x4.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Uint32x4, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Uint64x2.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Uint64x2, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Float32x4.Broadcast1To16", opLen1(ssa.OpBroadcast1To16Float32x4, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Int8x16.Broadcast1To16", opLen1(ssa.OpBroadcast1To16Int8x16, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Int16x8.Broadcast1To16", opLen1(ssa.OpBroadcast1To16Int16x8, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int32x4.Broadcast1To16", opLen1(ssa.OpBroadcast1To16Int32x4, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Uint8x16.Broadcast1To16", opLen1(ssa.OpBroadcast1To16Uint8x16, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Uint16x8.Broadcast1To16", opLen1(ssa.OpBroadcast1To16Uint16x8, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Uint32x4.Broadcast1To16", opLen1(ssa.OpBroadcast1To16Uint32x4, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Int8x16.Broadcast1To32", opLen1(ssa.OpBroadcast1To32Int8x16, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int16x8.Broadcast1To32", opLen1(ssa.OpBroadcast1To32Int16x8, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Uint8x16.Broadcast1To32", opLen1(ssa.OpBroadcast1To32Uint8x16, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Uint16x8.Broadcast1To32", opLen1(ssa.OpBroadcast1To32Uint16x8, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Int8x16.Broadcast1To64", opLen1(ssa.OpBroadcast1To64Int8x16, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Uint8x16.Broadcast1To64", opLen1(ssa.OpBroadcast1To64Uint8x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Float32x4.Ceil", opLen1(ssa.OpCeilFloat32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Float32x8.Ceil", opLen1(ssa.OpCeilFloat32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Float64x2.Ceil", opLen1(ssa.OpCeilFloat64x2, types.TypeVec128), sys.AMD64)
diff --git a/src/simd/archsimd/_gen/simdgen/ops/Moves/categories.yaml b/src/simd/archsimd/_gen/simdgen/ops/Moves/categories.yaml
index 38bc9374cc..3cba01ef95 100644
--- a/src/simd/archsimd/_gen/simdgen/ops/Moves/categories.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/Moves/categories.yaml
@@ -69,21 +69,36 @@
documentation: !string |-
// NAME performs an expansion on a vector x whose elements are packed to lower parts.
// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
-- go: Broadcast128
+- go: Broadcast1To2
commutative: false
documentation: !string |-
- // NAME copies element zero of its (128-bit) input to all elements of
- // the 128-bit output vector.
-- go: Broadcast256
+ // NAME copies the lowest element of its input to all 2 elements of
+ // the output vector.
+- go: Broadcast1To4
commutative: false
documentation: !string |-
- // NAME copies element zero of its (128-bit) input to all elements of
- // the 256-bit output vector.
-- go: Broadcast512
+ // NAME copies the lowest element of its input to all 4 elements of
+ // the output vector.
+- go: Broadcast1To8
commutative: false
documentation: !string |-
- // NAME copies element zero of its (128-bit) input to all elements of
- // the 512-bit output vector.
+ // NAME copies the lowest element of its input to all 8 elements of
+ // the output vector.
+- go: Broadcast1To16
+ commutative: false
+ documentation: !string |-
+ // NAME copies the lowest element of its input to all 16 elements of
+ // the output vector.
+- go: Broadcast1To32
+ commutative: false
+ documentation: !string |-
+ // NAME copies the lowest element of its input to all 32 elements of
+ // the output vector.
+- go: Broadcast1To64
+ commutative: false
+ documentation: !string |-
+ // NAME copies the lowest element of its input to all 64 elements of
+ // the output vector.
- go: PermuteOrZeroGrouped
commutative: false
documentation: !string |- # Detailed documentation will rely on the specific ops.
diff --git a/src/simd/archsimd/_gen/simdgen/ops/Moves/go.yaml b/src/simd/archsimd/_gen/simdgen/ops/Moves/go.yaml
index e1fd184ed7..02daa2ea1e 100644
--- a/src/simd/archsimd/_gen/simdgen/ops/Moves/go.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/Moves/go.yaml
@@ -376,21 +376,21 @@
out:
- *any
-- go: Broadcast128
- asm: VPBROADCAST[BWDQ]
+- go: Broadcast1To2
+ asm: VPBROADCASTQ
in:
- class: vreg
bits: 128
- elemBits: $e
+ elemBits: 64
base: $b
out:
- class: vreg
bits: 128
- elemBits: $e
+ elemBits: 64
base: $b
# weirdly, this one case on AVX2 is memory-operand-only
-- go: Broadcast128
+- go: Broadcast1To2
asm: VPBROADCASTQ
in:
- class: vreg
@@ -405,71 +405,94 @@
base: int
OverwriteBase: float
-- go: Broadcast256
+- go: Broadcast1To4
asm: VPBROADCAST[BWDQ]
in:
- class: vreg
bits: 128
- elemBits: $e
base: $b
out:
- class: vreg
- bits: 256
- elemBits: $e
+ lanes: 4
base: $b
-- go: Broadcast512
+- go: Broadcast1To8
asm: VPBROADCAST[BWDQ]
in:
- class: vreg
bits: 128
- elemBits: $e
base: $b
out:
- class: vreg
- bits: 512
- elemBits: $e
+ lanes: 8
base: $b
-- go: Broadcast128
- asm: VBROADCASTS[SD]
+- go: Broadcast1To16
+ asm: VPBROADCAST[BWDQ]
in:
- class: vreg
bits: 128
- elemBits: $e
base: $b
out:
- class: vreg
- bits: 128
- elemBits: $e
+ lanes: 16
base: $b
-- go: Broadcast256
- asm: VBROADCASTS[SD]
+- go: Broadcast1To32
+ asm: VPBROADCAST[BWDQ]
in:
- class: vreg
bits: 128
- elemBits: $e
base: $b
out:
- class: vreg
- bits: 256
- elemBits: $e
+ lanes: 32
base: $b
-- go: Broadcast512
- asm: VBROADCASTS[SD]
+- go: Broadcast1To64
+ asm: VPBROADCASTB
in:
- class: vreg
bits: 128
- elemBits: $e
base: $b
out:
- class: vreg
- bits: 512
- elemBits: $e
+ lanes: 64
base: $b
+- go: Broadcast1To4
+ asm: VBROADCASTS[SD]
+ in:
+ - class: vreg
+ bits: 128
+ base: float
+ out:
+ - class: vreg
+ lanes: 4
+ base: float
+
+- go: Broadcast1To8
+ asm: VBROADCASTS[SD]
+ in:
+ - class: vreg
+ bits: 128
+ base: float
+ out:
+ - class: vreg
+ lanes: 8
+ base: float
+
+- go: Broadcast1To16
+ asm: VBROADCASTS[SD]
+ in:
+ - class: vreg
+ bits: 128
+ base: float
+ out:
+ - class: vreg
+ lanes: 16
+ base: float
+
# VPSHUFB for 128-bit byte shuffles will be picked with higher priority than VPERMB, given its lower CPU feature requirement. (It's AVX)
- go: PermuteOrZero
asm: VPSHUFB
diff --git a/src/simd/archsimd/_gen/tmplgen/main.go b/src/simd/archsimd/_gen/tmplgen/main.go
index 8db185e1e0..45338b765d 100644
--- a/src/simd/archsimd/_gen/tmplgen/main.go
+++ b/src/simd/archsimd/_gen/tmplgen/main.go
@@ -873,7 +873,7 @@ var broadcastTemplate = templateOf("Broadcast functions", `
// Emulated, CPU Feature: {{.CPUfeatureBC}}
func Broadcast{{.VType}}(x {{.Etype}}) {{.VType}} {
var z {{.As128BitVec }}
- return z.SetElem(0, x).Broadcast{{.Vwidth}}()
+ return z.SetElem(0, x).Broadcast1To{{.Count}}()
}
`)
diff --git a/src/simd/archsimd/ops_amd64.go b/src/simd/archsimd/ops_amd64.go
index eba340c793..bb162c4ff9 100644
--- a/src/simd/archsimd/ops_amd64.go
+++ b/src/simd/archsimd/ops_amd64.go
@@ -805,191 +805,197 @@ func (x Uint16x16) Average(y Uint16x16) Uint16x16
// Asm: VPAVGW, CPU Feature: AVX512
func (x Uint16x32) Average(y Uint16x32) Uint16x32
-/* Broadcast128 */
+/* Broadcast1To2 */
-// Broadcast128 copies element zero of its (128-bit) input to all elements of
-// the 128-bit output vector.
+// Broadcast1To2 copies the lowest element of its input to all 2 elements of
+// the output vector.
//
-// Asm: VBROADCASTSS, CPU Feature: AVX2
-func (x Float32x4) Broadcast128() Float32x4
+// Asm: VPBROADCASTQ, CPU Feature: AVX2
+func (x Float64x2) Broadcast1To2() Float64x2
-// Broadcast128 copies element zero of its (128-bit) input to all elements of
-// the 128-bit output vector.
+// Broadcast1To2 copies the lowest element of its input to all 2 elements of
+// the output vector.
//
// Asm: VPBROADCASTQ, CPU Feature: AVX2
-func (x Float64x2) Broadcast128() Float64x2
+func (x Int64x2) Broadcast1To2() Int64x2
-// Broadcast128 copies element zero of its (128-bit) input to all elements of
-// the 128-bit output vector.
+// Broadcast1To2 copies the lowest element of its input to all 2 elements of
+// the output vector.
//
-// Asm: VPBROADCASTB, CPU Feature: AVX2
-func (x Int8x16) Broadcast128() Int8x16
+// Asm: VPBROADCASTQ, CPU Feature: AVX2
+func (x Uint64x2) Broadcast1To2() Uint64x2
-// Broadcast128 copies element zero of its (128-bit) input to all elements of
-// the 128-bit output vector.
-//
-// Asm: VPBROADCASTW, CPU Feature: AVX2
-func (x Int16x8) Broadcast128() Int16x8
+/* Broadcast1To4 */
-// Broadcast128 copies element zero of its (128-bit) input to all elements of
-// the 128-bit output vector.
+// Broadcast1To4 copies the lowest element of its input to all 4 elements of
+// the output vector.
//
-// Asm: VPBROADCASTD, CPU Feature: AVX2
-func (x Int32x4) Broadcast128() Int32x4
+// Asm: VBROADCASTSS, CPU Feature: AVX2
+func (x Float32x4) Broadcast1To4() Float32x4
-// Broadcast128 copies element zero of its (128-bit) input to all elements of
-// the 128-bit output vector.
+// Broadcast1To4 copies the lowest element of its input to all 4 elements of
+// the output vector.
//
-// Asm: VPBROADCASTQ, CPU Feature: AVX2
-func (x Int64x2) Broadcast128() Int64x2
+// Asm: VBROADCASTSD, CPU Feature: AVX2
+func (x Float64x2) Broadcast1To4() Float64x4
-// Broadcast128 copies element zero of its (128-bit) input to all elements of
-// the 128-bit output vector.
+// Broadcast1To4 copies the lowest element of its input to all 4 elements of
+// the output vector.
//
-// Asm: VPBROADCASTB, CPU Feature: AVX2
-func (x Uint8x16) Broadcast128() Uint8x16
+// Asm: VPBROADCASTD, CPU Feature: AVX2
+func (x Int32x4) Broadcast1To4() Int32x4
-// Broadcast128 copies element zero of its (128-bit) input to all elements of
-// the 128-bit output vector.
+// Broadcast1To4 copies the lowest element of its input to all 4 elements of
+// the output vector.
//
-// Asm: VPBROADCASTW, CPU Feature: AVX2
-func (x Uint16x8) Broadcast128() Uint16x8
+// Asm: VPBROADCASTQ, CPU Feature: AVX2
+func (x Int64x2) Broadcast1To4() Int64x4
-// Broadcast128 copies element zero of its (128-bit) input to all elements of
-// the 128-bit output vector.
+// Broadcast1To4 copies the lowest element of its input to all 4 elements of
+// the output vector.
//
// Asm: VPBROADCASTD, CPU Feature: AVX2
-func (x Uint32x4) Broadcast128() Uint32x4
+func (x Uint32x4) Broadcast1To4() Uint32x4
-// Broadcast128 copies element zero of its (128-bit) input to all elements of
-// the 128-bit output vector.
+// Broadcast1To4 copies the lowest element of its input to all 4 elements of
+// the output vector.
//
// Asm: VPBROADCASTQ, CPU Feature: AVX2
-func (x Uint64x2) Broadcast128() Uint64x2
+func (x Uint64x2) Broadcast1To4() Uint64x4
-/* Broadcast256 */
+/* Broadcast1To8 */
-// Broadcast256 copies element zero of its (128-bit) input to all elements of
-// the 256-bit output vector.
+// Broadcast1To8 copies the lowest element of its input to all 8 elements of
+// the output vector.
//
// Asm: VBROADCASTSS, CPU Feature: AVX2
-func (x Float32x4) Broadcast256() Float32x8
-
-// Broadcast256 copies element zero of its (128-bit) input to all elements of
-// the 256-bit output vector.
-//
-// Asm: VBROADCASTSD, CPU Feature: AVX2
-func (x Float64x2) Broadcast256() Float64x4
+func (x Float32x4) Broadcast1To8() Float32x8
-// Broadcast256 copies element zero of its (128-bit) input to all elements of
-// the 256-bit output vector.
+// Broadcast1To8 copies the lowest element of its input to all 8 elements of
+// the output vector.
//
-// Asm: VPBROADCASTB, CPU Feature: AVX2
-func (x Int8x16) Broadcast256() Int8x32
+// Asm: VBROADCASTSD, CPU Feature: AVX512
+func (x Float64x2) Broadcast1To8() Float64x8
-// Broadcast256 copies element zero of its (128-bit) input to all elements of
-// the 256-bit output vector.
+// Broadcast1To8 copies the lowest element of its input to all 8 elements of
+// the output vector.
//
// Asm: VPBROADCASTW, CPU Feature: AVX2
-func (x Int16x8) Broadcast256() Int16x16
+func (x Int16x8) Broadcast1To8() Int16x8
-// Broadcast256 copies element zero of its (128-bit) input to all elements of
-// the 256-bit output vector.
+// Broadcast1To8 copies the lowest element of its input to all 8 elements of
+// the output vector.
//
// Asm: VPBROADCASTD, CPU Feature: AVX2
-func (x Int32x4) Broadcast256() Int32x8
+func (x Int32x4) Broadcast1To8() Int32x8
-// Broadcast256 copies element zero of its (128-bit) input to all elements of
-// the 256-bit output vector.
+// Broadcast1To8 copies the lowest element of its input to all 8 elements of
+// the output vector.
//
-// Asm: VPBROADCASTQ, CPU Feature: AVX2
-func (x Int64x2) Broadcast256() Int64x4
-
-// Broadcast256 copies element zero of its (128-bit) input to all elements of
-// the 256-bit output vector.
-//
-// Asm: VPBROADCASTB, CPU Feature: AVX2
-func (x Uint8x16) Broadcast256() Uint8x32
+// Asm: VPBROADCASTQ, CPU Feature: AVX512
+func (x Int64x2) Broadcast1To8() Int64x8
-// Broadcast256 copies element zero of its (128-bit) input to all elements of
-// the 256-bit output vector.
+// Broadcast1To8 copies the lowest element of its input to all 8 elements of
+// the output vector.
//
// Asm: VPBROADCASTW, CPU Feature: AVX2
-func (x Uint16x8) Broadcast256() Uint16x16
+func (x Uint16x8) Broadcast1To8() Uint16x8
-// Broadcast256 copies element zero of its (128-bit) input to all elements of
-// the 256-bit output vector.
+// Broadcast1To8 copies the lowest element of its input to all 8 elements of
+// the output vector.
//
// Asm: VPBROADCASTD, CPU Feature: AVX2
-func (x Uint32x4) Broadcast256() Uint32x8
+func (x Uint32x4) Broadcast1To8() Uint32x8
-// Broadcast256 copies element zero of its (128-bit) input to all elements of
-// the 256-bit output vector.
+// Broadcast1To8 copies the lowest element of its input to all 8 elements of
+// the output vector.
//
-// Asm: VPBROADCASTQ, CPU Feature: AVX2
-func (x Uint64x2) Broadcast256() Uint64x4
+// Asm: VPBROADCASTQ, CPU Feature: AVX512
+func (x Uint64x2) Broadcast1To8() Uint64x8
-/* Broadcast512 */
+/* Broadcast1To16 */
-// Broadcast512 copies element zero of its (128-bit) input to all elements of
-// the 512-bit output vector.
+// Broadcast1To16 copies the lowest element of its input to all 16 elements of
+// the output vector.
//
// Asm: VBROADCASTSS, CPU Feature: AVX512
-func (x Float32x4) Broadcast512() Float32x16
+func (x Float32x4) Broadcast1To16() Float32x16
-// Broadcast512 copies element zero of its (128-bit) input to all elements of
-// the 512-bit output vector.
+// Broadcast1To16 copies the lowest element of its input to all 16 elements of
+// the output vector.
//
-// Asm: VBROADCASTSD, CPU Feature: AVX512
-func (x Float64x2) Broadcast512() Float64x8
+// Asm: VPBROADCASTB, CPU Feature: AVX2
+func (x Int8x16) Broadcast1To16() Int8x16
-// Broadcast512 copies element zero of its (128-bit) input to all elements of
-// the 512-bit output vector.
+// Broadcast1To16 copies the lowest element of its input to all 16 elements of
+// the output vector.
//
-// Asm: VPBROADCASTB, CPU Feature: AVX512
-func (x Int8x16) Broadcast512() Int8x64
+// Asm: VPBROADCASTW, CPU Feature: AVX2
+func (x Int16x8) Broadcast1To16() Int16x16
-// Broadcast512 copies element zero of its (128-bit) input to all elements of
-// the 512-bit output vector.
+// Broadcast1To16 copies the lowest element of its input to all 16 elements of
+// the output vector.
//
-// Asm: VPBROADCASTW, CPU Feature: AVX512
-func (x Int16x8) Broadcast512() Int16x32
+// Asm: VPBROADCASTD, CPU Feature: AVX512
+func (x Int32x4) Broadcast1To16() Int32x16
-// Broadcast512 copies element zero of its (128-bit) input to all elements of
-// the 512-bit output vector.
+// Broadcast1To16 copies the lowest element of its input to all 16 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTB, CPU Feature: AVX2
+func (x Uint8x16) Broadcast1To16() Uint8x16
+
+// Broadcast1To16 copies the lowest element of its input to all 16 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTW, CPU Feature: AVX2
+func (x Uint16x8) Broadcast1To16() Uint16x16
+
+// Broadcast1To16 copies the lowest element of its input to all 16 elements of
+// the output vector.
//
// Asm: VPBROADCASTD, CPU Feature: AVX512
-func (x Int32x4) Broadcast512() Int32x16
+func (x Uint32x4) Broadcast1To16() Uint32x16
+
+/* Broadcast1To32 */
-// Broadcast512 copies element zero of its (128-bit) input to all elements of
-// the 512-bit output vector.
+// Broadcast1To32 copies the lowest element of its input to all 32 elements of
+// the output vector.
//
-// Asm: VPBROADCASTQ, CPU Feature: AVX512
-func (x Int64x2) Broadcast512() Int64x8
+// Asm: VPBROADCASTB, CPU Feature: AVX2
+func (x Int8x16) Broadcast1To32() Int8x32
-// Broadcast512 copies element zero of its (128-bit) input to all elements of
-// the 512-bit output vector.
+// Broadcast1To32 copies the lowest element of its input to all 32 elements of
+// the output vector.
//
-// Asm: VPBROADCASTB, CPU Feature: AVX512
-func (x Uint8x16) Broadcast512() Uint8x64
+// Asm: VPBROADCASTW, CPU Feature: AVX512
+func (x Int16x8) Broadcast1To32() Int16x32
+
+// Broadcast1To32 copies the lowest element of its input to all 32 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTB, CPU Feature: AVX2
+func (x Uint8x16) Broadcast1To32() Uint8x32
-// Broadcast512 copies element zero of its (128-bit) input to all elements of
-// the 512-bit output vector.
+// Broadcast1To32 copies the lowest element of its input to all 32 elements of
+// the output vector.
//
// Asm: VPBROADCASTW, CPU Feature: AVX512
-func (x Uint16x8) Broadcast512() Uint16x32
+func (x Uint16x8) Broadcast1To32() Uint16x32
+
+/* Broadcast1To64 */
-// Broadcast512 copies element zero of its (128-bit) input to all elements of
-// the 512-bit output vector.
+// Broadcast1To64 copies the lowest element of its input to all 64 elements of
+// the output vector.
//
-// Asm: VPBROADCASTD, CPU Feature: AVX512
-func (x Uint32x4) Broadcast512() Uint32x16
+// Asm: VPBROADCASTB, CPU Feature: AVX512
+func (x Int8x16) Broadcast1To64() Int8x64
-// Broadcast512 copies element zero of its (128-bit) input to all elements of
-// the 512-bit output vector.
+// Broadcast1To64 copies the lowest element of its input to all 64 elements of
+// the output vector.
//
-// Asm: VPBROADCASTQ, CPU Feature: AVX512
-func (x Uint64x2) Broadcast512() Uint64x8
+// Asm: VPBROADCASTB, CPU Feature: AVX512
+func (x Uint8x16) Broadcast1To64() Uint8x64
/* Ceil */
diff --git a/src/simd/archsimd/other_gen_amd64.go b/src/simd/archsimd/other_gen_amd64.go
index 647001acce..c250dc2436 100644
--- a/src/simd/archsimd/other_gen_amd64.go
+++ b/src/simd/archsimd/other_gen_amd64.go
@@ -10,7 +10,7 @@ package archsimd
// Emulated, CPU Feature: AVX2
func BroadcastInt8x16(x int8) Int8x16 {
var z Int8x16
- return z.SetElem(0, x).Broadcast128()
+ return z.SetElem(0, x).Broadcast1To16()
}
// BroadcastInt16x8 returns a vector with the input
@@ -19,7 +19,7 @@ func BroadcastInt8x16(x int8) Int8x16 {
// Emulated, CPU Feature: AVX2
func BroadcastInt16x8(x int16) Int16x8 {
var z Int16x8
- return z.SetElem(0, x).Broadcast128()
+ return z.SetElem(0, x).Broadcast1To8()
}
// BroadcastInt32x4 returns a vector with the input
@@ -28,7 +28,7 @@ func BroadcastInt16x8(x int16) Int16x8 {
// Emulated, CPU Feature: AVX2
func BroadcastInt32x4(x int32) Int32x4 {
var z Int32x4
- return z.SetElem(0, x).Broadcast128()
+ return z.SetElem(0, x).Broadcast1To4()
}
// BroadcastInt64x2 returns a vector with the input
@@ -37,7 +37,7 @@ func BroadcastInt32x4(x int32) Int32x4 {
// Emulated, CPU Feature: AVX2
func BroadcastInt64x2(x int64) Int64x2 {
var z Int64x2
- return z.SetElem(0, x).Broadcast128()
+ return z.SetElem(0, x).Broadcast1To2()
}
// BroadcastUint8x16 returns a vector with the input
@@ -46,7 +46,7 @@ func BroadcastInt64x2(x int64) Int64x2 {
// Emulated, CPU Feature: AVX2
func BroadcastUint8x16(x uint8) Uint8x16 {
var z Uint8x16
- return z.SetElem(0, x).Broadcast128()
+ return z.SetElem(0, x).Broadcast1To16()
}
// BroadcastUint16x8 returns a vector with the input
@@ -55,7 +55,7 @@ func BroadcastUint8x16(x uint8) Uint8x16 {
// Emulated, CPU Feature: AVX2
func BroadcastUint16x8(x uint16) Uint16x8 {
var z Uint16x8
- return z.SetElem(0, x).Broadcast128()
+ return z.SetElem(0, x).Broadcast1To8()
}
// BroadcastUint32x4 returns a vector with the input
@@ -64,7 +64,7 @@ func BroadcastUint16x8(x uint16) Uint16x8 {
// Emulated, CPU Feature: AVX2
func BroadcastUint32x4(x uint32) Uint32x4 {
var z Uint32x4
- return z.SetElem(0, x).Broadcast128()
+ return z.SetElem(0, x).Broadcast1To4()
}
// BroadcastUint64x2 returns a vector with the input
@@ -73,7 +73,7 @@ func BroadcastUint32x4(x uint32) Uint32x4 {
// Emulated, CPU Feature: AVX2
func BroadcastUint64x2(x uint64) Uint64x2 {
var z Uint64x2
- return z.SetElem(0, x).Broadcast128()
+ return z.SetElem(0, x).Broadcast1To2()
}
// BroadcastFloat32x4 returns a vector with the input
@@ -82,7 +82,7 @@ func BroadcastUint64x2(x uint64) Uint64x2 {
// Emulated, CPU Feature: AVX2
func BroadcastFloat32x4(x float32) Float32x4 {
var z Float32x4
- return z.SetElem(0, x).Broadcast128()
+ return z.SetElem(0, x).Broadcast1To4()
}
// BroadcastFloat64x2 returns a vector with the input
@@ -91,7 +91,7 @@ func BroadcastFloat32x4(x float32) Float32x4 {
// Emulated, CPU Feature: AVX2
func BroadcastFloat64x2(x float64) Float64x2 {
var z Float64x2
- return z.SetElem(0, x).Broadcast128()
+ return z.SetElem(0, x).Broadcast1To2()
}
// BroadcastInt8x32 returns a vector with the input
@@ -100,7 +100,7 @@ func BroadcastFloat64x2(x float64) Float64x2 {
// Emulated, CPU Feature: AVX2
func BroadcastInt8x32(x int8) Int8x32 {
var z Int8x16
- return z.SetElem(0, x).Broadcast256()
+ return z.SetElem(0, x).Broadcast1To32()
}
// BroadcastInt16x16 returns a vector with the input
@@ -109,7 +109,7 @@ func BroadcastInt8x32(x int8) Int8x32 {
// Emulated, CPU Feature: AVX2
func BroadcastInt16x16(x int16) Int16x16 {
var z Int16x8
- return z.SetElem(0, x).Broadcast256()
+ return z.SetElem(0, x).Broadcast1To16()
}
// BroadcastInt32x8 returns a vector with the input
@@ -118,7 +118,7 @@ func BroadcastInt16x16(x int16) Int16x16 {
// Emulated, CPU Feature: AVX2
func BroadcastInt32x8(x int32) Int32x8 {
var z Int32x4
- return z.SetElem(0, x).Broadcast256()
+ return z.SetElem(0, x).Broadcast1To8()
}
// BroadcastInt64x4 returns a vector with the input
@@ -127,7 +127,7 @@ func BroadcastInt32x8(x int32) Int32x8 {
// Emulated, CPU Feature: AVX2
func BroadcastInt64x4(x int64) Int64x4 {
var z Int64x2
- return z.SetElem(0, x).Broadcast256()
+ return z.SetElem(0, x).Broadcast1To4()
}
// BroadcastUint8x32 returns a vector with the input
@@ -136,7 +136,7 @@ func BroadcastInt64x4(x int64) Int64x4 {
// Emulated, CPU Feature: AVX2
func BroadcastUint8x32(x uint8) Uint8x32 {
var z Uint8x16
- return z.SetElem(0, x).Broadcast256()
+ return z.SetElem(0, x).Broadcast1To32()
}
// BroadcastUint16x16 returns a vector with the input
@@ -145,7 +145,7 @@ func BroadcastUint8x32(x uint8) Uint8x32 {
// Emulated, CPU Feature: AVX2
func BroadcastUint16x16(x uint16) Uint16x16 {
var z Uint16x8
- return z.SetElem(0, x).Broadcast256()
+ return z.SetElem(0, x).Broadcast1To16()
}
// BroadcastUint32x8 returns a vector with the input
@@ -154,7 +154,7 @@ func BroadcastUint16x16(x uint16) Uint16x16 {
// Emulated, CPU Feature: AVX2
func BroadcastUint32x8(x uint32) Uint32x8 {
var z Uint32x4
- return z.SetElem(0, x).Broadcast256()
+ return z.SetElem(0, x).Broadcast1To8()
}
// BroadcastUint64x4 returns a vector with the input
@@ -163,7 +163,7 @@ func BroadcastUint32x8(x uint32) Uint32x8 {
// Emulated, CPU Feature: AVX2
func BroadcastUint64x4(x uint64) Uint64x4 {
var z Uint64x2
- return z.SetElem(0, x).Broadcast256()
+ return z.SetElem(0, x).Broadcast1To4()
}
// BroadcastFloat32x8 returns a vector with the input
@@ -172,7 +172,7 @@ func BroadcastUint64x4(x uint64) Uint64x4 {
// Emulated, CPU Feature: AVX2
func BroadcastFloat32x8(x float32) Float32x8 {
var z Float32x4
- return z.SetElem(0, x).Broadcast256()
+ return z.SetElem(0, x).Broadcast1To8()
}
// BroadcastFloat64x4 returns a vector with the input
@@ -181,7 +181,7 @@ func BroadcastFloat32x8(x float32) Float32x8 {
// Emulated, CPU Feature: AVX2
func BroadcastFloat64x4(x float64) Float64x4 {
var z Float64x2
- return z.SetElem(0, x).Broadcast256()
+ return z.SetElem(0, x).Broadcast1To4()
}
// BroadcastInt8x64 returns a vector with the input
@@ -190,7 +190,7 @@ func BroadcastFloat64x4(x float64) Float64x4 {
// Emulated, CPU Feature: AVX512BW
func BroadcastInt8x64(x int8) Int8x64 {
var z Int8x16
- return z.SetElem(0, x).Broadcast512()
+ return z.SetElem(0, x).Broadcast1To64()
}
// BroadcastInt16x32 returns a vector with the input
@@ -199,7 +199,7 @@ func BroadcastInt8x64(x int8) Int8x64 {
// Emulated, CPU Feature: AVX512BW
func BroadcastInt16x32(x int16) Int16x32 {
var z Int16x8
- return z.SetElem(0, x).Broadcast512()
+ return z.SetElem(0, x).Broadcast1To32()
}
// BroadcastInt32x16 returns a vector with the input
@@ -208,7 +208,7 @@ func BroadcastInt16x32(x int16) Int16x32 {
// Emulated, CPU Feature: AVX512F
func BroadcastInt32x16(x int32) Int32x16 {
var z Int32x4
- return z.SetElem(0, x).Broadcast512()
+ return z.SetElem(0, x).Broadcast1To16()
}
// BroadcastInt64x8 returns a vector with the input
@@ -217,7 +217,7 @@ func BroadcastInt32x16(x int32) Int32x16 {
// Emulated, CPU Feature: AVX512F
func BroadcastInt64x8(x int64) Int64x8 {
var z Int64x2
- return z.SetElem(0, x).Broadcast512()
+ return z.SetElem(0, x).Broadcast1To8()
}
// BroadcastUint8x64 returns a vector with the input
@@ -226,7 +226,7 @@ func BroadcastInt64x8(x int64) Int64x8 {
// Emulated, CPU Feature: AVX512BW
func BroadcastUint8x64(x uint8) Uint8x64 {
var z Uint8x16
- return z.SetElem(0, x).Broadcast512()
+ return z.SetElem(0, x).Broadcast1To64()
}
// BroadcastUint16x32 returns a vector with the input
@@ -235,7 +235,7 @@ func BroadcastUint8x64(x uint8) Uint8x64 {
// Emulated, CPU Feature: AVX512BW
func BroadcastUint16x32(x uint16) Uint16x32 {
var z Uint16x8
- return z.SetElem(0, x).Broadcast512()
+ return z.SetElem(0, x).Broadcast1To32()
}
// BroadcastUint32x16 returns a vector with the input
@@ -244,7 +244,7 @@ func BroadcastUint16x32(x uint16) Uint16x32 {
// Emulated, CPU Feature: AVX512F
func BroadcastUint32x16(x uint32) Uint32x16 {
var z Uint32x4
- return z.SetElem(0, x).Broadcast512()
+ return z.SetElem(0, x).Broadcast1To16()
}
// BroadcastUint64x8 returns a vector with the input
@@ -253,7 +253,7 @@ func BroadcastUint32x16(x uint32) Uint32x16 {
// Emulated, CPU Feature: AVX512F
func BroadcastUint64x8(x uint64) Uint64x8 {
var z Uint64x2
- return z.SetElem(0, x).Broadcast512()
+ return z.SetElem(0, x).Broadcast1To8()
}
// BroadcastFloat32x16 returns a vector with the input
@@ -262,7 +262,7 @@ func BroadcastUint64x8(x uint64) Uint64x8 {
// Emulated, CPU Feature: AVX512F
func BroadcastFloat32x16(x float32) Float32x16 {
var z Float32x4
- return z.SetElem(0, x).Broadcast512()
+ return z.SetElem(0, x).Broadcast1To16()
}
// BroadcastFloat64x8 returns a vector with the input
@@ -271,7 +271,7 @@ func BroadcastFloat32x16(x float32) Float32x16 {
// Emulated, CPU Feature: AVX512F
func BroadcastFloat64x8(x float64) Float64x8 {
var z Float64x2
- return z.SetElem(0, x).Broadcast512()
+ return z.SetElem(0, x).Broadcast1To8()
}
// ToMask converts from Int8x16 to Mask8x16, mask element is set to true when the corresponding vector element is non-zero.