From d1e7f49e3d1eb039d9d2aed3ba923459bd42aa7c Mon Sep 17 00:00:00 2001 From: jjpinto Date: Tue, 6 Jan 2026 22:47:07 +0000 Subject: internal/trace: fix recorder.Write return value for header-only buffers Fix issue #77083 Change-Id: I9189d1e3a6efea8478224164e820f50c818abcd5 GitHub-Last-Rev: bb24cbda95f0b5b10aeae9a5ee8cbe215ba6d4eb GitHub-Pull-Request: golang/go#77092 Reviewed-on: https://go-review.googlesource.com/c/go/+/734300 Reviewed-by: Michael Pratt Reviewed-by: Michael Knyszek Commit-Queue: Michael Knyszek LUCI-TryBot-Result: Go LUCI --- src/runtime/trace/recorder.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/runtime/trace/recorder.go b/src/runtime/trace/recorder.go index 4f2d3aa92a..a18d764141 100644 --- a/src/runtime/trace/recorder.go +++ b/src/runtime/trace/recorder.go @@ -39,7 +39,7 @@ func (w *recorder) Write(b []byte) (n int, err error) { w.headerReceived = true } if len(b) == n { - return 0, nil + return n, nil } ba, nb, err := readBatch(b[n:]) // Every write from the runtime is guaranteed to be a complete batch. if err != nil { -- cgit v1.3 From 874d8b98eba8129559b97d2fdfa02ddeb88b95f9 Mon Sep 17 00:00:00 2001 From: Michael Matloob Date: Tue, 6 Jan 2026 17:18:48 -0500 Subject: cmd/go/internal/work: decrement concurrentProcesses when action finishes This fixes a bug where we only incremented concurrentProcesses but never decremented it, causing us to run out of tokens and give all compiles -c=1 after a point. Cq-Include-Trybots: luci.golang.try:gotip-linux-amd64_c2s16-perf_vs_parent,gotip-linux-amd64_c3h88-perf_vs_parent,gotip-linux-arm64_c4ah72-perf_vs_parent,gotip-linux-arm64_c4as16-perf_vs_parent Change-Id: I41f4c1edb77004cbc1772d6d672045946a6a6964 Reviewed-on: https://go-review.googlesource.com/c/go/+/734260 Reviewed-by: David Chase Reviewed-by: Cherry Mui Reviewed-by: Michael Matloob TryBot-Bypass: Michael Matloob --- src/cmd/go/internal/work/exec.go | 5 +++++ src/cmd/go/internal/work/gc.go | 3 +++ 2 files changed, 8 insertions(+) diff --git a/src/cmd/go/internal/work/exec.go b/src/cmd/go/internal/work/exec.go index 654e9e9374..f2d1b1040b 100644 --- a/src/cmd/go/internal/work/exec.go +++ b/src/cmd/go/internal/work/exec.go @@ -248,6 +248,11 @@ func (b *Builder) Do(ctx context.Context, root *Action) { wg.Wait() + if tokens != totalTokens || concurrentProcesses != 0 { + base.Fatalf("internal error: tokens not restored at end of build: tokens: %d, totalTokens: %d, concurrentProcesses: %d", + tokens, totalTokens, concurrentProcesses) + } + // Write action graph again, this time with timing information. writeActionGraph() } diff --git a/src/cmd/go/internal/work/gc.go b/src/cmd/go/internal/work/gc.go index 9a5e6c924c..fc74715f22 100644 --- a/src/cmd/go/internal/work/gc.go +++ b/src/cmd/go/internal/work/gc.go @@ -227,6 +227,7 @@ func compilerConcurrency() (int, func()) { return c, func() { tokensMu.Lock() defer tokensMu.Unlock() + concurrentProcesses-- tokens += c } } @@ -235,6 +236,7 @@ var maxCompilerConcurrency = runtime.GOMAXPROCS(0) // max value we will use for var ( tokensMu sync.Mutex + totalTokens int // total number of tokens: this is used for checking that we get them all back in the end tokens int // number of available tokens concurrentProcesses int // number of currently running compiles ) @@ -246,6 +248,7 @@ func initCompilerConcurrencyPool() { // than what it was when we capped the concurrency to 4. oldConcurrencyCap := min(4, maxCompilerConcurrency) tokens = oldConcurrencyCap * cfg.BuildP + totalTokens = tokens } // trimpath returns the -trimpath argument to use -- cgit v1.3 From 28147b528312055b535c6a69d0d4492bd502e1b0 Mon Sep 17 00:00:00 2001 From: Michael Matloob Date: Mon, 5 Jan 2026 12:23:14 -0500 Subject: cmd/go: guarantee a minimum of min(4,GOMAXPROCS) to compile -c To allow this, we also increase the size of the pool to allow the minimum number for each action, with an extra 2*GOMAXPROCS number of tokens to boost -c when there are fewer concurrently running actions. That means the pool will now have the size 6*GOMAXPROCS instead of the previous 4*GOMAXPROCS. The goal is to maintain the boosting behavior added by the pool, while guarding from starving compiles when there are too few tokens left, so that the value of -c is always at least min(4,GOMAXPROCS), which is what it was set to before Go 1.26. Cq-Include-Trybots: luci.golang.try:gotip-linux-arm64_c4as16-perf_vs_parent,gotip-linux-arm64_c4ah72-perf_vs_parent,gotip-linux-amd64_c3h88-perf_vs_parent,gotip-linux-amd64_c2s16-perf_vs_parent Change-Id: I113a38584514a6c025d3d1bc727ff8d86a6a6964 Reviewed-on: https://go-review.googlesource.com/c/go/+/734040 Commit-Queue: Michael Matloob Reviewed-by: Cherry Mui TryBot-Bypass: Michael Matloob Reviewed-by: Michael Matloob --- src/cmd/go/internal/work/gc.go | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/cmd/go/internal/work/gc.go b/src/cmd/go/internal/work/gc.go index fc74715f22..6300a9135b 100644 --- a/src/cmd/go/internal/work/gc.go +++ b/src/cmd/go/internal/work/gc.go @@ -217,11 +217,11 @@ func compilerConcurrency() (int, func()) { concurrentProcesses++ // Set aside tokens so that we don't run out if we were running cfg.BuildP concurrent compiles. // We'll set aside one token for each of the action goroutines that aren't currently running a compile. - setAside := cfg.BuildP - concurrentProcesses + setAside := (cfg.BuildP - concurrentProcesses) * minTokens availableTokens := tokens - setAside - // Grab half the remaining tokens: but with a floor of at least 1 token, and + // Grab half the remaining tokens: but with a floor of at least minTokens token, and // a ceiling of the max backend concurrency. - c := max(min(availableTokens/2, maxCompilerConcurrency), 1) + c := max(min(availableTokens/2, maxCompilerConcurrency), minTokens) tokens -= c // Successfully grabbed the tokens. return c, func() { @@ -239,15 +239,18 @@ var ( totalTokens int // total number of tokens: this is used for checking that we get them all back in the end tokens int // number of available tokens concurrentProcesses int // number of currently running compiles + minTokens int // minimum number of tokens to give out ) // initCompilerConcurrencyPool sets the number of tokens in the pool. It needs // to be run after init, so that it can use the value of cfg.BuildP. func initCompilerConcurrencyPool() { - // Size the pool so that the worst case total number of compiles is not more - // than what it was when we capped the concurrency to 4. - oldConcurrencyCap := min(4, maxCompilerConcurrency) - tokens = oldConcurrencyCap * cfg.BuildP + // Size the pool to allow 2*maxCompilerConcurrency extra tokens to + // be distributed amongst the compile actions in addition to the minimum + // of min(4,GOMAXPROCS) tokens for each of the potentially cfg.BuildP + // concurrently running compile actions. + minTokens = min(4, maxCompilerConcurrency) + tokens = 2*maxCompilerConcurrency + minTokens*cfg.BuildP totalTokens = tokens } -- cgit v1.3 From 5facb3b24b1c388176572eb95239f94d6ed4017d Mon Sep 17 00:00:00 2001 From: Mark Freeman Date: Wed, 7 Jan 2026 16:40:53 -0500 Subject: internal/types: add test for cycles in value context Exposition is also added to outline a difference between syntax which can / cannot produce values of incomplete types. For us to enforce non-nilness of type RHS and remove the pending type mechanism, I suspect we would need to add completeness guards to the syntax which *can*. Enforcing non-nilness of type RHS currently breaks the below test cases, but I suspect that is simply an implementation artifact. In other words, they just call Underlying at a bad time. - T0 - T3 - T6 / T7 - T10 - T12 If we also remove pendingType, all of these test cases break; again, we would need guards in the appropriate syntax logic. Change-Id: Ibe22042232e542de1d38b923dd1d5cc50dce08cb Reviewed-on: https://go-review.googlesource.com/c/go/+/734600 TryBot-Bypass: Mark Freeman Reviewed-by: Robert Griesemer Auto-Submit: Mark Freeman --- src/internal/types/testdata/check/cycles6.go | 71 ++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 src/internal/types/testdata/check/cycles6.go diff --git a/src/internal/types/testdata/check/cycles6.go b/src/internal/types/testdata/check/cycles6.go new file mode 100644 index 0000000000..e5635ed456 --- /dev/null +++ b/src/internal/types/testdata/check/cycles6.go @@ -0,0 +1,71 @@ +// Copyright 2026 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package p + +import "unsafe" + +// Below are the pieces of syntax corresponding to functions which can produce a +// type T without first having a value of type T. Notice that each causes a +// value of type T to be passed to unsafe.Sizeof while T is incomplete. + +// literal on type +type T0 /* ERROR "invalid recursive type" */ [unsafe.Sizeof(T0{})]int +// literal on value (not applicable) +// literal on pointer (not applicable) + +// call on type +type T1 /* ERROR "invalid recursive type" */ [unsafe.Sizeof(T1(42))]int +// call on value +func f2() T2 +type T2 /* ERROR "invalid recursive type" */ [unsafe.Sizeof(f2())]int +// call on pointer (not applicable) + +// assert on type +var i3 interface{} +type T3 /* ERROR "invalid recursive type" */ [unsafe.Sizeof(i3.(T3))]int +// assert on value (not applicable) +// assert on pointer (not applicable) + +// receive on type (not applicable) +// receive on value +func f4() <-chan T4 +type T4 /* ERROR "invalid recursive type" */ [unsafe.Sizeof(<-f4())]int +// receive on pointer (not applicable) + +// star on type (not applicable) +// star on value (not applicable) +// star on pointer +func f5() *T5 +type T5 /* ERROR "invalid recursive type" */ [unsafe.Sizeof(*f5())]int + +// Below is additional syntax which interacts with incomplete types. Notice that +// each of the below falls into 1 of 3 cases: +// 1. It cannot produce a value of (incomplete) type T. +// 2. It can, but only because it already has a value of type T. +// 3. It can, but only because it performs an implicit dereference. + +// select on type (case 1) +// select on value (case 2) +type T6 /* ERROR "invalid recursive type" */ struct { + f T7 +} +type T7 [unsafe.Sizeof(T6{}.f)]int +// select on pointer (case 3) +type T8 /* ERROR "invalid recursive type" */ struct { + f T9 +} +type T9 [unsafe.Sizeof(new(T8).f)]int + +// slice on type (not applicable) +// slice on value (case 2) +type T10 /* ERROR "invalid recursive type" */ [unsafe.Sizeof(T10{}[:])]int +// slice on pointer (case 3) +type T11 /* ERROR "invalid recursive type" */ [unsafe.Sizeof(new(T11)[:])]int + +// index on type (case 1) +// index on value (case 2) +type T12 /* ERROR "invalid recursive type" */ [unsafe.Sizeof(T12{}[42])]int +// index on pointer (case 3) +type T13 /* ERROR "invalid recursive type" */ [unsafe.Sizeof(new(T13)[42])]int -- cgit v1.3 From 8ac4477d83672af8c3d39399685731ee6b81ce2f Mon Sep 17 00:00:00 2001 From: Cherry Mui Date: Thu, 8 Jan 2026 11:57:28 -0500 Subject: simd/archsimd: rename Broadcast methods Currently the Broadcast128/256/512 methods broadcast the lowest element of the input vector to a vector of the corresponding width. There are also variations of broadcast operations that broadcast the whole (128- or 256-bit) vector to a larger vector, which we don't yet support. Our current naming is unclear which version it is, though. Rename the current ones to Broadcast1ToN, to be clear that they broadcast one element. The vector version probably will be named BoradcastAllToN (not included in this CL). Change-Id: I47a21e367f948ec0b578d63706a40d20f5a9f46d Reviewed-on: https://go-review.googlesource.com/c/go/+/734840 LUCI-TryBot-Result: Go LUCI Reviewed-by: Junyang Shao --- src/cmd/compile/internal/amd64/simdssa.go | 88 ++++---- src/cmd/compile/internal/ssa/_gen/simdAMD64.rules | 82 +++---- .../compile/internal/ssa/_gen/simdgenericOps.go | 60 +++--- src/cmd/compile/internal/ssa/opGen.go | 120 +++++------ src/cmd/compile/internal/ssa/rewriteAMD64.go | 112 +++++----- src/cmd/compile/internal/ssagen/simdintrinsics.go | 60 +++--- .../_gen/simdgen/ops/Moves/categories.yaml | 33 ++- src/simd/archsimd/_gen/simdgen/ops/Moves/go.yaml | 79 ++++--- src/simd/archsimd/_gen/tmplgen/main.go | 2 +- src/simd/archsimd/ops_amd64.go | 240 +++++++++++---------- src/simd/archsimd/other_gen_amd64.go | 60 +++--- 11 files changed, 490 insertions(+), 446 deletions(-) diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go index c4d0fd69c6..a028cbe86d 100644 --- a/src/cmd/compile/internal/amd64/simdssa.go +++ b/src/cmd/compile/internal/amd64/simdssa.go @@ -25,23 +25,23 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPABSQ128, ssa.OpAMD64VPABSQ256, ssa.OpAMD64VPABSQ512, - ssa.OpAMD64VBROADCASTSS128, ssa.OpAMD64VPBROADCASTQ128, - ssa.OpAMD64VPBROADCASTB128, - ssa.OpAMD64VPBROADCASTW128, + ssa.OpAMD64VBROADCASTSS128, + ssa.OpAMD64VBROADCASTSD256, ssa.OpAMD64VPBROADCASTD128, + ssa.OpAMD64VPBROADCASTQ256, ssa.OpAMD64VBROADCASTSS256, - ssa.OpAMD64VBROADCASTSD256, - ssa.OpAMD64VPBROADCASTB256, - ssa.OpAMD64VPBROADCASTW256, + ssa.OpAMD64VBROADCASTSD512, + ssa.OpAMD64VPBROADCASTW128, ssa.OpAMD64VPBROADCASTD256, - ssa.OpAMD64VPBROADCASTQ256, + ssa.OpAMD64VPBROADCASTQ512, ssa.OpAMD64VBROADCASTSS512, - ssa.OpAMD64VBROADCASTSD512, - ssa.OpAMD64VPBROADCASTB512, - ssa.OpAMD64VPBROADCASTW512, + ssa.OpAMD64VPBROADCASTB128, + ssa.OpAMD64VPBROADCASTW256, ssa.OpAMD64VPBROADCASTD512, - ssa.OpAMD64VPBROADCASTQ512, + ssa.OpAMD64VPBROADCASTB256, + ssa.OpAMD64VPBROADCASTW512, + ssa.OpAMD64VPBROADCASTB512, ssa.OpAMD64VCVTPD2PSX128, ssa.OpAMD64VCVTPD2PSY128, ssa.OpAMD64VCVTPD2PS256, @@ -832,23 +832,23 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPABSQMasked128, ssa.OpAMD64VPABSQMasked256, ssa.OpAMD64VPABSQMasked512, - ssa.OpAMD64VBROADCASTSSMasked128, ssa.OpAMD64VPBROADCASTQMasked128, - ssa.OpAMD64VPBROADCASTBMasked128, - ssa.OpAMD64VPBROADCASTWMasked128, + ssa.OpAMD64VBROADCASTSSMasked128, + ssa.OpAMD64VBROADCASTSDMasked256, ssa.OpAMD64VPBROADCASTDMasked128, + ssa.OpAMD64VPBROADCASTQMasked256, ssa.OpAMD64VBROADCASTSSMasked256, - ssa.OpAMD64VBROADCASTSDMasked256, - ssa.OpAMD64VPBROADCASTBMasked256, - ssa.OpAMD64VPBROADCASTWMasked256, + ssa.OpAMD64VBROADCASTSDMasked512, + ssa.OpAMD64VPBROADCASTWMasked128, ssa.OpAMD64VPBROADCASTDMasked256, - ssa.OpAMD64VPBROADCASTQMasked256, + ssa.OpAMD64VPBROADCASTQMasked512, ssa.OpAMD64VBROADCASTSSMasked512, - ssa.OpAMD64VBROADCASTSDMasked512, - ssa.OpAMD64VPBROADCASTBMasked512, - ssa.OpAMD64VPBROADCASTWMasked512, + ssa.OpAMD64VPBROADCASTBMasked128, + ssa.OpAMD64VPBROADCASTWMasked256, ssa.OpAMD64VPBROADCASTDMasked512, - ssa.OpAMD64VPBROADCASTQMasked512, + ssa.OpAMD64VPBROADCASTBMasked256, + ssa.OpAMD64VPBROADCASTWMasked512, + ssa.OpAMD64VPBROADCASTBMasked512, ssa.OpAMD64VCOMPRESSPSMasked128, ssa.OpAMD64VCOMPRESSPSMasked256, ssa.OpAMD64VCOMPRESSPSMasked512, @@ -2460,23 +2460,23 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPABSQMasked128Merging, ssa.OpAMD64VPABSQMasked256Merging, ssa.OpAMD64VPABSQMasked512Merging, - ssa.OpAMD64VBROADCASTSSMasked128Merging, ssa.OpAMD64VPBROADCASTQMasked128Merging, - ssa.OpAMD64VPBROADCASTBMasked128Merging, - ssa.OpAMD64VPBROADCASTWMasked128Merging, + ssa.OpAMD64VBROADCASTSSMasked128Merging, + ssa.OpAMD64VBROADCASTSDMasked256Merging, ssa.OpAMD64VPBROADCASTDMasked128Merging, + ssa.OpAMD64VPBROADCASTQMasked256Merging, ssa.OpAMD64VBROADCASTSSMasked256Merging, - ssa.OpAMD64VBROADCASTSDMasked256Merging, - ssa.OpAMD64VPBROADCASTBMasked256Merging, - ssa.OpAMD64VPBROADCASTWMasked256Merging, + ssa.OpAMD64VBROADCASTSDMasked512Merging, + ssa.OpAMD64VPBROADCASTWMasked128Merging, ssa.OpAMD64VPBROADCASTDMasked256Merging, - ssa.OpAMD64VPBROADCASTQMasked256Merging, + ssa.OpAMD64VPBROADCASTQMasked512Merging, ssa.OpAMD64VBROADCASTSSMasked512Merging, - ssa.OpAMD64VBROADCASTSDMasked512Merging, - ssa.OpAMD64VPBROADCASTBMasked512Merging, - ssa.OpAMD64VPBROADCASTWMasked512Merging, + ssa.OpAMD64VPBROADCASTBMasked128Merging, + ssa.OpAMD64VPBROADCASTWMasked256Merging, ssa.OpAMD64VPBROADCASTDMasked512Merging, - ssa.OpAMD64VPBROADCASTQMasked512Merging, + ssa.OpAMD64VPBROADCASTBMasked256Merging, + ssa.OpAMD64VPBROADCASTWMasked512Merging, + ssa.OpAMD64VPBROADCASTBMasked512Merging, ssa.OpAMD64VRNDSCALEPSMasked128Merging, ssa.OpAMD64VRNDSCALEPSMasked256Merging, ssa.OpAMD64VRNDSCALEPSMasked512Merging, @@ -2817,23 +2817,23 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPAVGWMasked128, ssa.OpAMD64VPAVGWMasked256, ssa.OpAMD64VPAVGWMasked512, - ssa.OpAMD64VBROADCASTSSMasked128, ssa.OpAMD64VPBROADCASTQMasked128, - ssa.OpAMD64VPBROADCASTBMasked128, - ssa.OpAMD64VPBROADCASTWMasked128, + ssa.OpAMD64VBROADCASTSSMasked128, + ssa.OpAMD64VBROADCASTSDMasked256, ssa.OpAMD64VPBROADCASTDMasked128, + ssa.OpAMD64VPBROADCASTQMasked256, ssa.OpAMD64VBROADCASTSSMasked256, - ssa.OpAMD64VBROADCASTSDMasked256, - ssa.OpAMD64VPBROADCASTBMasked256, - ssa.OpAMD64VPBROADCASTWMasked256, + ssa.OpAMD64VBROADCASTSDMasked512, + ssa.OpAMD64VPBROADCASTWMasked128, ssa.OpAMD64VPBROADCASTDMasked256, - ssa.OpAMD64VPBROADCASTQMasked256, + ssa.OpAMD64VPBROADCASTQMasked512, ssa.OpAMD64VBROADCASTSSMasked512, - ssa.OpAMD64VBROADCASTSDMasked512, - ssa.OpAMD64VPBROADCASTBMasked512, - ssa.OpAMD64VPBROADCASTWMasked512, + ssa.OpAMD64VPBROADCASTBMasked128, + ssa.OpAMD64VPBROADCASTWMasked256, ssa.OpAMD64VPBROADCASTDMasked512, - ssa.OpAMD64VPBROADCASTQMasked512, + ssa.OpAMD64VPBROADCASTBMasked256, + ssa.OpAMD64VPBROADCASTWMasked512, + ssa.OpAMD64VPBROADCASTBMasked512, ssa.OpAMD64VRNDSCALEPSMasked128, ssa.OpAMD64VRNDSCALEPSMasked128load, ssa.OpAMD64VRNDSCALEPSMasked256, diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules index 5c83f39a1f..799461610d 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules @@ -140,36 +140,36 @@ (AverageUint16x8 ...) => (VPAVGW128 ...) (AverageUint16x16 ...) => (VPAVGW256 ...) (AverageUint16x32 ...) => (VPAVGW512 ...) -(Broadcast128Float32x4 ...) => (VBROADCASTSS128 ...) -(Broadcast128Float64x2 ...) => (VPBROADCASTQ128 ...) -(Broadcast128Int8x16 ...) => (VPBROADCASTB128 ...) -(Broadcast128Int16x8 ...) => (VPBROADCASTW128 ...) -(Broadcast128Int32x4 ...) => (VPBROADCASTD128 ...) -(Broadcast128Int64x2 ...) => (VPBROADCASTQ128 ...) -(Broadcast128Uint8x16 ...) => (VPBROADCASTB128 ...) -(Broadcast128Uint16x8 ...) => (VPBROADCASTW128 ...) -(Broadcast128Uint32x4 ...) => (VPBROADCASTD128 ...) -(Broadcast128Uint64x2 ...) => (VPBROADCASTQ128 ...) -(Broadcast256Float32x4 ...) => (VBROADCASTSS256 ...) -(Broadcast256Float64x2 ...) => (VBROADCASTSD256 ...) -(Broadcast256Int8x16 ...) => (VPBROADCASTB256 ...) -(Broadcast256Int16x8 ...) => (VPBROADCASTW256 ...) -(Broadcast256Int32x4 ...) => (VPBROADCASTD256 ...) -(Broadcast256Int64x2 ...) => (VPBROADCASTQ256 ...) -(Broadcast256Uint8x16 ...) => (VPBROADCASTB256 ...) -(Broadcast256Uint16x8 ...) => (VPBROADCASTW256 ...) -(Broadcast256Uint32x4 ...) => (VPBROADCASTD256 ...) -(Broadcast256Uint64x2 ...) => (VPBROADCASTQ256 ...) -(Broadcast512Float32x4 ...) => (VBROADCASTSS512 ...) -(Broadcast512Float64x2 ...) => (VBROADCASTSD512 ...) -(Broadcast512Int8x16 ...) => (VPBROADCASTB512 ...) -(Broadcast512Int16x8 ...) => (VPBROADCASTW512 ...) -(Broadcast512Int32x4 ...) => (VPBROADCASTD512 ...) -(Broadcast512Int64x2 ...) => (VPBROADCASTQ512 ...) -(Broadcast512Uint8x16 ...) => (VPBROADCASTB512 ...) -(Broadcast512Uint16x8 ...) => (VPBROADCASTW512 ...) -(Broadcast512Uint32x4 ...) => (VPBROADCASTD512 ...) -(Broadcast512Uint64x2 ...) => (VPBROADCASTQ512 ...) +(Broadcast1To2Float64x2 ...) => (VPBROADCASTQ128 ...) +(Broadcast1To2Int64x2 ...) => (VPBROADCASTQ128 ...) +(Broadcast1To2Uint64x2 ...) => (VPBROADCASTQ128 ...) +(Broadcast1To4Float32x4 ...) => (VBROADCASTSS128 ...) +(Broadcast1To4Float64x2 ...) => (VBROADCASTSD256 ...) +(Broadcast1To4Int32x4 ...) => (VPBROADCASTD128 ...) +(Broadcast1To4Int64x2 ...) => (VPBROADCASTQ256 ...) +(Broadcast1To4Uint32x4 ...) => (VPBROADCASTD128 ...) +(Broadcast1To4Uint64x2 ...) => (VPBROADCASTQ256 ...) +(Broadcast1To8Float32x4 ...) => (VBROADCASTSS256 ...) +(Broadcast1To8Float64x2 ...) => (VBROADCASTSD512 ...) +(Broadcast1To8Int16x8 ...) => (VPBROADCASTW128 ...) +(Broadcast1To8Int32x4 ...) => (VPBROADCASTD256 ...) +(Broadcast1To8Int64x2 ...) => (VPBROADCASTQ512 ...) +(Broadcast1To8Uint16x8 ...) => (VPBROADCASTW128 ...) +(Broadcast1To8Uint32x4 ...) => (VPBROADCASTD256 ...) +(Broadcast1To8Uint64x2 ...) => (VPBROADCASTQ512 ...) +(Broadcast1To16Float32x4 ...) => (VBROADCASTSS512 ...) +(Broadcast1To16Int8x16 ...) => (VPBROADCASTB128 ...) +(Broadcast1To16Int16x8 ...) => (VPBROADCASTW256 ...) +(Broadcast1To16Int32x4 ...) => (VPBROADCASTD512 ...) +(Broadcast1To16Uint8x16 ...) => (VPBROADCASTB128 ...) +(Broadcast1To16Uint16x8 ...) => (VPBROADCASTW256 ...) +(Broadcast1To16Uint32x4 ...) => (VPBROADCASTD512 ...) +(Broadcast1To32Int8x16 ...) => (VPBROADCASTB256 ...) +(Broadcast1To32Int16x8 ...) => (VPBROADCASTW512 ...) +(Broadcast1To32Uint8x16 ...) => (VPBROADCASTB256 ...) +(Broadcast1To32Uint16x8 ...) => (VPBROADCASTW512 ...) +(Broadcast1To64Int8x16 ...) => (VPBROADCASTB512 ...) +(Broadcast1To64Uint8x16 ...) => (VPBROADCASTB512 ...) (CeilFloat32x4 x) => (VROUNDPS128 [2] x) (CeilFloat32x8 x) => (VROUNDPS256 [2] x) (CeilFloat64x2 x) => (VROUNDPD128 [2] x) @@ -1424,23 +1424,23 @@ (VMOVDQU16Masked128 (VPAVGW128 x y) mask) => (VPAVGWMasked128 x y mask) (VMOVDQU16Masked256 (VPAVGW256 x y) mask) => (VPAVGWMasked256 x y mask) (VMOVDQU16Masked512 (VPAVGW512 x y) mask) => (VPAVGWMasked512 x y mask) -(VMOVDQU32Masked128 (VBROADCASTSS128 x) mask) => (VBROADCASTSSMasked128 x mask) (VMOVDQU64Masked128 (VPBROADCASTQ128 x) mask) => (VPBROADCASTQMasked128 x mask) -(VMOVDQU8Masked128 (VPBROADCASTB128 x) mask) => (VPBROADCASTBMasked128 x mask) -(VMOVDQU16Masked128 (VPBROADCASTW128 x) mask) => (VPBROADCASTWMasked128 x mask) +(VMOVDQU32Masked128 (VBROADCASTSS128 x) mask) => (VBROADCASTSSMasked128 x mask) +(VMOVDQU64Masked256 (VBROADCASTSD256 x) mask) => (VBROADCASTSDMasked256 x mask) (VMOVDQU32Masked128 (VPBROADCASTD128 x) mask) => (VPBROADCASTDMasked128 x mask) +(VMOVDQU64Masked256 (VPBROADCASTQ256 x) mask) => (VPBROADCASTQMasked256 x mask) (VMOVDQU32Masked256 (VBROADCASTSS256 x) mask) => (VBROADCASTSSMasked256 x mask) -(VMOVDQU64Masked256 (VBROADCASTSD256 x) mask) => (VBROADCASTSDMasked256 x mask) -(VMOVDQU8Masked256 (VPBROADCASTB256 x) mask) => (VPBROADCASTBMasked256 x mask) -(VMOVDQU16Masked256 (VPBROADCASTW256 x) mask) => (VPBROADCASTWMasked256 x mask) +(VMOVDQU64Masked512 (VBROADCASTSD512 x) mask) => (VBROADCASTSDMasked512 x mask) +(VMOVDQU16Masked128 (VPBROADCASTW128 x) mask) => (VPBROADCASTWMasked128 x mask) (VMOVDQU32Masked256 (VPBROADCASTD256 x) mask) => (VPBROADCASTDMasked256 x mask) -(VMOVDQU64Masked256 (VPBROADCASTQ256 x) mask) => (VPBROADCASTQMasked256 x mask) +(VMOVDQU64Masked512 (VPBROADCASTQ512 x) mask) => (VPBROADCASTQMasked512 x mask) (VMOVDQU32Masked512 (VBROADCASTSS512 x) mask) => (VBROADCASTSSMasked512 x mask) -(VMOVDQU64Masked512 (VBROADCASTSD512 x) mask) => (VBROADCASTSDMasked512 x mask) -(VMOVDQU8Masked512 (VPBROADCASTB512 x) mask) => (VPBROADCASTBMasked512 x mask) -(VMOVDQU16Masked512 (VPBROADCASTW512 x) mask) => (VPBROADCASTWMasked512 x mask) +(VMOVDQU8Masked128 (VPBROADCASTB128 x) mask) => (VPBROADCASTBMasked128 x mask) +(VMOVDQU16Masked256 (VPBROADCASTW256 x) mask) => (VPBROADCASTWMasked256 x mask) (VMOVDQU32Masked512 (VPBROADCASTD512 x) mask) => (VPBROADCASTDMasked512 x mask) -(VMOVDQU64Masked512 (VPBROADCASTQ512 x) mask) => (VPBROADCASTQMasked512 x mask) +(VMOVDQU8Masked256 (VPBROADCASTB256 x) mask) => (VPBROADCASTBMasked256 x mask) +(VMOVDQU16Masked512 (VPBROADCASTW512 x) mask) => (VPBROADCASTWMasked512 x mask) +(VMOVDQU8Masked512 (VPBROADCASTB512 x) mask) => (VPBROADCASTBMasked512 x mask) (VMOVDQU32Masked128 (VRNDSCALEPS128 [a] x) mask) => (VRNDSCALEPSMasked128 [a] x mask) (VMOVDQU32Masked256 (VRNDSCALEPS256 [a] x) mask) => (VRNDSCALEPSMasked256 [a] x mask) (VMOVDQU32Masked512 (VRNDSCALEPS512 [a] x) mask) => (VRNDSCALEPSMasked512 [a] x mask) diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go index 889ab0d84f..ff863a389f 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go +++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go @@ -143,36 +143,36 @@ func simdGenericOps() []opData { {name: "AverageUint16x8", argLength: 2, commutative: true}, {name: "AverageUint16x16", argLength: 2, commutative: true}, {name: "AverageUint16x32", argLength: 2, commutative: true}, - {name: "Broadcast128Float32x4", argLength: 1, commutative: false}, - {name: "Broadcast128Float64x2", argLength: 1, commutative: false}, - {name: "Broadcast128Int8x16", argLength: 1, commutative: false}, - {name: "Broadcast128Int16x8", argLength: 1, commutative: false}, - {name: "Broadcast128Int32x4", argLength: 1, commutative: false}, - {name: "Broadcast128Int64x2", argLength: 1, commutative: false}, - {name: "Broadcast128Uint8x16", argLength: 1, commutative: false}, - {name: "Broadcast128Uint16x8", argLength: 1, commutative: false}, - {name: "Broadcast128Uint32x4", argLength: 1, commutative: false}, - {name: "Broadcast128Uint64x2", argLength: 1, commutative: false}, - {name: "Broadcast256Float32x4", argLength: 1, commutative: false}, - {name: "Broadcast256Float64x2", argLength: 1, commutative: false}, - {name: "Broadcast256Int8x16", argLength: 1, commutative: false}, - {name: "Broadcast256Int16x8", argLength: 1, commutative: false}, - {name: "Broadcast256Int32x4", argLength: 1, commutative: false}, - {name: "Broadcast256Int64x2", argLength: 1, commutative: false}, - {name: "Broadcast256Uint8x16", argLength: 1, commutative: false}, - {name: "Broadcast256Uint16x8", argLength: 1, commutative: false}, - {name: "Broadcast256Uint32x4", argLength: 1, commutative: false}, - {name: "Broadcast256Uint64x2", argLength: 1, commutative: false}, - {name: "Broadcast512Float32x4", argLength: 1, commutative: false}, - {name: "Broadcast512Float64x2", argLength: 1, commutative: false}, - {name: "Broadcast512Int8x16", argLength: 1, commutative: false}, - {name: "Broadcast512Int16x8", argLength: 1, commutative: false}, - {name: "Broadcast512Int32x4", argLength: 1, commutative: false}, - {name: "Broadcast512Int64x2", argLength: 1, commutative: false}, - {name: "Broadcast512Uint8x16", argLength: 1, commutative: false}, - {name: "Broadcast512Uint16x8", argLength: 1, commutative: false}, - {name: "Broadcast512Uint32x4", argLength: 1, commutative: false}, - {name: "Broadcast512Uint64x2", argLength: 1, commutative: false}, + {name: "Broadcast1To2Float64x2", argLength: 1, commutative: false}, + {name: "Broadcast1To2Int64x2", argLength: 1, commutative: false}, + {name: "Broadcast1To2Uint64x2", argLength: 1, commutative: false}, + {name: "Broadcast1To4Float32x4", argLength: 1, commutative: false}, + {name: "Broadcast1To4Float64x2", argLength: 1, commutative: false}, + {name: "Broadcast1To4Int32x4", argLength: 1, commutative: false}, + {name: "Broadcast1To4Int64x2", argLength: 1, commutative: false}, + {name: "Broadcast1To4Uint32x4", argLength: 1, commutative: false}, + {name: "Broadcast1To4Uint64x2", argLength: 1, commutative: false}, + {name: "Broadcast1To8Float32x4", argLength: 1, commutative: false}, + {name: "Broadcast1To8Float64x2", argLength: 1, commutative: false}, + {name: "Broadcast1To8Int16x8", argLength: 1, commutative: false}, + {name: "Broadcast1To8Int32x4", argLength: 1, commutative: false}, + {name: "Broadcast1To8Int64x2", argLength: 1, commutative: false}, + {name: "Broadcast1To8Uint16x8", argLength: 1, commutative: false}, + {name: "Broadcast1To8Uint32x4", argLength: 1, commutative: false}, + {name: "Broadcast1To8Uint64x2", argLength: 1, commutative: false}, + {name: "Broadcast1To16Float32x4", argLength: 1, commutative: false}, + {name: "Broadcast1To16Int8x16", argLength: 1, commutative: false}, + {name: "Broadcast1To16Int16x8", argLength: 1, commutative: false}, + {name: "Broadcast1To16Int32x4", argLength: 1, commutative: false}, + {name: "Broadcast1To16Uint8x16", argLength: 1, commutative: false}, + {name: "Broadcast1To16Uint16x8", argLength: 1, commutative: false}, + {name: "Broadcast1To16Uint32x4", argLength: 1, commutative: false}, + {name: "Broadcast1To32Int8x16", argLength: 1, commutative: false}, + {name: "Broadcast1To32Int16x8", argLength: 1, commutative: false}, + {name: "Broadcast1To32Uint8x16", argLength: 1, commutative: false}, + {name: "Broadcast1To32Uint16x8", argLength: 1, commutative: false}, + {name: "Broadcast1To64Int8x16", argLength: 1, commutative: false}, + {name: "Broadcast1To64Uint8x16", argLength: 1, commutative: false}, {name: "CeilFloat32x4", argLength: 1, commutative: false}, {name: "CeilFloat32x8", argLength: 1, commutative: false}, {name: "CeilFloat64x2", argLength: 1, commutative: false}, diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 7b70dc2686..9e5fdb1fc1 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -6309,36 +6309,36 @@ const ( OpAverageUint16x8 OpAverageUint16x16 OpAverageUint16x32 - OpBroadcast128Float32x4 - OpBroadcast128Float64x2 - OpBroadcast128Int8x16 - OpBroadcast128Int16x8 - OpBroadcast128Int32x4 - OpBroadcast128Int64x2 - OpBroadcast128Uint8x16 - OpBroadcast128Uint16x8 - OpBroadcast128Uint32x4 - OpBroadcast128Uint64x2 - OpBroadcast256Float32x4 - OpBroadcast256Float64x2 - OpBroadcast256Int8x16 - OpBroadcast256Int16x8 - OpBroadcast256Int32x4 - OpBroadcast256Int64x2 - OpBroadcast256Uint8x16 - OpBroadcast256Uint16x8 - OpBroadcast256Uint32x4 - OpBroadcast256Uint64x2 - OpBroadcast512Float32x4 - OpBroadcast512Float64x2 - OpBroadcast512Int8x16 - OpBroadcast512Int16x8 - OpBroadcast512Int32x4 - OpBroadcast512Int64x2 - OpBroadcast512Uint8x16 - OpBroadcast512Uint16x8 - OpBroadcast512Uint32x4 - OpBroadcast512Uint64x2 + OpBroadcast1To2Float64x2 + OpBroadcast1To2Int64x2 + OpBroadcast1To2Uint64x2 + OpBroadcast1To4Float32x4 + OpBroadcast1To4Float64x2 + OpBroadcast1To4Int32x4 + OpBroadcast1To4Int64x2 + OpBroadcast1To4Uint32x4 + OpBroadcast1To4Uint64x2 + OpBroadcast1To8Float32x4 + OpBroadcast1To8Float64x2 + OpBroadcast1To8Int16x8 + OpBroadcast1To8Int32x4 + OpBroadcast1To8Int64x2 + OpBroadcast1To8Uint16x8 + OpBroadcast1To8Uint32x4 + OpBroadcast1To8Uint64x2 + OpBroadcast1To16Float32x4 + OpBroadcast1To16Int8x16 + OpBroadcast1To16Int16x8 + OpBroadcast1To16Int32x4 + OpBroadcast1To16Uint8x16 + OpBroadcast1To16Uint16x8 + OpBroadcast1To16Uint32x4 + OpBroadcast1To32Int8x16 + OpBroadcast1To32Int16x8 + OpBroadcast1To32Uint8x16 + OpBroadcast1To32Uint16x8 + OpBroadcast1To64Int8x16 + OpBroadcast1To64Uint8x16 OpCeilFloat32x4 OpCeilFloat32x8 OpCeilFloat64x2 @@ -89875,152 +89875,152 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "Broadcast128Float32x4", + name: "Broadcast1To2Float64x2", argLen: 1, generic: true, }, { - name: "Broadcast128Float64x2", + name: "Broadcast1To2Int64x2", argLen: 1, generic: true, }, { - name: "Broadcast128Int8x16", + name: "Broadcast1To2Uint64x2", argLen: 1, generic: true, }, { - name: "Broadcast128Int16x8", + name: "Broadcast1To4Float32x4", argLen: 1, generic: true, }, { - name: "Broadcast128Int32x4", + name: "Broadcast1To4Float64x2", argLen: 1, generic: true, }, { - name: "Broadcast128Int64x2", + name: "Broadcast1To4Int32x4", argLen: 1, generic: true, }, { - name: "Broadcast128Uint8x16", + name: "Broadcast1To4Int64x2", argLen: 1, generic: true, }, { - name: "Broadcast128Uint16x8", + name: "Broadcast1To4Uint32x4", argLen: 1, generic: true, }, { - name: "Broadcast128Uint32x4", + name: "Broadcast1To4Uint64x2", argLen: 1, generic: true, }, { - name: "Broadcast128Uint64x2", + name: "Broadcast1To8Float32x4", argLen: 1, generic: true, }, { - name: "Broadcast256Float32x4", + name: "Broadcast1To8Float64x2", argLen: 1, generic: true, }, { - name: "Broadcast256Float64x2", + name: "Broadcast1To8Int16x8", argLen: 1, generic: true, }, { - name: "Broadcast256Int8x16", + name: "Broadcast1To8Int32x4", argLen: 1, generic: true, }, { - name: "Broadcast256Int16x8", + name: "Broadcast1To8Int64x2", argLen: 1, generic: true, }, { - name: "Broadcast256Int32x4", + name: "Broadcast1To8Uint16x8", argLen: 1, generic: true, }, { - name: "Broadcast256Int64x2", + name: "Broadcast1To8Uint32x4", argLen: 1, generic: true, }, { - name: "Broadcast256Uint8x16", + name: "Broadcast1To8Uint64x2", argLen: 1, generic: true, }, { - name: "Broadcast256Uint16x8", + name: "Broadcast1To16Float32x4", argLen: 1, generic: true, }, { - name: "Broadcast256Uint32x4", + name: "Broadcast1To16Int8x16", argLen: 1, generic: true, }, { - name: "Broadcast256Uint64x2", + name: "Broadcast1To16Int16x8", argLen: 1, generic: true, }, { - name: "Broadcast512Float32x4", + name: "Broadcast1To16Int32x4", argLen: 1, generic: true, }, { - name: "Broadcast512Float64x2", + name: "Broadcast1To16Uint8x16", argLen: 1, generic: true, }, { - name: "Broadcast512Int8x16", + name: "Broadcast1To16Uint16x8", argLen: 1, generic: true, }, { - name: "Broadcast512Int16x8", + name: "Broadcast1To16Uint32x4", argLen: 1, generic: true, }, { - name: "Broadcast512Int32x4", + name: "Broadcast1To32Int8x16", argLen: 1, generic: true, }, { - name: "Broadcast512Int64x2", + name: "Broadcast1To32Int16x8", argLen: 1, generic: true, }, { - name: "Broadcast512Uint8x16", + name: "Broadcast1To32Uint8x16", argLen: 1, generic: true, }, { - name: "Broadcast512Uint16x8", + name: "Broadcast1To32Uint16x8", argLen: 1, generic: true, }, { - name: "Broadcast512Uint32x4", + name: "Broadcast1To64Int8x16", argLen: 1, generic: true, }, { - name: "Broadcast512Uint64x2", + name: "Broadcast1To64Uint8x16", argLen: 1, generic: true, }, diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index e84bf19c83..fe0005bb05 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -2479,96 +2479,96 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpBitLen64(v) case OpBitLen8: return rewriteValueAMD64_OpBitLen8(v) - case OpBroadcast128Float32x4: - v.Op = OpAMD64VBROADCASTSS128 + case OpBroadcast1To16Float32x4: + v.Op = OpAMD64VBROADCASTSS512 return true - case OpBroadcast128Float64x2: - v.Op = OpAMD64VPBROADCASTQ128 + case OpBroadcast1To16Int16x8: + v.Op = OpAMD64VPBROADCASTW256 return true - case OpBroadcast128Int16x8: - v.Op = OpAMD64VPBROADCASTW128 + case OpBroadcast1To16Int32x4: + v.Op = OpAMD64VPBROADCASTD512 return true - case OpBroadcast128Int32x4: - v.Op = OpAMD64VPBROADCASTD128 + case OpBroadcast1To16Int8x16: + v.Op = OpAMD64VPBROADCASTB128 return true - case OpBroadcast128Int64x2: - v.Op = OpAMD64VPBROADCASTQ128 + case OpBroadcast1To16Uint16x8: + v.Op = OpAMD64VPBROADCASTW256 return true - case OpBroadcast128Int8x16: + case OpBroadcast1To16Uint32x4: + v.Op = OpAMD64VPBROADCASTD512 + return true + case OpBroadcast1To16Uint8x16: v.Op = OpAMD64VPBROADCASTB128 return true - case OpBroadcast128Uint16x8: - v.Op = OpAMD64VPBROADCASTW128 + case OpBroadcast1To2Float64x2: + v.Op = OpAMD64VPBROADCASTQ128 return true - case OpBroadcast128Uint32x4: - v.Op = OpAMD64VPBROADCASTD128 + case OpBroadcast1To2Int64x2: + v.Op = OpAMD64VPBROADCASTQ128 return true - case OpBroadcast128Uint64x2: + case OpBroadcast1To2Uint64x2: v.Op = OpAMD64VPBROADCASTQ128 return true - case OpBroadcast128Uint8x16: - v.Op = OpAMD64VPBROADCASTB128 + case OpBroadcast1To32Int16x8: + v.Op = OpAMD64VPBROADCASTW512 return true - case OpBroadcast256Float32x4: - v.Op = OpAMD64VBROADCASTSS256 + case OpBroadcast1To32Int8x16: + v.Op = OpAMD64VPBROADCASTB256 return true - case OpBroadcast256Float64x2: - v.Op = OpAMD64VBROADCASTSD256 + case OpBroadcast1To32Uint16x8: + v.Op = OpAMD64VPBROADCASTW512 return true - case OpBroadcast256Int16x8: - v.Op = OpAMD64VPBROADCASTW256 + case OpBroadcast1To32Uint8x16: + v.Op = OpAMD64VPBROADCASTB256 return true - case OpBroadcast256Int32x4: - v.Op = OpAMD64VPBROADCASTD256 + case OpBroadcast1To4Float32x4: + v.Op = OpAMD64VBROADCASTSS128 return true - case OpBroadcast256Int64x2: - v.Op = OpAMD64VPBROADCASTQ256 + case OpBroadcast1To4Float64x2: + v.Op = OpAMD64VBROADCASTSD256 return true - case OpBroadcast256Int8x16: - v.Op = OpAMD64VPBROADCASTB256 + case OpBroadcast1To4Int32x4: + v.Op = OpAMD64VPBROADCASTD128 return true - case OpBroadcast256Uint16x8: - v.Op = OpAMD64VPBROADCASTW256 + case OpBroadcast1To4Int64x2: + v.Op = OpAMD64VPBROADCASTQ256 return true - case OpBroadcast256Uint32x4: - v.Op = OpAMD64VPBROADCASTD256 + case OpBroadcast1To4Uint32x4: + v.Op = OpAMD64VPBROADCASTD128 return true - case OpBroadcast256Uint64x2: + case OpBroadcast1To4Uint64x2: v.Op = OpAMD64VPBROADCASTQ256 return true - case OpBroadcast256Uint8x16: - v.Op = OpAMD64VPBROADCASTB256 + case OpBroadcast1To64Int8x16: + v.Op = OpAMD64VPBROADCASTB512 return true - case OpBroadcast512Float32x4: - v.Op = OpAMD64VBROADCASTSS512 + case OpBroadcast1To64Uint8x16: + v.Op = OpAMD64VPBROADCASTB512 + return true + case OpBroadcast1To8Float32x4: + v.Op = OpAMD64VBROADCASTSS256 return true - case OpBroadcast512Float64x2: + case OpBroadcast1To8Float64x2: v.Op = OpAMD64VBROADCASTSD512 return true - case OpBroadcast512Int16x8: - v.Op = OpAMD64VPBROADCASTW512 + case OpBroadcast1To8Int16x8: + v.Op = OpAMD64VPBROADCASTW128 return true - case OpBroadcast512Int32x4: - v.Op = OpAMD64VPBROADCASTD512 + case OpBroadcast1To8Int32x4: + v.Op = OpAMD64VPBROADCASTD256 return true - case OpBroadcast512Int64x2: + case OpBroadcast1To8Int64x2: v.Op = OpAMD64VPBROADCASTQ512 return true - case OpBroadcast512Int8x16: - v.Op = OpAMD64VPBROADCASTB512 - return true - case OpBroadcast512Uint16x8: - v.Op = OpAMD64VPBROADCASTW512 + case OpBroadcast1To8Uint16x8: + v.Op = OpAMD64VPBROADCASTW128 return true - case OpBroadcast512Uint32x4: - v.Op = OpAMD64VPBROADCASTD512 + case OpBroadcast1To8Uint32x4: + v.Op = OpAMD64VPBROADCASTD256 return true - case OpBroadcast512Uint64x2: + case OpBroadcast1To8Uint64x2: v.Op = OpAMD64VPBROADCASTQ512 return true - case OpBroadcast512Uint8x16: - v.Op = OpAMD64VPBROADCASTB512 - return true case OpBswap16: return rewriteValueAMD64_OpBswap16(v) case OpBswap32: diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go index 4ad0c6032c..e50561845b 100644 --- a/src/cmd/compile/internal/ssagen/simdintrinsics.go +++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go @@ -152,36 +152,36 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Uint16x8.Average", opLen2(ssa.OpAverageUint16x8, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint16x16.Average", opLen2(ssa.OpAverageUint16x16, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint16x32.Average", opLen2(ssa.OpAverageUint16x32, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Float32x4.Broadcast128", opLen1(ssa.OpBroadcast128Float32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Float64x2.Broadcast128", opLen1(ssa.OpBroadcast128Float64x2, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int8x16.Broadcast128", opLen1(ssa.OpBroadcast128Int8x16, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int16x8.Broadcast128", opLen1(ssa.OpBroadcast128Int16x8, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int32x4.Broadcast128", opLen1(ssa.OpBroadcast128Int32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int64x2.Broadcast128", opLen1(ssa.OpBroadcast128Int64x2, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint8x16.Broadcast128", opLen1(ssa.OpBroadcast128Uint8x16, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint16x8.Broadcast128", opLen1(ssa.OpBroadcast128Uint16x8, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint32x4.Broadcast128", opLen1(ssa.OpBroadcast128Uint32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint64x2.Broadcast128", opLen1(ssa.OpBroadcast128Uint64x2, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Float32x4.Broadcast256", opLen1(ssa.OpBroadcast256Float32x4, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Float64x2.Broadcast256", opLen1(ssa.OpBroadcast256Float64x2, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int8x16.Broadcast256", opLen1(ssa.OpBroadcast256Int8x16, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int16x8.Broadcast256", opLen1(ssa.OpBroadcast256Int16x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int32x4.Broadcast256", opLen1(ssa.OpBroadcast256Int32x4, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int64x2.Broadcast256", opLen1(ssa.OpBroadcast256Int64x2, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Uint8x16.Broadcast256", opLen1(ssa.OpBroadcast256Uint8x16, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Uint16x8.Broadcast256", opLen1(ssa.OpBroadcast256Uint16x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Uint32x4.Broadcast256", opLen1(ssa.OpBroadcast256Uint32x4, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Uint64x2.Broadcast256", opLen1(ssa.OpBroadcast256Uint64x2, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Float32x4.Broadcast512", opLen1(ssa.OpBroadcast512Float32x4, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Float64x2.Broadcast512", opLen1(ssa.OpBroadcast512Float64x2, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int8x16.Broadcast512", opLen1(ssa.OpBroadcast512Int8x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int16x8.Broadcast512", opLen1(ssa.OpBroadcast512Int16x8, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int32x4.Broadcast512", opLen1(ssa.OpBroadcast512Int32x4, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int64x2.Broadcast512", opLen1(ssa.OpBroadcast512Int64x2, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Uint8x16.Broadcast512", opLen1(ssa.OpBroadcast512Uint8x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Uint16x8.Broadcast512", opLen1(ssa.OpBroadcast512Uint16x8, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Uint32x4.Broadcast512", opLen1(ssa.OpBroadcast512Uint32x4, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Uint64x2.Broadcast512", opLen1(ssa.OpBroadcast512Uint64x2, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float64x2.Broadcast1To2", opLen1(ssa.OpBroadcast1To2Float64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int64x2.Broadcast1To2", opLen1(ssa.OpBroadcast1To2Int64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint64x2.Broadcast1To2", opLen1(ssa.OpBroadcast1To2Uint64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Float32x4.Broadcast1To4", opLen1(ssa.OpBroadcast1To4Float32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Float64x2.Broadcast1To4", opLen1(ssa.OpBroadcast1To4Float64x2, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int32x4.Broadcast1To4", opLen1(ssa.OpBroadcast1To4Int32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int64x2.Broadcast1To4", opLen1(ssa.OpBroadcast1To4Int64x2, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint32x4.Broadcast1To4", opLen1(ssa.OpBroadcast1To4Uint32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint64x2.Broadcast1To4", opLen1(ssa.OpBroadcast1To4Uint64x2, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float32x4.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Float32x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float64x2.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Float64x2, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int16x8.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Int16x8, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int32x4.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Int32x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int64x2.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Int64x2, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint16x8.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Uint16x8, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint32x4.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Uint32x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint64x2.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Uint64x2, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float32x4.Broadcast1To16", opLen1(ssa.OpBroadcast1To16Float32x4, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int8x16.Broadcast1To16", opLen1(ssa.OpBroadcast1To16Int8x16, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int16x8.Broadcast1To16", opLen1(ssa.OpBroadcast1To16Int16x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int32x4.Broadcast1To16", opLen1(ssa.OpBroadcast1To16Int32x4, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint8x16.Broadcast1To16", opLen1(ssa.OpBroadcast1To16Uint8x16, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint16x8.Broadcast1To16", opLen1(ssa.OpBroadcast1To16Uint16x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint32x4.Broadcast1To16", opLen1(ssa.OpBroadcast1To16Uint32x4, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int8x16.Broadcast1To32", opLen1(ssa.OpBroadcast1To32Int8x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int16x8.Broadcast1To32", opLen1(ssa.OpBroadcast1To32Int16x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint8x16.Broadcast1To32", opLen1(ssa.OpBroadcast1To32Uint8x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint16x8.Broadcast1To32", opLen1(ssa.OpBroadcast1To32Uint16x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int8x16.Broadcast1To64", opLen1(ssa.OpBroadcast1To64Int8x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint8x16.Broadcast1To64", opLen1(ssa.OpBroadcast1To64Uint8x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Float32x4.Ceil", opLen1(ssa.OpCeilFloat32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float32x8.Ceil", opLen1(ssa.OpCeilFloat32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float64x2.Ceil", opLen1(ssa.OpCeilFloat64x2, types.TypeVec128), sys.AMD64) diff --git a/src/simd/archsimd/_gen/simdgen/ops/Moves/categories.yaml b/src/simd/archsimd/_gen/simdgen/ops/Moves/categories.yaml index 38bc9374cc..3cba01ef95 100644 --- a/src/simd/archsimd/_gen/simdgen/ops/Moves/categories.yaml +++ b/src/simd/archsimd/_gen/simdgen/ops/Moves/categories.yaml @@ -69,21 +69,36 @@ documentation: !string |- // NAME performs an expansion on a vector x whose elements are packed to lower parts. // The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order. -- go: Broadcast128 +- go: Broadcast1To2 commutative: false documentation: !string |- - // NAME copies element zero of its (128-bit) input to all elements of - // the 128-bit output vector. -- go: Broadcast256 + // NAME copies the lowest element of its input to all 2 elements of + // the output vector. +- go: Broadcast1To4 commutative: false documentation: !string |- - // NAME copies element zero of its (128-bit) input to all elements of - // the 256-bit output vector. -- go: Broadcast512 + // NAME copies the lowest element of its input to all 4 elements of + // the output vector. +- go: Broadcast1To8 commutative: false documentation: !string |- - // NAME copies element zero of its (128-bit) input to all elements of - // the 512-bit output vector. + // NAME copies the lowest element of its input to all 8 elements of + // the output vector. +- go: Broadcast1To16 + commutative: false + documentation: !string |- + // NAME copies the lowest element of its input to all 16 elements of + // the output vector. +- go: Broadcast1To32 + commutative: false + documentation: !string |- + // NAME copies the lowest element of its input to all 32 elements of + // the output vector. +- go: Broadcast1To64 + commutative: false + documentation: !string |- + // NAME copies the lowest element of its input to all 64 elements of + // the output vector. - go: PermuteOrZeroGrouped commutative: false documentation: !string |- # Detailed documentation will rely on the specific ops. diff --git a/src/simd/archsimd/_gen/simdgen/ops/Moves/go.yaml b/src/simd/archsimd/_gen/simdgen/ops/Moves/go.yaml index e1fd184ed7..02daa2ea1e 100644 --- a/src/simd/archsimd/_gen/simdgen/ops/Moves/go.yaml +++ b/src/simd/archsimd/_gen/simdgen/ops/Moves/go.yaml @@ -376,21 +376,21 @@ out: - *any -- go: Broadcast128 - asm: VPBROADCAST[BWDQ] +- go: Broadcast1To2 + asm: VPBROADCASTQ in: - class: vreg bits: 128 - elemBits: $e + elemBits: 64 base: $b out: - class: vreg bits: 128 - elemBits: $e + elemBits: 64 base: $b # weirdly, this one case on AVX2 is memory-operand-only -- go: Broadcast128 +- go: Broadcast1To2 asm: VPBROADCASTQ in: - class: vreg @@ -405,71 +405,94 @@ base: int OverwriteBase: float -- go: Broadcast256 +- go: Broadcast1To4 asm: VPBROADCAST[BWDQ] in: - class: vreg bits: 128 - elemBits: $e base: $b out: - class: vreg - bits: 256 - elemBits: $e + lanes: 4 base: $b -- go: Broadcast512 +- go: Broadcast1To8 asm: VPBROADCAST[BWDQ] in: - class: vreg bits: 128 - elemBits: $e base: $b out: - class: vreg - bits: 512 - elemBits: $e + lanes: 8 base: $b -- go: Broadcast128 - asm: VBROADCASTS[SD] +- go: Broadcast1To16 + asm: VPBROADCAST[BWDQ] in: - class: vreg bits: 128 - elemBits: $e base: $b out: - class: vreg - bits: 128 - elemBits: $e + lanes: 16 base: $b -- go: Broadcast256 - asm: VBROADCASTS[SD] +- go: Broadcast1To32 + asm: VPBROADCAST[BWDQ] in: - class: vreg bits: 128 - elemBits: $e base: $b out: - class: vreg - bits: 256 - elemBits: $e + lanes: 32 base: $b -- go: Broadcast512 - asm: VBROADCASTS[SD] +- go: Broadcast1To64 + asm: VPBROADCASTB in: - class: vreg bits: 128 - elemBits: $e base: $b out: - class: vreg - bits: 512 - elemBits: $e + lanes: 64 base: $b +- go: Broadcast1To4 + asm: VBROADCASTS[SD] + in: + - class: vreg + bits: 128 + base: float + out: + - class: vreg + lanes: 4 + base: float + +- go: Broadcast1To8 + asm: VBROADCASTS[SD] + in: + - class: vreg + bits: 128 + base: float + out: + - class: vreg + lanes: 8 + base: float + +- go: Broadcast1To16 + asm: VBROADCASTS[SD] + in: + - class: vreg + bits: 128 + base: float + out: + - class: vreg + lanes: 16 + base: float + # VPSHUFB for 128-bit byte shuffles will be picked with higher priority than VPERMB, given its lower CPU feature requirement. (It's AVX) - go: PermuteOrZero asm: VPSHUFB diff --git a/src/simd/archsimd/_gen/tmplgen/main.go b/src/simd/archsimd/_gen/tmplgen/main.go index 8db185e1e0..45338b765d 100644 --- a/src/simd/archsimd/_gen/tmplgen/main.go +++ b/src/simd/archsimd/_gen/tmplgen/main.go @@ -873,7 +873,7 @@ var broadcastTemplate = templateOf("Broadcast functions", ` // Emulated, CPU Feature: {{.CPUfeatureBC}} func Broadcast{{.VType}}(x {{.Etype}}) {{.VType}} { var z {{.As128BitVec }} - return z.SetElem(0, x).Broadcast{{.Vwidth}}() + return z.SetElem(0, x).Broadcast1To{{.Count}}() } `) diff --git a/src/simd/archsimd/ops_amd64.go b/src/simd/archsimd/ops_amd64.go index eba340c793..bb162c4ff9 100644 --- a/src/simd/archsimd/ops_amd64.go +++ b/src/simd/archsimd/ops_amd64.go @@ -805,191 +805,197 @@ func (x Uint16x16) Average(y Uint16x16) Uint16x16 // Asm: VPAVGW, CPU Feature: AVX512 func (x Uint16x32) Average(y Uint16x32) Uint16x32 -/* Broadcast128 */ +/* Broadcast1To2 */ -// Broadcast128 copies element zero of its (128-bit) input to all elements of -// the 128-bit output vector. +// Broadcast1To2 copies the lowest element of its input to all 2 elements of +// the output vector. // -// Asm: VBROADCASTSS, CPU Feature: AVX2 -func (x Float32x4) Broadcast128() Float32x4 +// Asm: VPBROADCASTQ, CPU Feature: AVX2 +func (x Float64x2) Broadcast1To2() Float64x2 -// Broadcast128 copies element zero of its (128-bit) input to all elements of -// the 128-bit output vector. +// Broadcast1To2 copies the lowest element of its input to all 2 elements of +// the output vector. // // Asm: VPBROADCASTQ, CPU Feature: AVX2 -func (x Float64x2) Broadcast128() Float64x2 +func (x Int64x2) Broadcast1To2() Int64x2 -// Broadcast128 copies element zero of its (128-bit) input to all elements of -// the 128-bit output vector. +// Broadcast1To2 copies the lowest element of its input to all 2 elements of +// the output vector. // -// Asm: VPBROADCASTB, CPU Feature: AVX2 -func (x Int8x16) Broadcast128() Int8x16 +// Asm: VPBROADCASTQ, CPU Feature: AVX2 +func (x Uint64x2) Broadcast1To2() Uint64x2 -// Broadcast128 copies element zero of its (128-bit) input to all elements of -// the 128-bit output vector. -// -// Asm: VPBROADCASTW, CPU Feature: AVX2 -func (x Int16x8) Broadcast128() Int16x8 +/* Broadcast1To4 */ -// Broadcast128 copies element zero of its (128-bit) input to all elements of -// the 128-bit output vector. +// Broadcast1To4 copies the lowest element of its input to all 4 elements of +// the output vector. // -// Asm: VPBROADCASTD, CPU Feature: AVX2 -func (x Int32x4) Broadcast128() Int32x4 +// Asm: VBROADCASTSS, CPU Feature: AVX2 +func (x Float32x4) Broadcast1To4() Float32x4 -// Broadcast128 copies element zero of its (128-bit) input to all elements of -// the 128-bit output vector. +// Broadcast1To4 copies the lowest element of its input to all 4 elements of +// the output vector. // -// Asm: VPBROADCASTQ, CPU Feature: AVX2 -func (x Int64x2) Broadcast128() Int64x2 +// Asm: VBROADCASTSD, CPU Feature: AVX2 +func (x Float64x2) Broadcast1To4() Float64x4 -// Broadcast128 copies element zero of its (128-bit) input to all elements of -// the 128-bit output vector. +// Broadcast1To4 copies the lowest element of its input to all 4 elements of +// the output vector. // -// Asm: VPBROADCASTB, CPU Feature: AVX2 -func (x Uint8x16) Broadcast128() Uint8x16 +// Asm: VPBROADCASTD, CPU Feature: AVX2 +func (x Int32x4) Broadcast1To4() Int32x4 -// Broadcast128 copies element zero of its (128-bit) input to all elements of -// the 128-bit output vector. +// Broadcast1To4 copies the lowest element of its input to all 4 elements of +// the output vector. // -// Asm: VPBROADCASTW, CPU Feature: AVX2 -func (x Uint16x8) Broadcast128() Uint16x8 +// Asm: VPBROADCASTQ, CPU Feature: AVX2 +func (x Int64x2) Broadcast1To4() Int64x4 -// Broadcast128 copies element zero of its (128-bit) input to all elements of -// the 128-bit output vector. +// Broadcast1To4 copies the lowest element of its input to all 4 elements of +// the output vector. // // Asm: VPBROADCASTD, CPU Feature: AVX2 -func (x Uint32x4) Broadcast128() Uint32x4 +func (x Uint32x4) Broadcast1To4() Uint32x4 -// Broadcast128 copies element zero of its (128-bit) input to all elements of -// the 128-bit output vector. +// Broadcast1To4 copies the lowest element of its input to all 4 elements of +// the output vector. // // Asm: VPBROADCASTQ, CPU Feature: AVX2 -func (x Uint64x2) Broadcast128() Uint64x2 +func (x Uint64x2) Broadcast1To4() Uint64x4 -/* Broadcast256 */ +/* Broadcast1To8 */ -// Broadcast256 copies element zero of its (128-bit) input to all elements of -// the 256-bit output vector. +// Broadcast1To8 copies the lowest element of its input to all 8 elements of +// the output vector. // // Asm: VBROADCASTSS, CPU Feature: AVX2 -func (x Float32x4) Broadcast256() Float32x8 - -// Broadcast256 copies element zero of its (128-bit) input to all elements of -// the 256-bit output vector. -// -// Asm: VBROADCASTSD, CPU Feature: AVX2 -func (x Float64x2) Broadcast256() Float64x4 +func (x Float32x4) Broadcast1To8() Float32x8 -// Broadcast256 copies element zero of its (128-bit) input to all elements of -// the 256-bit output vector. +// Broadcast1To8 copies the lowest element of its input to all 8 elements of +// the output vector. // -// Asm: VPBROADCASTB, CPU Feature: AVX2 -func (x Int8x16) Broadcast256() Int8x32 +// Asm: VBROADCASTSD, CPU Feature: AVX512 +func (x Float64x2) Broadcast1To8() Float64x8 -// Broadcast256 copies element zero of its (128-bit) input to all elements of -// the 256-bit output vector. +// Broadcast1To8 copies the lowest element of its input to all 8 elements of +// the output vector. // // Asm: VPBROADCASTW, CPU Feature: AVX2 -func (x Int16x8) Broadcast256() Int16x16 +func (x Int16x8) Broadcast1To8() Int16x8 -// Broadcast256 copies element zero of its (128-bit) input to all elements of -// the 256-bit output vector. +// Broadcast1To8 copies the lowest element of its input to all 8 elements of +// the output vector. // // Asm: VPBROADCASTD, CPU Feature: AVX2 -func (x Int32x4) Broadcast256() Int32x8 +func (x Int32x4) Broadcast1To8() Int32x8 -// Broadcast256 copies element zero of its (128-bit) input to all elements of -// the 256-bit output vector. +// Broadcast1To8 copies the lowest element of its input to all 8 elements of +// the output vector. // -// Asm: VPBROADCASTQ, CPU Feature: AVX2 -func (x Int64x2) Broadcast256() Int64x4 - -// Broadcast256 copies element zero of its (128-bit) input to all elements of -// the 256-bit output vector. -// -// Asm: VPBROADCASTB, CPU Feature: AVX2 -func (x Uint8x16) Broadcast256() Uint8x32 +// Asm: VPBROADCASTQ, CPU Feature: AVX512 +func (x Int64x2) Broadcast1To8() Int64x8 -// Broadcast256 copies element zero of its (128-bit) input to all elements of -// the 256-bit output vector. +// Broadcast1To8 copies the lowest element of its input to all 8 elements of +// the output vector. // // Asm: VPBROADCASTW, CPU Feature: AVX2 -func (x Uint16x8) Broadcast256() Uint16x16 +func (x Uint16x8) Broadcast1To8() Uint16x8 -// Broadcast256 copies element zero of its (128-bit) input to all elements of -// the 256-bit output vector. +// Broadcast1To8 copies the lowest element of its input to all 8 elements of +// the output vector. // // Asm: VPBROADCASTD, CPU Feature: AVX2 -func (x Uint32x4) Broadcast256() Uint32x8 +func (x Uint32x4) Broadcast1To8() Uint32x8 -// Broadcast256 copies element zero of its (128-bit) input to all elements of -// the 256-bit output vector. +// Broadcast1To8 copies the lowest element of its input to all 8 elements of +// the output vector. // -// Asm: VPBROADCASTQ, CPU Feature: AVX2 -func (x Uint64x2) Broadcast256() Uint64x4 +// Asm: VPBROADCASTQ, CPU Feature: AVX512 +func (x Uint64x2) Broadcast1To8() Uint64x8 -/* Broadcast512 */ +/* Broadcast1To16 */ -// Broadcast512 copies element zero of its (128-bit) input to all elements of -// the 512-bit output vector. +// Broadcast1To16 copies the lowest element of its input to all 16 elements of +// the output vector. // // Asm: VBROADCASTSS, CPU Feature: AVX512 -func (x Float32x4) Broadcast512() Float32x16 +func (x Float32x4) Broadcast1To16() Float32x16 -// Broadcast512 copies element zero of its (128-bit) input to all elements of -// the 512-bit output vector. +// Broadcast1To16 copies the lowest element of its input to all 16 elements of +// the output vector. // -// Asm: VBROADCASTSD, CPU Feature: AVX512 -func (x Float64x2) Broadcast512() Float64x8 +// Asm: VPBROADCASTB, CPU Feature: AVX2 +func (x Int8x16) Broadcast1To16() Int8x16 -// Broadcast512 copies element zero of its (128-bit) input to all elements of -// the 512-bit output vector. +// Broadcast1To16 copies the lowest element of its input to all 16 elements of +// the output vector. // -// Asm: VPBROADCASTB, CPU Feature: AVX512 -func (x Int8x16) Broadcast512() Int8x64 +// Asm: VPBROADCASTW, CPU Feature: AVX2 +func (x Int16x8) Broadcast1To16() Int16x16 -// Broadcast512 copies element zero of its (128-bit) input to all elements of -// the 512-bit output vector. +// Broadcast1To16 copies the lowest element of its input to all 16 elements of +// the output vector. // -// Asm: VPBROADCASTW, CPU Feature: AVX512 -func (x Int16x8) Broadcast512() Int16x32 +// Asm: VPBROADCASTD, CPU Feature: AVX512 +func (x Int32x4) Broadcast1To16() Int32x16 -// Broadcast512 copies element zero of its (128-bit) input to all elements of -// the 512-bit output vector. +// Broadcast1To16 copies the lowest element of its input to all 16 elements of +// the output vector. +// +// Asm: VPBROADCASTB, CPU Feature: AVX2 +func (x Uint8x16) Broadcast1To16() Uint8x16 + +// Broadcast1To16 copies the lowest element of its input to all 16 elements of +// the output vector. +// +// Asm: VPBROADCASTW, CPU Feature: AVX2 +func (x Uint16x8) Broadcast1To16() Uint16x16 + +// Broadcast1To16 copies the lowest element of its input to all 16 elements of +// the output vector. // // Asm: VPBROADCASTD, CPU Feature: AVX512 -func (x Int32x4) Broadcast512() Int32x16 +func (x Uint32x4) Broadcast1To16() Uint32x16 + +/* Broadcast1To32 */ -// Broadcast512 copies element zero of its (128-bit) input to all elements of -// the 512-bit output vector. +// Broadcast1To32 copies the lowest element of its input to all 32 elements of +// the output vector. // -// Asm: VPBROADCASTQ, CPU Feature: AVX512 -func (x Int64x2) Broadcast512() Int64x8 +// Asm: VPBROADCASTB, CPU Feature: AVX2 +func (x Int8x16) Broadcast1To32() Int8x32 -// Broadcast512 copies element zero of its (128-bit) input to all elements of -// the 512-bit output vector. +// Broadcast1To32 copies the lowest element of its input to all 32 elements of +// the output vector. // -// Asm: VPBROADCASTB, CPU Feature: AVX512 -func (x Uint8x16) Broadcast512() Uint8x64 +// Asm: VPBROADCASTW, CPU Feature: AVX512 +func (x Int16x8) Broadcast1To32() Int16x32 + +// Broadcast1To32 copies the lowest element of its input to all 32 elements of +// the output vector. +// +// Asm: VPBROADCASTB, CPU Feature: AVX2 +func (x Uint8x16) Broadcast1To32() Uint8x32 -// Broadcast512 copies element zero of its (128-bit) input to all elements of -// the 512-bit output vector. +// Broadcast1To32 copies the lowest element of its input to all 32 elements of +// the output vector. // // Asm: VPBROADCASTW, CPU Feature: AVX512 -func (x Uint16x8) Broadcast512() Uint16x32 +func (x Uint16x8) Broadcast1To32() Uint16x32 + +/* Broadcast1To64 */ -// Broadcast512 copies element zero of its (128-bit) input to all elements of -// the 512-bit output vector. +// Broadcast1To64 copies the lowest element of its input to all 64 elements of +// the output vector. // -// Asm: VPBROADCASTD, CPU Feature: AVX512 -func (x Uint32x4) Broadcast512() Uint32x16 +// Asm: VPBROADCASTB, CPU Feature: AVX512 +func (x Int8x16) Broadcast1To64() Int8x64 -// Broadcast512 copies element zero of its (128-bit) input to all elements of -// the 512-bit output vector. +// Broadcast1To64 copies the lowest element of its input to all 64 elements of +// the output vector. // -// Asm: VPBROADCASTQ, CPU Feature: AVX512 -func (x Uint64x2) Broadcast512() Uint64x8 +// Asm: VPBROADCASTB, CPU Feature: AVX512 +func (x Uint8x16) Broadcast1To64() Uint8x64 /* Ceil */ diff --git a/src/simd/archsimd/other_gen_amd64.go b/src/simd/archsimd/other_gen_amd64.go index 647001acce..c250dc2436 100644 --- a/src/simd/archsimd/other_gen_amd64.go +++ b/src/simd/archsimd/other_gen_amd64.go @@ -10,7 +10,7 @@ package archsimd // Emulated, CPU Feature: AVX2 func BroadcastInt8x16(x int8) Int8x16 { var z Int8x16 - return z.SetElem(0, x).Broadcast128() + return z.SetElem(0, x).Broadcast1To16() } // BroadcastInt16x8 returns a vector with the input @@ -19,7 +19,7 @@ func BroadcastInt8x16(x int8) Int8x16 { // Emulated, CPU Feature: AVX2 func BroadcastInt16x8(x int16) Int16x8 { var z Int16x8 - return z.SetElem(0, x).Broadcast128() + return z.SetElem(0, x).Broadcast1To8() } // BroadcastInt32x4 returns a vector with the input @@ -28,7 +28,7 @@ func BroadcastInt16x8(x int16) Int16x8 { // Emulated, CPU Feature: AVX2 func BroadcastInt32x4(x int32) Int32x4 { var z Int32x4 - return z.SetElem(0, x).Broadcast128() + return z.SetElem(0, x).Broadcast1To4() } // BroadcastInt64x2 returns a vector with the input @@ -37,7 +37,7 @@ func BroadcastInt32x4(x int32) Int32x4 { // Emulated, CPU Feature: AVX2 func BroadcastInt64x2(x int64) Int64x2 { var z Int64x2 - return z.SetElem(0, x).Broadcast128() + return z.SetElem(0, x).Broadcast1To2() } // BroadcastUint8x16 returns a vector with the input @@ -46,7 +46,7 @@ func BroadcastInt64x2(x int64) Int64x2 { // Emulated, CPU Feature: AVX2 func BroadcastUint8x16(x uint8) Uint8x16 { var z Uint8x16 - return z.SetElem(0, x).Broadcast128() + return z.SetElem(0, x).Broadcast1To16() } // BroadcastUint16x8 returns a vector with the input @@ -55,7 +55,7 @@ func BroadcastUint8x16(x uint8) Uint8x16 { // Emulated, CPU Feature: AVX2 func BroadcastUint16x8(x uint16) Uint16x8 { var z Uint16x8 - return z.SetElem(0, x).Broadcast128() + return z.SetElem(0, x).Broadcast1To8() } // BroadcastUint32x4 returns a vector with the input @@ -64,7 +64,7 @@ func BroadcastUint16x8(x uint16) Uint16x8 { // Emulated, CPU Feature: AVX2 func BroadcastUint32x4(x uint32) Uint32x4 { var z Uint32x4 - return z.SetElem(0, x).Broadcast128() + return z.SetElem(0, x).Broadcast1To4() } // BroadcastUint64x2 returns a vector with the input @@ -73,7 +73,7 @@ func BroadcastUint32x4(x uint32) Uint32x4 { // Emulated, CPU Feature: AVX2 func BroadcastUint64x2(x uint64) Uint64x2 { var z Uint64x2 - return z.SetElem(0, x).Broadcast128() + return z.SetElem(0, x).Broadcast1To2() } // BroadcastFloat32x4 returns a vector with the input @@ -82,7 +82,7 @@ func BroadcastUint64x2(x uint64) Uint64x2 { // Emulated, CPU Feature: AVX2 func BroadcastFloat32x4(x float32) Float32x4 { var z Float32x4 - return z.SetElem(0, x).Broadcast128() + return z.SetElem(0, x).Broadcast1To4() } // BroadcastFloat64x2 returns a vector with the input @@ -91,7 +91,7 @@ func BroadcastFloat32x4(x float32) Float32x4 { // Emulated, CPU Feature: AVX2 func BroadcastFloat64x2(x float64) Float64x2 { var z Float64x2 - return z.SetElem(0, x).Broadcast128() + return z.SetElem(0, x).Broadcast1To2() } // BroadcastInt8x32 returns a vector with the input @@ -100,7 +100,7 @@ func BroadcastFloat64x2(x float64) Float64x2 { // Emulated, CPU Feature: AVX2 func BroadcastInt8x32(x int8) Int8x32 { var z Int8x16 - return z.SetElem(0, x).Broadcast256() + return z.SetElem(0, x).Broadcast1To32() } // BroadcastInt16x16 returns a vector with the input @@ -109,7 +109,7 @@ func BroadcastInt8x32(x int8) Int8x32 { // Emulated, CPU Feature: AVX2 func BroadcastInt16x16(x int16) Int16x16 { var z Int16x8 - return z.SetElem(0, x).Broadcast256() + return z.SetElem(0, x).Broadcast1To16() } // BroadcastInt32x8 returns a vector with the input @@ -118,7 +118,7 @@ func BroadcastInt16x16(x int16) Int16x16 { // Emulated, CPU Feature: AVX2 func BroadcastInt32x8(x int32) Int32x8 { var z Int32x4 - return z.SetElem(0, x).Broadcast256() + return z.SetElem(0, x).Broadcast1To8() } // BroadcastInt64x4 returns a vector with the input @@ -127,7 +127,7 @@ func BroadcastInt32x8(x int32) Int32x8 { // Emulated, CPU Feature: AVX2 func BroadcastInt64x4(x int64) Int64x4 { var z Int64x2 - return z.SetElem(0, x).Broadcast256() + return z.SetElem(0, x).Broadcast1To4() } // BroadcastUint8x32 returns a vector with the input @@ -136,7 +136,7 @@ func BroadcastInt64x4(x int64) Int64x4 { // Emulated, CPU Feature: AVX2 func BroadcastUint8x32(x uint8) Uint8x32 { var z Uint8x16 - return z.SetElem(0, x).Broadcast256() + return z.SetElem(0, x).Broadcast1To32() } // BroadcastUint16x16 returns a vector with the input @@ -145,7 +145,7 @@ func BroadcastUint8x32(x uint8) Uint8x32 { // Emulated, CPU Feature: AVX2 func BroadcastUint16x16(x uint16) Uint16x16 { var z Uint16x8 - return z.SetElem(0, x).Broadcast256() + return z.SetElem(0, x).Broadcast1To16() } // BroadcastUint32x8 returns a vector with the input @@ -154,7 +154,7 @@ func BroadcastUint16x16(x uint16) Uint16x16 { // Emulated, CPU Feature: AVX2 func BroadcastUint32x8(x uint32) Uint32x8 { var z Uint32x4 - return z.SetElem(0, x).Broadcast256() + return z.SetElem(0, x).Broadcast1To8() } // BroadcastUint64x4 returns a vector with the input @@ -163,7 +163,7 @@ func BroadcastUint32x8(x uint32) Uint32x8 { // Emulated, CPU Feature: AVX2 func BroadcastUint64x4(x uint64) Uint64x4 { var z Uint64x2 - return z.SetElem(0, x).Broadcast256() + return z.SetElem(0, x).Broadcast1To4() } // BroadcastFloat32x8 returns a vector with the input @@ -172,7 +172,7 @@ func BroadcastUint64x4(x uint64) Uint64x4 { // Emulated, CPU Feature: AVX2 func BroadcastFloat32x8(x float32) Float32x8 { var z Float32x4 - return z.SetElem(0, x).Broadcast256() + return z.SetElem(0, x).Broadcast1To8() } // BroadcastFloat64x4 returns a vector with the input @@ -181,7 +181,7 @@ func BroadcastFloat32x8(x float32) Float32x8 { // Emulated, CPU Feature: AVX2 func BroadcastFloat64x4(x float64) Float64x4 { var z Float64x2 - return z.SetElem(0, x).Broadcast256() + return z.SetElem(0, x).Broadcast1To4() } // BroadcastInt8x64 returns a vector with the input @@ -190,7 +190,7 @@ func BroadcastFloat64x4(x float64) Float64x4 { // Emulated, CPU Feature: AVX512BW func BroadcastInt8x64(x int8) Int8x64 { var z Int8x16 - return z.SetElem(0, x).Broadcast512() + return z.SetElem(0, x).Broadcast1To64() } // BroadcastInt16x32 returns a vector with the input @@ -199,7 +199,7 @@ func BroadcastInt8x64(x int8) Int8x64 { // Emulated, CPU Feature: AVX512BW func BroadcastInt16x32(x int16) Int16x32 { var z Int16x8 - return z.SetElem(0, x).Broadcast512() + return z.SetElem(0, x).Broadcast1To32() } // BroadcastInt32x16 returns a vector with the input @@ -208,7 +208,7 @@ func BroadcastInt16x32(x int16) Int16x32 { // Emulated, CPU Feature: AVX512F func BroadcastInt32x16(x int32) Int32x16 { var z Int32x4 - return z.SetElem(0, x).Broadcast512() + return z.SetElem(0, x).Broadcast1To16() } // BroadcastInt64x8 returns a vector with the input @@ -217,7 +217,7 @@ func BroadcastInt32x16(x int32) Int32x16 { // Emulated, CPU Feature: AVX512F func BroadcastInt64x8(x int64) Int64x8 { var z Int64x2 - return z.SetElem(0, x).Broadcast512() + return z.SetElem(0, x).Broadcast1To8() } // BroadcastUint8x64 returns a vector with the input @@ -226,7 +226,7 @@ func BroadcastInt64x8(x int64) Int64x8 { // Emulated, CPU Feature: AVX512BW func BroadcastUint8x64(x uint8) Uint8x64 { var z Uint8x16 - return z.SetElem(0, x).Broadcast512() + return z.SetElem(0, x).Broadcast1To64() } // BroadcastUint16x32 returns a vector with the input @@ -235,7 +235,7 @@ func BroadcastUint8x64(x uint8) Uint8x64 { // Emulated, CPU Feature: AVX512BW func BroadcastUint16x32(x uint16) Uint16x32 { var z Uint16x8 - return z.SetElem(0, x).Broadcast512() + return z.SetElem(0, x).Broadcast1To32() } // BroadcastUint32x16 returns a vector with the input @@ -244,7 +244,7 @@ func BroadcastUint16x32(x uint16) Uint16x32 { // Emulated, CPU Feature: AVX512F func BroadcastUint32x16(x uint32) Uint32x16 { var z Uint32x4 - return z.SetElem(0, x).Broadcast512() + return z.SetElem(0, x).Broadcast1To16() } // BroadcastUint64x8 returns a vector with the input @@ -253,7 +253,7 @@ func BroadcastUint32x16(x uint32) Uint32x16 { // Emulated, CPU Feature: AVX512F func BroadcastUint64x8(x uint64) Uint64x8 { var z Uint64x2 - return z.SetElem(0, x).Broadcast512() + return z.SetElem(0, x).Broadcast1To8() } // BroadcastFloat32x16 returns a vector with the input @@ -262,7 +262,7 @@ func BroadcastUint64x8(x uint64) Uint64x8 { // Emulated, CPU Feature: AVX512F func BroadcastFloat32x16(x float32) Float32x16 { var z Float32x4 - return z.SetElem(0, x).Broadcast512() + return z.SetElem(0, x).Broadcast1To16() } // BroadcastFloat64x8 returns a vector with the input @@ -271,7 +271,7 @@ func BroadcastFloat32x16(x float32) Float32x16 { // Emulated, CPU Feature: AVX512F func BroadcastFloat64x8(x float64) Float64x8 { var z Float64x2 - return z.SetElem(0, x).Broadcast512() + return z.SetElem(0, x).Broadcast1To8() } // ToMask converts from Int8x16 to Mask8x16, mask element is set to true when the corresponding vector element is non-zero. -- cgit v1.3 From 4b89bcb8b7141c7e4ef1a7dbb4c3f17f589d89c0 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Fri, 19 Dec 2025 23:14:36 +0100 Subject: lib/fips140: freeze v1.26.0 FIPS 140-3 module Fixes #76770 Change-Id: Ia617f01ea9be0d1759147b6cca0403c56a6a6964 Reviewed-on: https://go-review.googlesource.com/c/go/+/731840 Reviewed-by: Roland Shoemaker LUCI-TryBot-Result: Go LUCI Auto-Submit: Filippo Valsorda Reviewed-by: Junyang Shao --- lib/fips140/fips140.sum | 2 +- lib/fips140/v1.1.0-rc1.zip | Bin 678896 -> 0 bytes lib/fips140/v1.26.0.zip | Bin 0 -> 676132 bytes 3 files changed, 1 insertion(+), 1 deletion(-) delete mode 100644 lib/fips140/v1.1.0-rc1.zip create mode 100644 lib/fips140/v1.26.0.zip diff --git a/lib/fips140/fips140.sum b/lib/fips140/fips140.sum index c4d185da73..050957af60 100644 --- a/lib/fips140/fips140.sum +++ b/lib/fips140/fips140.sum @@ -10,4 +10,4 @@ # go test cmd/go/internal/fips140 -update # v1.0.0-c2097c7c.zip daf3614e0406f67ae6323c902db3f953a1effb199142362a039e7526dfb9368b -v1.1.0-rc1.zip ea94f8c3885294c9efe1bd8f9b6e86daeb25b6aff2aeb20707cd9a5101f6f54e +v1.26.0.zip 9b28f847fdf1db4a36cb2b2f8ec09443c039383f085630a03ecfaddf6db7ea23 diff --git a/lib/fips140/v1.1.0-rc1.zip b/lib/fips140/v1.1.0-rc1.zip deleted file mode 100644 index d4264bdb2e..0000000000 Binary files a/lib/fips140/v1.1.0-rc1.zip and /dev/null differ diff --git a/lib/fips140/v1.26.0.zip b/lib/fips140/v1.26.0.zip new file mode 100644 index 0000000000..f53ade8036 Binary files /dev/null and b/lib/fips140/v1.26.0.zip differ -- cgit v1.3 From 6b2505c79cb3838c6e27cf47ac09980fe51c83c2 Mon Sep 17 00:00:00 2001 From: Neal Patel Date: Tue, 6 Jan 2026 16:09:19 -0500 Subject: cmd/go: remove user-content from doc strings in cgo ASTs. Thank you to RyotaK (https://ryotak.net) of GMO Flatt Security Inc. for reporting this issue. Updates golang/go#76697 Fixes CVE-2025-61732 Change-Id: I1121502f1bf1e91309eb4bd41cc3a09c39366d36 Reviewed-on: https://go-review.googlesource.com/c/go/+/734220 Reviewed-by: Agustin Hernandez Reviewed-by: David Chase Reviewed-by: Robert Griesemer LUCI-TryBot-Result: Go LUCI --- src/cmd/cgo/ast.go | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/src/cmd/cgo/ast.go b/src/cmd/cgo/ast.go index 2da6ca5a30..df0552f525 100644 --- a/src/cmd/cgo/ast.go +++ b/src/cmd/cgo/ast.go @@ -301,17 +301,12 @@ func (f *File) saveExport(x any, context astContext) { error_(c.Pos(), "export comment has wrong name %q, want %q", name, n.Name.Name) } - doc := "" - for _, c1 := range n.Doc.List { - if c1 != c { - doc += c1.Text + "\n" - } - } - f.ExpFunc = append(f.ExpFunc, &ExpFunc{ Func: n, ExpName: name, - Doc: doc, + // Caution: Do not set the Doc field on purpose + // to ensure that there are no unintended artifacts + // in the binary. See https://go.dev/issue/76697. }) break } -- cgit v1.3