diff options
| author | Cherry Mui <cherryyz@google.com> | 2025-12-31 02:42:30 -0500 |
|---|---|---|
| committer | Cherry Mui <cherryyz@google.com> | 2026-01-02 12:13:40 -0800 |
| commit | 13440fb51831bfde5804430596d2045a64fd2209 (patch) | |
| tree | 84196ec227a3ed45eee119c10a434169216e9516 /src/cmd | |
| parent | c3550b3352ae283110c443576e1e62cdf8efaa72 (diff) | |
| download | go-13440fb51831bfde5804430596d2045a64fd2209.tar.xz | |
simd/archsimd: make IsNaN unary
Currently, the IsNan API is defined as x.IsNan(y), which returns
a mask to represent, for each element, either x or y is NaN.
Albeit closer to the machine instruction, this is weird API, as
IsNaN is a unary operation. This CL changes it to unary, x.IsNaN().
It compiles to VCMPPS $3, x, x (or VCMPPD). For the two-operand
version, we can optimize x.IsNaN().Or(y.IsNaN()) to VCMPPS $3, x,
y (not done in this CL).
While here, change the name to IsNaN (uppercase both Ns), which
matches math.IsNaN.
Tests in the next CL.
Change-Id: Ib6e7afc2635e6c3c606db5ea16420ee673a6c6d6
Reviewed-on: https://go-review.googlesource.com/c/go/+/733660
Reviewed-by: David Chase <drchase@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Diffstat (limited to 'src/cmd')
| -rw-r--r-- | src/cmd/compile/internal/ssa/_gen/AMD64.rules | 7 | ||||
| -rw-r--r-- | src/cmd/compile/internal/ssa/_gen/genericOps.go | 8 | ||||
| -rw-r--r-- | src/cmd/compile/internal/ssa/_gen/simdAMD64.rules | 6 | ||||
| -rw-r--r-- | src/cmd/compile/internal/ssa/_gen/simdgenericOps.go | 6 | ||||
| -rw-r--r-- | src/cmd/compile/internal/ssa/opGen.go | 78 | ||||
| -rw-r--r-- | src/cmd/compile/internal/ssa/rewriteAMD64.go | 84 | ||||
| -rw-r--r-- | src/cmd/compile/internal/ssagen/intrinsics.go | 6 | ||||
| -rw-r--r-- | src/cmd/compile/internal/ssagen/simdintrinsics.go | 6 |
8 files changed, 93 insertions, 108 deletions
diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64.rules b/src/cmd/compile/internal/ssa/_gen/AMD64.rules index 38ca44f7eb..9c54186854 100644 --- a/src/cmd/compile/internal/ssa/_gen/AMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/AMD64.rules @@ -1730,6 +1730,13 @@ // Misc (IsZeroVec x) => (SETEQ (VPTEST x x)) +(IsNaNFloat32x4 x) => (VCMPPS128 [3] x x) +(IsNaNFloat32x8 x) => (VCMPPS256 [3] x x) +(IsNaNFloat32x16 x) => (VPMOVMToVec32x16 (VCMPPS512 [3] x x)) +(IsNaNFloat64x2 x) => (VCMPPD128 [3] x x) +(IsNaNFloat64x4 x) => (VCMPPD256 [3] x x) +(IsNaNFloat64x8 x) => (VPMOVMToVec64x8 (VCMPPD512 [3] x x)) + // SIMD vector K-masked loads and stores (LoadMasked64 <t> ptr mask mem) && t.Size() == 64 => (VPMASK64load512 ptr (VPMOVVec64x8ToM <types.TypeMask> mask) mem) diff --git a/src/cmd/compile/internal/ssa/_gen/genericOps.go b/src/cmd/compile/internal/ssa/_gen/genericOps.go index 8637133e5f..85bde1aab2 100644 --- a/src/cmd/compile/internal/ssa/_gen/genericOps.go +++ b/src/cmd/compile/internal/ssa/_gen/genericOps.go @@ -715,6 +715,14 @@ var genericOps = []opData{ // Returns true if arg0 is all zero. {name: "IsZeroVec", argLength: 1}, + + // Returns a mask indicating whether arg0's elements are NaN. + {name: "IsNaNFloat32x4", argLength: 1}, + {name: "IsNaNFloat32x8", argLength: 1}, + {name: "IsNaNFloat32x16", argLength: 1}, + {name: "IsNaNFloat64x2", argLength: 1}, + {name: "IsNaNFloat64x4", argLength: 1}, + {name: "IsNaNFloat64x8", argLength: 1}, } // kind controls successors implicit exit diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules index 8a5b70da30..5c83f39a1f 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules @@ -559,12 +559,6 @@ (InterleaveLoGroupedUint32x16 ...) => (VPUNPCKLDQ512 ...) (InterleaveLoGroupedUint64x4 ...) => (VPUNPCKLQDQ256 ...) (InterleaveLoGroupedUint64x8 ...) => (VPUNPCKLQDQ512 ...) -(IsNanFloat32x4 x y) => (VCMPPS128 [3] x y) -(IsNanFloat32x8 x y) => (VCMPPS256 [3] x y) -(IsNanFloat32x16 x y) => (VPMOVMToVec32x16 (VCMPPS512 [3] x y)) -(IsNanFloat64x2 x y) => (VCMPPD128 [3] x y) -(IsNanFloat64x4 x y) => (VCMPPD256 [3] x y) -(IsNanFloat64x8 x y) => (VPMOVMToVec64x8 (VCMPPD512 [3] x y)) (LeadingZerosInt32x4 ...) => (VPLZCNTD128 ...) (LeadingZerosInt32x8 ...) => (VPLZCNTD256 ...) (LeadingZerosInt32x16 ...) => (VPLZCNTD512 ...) diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go index af1007cd54..889ab0d84f 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go +++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go @@ -519,12 +519,6 @@ func simdGenericOps() []opData { {name: "InterleaveLoUint16x8", argLength: 2, commutative: false}, {name: "InterleaveLoUint32x4", argLength: 2, commutative: false}, {name: "InterleaveLoUint64x2", argLength: 2, commutative: false}, - {name: "IsNanFloat32x4", argLength: 2, commutative: true}, - {name: "IsNanFloat32x8", argLength: 2, commutative: true}, - {name: "IsNanFloat32x16", argLength: 2, commutative: true}, - {name: "IsNanFloat64x2", argLength: 2, commutative: true}, - {name: "IsNanFloat64x4", argLength: 2, commutative: true}, - {name: "IsNanFloat64x8", argLength: 2, commutative: true}, {name: "LeadingZerosInt32x4", argLength: 1, commutative: false}, {name: "LeadingZerosInt32x8", argLength: 1, commutative: false}, {name: "LeadingZerosInt32x16", argLength: 1, commutative: false}, diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index d2ba15f740..abaf7911d4 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -6158,6 +6158,12 @@ const ( OpCvtMask64x4to8 OpCvtMask64x8to8 OpIsZeroVec + OpIsNaNFloat32x4 + OpIsNaNFloat32x8 + OpIsNaNFloat32x16 + OpIsNaNFloat64x2 + OpIsNaNFloat64x4 + OpIsNaNFloat64x8 OpAESDecryptLastRoundUint8x16 OpAESDecryptLastRoundUint8x32 OpAESDecryptLastRoundUint8x64 @@ -6673,12 +6679,6 @@ const ( OpInterleaveLoUint16x8 OpInterleaveLoUint32x4 OpInterleaveLoUint64x2 - OpIsNanFloat32x4 - OpIsNanFloat32x8 - OpIsNanFloat32x16 - OpIsNanFloat64x2 - OpIsNanFloat64x4 - OpIsNanFloat64x8 OpLeadingZerosInt32x4 OpLeadingZerosInt32x8 OpLeadingZerosInt32x16 @@ -88994,6 +88994,36 @@ var opcodeTable = [...]opInfo{ generic: true, }, { + name: "IsNaNFloat32x4", + argLen: 1, + generic: true, + }, + { + name: "IsNaNFloat32x8", + argLen: 1, + generic: true, + }, + { + name: "IsNaNFloat32x16", + argLen: 1, + generic: true, + }, + { + name: "IsNaNFloat64x2", + argLen: 1, + generic: true, + }, + { + name: "IsNaNFloat64x4", + argLen: 1, + generic: true, + }, + { + name: "IsNaNFloat64x8", + argLen: 1, + generic: true, + }, + { name: "AESDecryptLastRoundUint8x16", argLen: 2, generic: true, @@ -91671,42 +91701,6 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "IsNanFloat32x4", - argLen: 2, - commutative: true, - generic: true, - }, - { - name: "IsNanFloat32x8", - argLen: 2, - commutative: true, - generic: true, - }, - { - name: "IsNanFloat32x16", - argLen: 2, - commutative: true, - generic: true, - }, - { - name: "IsNanFloat64x2", - argLen: 2, - commutative: true, - generic: true, - }, - { - name: "IsNanFloat64x4", - argLen: 2, - commutative: true, - generic: true, - }, - { - name: "IsNanFloat64x8", - argLen: 2, - commutative: true, - generic: true, - }, - { name: "LeadingZerosInt32x4", argLen: 1, generic: true, diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index 0c04410074..0b2bb74ce4 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -3773,18 +3773,18 @@ func rewriteValueAMD64(v *Value) bool { return true case OpIsInBounds: return rewriteValueAMD64_OpIsInBounds(v) - case OpIsNanFloat32x16: - return rewriteValueAMD64_OpIsNanFloat32x16(v) - case OpIsNanFloat32x4: - return rewriteValueAMD64_OpIsNanFloat32x4(v) - case OpIsNanFloat32x8: - return rewriteValueAMD64_OpIsNanFloat32x8(v) - case OpIsNanFloat64x2: - return rewriteValueAMD64_OpIsNanFloat64x2(v) - case OpIsNanFloat64x4: - return rewriteValueAMD64_OpIsNanFloat64x4(v) - case OpIsNanFloat64x8: - return rewriteValueAMD64_OpIsNanFloat64x8(v) + case OpIsNaNFloat32x16: + return rewriteValueAMD64_OpIsNaNFloat32x16(v) + case OpIsNaNFloat32x4: + return rewriteValueAMD64_OpIsNaNFloat32x4(v) + case OpIsNaNFloat32x8: + return rewriteValueAMD64_OpIsNaNFloat32x8(v) + case OpIsNaNFloat64x2: + return rewriteValueAMD64_OpIsNaNFloat64x2(v) + case OpIsNaNFloat64x4: + return rewriteValueAMD64_OpIsNaNFloat64x4(v) + case OpIsNaNFloat64x8: + return rewriteValueAMD64_OpIsNaNFloat64x8(v) case OpIsNonNil: return rewriteValueAMD64_OpIsNonNil(v) case OpIsSliceInBounds: @@ -70957,94 +70957,82 @@ func rewriteValueAMD64_OpIsInBounds(v *Value) bool { return true } } -func rewriteValueAMD64_OpIsNanFloat32x16(v *Value) bool { - v_1 := v.Args[1] +func rewriteValueAMD64_OpIsNaNFloat32x16(v *Value) bool { v_0 := v.Args[0] b := v.Block typ := &b.Func.Config.Types - // match: (IsNanFloat32x16 x y) - // result: (VPMOVMToVec32x16 (VCMPPS512 [3] x y)) + // match: (IsNaNFloat32x16 x) + // result: (VPMOVMToVec32x16 (VCMPPS512 [3] x x)) for { x := v_0 - y := v_1 v.reset(OpAMD64VPMOVMToVec32x16) v0 := b.NewValue0(v.Pos, OpAMD64VCMPPS512, typ.Mask) v0.AuxInt = uint8ToAuxInt(3) - v0.AddArg2(x, y) + v0.AddArg2(x, x) v.AddArg(v0) return true } } -func rewriteValueAMD64_OpIsNanFloat32x4(v *Value) bool { - v_1 := v.Args[1] +func rewriteValueAMD64_OpIsNaNFloat32x4(v *Value) bool { v_0 := v.Args[0] - // match: (IsNanFloat32x4 x y) - // result: (VCMPPS128 [3] x y) + // match: (IsNaNFloat32x4 x) + // result: (VCMPPS128 [3] x x) for { x := v_0 - y := v_1 v.reset(OpAMD64VCMPPS128) v.AuxInt = uint8ToAuxInt(3) - v.AddArg2(x, y) + v.AddArg2(x, x) return true } } -func rewriteValueAMD64_OpIsNanFloat32x8(v *Value) bool { - v_1 := v.Args[1] +func rewriteValueAMD64_OpIsNaNFloat32x8(v *Value) bool { v_0 := v.Args[0] - // match: (IsNanFloat32x8 x y) - // result: (VCMPPS256 [3] x y) + // match: (IsNaNFloat32x8 x) + // result: (VCMPPS256 [3] x x) for { x := v_0 - y := v_1 v.reset(OpAMD64VCMPPS256) v.AuxInt = uint8ToAuxInt(3) - v.AddArg2(x, y) + v.AddArg2(x, x) return true } } -func rewriteValueAMD64_OpIsNanFloat64x2(v *Value) bool { - v_1 := v.Args[1] +func rewriteValueAMD64_OpIsNaNFloat64x2(v *Value) bool { v_0 := v.Args[0] - // match: (IsNanFloat64x2 x y) - // result: (VCMPPD128 [3] x y) + // match: (IsNaNFloat64x2 x) + // result: (VCMPPD128 [3] x x) for { x := v_0 - y := v_1 v.reset(OpAMD64VCMPPD128) v.AuxInt = uint8ToAuxInt(3) - v.AddArg2(x, y) + v.AddArg2(x, x) return true } } -func rewriteValueAMD64_OpIsNanFloat64x4(v *Value) bool { - v_1 := v.Args[1] +func rewriteValueAMD64_OpIsNaNFloat64x4(v *Value) bool { v_0 := v.Args[0] - // match: (IsNanFloat64x4 x y) - // result: (VCMPPD256 [3] x y) + // match: (IsNaNFloat64x4 x) + // result: (VCMPPD256 [3] x x) for { x := v_0 - y := v_1 v.reset(OpAMD64VCMPPD256) v.AuxInt = uint8ToAuxInt(3) - v.AddArg2(x, y) + v.AddArg2(x, x) return true } } -func rewriteValueAMD64_OpIsNanFloat64x8(v *Value) bool { - v_1 := v.Args[1] +func rewriteValueAMD64_OpIsNaNFloat64x8(v *Value) bool { v_0 := v.Args[0] b := v.Block typ := &b.Func.Config.Types - // match: (IsNanFloat64x8 x y) - // result: (VPMOVMToVec64x8 (VCMPPD512 [3] x y)) + // match: (IsNaNFloat64x8 x) + // result: (VPMOVMToVec64x8 (VCMPPD512 [3] x x)) for { x := v_0 - y := v_1 v.reset(OpAMD64VPMOVMToVec64x8) v0 := b.NewValue0(v.Pos, OpAMD64VCMPPD512, typ.Mask) v0.AuxInt = uint8ToAuxInt(3) - v0.AddArg2(x, y) + v0.AddArg2(x, x) v.AddArg(v0) return true } diff --git a/src/cmd/compile/internal/ssagen/intrinsics.go b/src/cmd/compile/internal/ssagen/intrinsics.go index 4425c5617b..e2eebd783d 100644 --- a/src/cmd/compile/internal/ssagen/intrinsics.go +++ b/src/cmd/compile/internal/ssagen/intrinsics.go @@ -1667,6 +1667,12 @@ func initIntrinsics(cfg *intrinsicBuildConfig) { addF(simdPackage, "Uint16x16.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64) addF(simdPackage, "Uint32x8.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64) addF(simdPackage, "Uint64x4.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64) + addF(simdPackage, "Float32x4.IsNaN", opLen1(ssa.OpIsNaNFloat32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Float32x8.IsNaN", opLen1(ssa.OpIsNaNFloat32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float32x16.IsNaN", opLen1(ssa.OpIsNaNFloat32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float64x2.IsNaN", opLen1(ssa.OpIsNaNFloat64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Float64x4.IsNaN", opLen1(ssa.OpIsNaNFloat64x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float64x8.IsNaN", opLen1(ssa.OpIsNaNFloat64x8, types.TypeVec512), sys.AMD64) // sfp4 is intrinsic-if-constant, but otherwise it's complicated enough to just implement in Go. sfp4 := func(method string, hwop ssa.Op, vectype *types.Type) { diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go index 5a95761228..4ad0c6032c 100644 --- a/src/cmd/compile/internal/ssagen/simdintrinsics.go +++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go @@ -571,12 +571,6 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Uint32x16.InterleaveLoGrouped", opLen2(ssa.OpInterleaveLoGroupedUint32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Uint64x4.InterleaveLoGrouped", opLen2(ssa.OpInterleaveLoGroupedUint64x4, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint64x8.InterleaveLoGrouped", opLen2(ssa.OpInterleaveLoGroupedUint64x8, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Float32x4.IsNan", opLen2(ssa.OpIsNanFloat32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Float32x8.IsNan", opLen2(ssa.OpIsNanFloat32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Float32x16.IsNan", opLen2(ssa.OpIsNanFloat32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Float64x2.IsNan", opLen2(ssa.OpIsNanFloat64x2, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Float64x4.IsNan", opLen2(ssa.OpIsNanFloat64x4, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Float64x8.IsNan", opLen2(ssa.OpIsNanFloat64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int32x4.LeadingZeros", opLen1(ssa.OpLeadingZerosInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int32x8.LeadingZeros", opLen1(ssa.OpLeadingZerosInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int32x16.LeadingZeros", opLen1(ssa.OpLeadingZerosInt32x16, types.TypeVec512), sys.AMD64) |
