From e30fbe3757d09a22988835835c41233df7c6cd00 Mon Sep 17 00:00:00 2001 From: Junchen Li Date: Fri, 10 Jul 2020 11:39:23 +0800 Subject: cmd/compile: optimize unsigned comparisons to 0 There are some architecture-independent rules in #21439, since an unsigned integer >= 0 is always true and < 0 is always false. This CL adds these optimizations to generic rules. Updates #21439 Change-Id: Iec7e3040b761ecb1e60908f764815fdd9bc62495 Reviewed-on: https://go-review.googlesource.com/c/go/+/246617 Reviewed-by: Keith Randall Run-TryBot: Keith Randall TryBot-Result: Gobot Gobot --- test/codegen/comparisons.go | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'test/codegen') diff --git a/test/codegen/comparisons.go b/test/codegen/comparisons.go index 90808573c2..f3c15538a8 100644 --- a/test/codegen/comparisons.go +++ b/test/codegen/comparisons.go @@ -407,3 +407,20 @@ func CmpToZero_ex5(e, f int32, u uint32) int { } return 0 } +func UintLtZero(a uint8, b uint16, c uint32, d uint64) int { + // amd64: -`(TESTB|TESTW|TESTL|TESTQ|JCC|JCS)` + // arm64: -`(CMPW|CMP|BHS|BLO)` + if a < 0 || b < 0 || c < 0 || d < 0 { + return 1 + } + return 0 +} + +func UintGeqZero(a uint8, b uint16, c uint32, d uint64) int { + // amd64: -`(TESTB|TESTW|TESTL|TESTQ|JCS|JCC)` + // arm64: -`(CMPW|CMP|BLO|BHS)` + if a >= 0 || b >= 0 || c >= 0 || d >= 0 { + return 1 + } + return 0 +} -- cgit v1.3 From ef9c8a38ad177fa7f48dfaad5d0e27f39a03529d Mon Sep 17 00:00:00 2001 From: Keith Randall Date: Tue, 7 Jul 2020 09:32:12 -0700 Subject: cmd/compile: don't rewrite (CMP (AND x y) 0) to TEST if AND has other uses If the AND has other uses, we end up saving an argument to the AND in another register, so we can use it for the TEST. No point in doing that. Change-Id: I73444a6aeddd6f55e2328ce04d77c3e6cf4a83e0 Reviewed-on: https://go-review.googlesource.com/c/go/+/241280 Run-TryBot: Keith Randall Reviewed-by: Josh Bleecher Snyder TryBot-Result: Gobot Gobot --- src/cmd/compile/internal/ssa/gen/AMD64.rules | 16 ++-- src/cmd/compile/internal/ssa/rewriteAMD64.go | 128 ++++++++++++++++++++------- test/codegen/logic.go | 24 +++++ 3 files changed, 128 insertions(+), 40 deletions(-) create mode 100644 test/codegen/logic.go (limited to 'test/codegen') diff --git a/src/cmd/compile/internal/ssa/gen/AMD64.rules b/src/cmd/compile/internal/ssa/gen/AMD64.rules index 9967c7b030..5111ef79d3 100644 --- a/src/cmd/compile/internal/ssa/gen/AMD64.rules +++ b/src/cmd/compile/internal/ssa/gen/AMD64.rules @@ -1463,14 +1463,14 @@ (MULQconst [c] (NEGQ x)) && c != -(1<<31) -> (MULQconst [-c] x) // checking AND against 0. -(CMPQconst (ANDQ x y) [0]) -> (TESTQ x y) -(CMPLconst (ANDL x y) [0]) -> (TESTL x y) -(CMPWconst (ANDL x y) [0]) -> (TESTW x y) -(CMPBconst (ANDL x y) [0]) -> (TESTB x y) -(CMPQconst (ANDQconst [c] x) [0]) -> (TESTQconst [c] x) -(CMPLconst (ANDLconst [c] x) [0]) -> (TESTLconst [c] x) -(CMPWconst (ANDLconst [c] x) [0]) -> (TESTWconst [int64(int16(c))] x) -(CMPBconst (ANDLconst [c] x) [0]) -> (TESTBconst [int64(int8(c))] x) +(CMPQconst a:(ANDQ x y) [0]) && a.Uses == 1 -> (TESTQ x y) +(CMPLconst a:(ANDL x y) [0]) && a.Uses == 1 -> (TESTL x y) +(CMPWconst a:(ANDL x y) [0]) && a.Uses == 1 -> (TESTW x y) +(CMPBconst a:(ANDL x y) [0]) && a.Uses == 1 -> (TESTB x y) +(CMPQconst a:(ANDQconst [c] x) [0]) && a.Uses == 1 -> (TESTQconst [c] x) +(CMPLconst a:(ANDLconst [c] x) [0]) && a.Uses == 1 -> (TESTLconst [c] x) +(CMPWconst a:(ANDLconst [c] x) [0]) && a.Uses == 1 -> (TESTWconst [int64(int16(c))] x) +(CMPBconst a:(ANDLconst [c] x) [0]) && a.Uses == 1 -> (TESTBconst [int64(int8(c))] x) // Convert TESTx to TESTxconst if possible. (TESTQ (MOVQconst [c]) x) && is32Bit(c) -> (TESTQconst [c] x) diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index 20eab05e9c..cda9df56f4 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -6924,26 +6924,42 @@ func rewriteValueAMD64_OpAMD64CMPBconst(v *Value) bool { v.reset(OpAMD64FlagLT_ULT) return true } - // match: (CMPBconst (ANDL x y) [0]) + // match: (CMPBconst a:(ANDL x y) [0]) + // cond: a.Uses == 1 // result: (TESTB x y) for { - if v.AuxInt != 0 || v_0.Op != OpAMD64ANDL { + if v.AuxInt != 0 { + break + } + a := v_0 + if a.Op != OpAMD64ANDL { + break + } + y := a.Args[1] + x := a.Args[0] + if !(a.Uses == 1) { break } - y := v_0.Args[1] - x := v_0.Args[0] v.reset(OpAMD64TESTB) v.AddArg2(x, y) return true } - // match: (CMPBconst (ANDLconst [c] x) [0]) + // match: (CMPBconst a:(ANDLconst [c] x) [0]) + // cond: a.Uses == 1 // result: (TESTBconst [int64(int8(c))] x) for { - if v.AuxInt != 0 || v_0.Op != OpAMD64ANDLconst { + if v.AuxInt != 0 { + break + } + a := v_0 + if a.Op != OpAMD64ANDLconst { + break + } + c := a.AuxInt + x := a.Args[0] + if !(a.Uses == 1) { break } - c := v_0.AuxInt - x := v_0.Args[0] v.reset(OpAMD64TESTBconst) v.AuxInt = int64(int8(c)) v.AddArg(x) @@ -7309,26 +7325,42 @@ func rewriteValueAMD64_OpAMD64CMPLconst(v *Value) bool { v.reset(OpAMD64FlagLT_ULT) return true } - // match: (CMPLconst (ANDL x y) [0]) + // match: (CMPLconst a:(ANDL x y) [0]) + // cond: a.Uses == 1 // result: (TESTL x y) for { - if v.AuxInt != 0 || v_0.Op != OpAMD64ANDL { + if v.AuxInt != 0 { + break + } + a := v_0 + if a.Op != OpAMD64ANDL { + break + } + y := a.Args[1] + x := a.Args[0] + if !(a.Uses == 1) { break } - y := v_0.Args[1] - x := v_0.Args[0] v.reset(OpAMD64TESTL) v.AddArg2(x, y) return true } - // match: (CMPLconst (ANDLconst [c] x) [0]) + // match: (CMPLconst a:(ANDLconst [c] x) [0]) + // cond: a.Uses == 1 // result: (TESTLconst [c] x) for { - if v.AuxInt != 0 || v_0.Op != OpAMD64ANDLconst { + if v.AuxInt != 0 { + break + } + a := v_0 + if a.Op != OpAMD64ANDLconst { + break + } + c := a.AuxInt + x := a.Args[0] + if !(a.Uses == 1) { break } - c := v_0.AuxInt - x := v_0.Args[0] v.reset(OpAMD64TESTLconst) v.AuxInt = c v.AddArg(x) @@ -7874,26 +7906,42 @@ func rewriteValueAMD64_OpAMD64CMPQconst(v *Value) bool { v.reset(OpAMD64FlagLT_ULT) return true } - // match: (CMPQconst (ANDQ x y) [0]) + // match: (CMPQconst a:(ANDQ x y) [0]) + // cond: a.Uses == 1 // result: (TESTQ x y) for { - if v.AuxInt != 0 || v_0.Op != OpAMD64ANDQ { + if v.AuxInt != 0 { + break + } + a := v_0 + if a.Op != OpAMD64ANDQ { + break + } + y := a.Args[1] + x := a.Args[0] + if !(a.Uses == 1) { break } - y := v_0.Args[1] - x := v_0.Args[0] v.reset(OpAMD64TESTQ) v.AddArg2(x, y) return true } - // match: (CMPQconst (ANDQconst [c] x) [0]) + // match: (CMPQconst a:(ANDQconst [c] x) [0]) + // cond: a.Uses == 1 // result: (TESTQconst [c] x) for { - if v.AuxInt != 0 || v_0.Op != OpAMD64ANDQconst { + if v.AuxInt != 0 { + break + } + a := v_0 + if a.Op != OpAMD64ANDQconst { + break + } + c := a.AuxInt + x := a.Args[0] + if !(a.Uses == 1) { break } - c := v_0.AuxInt - x := v_0.Args[0] v.reset(OpAMD64TESTQconst) v.AuxInt = c v.AddArg(x) @@ -8244,26 +8292,42 @@ func rewriteValueAMD64_OpAMD64CMPWconst(v *Value) bool { v.reset(OpAMD64FlagLT_ULT) return true } - // match: (CMPWconst (ANDL x y) [0]) + // match: (CMPWconst a:(ANDL x y) [0]) + // cond: a.Uses == 1 // result: (TESTW x y) for { - if v.AuxInt != 0 || v_0.Op != OpAMD64ANDL { + if v.AuxInt != 0 { + break + } + a := v_0 + if a.Op != OpAMD64ANDL { + break + } + y := a.Args[1] + x := a.Args[0] + if !(a.Uses == 1) { break } - y := v_0.Args[1] - x := v_0.Args[0] v.reset(OpAMD64TESTW) v.AddArg2(x, y) return true } - // match: (CMPWconst (ANDLconst [c] x) [0]) + // match: (CMPWconst a:(ANDLconst [c] x) [0]) + // cond: a.Uses == 1 // result: (TESTWconst [int64(int16(c))] x) for { - if v.AuxInt != 0 || v_0.Op != OpAMD64ANDLconst { + if v.AuxInt != 0 { + break + } + a := v_0 + if a.Op != OpAMD64ANDLconst { + break + } + c := a.AuxInt + x := a.Args[0] + if !(a.Uses == 1) { break } - c := v_0.AuxInt - x := v_0.Args[0] v.reset(OpAMD64TESTWconst) v.AuxInt = int64(int16(c)) v.AddArg(x) diff --git a/test/codegen/logic.go b/test/codegen/logic.go new file mode 100644 index 0000000000..9afdfd760f --- /dev/null +++ b/test/codegen/logic.go @@ -0,0 +1,24 @@ +// asmcheck + +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package codegen + +var gx, gy int + +// Test to make sure that (CMPQ (ANDQ x y) [0]) does not get rewritten to +// (TESTQ x y) if the ANDQ has other uses. If that rewrite happens, then one +// of the args of the ANDQ needs to be saved so it can be used as the arg to TESTQ. +func andWithUse(x, y int) int { + // Load x,y into registers, so those MOVQ will not appear at the z := x&y line. + gx, gy = x, y + // amd64:-"MOVQ" + z := x & y + if z == 0 { + return 77 + } + // use z by returning it + return z +} -- cgit v1.3 From 06337823ef4d9c57b1b01f9b97a348a47277a9df Mon Sep 17 00:00:00 2001 From: Junchen Li Date: Fri, 10 Jul 2020 11:39:23 +0800 Subject: cmd/compile: optimize unsigned comparisons to 0/1 on arm64 For an unsigned integer, it's useful to convert its order test with 0/1 to its equality test with 0. We can save a comparison instruction that followed by a conditional branch on arm64 since it supports compare-with-zero-and-branch instructions. For example, if x > 0 { ... } else { ... } the original version: CMP $0, R0 BLS 9 the optimized version: CBZ R0, 8 Updates #21439 Change-Id: Id1de6f865f6aa72c5d45b29f7894818857288425 Reviewed-on: https://go-review.googlesource.com/c/go/+/246857 Reviewed-by: Keith Randall --- src/cmd/compile/internal/ssa/gen/ARM64.rules | 10 ++ src/cmd/compile/internal/ssa/rewriteARM64.go | 204 +++++++++++++++++++++++++++ test/codegen/comparisons.go | 32 +++++ 3 files changed, 246 insertions(+) (limited to 'test/codegen') diff --git a/src/cmd/compile/internal/ssa/gen/ARM64.rules b/src/cmd/compile/internal/ssa/gen/ARM64.rules index 442d769fdd..27959d01fc 100644 --- a/src/cmd/compile/internal/ssa/gen/ARM64.rules +++ b/src/cmd/compile/internal/ssa/gen/ARM64.rules @@ -279,6 +279,16 @@ (Less32F x y) => (LessThanF (FCMPS x y)) (Less64F x y) => (LessThanF (FCMPD x y)) +// For an unsigned integer x, the following rules are useful when combining branch +// 0 < x => x != 0 +// x <= 0 => x == 0 +// x < 1 => x == 0 +// 1 <= x => x != 0 +(Less(8U|16U|32U|64U) zero:(MOVDconst [0]) x) => (Neq(8|16|32|64) zero x) +(Leq(8U|16U|32U|64U) x zero:(MOVDconst [0])) => (Eq(8|16|32|64) x zero) +(Less(8U|16U|32U|64U) x (MOVDconst [1])) => (Eq(8|16|32|64) x (MOVDconst [0])) +(Leq(8U|16U|32U|64U) (MOVDconst [1]) x) => (Neq(8|16|32|64) (MOVDconst [0]) x) + (Less8U x y) => (LessThanU (CMPW (ZeroExt8to32 x) (ZeroExt8to32 y))) (Less16U x y) => (LessThanU (CMPW (ZeroExt16to32 x) (ZeroExt16to32 y))) (Less32U x y) => (LessThanU (CMPW x y)) diff --git a/src/cmd/compile/internal/ssa/rewriteARM64.go b/src/cmd/compile/internal/ssa/rewriteARM64.go index 8e48b33628..023d9908c2 100644 --- a/src/cmd/compile/internal/ssa/rewriteARM64.go +++ b/src/cmd/compile/internal/ssa/rewriteARM64.go @@ -21976,6 +21976,31 @@ func rewriteValueARM64_OpLeq16U(v *Value) bool { v_0 := v.Args[0] b := v.Block typ := &b.Func.Config.Types + // match: (Leq16U x zero:(MOVDconst [0])) + // result: (Eq16 x zero) + for { + x := v_0 + zero := v_1 + if zero.Op != OpARM64MOVDconst || auxIntToInt64(zero.AuxInt) != 0 { + break + } + v.reset(OpEq16) + v.AddArg2(x, zero) + return true + } + // match: (Leq16U (MOVDconst [1]) x) + // result: (Neq16 (MOVDconst [0]) x) + for { + if v_0.Op != OpARM64MOVDconst || auxIntToInt64(v_0.AuxInt) != 1 { + break + } + x := v_1 + v.reset(OpNeq16) + v0 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64) + v0.AuxInt = int64ToAuxInt(0) + v.AddArg2(v0, x) + return true + } // match: (Leq16U x y) // result: (LessEqualU (CMPW (ZeroExt16to32 x) (ZeroExt16to32 y))) for { @@ -22028,6 +22053,32 @@ func rewriteValueARM64_OpLeq32U(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block + typ := &b.Func.Config.Types + // match: (Leq32U x zero:(MOVDconst [0])) + // result: (Eq32 x zero) + for { + x := v_0 + zero := v_1 + if zero.Op != OpARM64MOVDconst || auxIntToInt64(zero.AuxInt) != 0 { + break + } + v.reset(OpEq32) + v.AddArg2(x, zero) + return true + } + // match: (Leq32U (MOVDconst [1]) x) + // result: (Neq32 (MOVDconst [0]) x) + for { + if v_0.Op != OpARM64MOVDconst || auxIntToInt64(v_0.AuxInt) != 1 { + break + } + x := v_1 + v.reset(OpNeq32) + v0 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64) + v0.AuxInt = int64ToAuxInt(0) + v.AddArg2(v0, x) + return true + } // match: (Leq32U x y) // result: (LessEqualU (CMPW x y)) for { @@ -22076,6 +22127,32 @@ func rewriteValueARM64_OpLeq64U(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block + typ := &b.Func.Config.Types + // match: (Leq64U x zero:(MOVDconst [0])) + // result: (Eq64 x zero) + for { + x := v_0 + zero := v_1 + if zero.Op != OpARM64MOVDconst || auxIntToInt64(zero.AuxInt) != 0 { + break + } + v.reset(OpEq64) + v.AddArg2(x, zero) + return true + } + // match: (Leq64U (MOVDconst [1]) x) + // result: (Neq64 (MOVDconst [0]) x) + for { + if v_0.Op != OpARM64MOVDconst || auxIntToInt64(v_0.AuxInt) != 1 { + break + } + x := v_1 + v.reset(OpNeq64) + v0 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64) + v0.AuxInt = int64ToAuxInt(0) + v.AddArg2(v0, x) + return true + } // match: (Leq64U x y) // result: (LessEqualU (CMP x y)) for { @@ -22114,6 +22191,31 @@ func rewriteValueARM64_OpLeq8U(v *Value) bool { v_0 := v.Args[0] b := v.Block typ := &b.Func.Config.Types + // match: (Leq8U x zero:(MOVDconst [0])) + // result: (Eq8 x zero) + for { + x := v_0 + zero := v_1 + if zero.Op != OpARM64MOVDconst || auxIntToInt64(zero.AuxInt) != 0 { + break + } + v.reset(OpEq8) + v.AddArg2(x, zero) + return true + } + // match: (Leq8U (MOVDconst [1]) x) + // result: (Neq8 (MOVDconst [0]) x) + for { + if v_0.Op != OpARM64MOVDconst || auxIntToInt64(v_0.AuxInt) != 1 { + break + } + x := v_1 + v.reset(OpNeq8) + v0 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64) + v0.AuxInt = int64ToAuxInt(0) + v.AddArg2(v0, x) + return true + } // match: (Leq8U x y) // result: (LessEqualU (CMPW (ZeroExt8to32 x) (ZeroExt8to32 y))) for { @@ -22156,6 +22258,31 @@ func rewriteValueARM64_OpLess16U(v *Value) bool { v_0 := v.Args[0] b := v.Block typ := &b.Func.Config.Types + // match: (Less16U zero:(MOVDconst [0]) x) + // result: (Neq16 zero x) + for { + zero := v_0 + if zero.Op != OpARM64MOVDconst || auxIntToInt64(zero.AuxInt) != 0 { + break + } + x := v_1 + v.reset(OpNeq16) + v.AddArg2(zero, x) + return true + } + // match: (Less16U x (MOVDconst [1])) + // result: (Eq16 x (MOVDconst [0])) + for { + x := v_0 + if v_1.Op != OpARM64MOVDconst || auxIntToInt64(v_1.AuxInt) != 1 { + break + } + v.reset(OpEq16) + v0 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64) + v0.AuxInt = int64ToAuxInt(0) + v.AddArg2(x, v0) + return true + } // match: (Less16U x y) // result: (LessThanU (CMPW (ZeroExt16to32 x) (ZeroExt16to32 y))) for { @@ -22208,6 +22335,32 @@ func rewriteValueARM64_OpLess32U(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block + typ := &b.Func.Config.Types + // match: (Less32U zero:(MOVDconst [0]) x) + // result: (Neq32 zero x) + for { + zero := v_0 + if zero.Op != OpARM64MOVDconst || auxIntToInt64(zero.AuxInt) != 0 { + break + } + x := v_1 + v.reset(OpNeq32) + v.AddArg2(zero, x) + return true + } + // match: (Less32U x (MOVDconst [1])) + // result: (Eq32 x (MOVDconst [0])) + for { + x := v_0 + if v_1.Op != OpARM64MOVDconst || auxIntToInt64(v_1.AuxInt) != 1 { + break + } + v.reset(OpEq32) + v0 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64) + v0.AuxInt = int64ToAuxInt(0) + v.AddArg2(x, v0) + return true + } // match: (Less32U x y) // result: (LessThanU (CMPW x y)) for { @@ -22256,6 +22409,32 @@ func rewriteValueARM64_OpLess64U(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block + typ := &b.Func.Config.Types + // match: (Less64U zero:(MOVDconst [0]) x) + // result: (Neq64 zero x) + for { + zero := v_0 + if zero.Op != OpARM64MOVDconst || auxIntToInt64(zero.AuxInt) != 0 { + break + } + x := v_1 + v.reset(OpNeq64) + v.AddArg2(zero, x) + return true + } + // match: (Less64U x (MOVDconst [1])) + // result: (Eq64 x (MOVDconst [0])) + for { + x := v_0 + if v_1.Op != OpARM64MOVDconst || auxIntToInt64(v_1.AuxInt) != 1 { + break + } + v.reset(OpEq64) + v0 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64) + v0.AuxInt = int64ToAuxInt(0) + v.AddArg2(x, v0) + return true + } // match: (Less64U x y) // result: (LessThanU (CMP x y)) for { @@ -22294,6 +22473,31 @@ func rewriteValueARM64_OpLess8U(v *Value) bool { v_0 := v.Args[0] b := v.Block typ := &b.Func.Config.Types + // match: (Less8U zero:(MOVDconst [0]) x) + // result: (Neq8 zero x) + for { + zero := v_0 + if zero.Op != OpARM64MOVDconst || auxIntToInt64(zero.AuxInt) != 0 { + break + } + x := v_1 + v.reset(OpNeq8) + v.AddArg2(zero, x) + return true + } + // match: (Less8U x (MOVDconst [1])) + // result: (Eq8 x (MOVDconst [0])) + for { + x := v_0 + if v_1.Op != OpARM64MOVDconst || auxIntToInt64(v_1.AuxInt) != 1 { + break + } + v.reset(OpEq8) + v0 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64) + v0.AuxInt = int64ToAuxInt(0) + v.AddArg2(x, v0) + return true + } // match: (Less8U x y) // result: (LessThanU (CMPW (ZeroExt8to32 x) (ZeroExt8to32 y))) for { diff --git a/test/codegen/comparisons.go b/test/codegen/comparisons.go index f3c15538a8..3c2dcb7eba 100644 --- a/test/codegen/comparisons.go +++ b/test/codegen/comparisons.go @@ -424,3 +424,35 @@ func UintGeqZero(a uint8, b uint16, c uint32, d uint64) int { } return 0 } + +func UintGtZero(a uint8, b uint16, c uint32, d uint64) int { + // arm64: `CBZW`, `CBNZW`, `CBNZ`, -`(CMPW|CMP|BLS|BHI)` + if a > 0 || b > 0 || c > 0 || d > 0 { + return 1 + } + return 0 +} + +func UintLeqZero(a uint8, b uint16, c uint32, d uint64) int { + // arm64: `CBNZW`, `CBZW`, `CBZ`, -`(CMPW|CMP|BHI|BLS)` + if a <= 0 || b <= 0 || c <= 0 || d <= 0 { + return 1 + } + return 0 +} + +func UintLtOne(a uint8, b uint16, c uint32, d uint64) int { + // arm64: `CBNZW`, `CBZW`, `CBZW`, `CBZ`, -`(CMPW|CMP|BHS|BLO)` + if a < 1 || b < 1 || c < 1 || d < 1 { + return 1 + } + return 0 +} + +func UintGeqOne(a uint8, b uint16, c uint32, d uint64) int { + // arm64: `CBZW`, `CBNZW`, `CBNZ`, -`(CMPW|CMP|BLO|BHS)` + if a >= 1 || b >= 1 || c >= 1 || d >= 1 { + return 1 + } + return 0 +} -- cgit v1.3 From 6e876f19857a8fbd259571080f7f91bc03276559 Mon Sep 17 00:00:00 2001 From: Michael Munday Date: Thu, 4 Jun 2020 10:55:01 -0700 Subject: cmd/compile: clean up and optimize s390x multiplication rules MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some of the existing optimizations aren't triggered because they are handled by the generic rules so this CL removes them. Also some constraints were copied without much thought from the amd64 rules and they don't make sense on s390x, so we remove those constraints. Finally, add a 'multiply by the sum of two powers of two' optimization. This makes sense on s390x as shifts are low latency and can also sometimes be optimized further (especially if we add support for RISBG instructions). name old time/op new time/op delta IntMulByConst/3-8 1.70ns ±11% 1.10ns ± 5% -35.26% (p=0.000 n=10+10) IntMulByConst/5-8 1.64ns ± 7% 1.10ns ± 4% -32.94% (p=0.000 n=10+9) IntMulByConst/12-8 1.65ns ± 6% 1.20ns ± 4% -27.16% (p=0.000 n=10+9) IntMulByConst/120-8 1.66ns ± 4% 1.22ns ±13% -26.43% (p=0.000 n=10+10) IntMulByConst/-120-8 1.65ns ± 7% 1.19ns ± 4% -28.06% (p=0.000 n=9+10) IntMulByConst/65537-8 0.86ns ± 9% 1.12ns ±12% +30.41% (p=0.000 n=10+10) IntMulByConst/65538-8 1.65ns ± 5% 1.23ns ± 5% -25.11% (p=0.000 n=10+10) Change-Id: Ib196e6bff1e97febfd266134d0a2b2a62897989f Reviewed-on: https://go-review.googlesource.com/c/go/+/248937 Run-TryBot: Michael Munday TryBot-Result: Gobot Gobot Reviewed-by: Keith Randall --- src/cmd/compile/internal/ssa/gen/S390X.rules | 51 +++-- src/cmd/compile/internal/ssa/rewriteS390X.go | 268 ++++++++++++++----------- src/cmd/compile/internal/test/mulconst_test.go | 242 ++++++++++++++++++++++ test/codegen/arithmetic.go | 6 + 4 files changed, 441 insertions(+), 126 deletions(-) create mode 100644 src/cmd/compile/internal/test/mulconst_test.go (limited to 'test/codegen') diff --git a/src/cmd/compile/internal/ssa/gen/S390X.rules b/src/cmd/compile/internal/ssa/gen/S390X.rules index d3234c1a00..5e4c436ca1 100644 --- a/src/cmd/compile/internal/ssa/gen/S390X.rules +++ b/src/cmd/compile/internal/ssa/gen/S390X.rules @@ -716,20 +716,40 @@ (ANDWconst [0xFF] x) => (MOVBZreg x) (ANDWconst [0xFFFF] x) => (MOVHZreg x) -// strength reduction -(MULLDconst [-1] x) => (NEG x) -(MULLDconst [0] _) => (MOVDconst [0]) -(MULLDconst [1] x) => x -(MULLDconst [c] x) && isPowerOfTwo(c) -> (SLDconst [log2(c)] x) -(MULLDconst [c] x) && isPowerOfTwo(c+1) && c >= 15 -> (SUB (SLDconst [log2(c+1)] x) x) -(MULLDconst [c] x) && isPowerOfTwo(c-1) && c >= 17 -> (ADD (SLDconst [log2(c-1)] x) x) - -(MULLWconst [-1] x) => (NEGW x) -(MULLWconst [0] _) => (MOVDconst [0]) -(MULLWconst [1] x) => x -(MULLWconst [c] x) && isPowerOfTwo(c) -> (SLWconst [log2(c)] x) -(MULLWconst [c] x) && isPowerOfTwo(c+1) && c >= 15 -> (SUBW (SLWconst [log2(c+1)] x) x) -(MULLWconst [c] x) && isPowerOfTwo(c-1) && c >= 17 -> (ADDW (SLWconst [log2(c-1)] x) x) +// Strength reduce multiplication to the sum (or difference) of two powers of two. +// +// Examples: +// 5x -> 4x + 1x +// 10x -> 8x + 2x +// 120x -> 128x - 8x +// -120x -> 8x - 128x +// +// We know that the rightmost bit of any positive value, once isolated, must either +// be a power of 2 (because it is a single bit) or 0 (if the original value is 0). +// In all of these rules we use a rightmost bit calculation to determine one operand +// for the addition or subtraction. We then just need to calculate if the other +// operand is a valid power of 2 before we can match the rule. +// +// Notes: +// - the generic rules have already matched single powers of two so we ignore them here +// - isPowerOfTwo32 asserts that its argument is greater than 0 +// - c&(c-1) = clear rightmost bit +// - c&^(c-1) = isolate rightmost bit + +// c = 2ˣ + 2ʸ => c - 2ˣ = 2ʸ +(MULL(D|W)const x [c]) && isPowerOfTwo32(c&(c-1)) + => ((ADD|ADDW) (SL(D|W)const x [int8(log32(c&(c-1)))]) + (SL(D|W)const x [int8(log32(c&^(c-1)))])) + +// c = 2ʸ - 2ˣ => c + 2ˣ = 2ʸ +(MULL(D|W)const x [c]) && isPowerOfTwo32(c+(c&^(c-1))) + => ((SUB|SUBW) (SL(D|W)const x [int8(log32(c+(c&^(c-1))))]) + (SL(D|W)const x [int8(log32(c&^(c-1)))])) + +// c = 2ˣ - 2ʸ => -c + 2ˣ = 2ʸ +(MULL(D|W)const x [c]) && isPowerOfTwo32(-c+(-c&^(-c-1))) + => ((SUB|SUBW) (SL(D|W)const x [int8(log32(-c&^(-c-1)))]) + (SL(D|W)const x [int8(log32(-c+(-c&^(-c-1))))])) // Fold ADD into MOVDaddr. Odd offsets from SB shouldn't be folded (LARL can't handle them). (ADDconst [c] (MOVDaddr [d] {s} x:(SB))) && ((c+d)&1 == 0) && is32Bit(c+d) -> (MOVDaddr [c+d] {s} x) @@ -1133,6 +1153,9 @@ (XORconst [0] x) => x (XORWconst [c] x) && int32(c)==0 => x +// Shifts by zero (may be inserted during multiplication strength reduction). +((SLD|SLW|SRD|SRW|SRAD|SRAW)const x [0]) => x + // Convert constant subtracts to constant adds. (SUBconst [c] x) && c != -(1<<31) => (ADDconst [-c] x) (SUBWconst [c] x) -> (ADDWconst [int64(int32(-c))] x) diff --git a/src/cmd/compile/internal/ssa/rewriteS390X.go b/src/cmd/compile/internal/ssa/rewriteS390X.go index dc9b143562..536f8db320 100644 --- a/src/cmd/compile/internal/ssa/rewriteS390X.go +++ b/src/cmd/compile/internal/ssa/rewriteS390X.go @@ -732,8 +732,12 @@ func rewriteValueS390X(v *Value) bool { return rewriteValueS390X_OpS390XRLLG(v) case OpS390XSLD: return rewriteValueS390X_OpS390XSLD(v) + case OpS390XSLDconst: + return rewriteValueS390X_OpS390XSLDconst(v) case OpS390XSLW: return rewriteValueS390X_OpS390XSLW(v) + case OpS390XSLWconst: + return rewriteValueS390X_OpS390XSLWconst(v) case OpS390XSRAD: return rewriteValueS390X_OpS390XSRAD(v) case OpS390XSRADconst: @@ -748,6 +752,8 @@ func rewriteValueS390X(v *Value) bool { return rewriteValueS390X_OpS390XSRDconst(v) case OpS390XSRW: return rewriteValueS390X_OpS390XSRW(v) + case OpS390XSRWconst: + return rewriteValueS390X_OpS390XSRWconst(v) case OpS390XSTM2: return rewriteValueS390X_OpS390XSTM2(v) case OpS390XSTMG2: @@ -13853,81 +13859,64 @@ func rewriteValueS390X_OpS390XMULLD(v *Value) bool { func rewriteValueS390X_OpS390XMULLDconst(v *Value) bool { v_0 := v.Args[0] b := v.Block - // match: (MULLDconst [-1] x) - // result: (NEG x) + // match: (MULLDconst x [c]) + // cond: isPowerOfTwo32(c&(c-1)) + // result: (ADD (SLDconst x [int8(log32(c&(c-1)))]) (SLDconst x [int8(log32(c&^(c-1)))])) for { - if auxIntToInt32(v.AuxInt) != -1 { - break - } - x := v_0 - v.reset(OpS390XNEG) - v.AddArg(x) - return true - } - // match: (MULLDconst [0] _) - // result: (MOVDconst [0]) - for { - if auxIntToInt32(v.AuxInt) != 0 { - break - } - v.reset(OpS390XMOVDconst) - v.AuxInt = int64ToAuxInt(0) - return true - } - // match: (MULLDconst [1] x) - // result: x - for { - if auxIntToInt32(v.AuxInt) != 1 { - break - } - x := v_0 - v.copyOf(x) - return true - } - // match: (MULLDconst [c] x) - // cond: isPowerOfTwo(c) - // result: (SLDconst [log2(c)] x) - for { - c := v.AuxInt + t := v.Type + c := auxIntToInt32(v.AuxInt) x := v_0 - if !(isPowerOfTwo(c)) { + if !(isPowerOfTwo32(c & (c - 1))) { break } - v.reset(OpS390XSLDconst) - v.AuxInt = log2(c) - v.AddArg(x) + v.reset(OpS390XADD) + v0 := b.NewValue0(v.Pos, OpS390XSLDconst, t) + v0.AuxInt = int8ToAuxInt(int8(log32(c & (c - 1)))) + v0.AddArg(x) + v1 := b.NewValue0(v.Pos, OpS390XSLDconst, t) + v1.AuxInt = int8ToAuxInt(int8(log32(c &^ (c - 1)))) + v1.AddArg(x) + v.AddArg2(v0, v1) return true } - // match: (MULLDconst [c] x) - // cond: isPowerOfTwo(c+1) && c >= 15 - // result: (SUB (SLDconst [log2(c+1)] x) x) + // match: (MULLDconst x [c]) + // cond: isPowerOfTwo32(c+(c&^(c-1))) + // result: (SUB (SLDconst x [int8(log32(c+(c&^(c-1))))]) (SLDconst x [int8(log32(c&^(c-1)))])) for { - c := v.AuxInt + t := v.Type + c := auxIntToInt32(v.AuxInt) x := v_0 - if !(isPowerOfTwo(c+1) && c >= 15) { + if !(isPowerOfTwo32(c + (c &^ (c - 1)))) { break } v.reset(OpS390XSUB) - v0 := b.NewValue0(v.Pos, OpS390XSLDconst, v.Type) - v0.AuxInt = log2(c + 1) + v0 := b.NewValue0(v.Pos, OpS390XSLDconst, t) + v0.AuxInt = int8ToAuxInt(int8(log32(c + (c &^ (c - 1))))) v0.AddArg(x) - v.AddArg2(v0, x) + v1 := b.NewValue0(v.Pos, OpS390XSLDconst, t) + v1.AuxInt = int8ToAuxInt(int8(log32(c &^ (c - 1)))) + v1.AddArg(x) + v.AddArg2(v0, v1) return true } - // match: (MULLDconst [c] x) - // cond: isPowerOfTwo(c-1) && c >= 17 - // result: (ADD (SLDconst [log2(c-1)] x) x) + // match: (MULLDconst x [c]) + // cond: isPowerOfTwo32(-c+(-c&^(-c-1))) + // result: (SUB (SLDconst x [int8(log32(-c&^(-c-1)))]) (SLDconst x [int8(log32(-c+(-c&^(-c-1))))])) for { - c := v.AuxInt + t := v.Type + c := auxIntToInt32(v.AuxInt) x := v_0 - if !(isPowerOfTwo(c-1) && c >= 17) { + if !(isPowerOfTwo32(-c + (-c &^ (-c - 1)))) { break } - v.reset(OpS390XADD) - v0 := b.NewValue0(v.Pos, OpS390XSLDconst, v.Type) - v0.AuxInt = log2(c - 1) + v.reset(OpS390XSUB) + v0 := b.NewValue0(v.Pos, OpS390XSLDconst, t) + v0.AuxInt = int8ToAuxInt(int8(log32(-c &^ (-c - 1)))) v0.AddArg(x) - v.AddArg2(v0, x) + v1 := b.NewValue0(v.Pos, OpS390XSLDconst, t) + v1.AuxInt = int8ToAuxInt(int8(log32(-c + (-c &^ (-c - 1))))) + v1.AddArg(x) + v.AddArg2(v0, v1) return true } // match: (MULLDconst [c] (MOVDconst [d])) @@ -14097,81 +14086,64 @@ func rewriteValueS390X_OpS390XMULLW(v *Value) bool { func rewriteValueS390X_OpS390XMULLWconst(v *Value) bool { v_0 := v.Args[0] b := v.Block - // match: (MULLWconst [-1] x) - // result: (NEGW x) - for { - if auxIntToInt32(v.AuxInt) != -1 { - break - } - x := v_0 - v.reset(OpS390XNEGW) - v.AddArg(x) - return true - } - // match: (MULLWconst [0] _) - // result: (MOVDconst [0]) - for { - if auxIntToInt32(v.AuxInt) != 0 { - break - } - v.reset(OpS390XMOVDconst) - v.AuxInt = int64ToAuxInt(0) - return true - } - // match: (MULLWconst [1] x) - // result: x + // match: (MULLWconst x [c]) + // cond: isPowerOfTwo32(c&(c-1)) + // result: (ADDW (SLWconst x [int8(log32(c&(c-1)))]) (SLWconst x [int8(log32(c&^(c-1)))])) for { - if auxIntToInt32(v.AuxInt) != 1 { - break - } - x := v_0 - v.copyOf(x) - return true - } - // match: (MULLWconst [c] x) - // cond: isPowerOfTwo(c) - // result: (SLWconst [log2(c)] x) - for { - c := v.AuxInt + t := v.Type + c := auxIntToInt32(v.AuxInt) x := v_0 - if !(isPowerOfTwo(c)) { + if !(isPowerOfTwo32(c & (c - 1))) { break } - v.reset(OpS390XSLWconst) - v.AuxInt = log2(c) - v.AddArg(x) + v.reset(OpS390XADDW) + v0 := b.NewValue0(v.Pos, OpS390XSLWconst, t) + v0.AuxInt = int8ToAuxInt(int8(log32(c & (c - 1)))) + v0.AddArg(x) + v1 := b.NewValue0(v.Pos, OpS390XSLWconst, t) + v1.AuxInt = int8ToAuxInt(int8(log32(c &^ (c - 1)))) + v1.AddArg(x) + v.AddArg2(v0, v1) return true } - // match: (MULLWconst [c] x) - // cond: isPowerOfTwo(c+1) && c >= 15 - // result: (SUBW (SLWconst [log2(c+1)] x) x) + // match: (MULLWconst x [c]) + // cond: isPowerOfTwo32(c+(c&^(c-1))) + // result: (SUBW (SLWconst x [int8(log32(c+(c&^(c-1))))]) (SLWconst x [int8(log32(c&^(c-1)))])) for { - c := v.AuxInt + t := v.Type + c := auxIntToInt32(v.AuxInt) x := v_0 - if !(isPowerOfTwo(c+1) && c >= 15) { + if !(isPowerOfTwo32(c + (c &^ (c - 1)))) { break } v.reset(OpS390XSUBW) - v0 := b.NewValue0(v.Pos, OpS390XSLWconst, v.Type) - v0.AuxInt = log2(c + 1) + v0 := b.NewValue0(v.Pos, OpS390XSLWconst, t) + v0.AuxInt = int8ToAuxInt(int8(log32(c + (c &^ (c - 1))))) v0.AddArg(x) - v.AddArg2(v0, x) + v1 := b.NewValue0(v.Pos, OpS390XSLWconst, t) + v1.AuxInt = int8ToAuxInt(int8(log32(c &^ (c - 1)))) + v1.AddArg(x) + v.AddArg2(v0, v1) return true } - // match: (MULLWconst [c] x) - // cond: isPowerOfTwo(c-1) && c >= 17 - // result: (ADDW (SLWconst [log2(c-1)] x) x) + // match: (MULLWconst x [c]) + // cond: isPowerOfTwo32(-c+(-c&^(-c-1))) + // result: (SUBW (SLWconst x [int8(log32(-c&^(-c-1)))]) (SLWconst x [int8(log32(-c+(-c&^(-c-1))))])) for { - c := v.AuxInt + t := v.Type + c := auxIntToInt32(v.AuxInt) x := v_0 - if !(isPowerOfTwo(c-1) && c >= 17) { + if !(isPowerOfTwo32(-c + (-c &^ (-c - 1)))) { break } - v.reset(OpS390XADDW) - v0 := b.NewValue0(v.Pos, OpS390XSLWconst, v.Type) - v0.AuxInt = log2(c - 1) + v.reset(OpS390XSUBW) + v0 := b.NewValue0(v.Pos, OpS390XSLWconst, t) + v0.AuxInt = int8ToAuxInt(int8(log32(-c &^ (-c - 1)))) v0.AddArg(x) - v.AddArg2(v0, x) + v1 := b.NewValue0(v.Pos, OpS390XSLWconst, t) + v1.AuxInt = int8ToAuxInt(int8(log32(-c + (-c &^ (-c - 1))))) + v1.AddArg(x) + v.AddArg2(v0, v1) return true } // match: (MULLWconst [c] (MOVDconst [d])) @@ -16826,6 +16798,20 @@ func rewriteValueS390X_OpS390XSLD(v *Value) bool { } return false } +func rewriteValueS390X_OpS390XSLDconst(v *Value) bool { + v_0 := v.Args[0] + // match: (SLDconst x [0]) + // result: x + for { + if auxIntToInt8(v.AuxInt) != 0 { + break + } + x := v_0 + v.copyOf(x) + return true + } + return false +} func rewriteValueS390X_OpS390XSLW(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] @@ -16960,6 +16946,20 @@ func rewriteValueS390X_OpS390XSLW(v *Value) bool { } return false } +func rewriteValueS390X_OpS390XSLWconst(v *Value) bool { + v_0 := v.Args[0] + // match: (SLWconst x [0]) + // result: x + for { + if auxIntToInt8(v.AuxInt) != 0 { + break + } + x := v_0 + v.copyOf(x) + return true + } + return false +} func rewriteValueS390X_OpS390XSRAD(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] @@ -17096,6 +17096,16 @@ func rewriteValueS390X_OpS390XSRAD(v *Value) bool { } func rewriteValueS390X_OpS390XSRADconst(v *Value) bool { v_0 := v.Args[0] + // match: (SRADconst x [0]) + // result: x + for { + if auxIntToInt8(v.AuxInt) != 0 { + break + } + x := v_0 + v.copyOf(x) + return true + } // match: (SRADconst [c] (MOVDconst [d])) // result: (MOVDconst [d>>uint64(c)]) for { @@ -17246,6 +17256,16 @@ func rewriteValueS390X_OpS390XSRAW(v *Value) bool { } func rewriteValueS390X_OpS390XSRAWconst(v *Value) bool { v_0 := v.Args[0] + // match: (SRAWconst x [0]) + // result: x + for { + if auxIntToInt8(v.AuxInt) != 0 { + break + } + x := v_0 + v.copyOf(x) + return true + } // match: (SRAWconst [c] (MOVDconst [d])) // result: (MOVDconst [int64(int32(d))>>uint64(c)]) for { @@ -17416,6 +17436,16 @@ func rewriteValueS390X_OpS390XSRDconst(v *Value) bool { v.AddArg(v0) return true } + // match: (SRDconst x [0]) + // result: x + for { + if auxIntToInt8(v.AuxInt) != 0 { + break + } + x := v_0 + v.copyOf(x) + return true + } return false } func rewriteValueS390X_OpS390XSRW(v *Value) bool { @@ -17552,6 +17582,20 @@ func rewriteValueS390X_OpS390XSRW(v *Value) bool { } return false } +func rewriteValueS390X_OpS390XSRWconst(v *Value) bool { + v_0 := v.Args[0] + // match: (SRWconst x [0]) + // result: x + for { + if auxIntToInt8(v.AuxInt) != 0 { + break + } + x := v_0 + v.copyOf(x) + return true + } + return false +} func rewriteValueS390X_OpS390XSTM2(v *Value) bool { v_3 := v.Args[3] v_2 := v.Args[2] diff --git a/src/cmd/compile/internal/test/mulconst_test.go b/src/cmd/compile/internal/test/mulconst_test.go new file mode 100644 index 0000000000..314cab32de --- /dev/null +++ b/src/cmd/compile/internal/test/mulconst_test.go @@ -0,0 +1,242 @@ +// Copyright 2020 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package test + +import "testing" + +// Benchmark multiplication of an integer by various constants. +// +// The comment above each sub-benchmark provides an example of how the +// target multiplication operation might be implemented using shift +// (multiplication by a power of 2), addition and subtraction +// operations. It is platform-dependent whether these transformations +// are actually applied. + +var ( + mulSinkI32 int32 + mulSinkI64 int64 + mulSinkU32 uint32 + mulSinkU64 uint64 +) + +func BenchmarkMulconstI32(b *testing.B) { + // 3x = 2x + x + b.Run("3", func(b *testing.B) { + x := int32(1) + for i := 0; i < b.N; i++ { + x *= 3 + } + mulSinkI32 = x + }) + // 5x = 4x + x + b.Run("5", func(b *testing.B) { + x := int32(1) + for i := 0; i < b.N; i++ { + x *= 5 + } + mulSinkI32 = x + }) + // 12x = 8x + 4x + b.Run("12", func(b *testing.B) { + x := int32(1) + for i := 0; i < b.N; i++ { + x *= 12 + } + mulSinkI32 = x + }) + // 120x = 128x - 8x + b.Run("120", func(b *testing.B) { + x := int32(1) + for i := 0; i < b.N; i++ { + x *= 120 + } + mulSinkI32 = x + }) + // -120x = 8x - 120x + b.Run("-120", func(b *testing.B) { + x := int32(1) + for i := 0; i < b.N; i++ { + x *= -120 + } + mulSinkI32 = x + }) + // 65537x = 65536x + x + b.Run("65537", func(b *testing.B) { + x := int32(1) + for i := 0; i < b.N; i++ { + x *= 65537 + } + mulSinkI32 = x + }) + // 65538x = 65536x + 2x + b.Run("65538", func(b *testing.B) { + x := int32(1) + for i := 0; i < b.N; i++ { + x *= 65538 + } + mulSinkI32 = x + }) +} + +func BenchmarkMulconstI64(b *testing.B) { + // 3x = 2x + x + b.Run("3", func(b *testing.B) { + x := int64(1) + for i := 0; i < b.N; i++ { + x *= 3 + } + mulSinkI64 = x + }) + // 5x = 4x + x + b.Run("5", func(b *testing.B) { + x := int64(1) + for i := 0; i < b.N; i++ { + x *= 5 + } + mulSinkI64 = x + }) + // 12x = 8x + 4x + b.Run("12", func(b *testing.B) { + x := int64(1) + for i := 0; i < b.N; i++ { + x *= 12 + } + mulSinkI64 = x + }) + // 120x = 128x - 8x + b.Run("120", func(b *testing.B) { + x := int64(1) + for i := 0; i < b.N; i++ { + x *= 120 + } + mulSinkI64 = x + }) + // -120x = 8x - 120x + b.Run("-120", func(b *testing.B) { + x := int64(1) + for i := 0; i < b.N; i++ { + x *= -120 + } + mulSinkI64 = x + }) + // 65537x = 65536x + x + b.Run("65537", func(b *testing.B) { + x := int64(1) + for i := 0; i < b.N; i++ { + x *= 65537 + } + mulSinkI64 = x + }) + // 65538x = 65536x + 2x + b.Run("65538", func(b *testing.B) { + x := int64(1) + for i := 0; i < b.N; i++ { + x *= 65538 + } + mulSinkI64 = x + }) +} + +func BenchmarkMulconstU32(b *testing.B) { + // 3x = 2x + x + b.Run("3", func(b *testing.B) { + x := uint32(1) + for i := 0; i < b.N; i++ { + x *= 3 + } + mulSinkU32 = x + }) + // 5x = 4x + x + b.Run("5", func(b *testing.B) { + x := uint32(1) + for i := 0; i < b.N; i++ { + x *= 5 + } + mulSinkU32 = x + }) + // 12x = 8x + 4x + b.Run("12", func(b *testing.B) { + x := uint32(1) + for i := 0; i < b.N; i++ { + x *= 12 + } + mulSinkU32 = x + }) + // 120x = 128x - 8x + b.Run("120", func(b *testing.B) { + x := uint32(1) + for i := 0; i < b.N; i++ { + x *= 120 + } + mulSinkU32 = x + }) + // 65537x = 65536x + x + b.Run("65537", func(b *testing.B) { + x := uint32(1) + for i := 0; i < b.N; i++ { + x *= 65537 + } + mulSinkU32 = x + }) + // 65538x = 65536x + 2x + b.Run("65538", func(b *testing.B) { + x := uint32(1) + for i := 0; i < b.N; i++ { + x *= 65538 + } + mulSinkU32 = x + }) +} + +func BenchmarkMulconstU64(b *testing.B) { + // 3x = 2x + x + b.Run("3", func(b *testing.B) { + x := uint64(1) + for i := 0; i < b.N; i++ { + x *= 3 + } + mulSinkU64 = x + }) + // 5x = 4x + x + b.Run("5", func(b *testing.B) { + x := uint64(1) + for i := 0; i < b.N; i++ { + x *= 5 + } + mulSinkU64 = x + }) + // 12x = 8x + 4x + b.Run("12", func(b *testing.B) { + x := uint64(1) + for i := 0; i < b.N; i++ { + x *= 12 + } + mulSinkU64 = x + }) + // 120x = 128x - 8x + b.Run("120", func(b *testing.B) { + x := uint64(1) + for i := 0; i < b.N; i++ { + x *= 120 + } + mulSinkU64 = x + }) + // 65537x = 65536x + x + b.Run("65537", func(b *testing.B) { + x := uint64(1) + for i := 0; i < b.N; i++ { + x *= 65537 + } + mulSinkU64 = x + }) + // 65538x = 65536x + 2x + b.Run("65538", func(b *testing.B) { + x := uint64(1) + for i := 0; i < b.N; i++ { + x *= 65538 + } + mulSinkU64 = x + }) +} diff --git a/test/codegen/arithmetic.go b/test/codegen/arithmetic.go index 8f25974376..9f30ec8ce4 100644 --- a/test/codegen/arithmetic.go +++ b/test/codegen/arithmetic.go @@ -71,9 +71,15 @@ func Mul_96(n int) int { // 386:`SHLL\t[$]5`,`LEAL\t\(.*\)\(.*\*2\),`,-`IMULL` // arm64:`LSL\t[$]5`,`ADD\sR[0-9]+<<1,\sR[0-9]+`,-`MUL` // arm:`SLL\t[$]5`,`ADD\sR[0-9]+<<1,\sR[0-9]+`,-`MUL` + // s390x:`SLD\t[$]5`,`SLD\t[$]6`,-`MULLD` return n * 96 } +func Mul_n120(n int) int { + // s390x:`SLD\t[$]3`,`SLD\t[$]7`,-`MULLD` + return n * -120 +} + func MulMemSrc(a []uint32, b []float32) { // 386:`IMULL\s4\([A-Z]+\),\s[A-Z]+` a[0] *= a[1] -- cgit v1.3 From e7c7ce646f37b260fe5a5635bc52243d28125dd8 Mon Sep 17 00:00:00 2001 From: "Paul E. Murphy" Date: Mon, 17 Aug 2020 16:14:48 -0500 Subject: cmd/compile: combine multiply/add into maddld on ppc64le/power9 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a new lowering rule to match and replace such instances with the MADDLD instruction available on power9 where possible. Likewise, this plumbs in a new ppc64 ssa opcode to house the newly generated MADDLD instructions. When testing ed25519, this reduced binary size by 936B. Similarly, MADDLD combination occcurs in a few other less obvious cases such as division by constant. Testing of golang.org/x/crypto/ed25519 shows non-trivial speedup during keygeneration: name old time/op new time/op delta KeyGeneration 65.2µs ± 0% 63.1µs ± 0% -3.19% Signing 64.3µs ± 0% 64.4µs ± 0% +0.16% Verification 147µs ± 0% 147µs ± 0% +0.11% Similarly, this test binary has shrunk by 66488B. Change-Id: I077aeda7943119b41f07e4e62e44a648f16e4ad0 Reviewed-on: https://go-review.googlesource.com/c/go/+/248723 Run-TryBot: Lynn Boger TryBot-Result: Gobot Gobot Reviewed-by: Lynn Boger --- src/cmd/compile/internal/ppc64/ssa.go | 14 ++++++++++++++ src/cmd/compile/internal/ssa/gen/PPC64.rules | 3 +++ src/cmd/compile/internal/ssa/gen/PPC64Ops.go | 2 ++ src/cmd/compile/internal/ssa/opGen.go | 16 ++++++++++++++++ src/cmd/compile/internal/ssa/rewritePPC64.go | 21 +++++++++++++++++++++ test/codegen/arithmetic.go | 12 ++++++++---- 6 files changed, 64 insertions(+), 4 deletions(-) (limited to 'test/codegen') diff --git a/src/cmd/compile/internal/ppc64/ssa.go b/src/cmd/compile/internal/ppc64/ssa.go index 0efdd710fb..4d2ad48135 100644 --- a/src/cmd/compile/internal/ppc64/ssa.go +++ b/src/cmd/compile/internal/ppc64/ssa.go @@ -601,6 +601,20 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { p.To.Type = obj.TYPE_REG p.To.Reg = v.Reg() + case ssa.OpPPC64MADDLD: + r := v.Reg() + r1 := v.Args[0].Reg() + r2 := v.Args[1].Reg() + r3 := v.Args[2].Reg() + // r = r1*r2 ± r3 + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = r1 + p.Reg = r2 + p.SetFrom3(obj.Addr{Type: obj.TYPE_REG, Reg: r3}) + p.To.Type = obj.TYPE_REG + p.To.Reg = r + case ssa.OpPPC64FMADD, ssa.OpPPC64FMADDS, ssa.OpPPC64FMSUB, ssa.OpPPC64FMSUBS: r := v.Reg() r1 := v.Args[0].Reg() diff --git a/src/cmd/compile/internal/ssa/gen/PPC64.rules b/src/cmd/compile/internal/ssa/gen/PPC64.rules index fd28e10098..14942d50f9 100644 --- a/src/cmd/compile/internal/ssa/gen/PPC64.rules +++ b/src/cmd/compile/internal/ssa/gen/PPC64.rules @@ -11,6 +11,9 @@ (Sub32F ...) => (FSUBS ...) (Sub64F ...) => (FSUB ...) +// Combine 64 bit integer multiply and adds +(ADD l:(MULLD x y) z) && objabi.GOPPC64 >= 9 && l.Uses == 1 && clobber(l) => (MADDLD x y z) + (Mod16 x y) => (Mod32 (SignExt16to32 x) (SignExt16to32 y)) (Mod16u x y) => (Mod32u (ZeroExt16to32 x) (ZeroExt16to32 y)) (Mod8 x y) => (Mod32 (SignExt8to32 x) (SignExt8to32 y)) diff --git a/src/cmd/compile/internal/ssa/gen/PPC64Ops.go b/src/cmd/compile/internal/ssa/gen/PPC64Ops.go index 0261dc283b..825d0faf34 100644 --- a/src/cmd/compile/internal/ssa/gen/PPC64Ops.go +++ b/src/cmd/compile/internal/ssa/gen/PPC64Ops.go @@ -137,6 +137,7 @@ func init() { gp01 = regInfo{inputs: nil, outputs: []regMask{gp}} gp11 = regInfo{inputs: []regMask{gp | sp | sb}, outputs: []regMask{gp}} gp21 = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb}, outputs: []regMask{gp}} + gp31 = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb, gp | sp | sb}, outputs: []regMask{gp}} gp22 = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb}, outputs: []regMask{gp, gp}} gp32 = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb, gp | sp | sb}, outputs: []regMask{gp, gp}} gp1cr = regInfo{inputs: []regMask{gp | sp | sb}} @@ -179,6 +180,7 @@ func init() { {name: "MULLD", argLength: 2, reg: gp21, asm: "MULLD", typ: "Int64", commutative: true}, // arg0*arg1 (signed 64-bit) {name: "MULLW", argLength: 2, reg: gp21, asm: "MULLW", typ: "Int32", commutative: true}, // arg0*arg1 (signed 32-bit) + {name: "MADDLD", argLength: 3, reg: gp31, asm: "MADDLD", typ: "Int64"}, // (arg0*arg1)+arg2 (signed 64-bit) {name: "MULHD", argLength: 2, reg: gp21, asm: "MULHD", commutative: true}, // (arg0 * arg1) >> 64, signed {name: "MULHW", argLength: 2, reg: gp21, asm: "MULHW", commutative: true}, // (arg0 * arg1) >> 32, signed diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index df2a27368b..4cd72799e8 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -1832,6 +1832,7 @@ const ( OpPPC64FSUBS OpPPC64MULLD OpPPC64MULLW + OpPPC64MADDLD OpPPC64MULHD OpPPC64MULHW OpPPC64MULHDU @@ -24374,6 +24375,21 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "MADDLD", + argLen: 3, + asm: ppc64.AMADDLD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 1073733630}, // SP SB R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29 + {1, 1073733630}, // SP SB R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29 + {2, 1073733630}, // SP SB R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29 + }, + outputs: []outputInfo{ + {0, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29 + }, + }, + }, { name: "MULHD", argLen: 2, diff --git a/src/cmd/compile/internal/ssa/rewritePPC64.go b/src/cmd/compile/internal/ssa/rewritePPC64.go index 37b75cc58a..7704b80dc6 100644 --- a/src/cmd/compile/internal/ssa/rewritePPC64.go +++ b/src/cmd/compile/internal/ssa/rewritePPC64.go @@ -3852,6 +3852,27 @@ func rewriteValuePPC64_OpPPC64ADD(v *Value) bool { v_0 := v.Args[0] b := v.Block typ := &b.Func.Config.Types + // match: (ADD l:(MULLD x y) z) + // cond: objabi.GOPPC64 >= 9 && l.Uses == 1 && clobber(l) + // result: (MADDLD x y z) + for { + for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 { + l := v_0 + if l.Op != OpPPC64MULLD { + continue + } + y := l.Args[1] + x := l.Args[0] + z := v_1 + if !(objabi.GOPPC64 >= 9 && l.Uses == 1 && clobber(l)) { + continue + } + v.reset(OpPPC64MADDLD) + v.AddArg3(x, y, z) + return true + } + break + } // match: (ADD (SLDconst x [c]) (SRDconst x [d])) // cond: d == 64-c // result: (ROTLconst [c] x) diff --git a/test/codegen/arithmetic.go b/test/codegen/arithmetic.go index 9f30ec8ce4..45fdb68903 100644 --- a/test/codegen/arithmetic.go +++ b/test/codegen/arithmetic.go @@ -253,16 +253,20 @@ func Divisible(n1 uint, n2 int) (bool, bool, bool, bool) { // 386:"IMUL3L\t[$]-1431655765","ADDL\t[$]715827882","ROLL\t[$]31",-"DIVQ" // arm64:"MUL","ADD\t[$]3074457345618258602","ROR",-"DIV" // arm:"MUL","ADD\t[$]715827882",-".*udiv" - // ppc64:"MULLD","ADD","ROTL\t[$]63" - // ppc64le:"MULLD","ADD","ROTL\t[$]63" + // ppc64/power8:"MULLD","ADD","ROTL\t[$]63" + // ppc64le/power8:"MULLD","ADD","ROTL\t[$]63" + // ppc64/power9:"MADDLD","ROTL\t[$]63" + // ppc64le/power9:"MADDLD","ROTL\t[$]63" evenS := n2%6 == 0 // amd64:"IMULQ","ADD",-"ROLQ",-"DIVQ" // 386:"IMUL3L\t[$]678152731","ADDL\t[$]113025455",-"ROLL",-"DIVQ" // arm64:"MUL","ADD\t[$]485440633518672410",-"ROR",-"DIV" // arm:"MUL","ADD\t[$]113025455",-".*udiv" - // ppc64:"MULLD","ADD",-"ROTL" - // ppc64le:"MULLD","ADD",-"ROTL" + // ppc64/power8:"MULLD","ADD",-"ROTL" + // ppc64/power9:"MADDLD",-"ROTL" + // ppc64le/power8:"MULLD","ADD",-"ROTL" + // ppc64le/power9:"MADDLD",-"ROTL" oddS := n2%19 == 0 return evenU, oddU, evenS, oddS -- cgit v1.3 From 01aad9ea939fed313d5c51778485349435302ead Mon Sep 17 00:00:00 2001 From: diaxu01 Date: Fri, 5 Jun 2020 03:53:53 +0000 Subject: cmd/compile: Optimize ARM64's code with EON This patch fuses pattern '(MVN (XOR x y))' into '(EON x y)'. Change-Id: I269c98ce198d51a4945ce8bd0e1024acbd1b7609 Reviewed-on: https://go-review.googlesource.com/c/go/+/239638 Run-TryBot: Cherry Zhang TryBot-Result: Gobot Gobot Reviewed-by: Cherry Zhang --- src/cmd/compile/internal/ssa/gen/ARM64.rules | 1 + src/cmd/compile/internal/ssa/rewriteARM64.go | 12 ++++++++++++ test/codegen/bits.go | 13 +++++++++++-- 3 files changed, 24 insertions(+), 2 deletions(-) (limited to 'test/codegen') diff --git a/src/cmd/compile/internal/ssa/gen/ARM64.rules b/src/cmd/compile/internal/ssa/gen/ARM64.rules index 27959d01fc..80e8c7137b 100644 --- a/src/cmd/compile/internal/ssa/gen/ARM64.rules +++ b/src/cmd/compile/internal/ssa/gen/ARM64.rules @@ -1323,6 +1323,7 @@ (AND x (MVN y)) -> (BIC x y) (XOR x (MVN y)) -> (EON x y) (OR x (MVN y)) -> (ORN x y) +(MVN (XOR x y)) -> (EON x y) (CSEL {cc} x (MOVDconst [0]) flag) -> (CSEL0 {cc} x flag) (CSEL {cc} (MOVDconst [0]) y flag) -> (CSEL0 {arm64Negate(cc.(Op))} y flag) (SUB x (SUB y z)) -> (SUB (ADD x z) y) diff --git a/src/cmd/compile/internal/ssa/rewriteARM64.go b/src/cmd/compile/internal/ssa/rewriteARM64.go index 023d9908c2..842eddbf4a 100644 --- a/src/cmd/compile/internal/ssa/rewriteARM64.go +++ b/src/cmd/compile/internal/ssa/rewriteARM64.go @@ -14593,6 +14593,18 @@ func rewriteValueARM64_OpARM64MULW(v *Value) bool { } func rewriteValueARM64_OpARM64MVN(v *Value) bool { v_0 := v.Args[0] + // match: (MVN (XOR x y)) + // result: (EON x y) + for { + if v_0.Op != OpARM64XOR { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + v.reset(OpARM64EON) + v.AddArg2(x, y) + return true + } // match: (MVN (MOVDconst [c])) // result: (MOVDconst [^c]) for { diff --git a/test/codegen/bits.go b/test/codegen/bits.go index 0a5428b55a..398dd84e9e 100644 --- a/test/codegen/bits.go +++ b/test/codegen/bits.go @@ -310,9 +310,18 @@ func op_bic(x, y uint32) uint32 { return x &^ y } -func op_eon(x, y uint32) uint32 { +func op_eon(x, y, z uint32, a []uint32, n, m uint64) uint64 { + // arm64:`EON\t`,-`EOR`,-`MVN` + a[0] = x ^ (y ^ 0xffffffff) + + // arm64:`EON\t`,-`EOR`,-`MVN` + a[1] = ^(y ^ z) + // arm64:`EON\t`,-`XOR` - return x ^ ^y + a[2] = x ^ ^z + + // arm64:`EON\t`,-`EOR`,-`MVN` + return n ^ (m ^ 0xffffffffffffffff) } func op_orn(x, y uint32) uint32 { -- cgit v1.3