aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorLynn Boger <laboger@linux.vnet.ibm.com>2020-10-02 17:51:13 -0400
committerLynn Boger <laboger@linux.vnet.ibm.com>2020-10-06 19:40:46 +0000
commitbdab5df40f474c7768a945ef4fcf5aab634f7af5 (patch)
tree7a06719f0ae6723ac96ee476ab869513c627f206 /src
parent1fb149fd640f2e83f17206aa6eb530d664b0b5ed (diff)
downloadgo-bdab5df40f474c7768a945ef4fcf5aab634f7af5.tar.xz
cmd/compile,cmd/internal/obj/ppc64: use mulli where possible
This adds support to allow the use of mulli when one of the multiply operands is a constant that fits in 16 bits. This especially helps in the case where this instruction appears in a loop since the load of the constant is not being moved out of the loop. Some improvements seen in compress/flate on power9: Decode/Digits/Huffman/1e4 259µs ± 0% 261µs ± 0% +0.57% (p=1.000 n=1+1) Decode/Digits/Huffman/1e5 2.43ms ± 0% 2.45ms ± 0% +0.79% (p=1.000 n=1+1) Decode/Digits/Huffman/1e6 23.9ms ± 0% 24.2ms ± 0% +0.86% (p=1.000 n=1+1) Decode/Digits/Speed/1e4 278µs ± 0% 279µs ± 0% +0.34% (p=1.000 n=1+1) Decode/Digits/Speed/1e5 2.80ms ± 0% 2.81ms ± 0% +0.29% (p=1.000 n=1+1) Decode/Digits/Speed/1e6 28.0ms ± 0% 28.1ms ± 0% +0.28% (p=1.000 n=1+1) Decode/Digits/Default/1e4 278µs ± 0% 278µs ± 0% +0.28% (p=1.000 n=1+1) Decode/Digits/Default/1e5 2.68ms ± 0% 2.69ms ± 0% +0.19% (p=1.000 n=1+1) Decode/Digits/Default/1e6 26.6ms ± 0% 26.6ms ± 0% +0.21% (p=1.000 n=1+1) Decode/Digits/Compression/1e4 278µs ± 0% 278µs ± 0% +0.00% (p=1.000 n=1+1) Decode/Digits/Compression/1e5 2.68ms ± 0% 2.69ms ± 0% +0.21% (p=1.000 n=1+1) Decode/Digits/Compression/1e6 26.6ms ± 0% 26.6ms ± 0% +0.07% (p=1.000 n=1+1) Decode/Newton/Huffman/1e4 322µs ± 0% 312µs ± 0% -2.84% (p=1.000 n=1+1) Decode/Newton/Huffman/1e5 3.11ms ± 0% 2.91ms ± 0% -6.41% (p=1.000 n=1+1) Decode/Newton/Huffman/1e6 31.4ms ± 0% 29.3ms ± 0% -6.85% (p=1.000 n=1+1) Decode/Newton/Speed/1e4 282µs ± 0% 269µs ± 0% -4.69% (p=1.000 n=1+1) Decode/Newton/Speed/1e5 2.29ms ± 0% 2.20ms ± 0% -4.13% (p=1.000 n=1+1) Decode/Newton/Speed/1e6 22.7ms ± 0% 21.3ms ± 0% -6.06% (p=1.000 n=1+1) Decode/Newton/Default/1e4 254µs ± 0% 237µs ± 0% -6.60% (p=1.000 n=1+1) Decode/Newton/Default/1e5 1.86ms ± 0% 1.75ms ± 0% -5.99% (p=1.000 n=1+1) Decode/Newton/Default/1e6 18.1ms ± 0% 17.4ms ± 0% -4.10% (p=1.000 n=1+1) Decode/Newton/Compression/1e4 254µs ± 0% 244µs ± 0% -3.91% (p=1.000 n=1+1) Decode/Newton/Compression/1e5 1.85ms ± 0% 1.79ms ± 0% -3.10% (p=1.000 n=1+1) Decode/Newton/Compression/1e6 18.0ms ± 0% 17.3ms ± 0% -3.88% (p=1.000 n=1+1) Change-Id: I840320fab1c4bf64c76b001c2651ab79f23df4eb Reviewed-on: https://go-review.googlesource.com/c/go/+/259444 Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Paul Murphy <murp@ibm.com> Reviewed-by: Carlos Eduardo Seo <carlos.seo@gmail.com> Trust: Lynn Boger <laboger@linux.vnet.ibm.com>
Diffstat (limited to 'src')
-rw-r--r--src/cmd/asm/internal/asm/testdata/ppc64enc.s4
-rw-r--r--src/cmd/compile/internal/gc/bench_test.go12
-rw-r--r--src/cmd/compile/internal/ppc64/ssa.go3
-rw-r--r--src/cmd/compile/internal/ssa/gen/PPC64.rules2
-rw-r--r--src/cmd/compile/internal/ssa/gen/PPC64Ops.go2
-rw-r--r--src/cmd/compile/internal/ssa/opGen.go30
-rw-r--r--src/cmd/compile/internal/ssa/rewritePPC64.go54
-rw-r--r--src/cmd/internal/obj/ppc64/asm9.go9
8 files changed, 111 insertions, 5 deletions
diff --git a/src/cmd/asm/internal/asm/testdata/ppc64enc.s b/src/cmd/asm/internal/asm/testdata/ppc64enc.s
index 869f8c2d4f..c6d7b59aad 100644
--- a/src/cmd/asm/internal/asm/testdata/ppc64enc.s
+++ b/src/cmd/asm/internal/asm/testdata/ppc64enc.s
@@ -204,12 +204,16 @@ TEXT asmtest(SB),DUPOK|NOSPLIT,$0
MULLW R3, R4 // 7c8419d6
MULLW R3, R4, R5 // 7ca419d6
+ MULLW $10, R3 // 1c63000a
+ MULLW $10000000, R3 // 641f009863ff96807c7f19d6
MULLWCC R3, R4, R5 // 7ca419d7
MULHW R3, R4, R5 // 7ca41896
MULHWU R3, R4, R5 // 7ca41816
MULLD R3, R4 // 7c8419d2
MULLD R4, R4, R5 // 7ca421d2
+ MULLD $20, R4 // 1c840014
+ MULLD $200000000, R4 // 641f0beb63ffc2007c9f21d2
MULLDCC R3, R4, R5 // 7ca419d3
MULHD R3, R4, R5 // 7ca41892
MULHDCC R3, R4, R5 // 7ca41893
diff --git a/src/cmd/compile/internal/gc/bench_test.go b/src/cmd/compile/internal/gc/bench_test.go
index a2887f2f7b..8c4288128f 100644
--- a/src/cmd/compile/internal/gc/bench_test.go
+++ b/src/cmd/compile/internal/gc/bench_test.go
@@ -7,6 +7,7 @@ package gc
import "testing"
var globl int64
+var globl32 int32
func BenchmarkLoadAdd(b *testing.B) {
x := make([]int64, 1024)
@@ -42,6 +43,17 @@ func BenchmarkModify(b *testing.B) {
}
}
+func BenchmarkMullImm(b *testing.B) {
+ x := make([]int32, 1024)
+ for i := 0; i < b.N; i++ {
+ var s int32
+ for i := range x {
+ s += x[i] * 100
+ }
+ globl32 = s
+ }
+}
+
func BenchmarkConstModify(b *testing.B) {
a := make([]int64, 1024)
for i := 0; i < b.N; i++ {
diff --git a/src/cmd/compile/internal/ppc64/ssa.go b/src/cmd/compile/internal/ppc64/ssa.go
index d83b2df379..1ece4d999f 100644
--- a/src/cmd/compile/internal/ppc64/ssa.go
+++ b/src/cmd/compile/internal/ppc64/ssa.go
@@ -677,7 +677,8 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p.From.Reg = v.Args[0].Reg()
case ssa.OpPPC64ADDconst, ssa.OpPPC64ANDconst, ssa.OpPPC64ORconst, ssa.OpPPC64XORconst,
- ssa.OpPPC64SRADconst, ssa.OpPPC64SRAWconst, ssa.OpPPC64SRDconst, ssa.OpPPC64SRWconst, ssa.OpPPC64SLDconst, ssa.OpPPC64SLWconst, ssa.OpPPC64EXTSWSLconst:
+ ssa.OpPPC64SRADconst, ssa.OpPPC64SRAWconst, ssa.OpPPC64SRDconst, ssa.OpPPC64SRWconst,
+ ssa.OpPPC64SLDconst, ssa.OpPPC64SLWconst, ssa.OpPPC64EXTSWSLconst, ssa.OpPPC64MULLWconst, ssa.OpPPC64MULLDconst:
p := s.Prog(v.Op.Asm())
p.Reg = v.Args[0].Reg()
p.From.Type = obj.TYPE_CONST
diff --git a/src/cmd/compile/internal/ssa/gen/PPC64.rules b/src/cmd/compile/internal/ssa/gen/PPC64.rules
index 83ee4c499b..a05cfee654 100644
--- a/src/cmd/compile/internal/ssa/gen/PPC64.rules
+++ b/src/cmd/compile/internal/ssa/gen/PPC64.rules
@@ -821,6 +821,8 @@
(ADDconst [c] (MOVDaddr [d] {sym} x)) && is32Bit(c+int64(d)) => (MOVDaddr [int32(c+int64(d))] {sym} x)
+(MULL(W|D) x (MOVDconst [c])) && is16Bit(c) => (MULL(W|D)const [int32(c)] x)
+
// Subtract from (with carry, but ignored) constant.
// Note, these clobber the carry bit.
(SUB (MOVDconst [c]) x) && is32Bit(c) => (SUBFCconst [c] x)
diff --git a/src/cmd/compile/internal/ssa/gen/PPC64Ops.go b/src/cmd/compile/internal/ssa/gen/PPC64Ops.go
index 28317928a8..5885660597 100644
--- a/src/cmd/compile/internal/ssa/gen/PPC64Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/PPC64Ops.go
@@ -181,6 +181,8 @@ func init() {
{name: "MULLD", argLength: 2, reg: gp21, asm: "MULLD", typ: "Int64", commutative: true}, // arg0*arg1 (signed 64-bit)
{name: "MULLW", argLength: 2, reg: gp21, asm: "MULLW", typ: "Int32", commutative: true}, // arg0*arg1 (signed 32-bit)
+ {name: "MULLDconst", argLength: 1, reg: gp11, asm: "MULLD", aux: "Int32", typ: "Int64"}, // arg0*auxInt (signed 64-bit)
+ {name: "MULLWconst", argLength: 1, reg: gp11, asm: "MULLW", aux: "Int32", typ: "Int64"}, // arg0*auxInt (signed 64-bit)
{name: "MADDLD", argLength: 3, reg: gp31, asm: "MADDLD", typ: "Int64"}, // (arg0*arg1)+arg2 (signed 64-bit)
{name: "MULHD", argLength: 2, reg: gp21, asm: "MULHD", commutative: true}, // (arg0 * arg1) >> 64, signed
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index d7d2b24a48..051550fb17 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -1832,6 +1832,8 @@ const (
OpPPC64FSUBS
OpPPC64MULLD
OpPPC64MULLW
+ OpPPC64MULLDconst
+ OpPPC64MULLWconst
OpPPC64MADDLD
OpPPC64MULHD
OpPPC64MULHW
@@ -24378,6 +24380,34 @@ var opcodeTable = [...]opInfo{
},
},
{
+ name: "MULLDconst",
+ auxType: auxInt32,
+ argLen: 1,
+ asm: ppc64.AMULLD,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 1073733630}, // SP SB R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+ },
+ outputs: []outputInfo{
+ {0, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+ },
+ },
+ },
+ {
+ name: "MULLWconst",
+ auxType: auxInt32,
+ argLen: 1,
+ asm: ppc64.AMULLW,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 1073733630}, // SP SB R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+ },
+ outputs: []outputInfo{
+ {0, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+ },
+ },
+ },
+ {
name: "MADDLD",
argLen: 3,
asm: ppc64.AMADDLD,
diff --git a/src/cmd/compile/internal/ssa/rewritePPC64.go b/src/cmd/compile/internal/ssa/rewritePPC64.go
index 9822637b05..1b8a5a78ca 100644
--- a/src/cmd/compile/internal/ssa/rewritePPC64.go
+++ b/src/cmd/compile/internal/ssa/rewritePPC64.go
@@ -568,6 +568,10 @@ func rewriteValuePPC64(v *Value) bool {
return rewriteValuePPC64_OpPPC64MOVWstorezero(v)
case OpPPC64MTVSRD:
return rewriteValuePPC64_OpPPC64MTVSRD(v)
+ case OpPPC64MULLD:
+ return rewriteValuePPC64_OpPPC64MULLD(v)
+ case OpPPC64MULLW:
+ return rewriteValuePPC64_OpPPC64MULLW(v)
case OpPPC64NEG:
return rewriteValuePPC64_OpPPC64NEG(v)
case OpPPC64NOR:
@@ -11003,6 +11007,56 @@ func rewriteValuePPC64_OpPPC64MTVSRD(v *Value) bool {
}
return false
}
+func rewriteValuePPC64_OpPPC64MULLD(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ // match: (MULLD x (MOVDconst [c]))
+ // cond: is16Bit(c)
+ // result: (MULLDconst [int32(c)] x)
+ for {
+ for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+ x := v_0
+ if v_1.Op != OpPPC64MOVDconst {
+ continue
+ }
+ c := auxIntToInt64(v_1.AuxInt)
+ if !(is16Bit(c)) {
+ continue
+ }
+ v.reset(OpPPC64MULLDconst)
+ v.AuxInt = int32ToAuxInt(int32(c))
+ v.AddArg(x)
+ return true
+ }
+ break
+ }
+ return false
+}
+func rewriteValuePPC64_OpPPC64MULLW(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ // match: (MULLW x (MOVDconst [c]))
+ // cond: is16Bit(c)
+ // result: (MULLWconst [int32(c)] x)
+ for {
+ for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+ x := v_0
+ if v_1.Op != OpPPC64MOVDconst {
+ continue
+ }
+ c := auxIntToInt64(v_1.AuxInt)
+ if !(is16Bit(c)) {
+ continue
+ }
+ v.reset(OpPPC64MULLWconst)
+ v.AuxInt = int32ToAuxInt(int32(c))
+ v.AddArg(x)
+ return true
+ }
+ break
+ }
+ return false
+}
func rewriteValuePPC64_OpPPC64NEG(v *Value) bool {
v_0 := v.Args[0]
// match: (NEG (ADDconst [c] x))
diff --git a/src/cmd/internal/obj/ppc64/asm9.go b/src/cmd/internal/obj/ppc64/asm9.go
index 928e299f43..c2e8e9e9d0 100644
--- a/src/cmd/internal/obj/ppc64/asm9.go
+++ b/src/cmd/internal/obj/ppc64/asm9.go
@@ -1279,6 +1279,9 @@ func buildop(ctxt *obj.Link) {
case AREMD:
opset(AREMDU, r0)
+ case AMULLW:
+ opset(AMULLD, r0)
+
case ADIVW: /* op Rb[,Ra],Rd */
opset(AMULHW, r0)
@@ -1312,7 +1315,6 @@ func buildop(ctxt *obj.Link) {
opset(AMULHDCC, r0)
opset(AMULHDU, r0)
opset(AMULHDUCC, r0)
- opset(AMULLD, r0)
opset(AMULLDCC, r0)
opset(AMULLDVCC, r0)
opset(AMULLDV, r0)
@@ -1996,7 +1998,6 @@ func buildop(ctxt *obj.Link) {
AMOVB, /* macro: move byte with sign extension */
AMOVBU, /* macro: move byte with sign extension & update */
AMOVFL,
- AMULLW,
/* op $s[,r2],r3; op r1[,r2],r3; no cc/v */
ASUBC, /* op r1,$s,r3; op r1[,r2],r3 */
ASTSW,
@@ -4990,8 +4991,8 @@ func (c *ctxt9) opirr(a obj.As) uint32 {
case ADARN:
return OPVCC(31, 755, 0, 0) /* darn - v3.00 */
- case AMULLW:
- return OPVCC(7, 0, 0, 0)
+ case AMULLW, AMULLD:
+ return OPVCC(7, 0, 0, 0) /* mulli works with MULLW or MULLD */
case AOR:
return OPVCC(24, 0, 0, 0)