cmd/asm, cmd/internal/obj/ppc64: avoid unnecessary load zeros

When instructions add, and, or, xor, and movd have constant operands in some cases more instructions are generated than necessary by the assembler. This adds more opcode/operand combinations to the optab and improves the code generation for the cases where the size and sign of the constant allows the use of 1 instructions instead of 2. Example of previous code: oris r3, r0, 0 ori r3, r3, 65533 now: ori r3, r0, 65533 This does not significantly reduce the overall binary size because the improvement depends on the constant value. Some procedures show a 1-2% reduction in size. This improvement could also be significant in cases where the extra instructions occur in a critical loop. Testcase ppc64enc.s was added to cmd/asm/internal/asm/testdata with the variations affected by this change. Updates #23845 Change-Id: I7fdf2320c95815d99f2755ba77d0c6921cd7fad7 Reviewed-on: https://go-review.googlesource.com/95135 Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
author: Lynn Boger <laboger@linux.vnet.ibm.com> 2018-02-19 11:10:31 -0500
committer: Lynn Boger <laboger@linux.vnet.ibm.com> 2018-03-08 14:17:34 +0000
commit: 5b14c7b324dc645dd7c90b22ddd86affac0637c2 (patch)
tree: c66617c712efaa1539d12ed1af79a7d89c558694 /src/cmd/internal/obj/ppc64
parent: 0add9a4dcfb4cbc0d99cd168752bd1bd641ce8e2 (diff)
download: go-5b14c7b324dc645dd7c90b22ddd86affac0637c2.tar.xz
1 files changed, 79 insertions, 25 deletions
diff --git a/src/cmd/internal/obj/ppc64/asm9.go b/src/cmd/internal/obj/ppc64/asm9.go
index 6cbf91a11d..11c43ec2e5 100644
--- a/src/cmd/internal/obj/ppc64/asm9.go
+++ b/src/cmd/internal/obj/ppc64/asm9.go
@@ -71,6 +71,14 @@ type Optab struct {
 	param int16
 }
 
+// This optab contains a list of opcodes with the operand
+// combinations that are implemented. Not all opcodes are in this
+// table, but are added later in buildop by calling opset for those
+// opcodes which allow the same operand combinations as an opcode
+// already in the table.
+//
+// The type field in the Optabl identifies the case in asmout where
+// the instruction word is assembled.
 var optab = []Optab{
 	{obj.ATEXT, C_LEXT, C_NONE, C_NONE, C_TEXTSIZE, 0, 0, 0},
 	{obj.ATEXT, C_LEXT, C_NONE, C_LCON, C_TEXTSIZE, 0, 0, 0},
@@ -84,14 +92,18 @@ var optab = []Optab{
 	{AMOVWZ, C_REG, C_NONE, C_NONE, C_REG, 13, 4, 0},
 	{AADD, C_REG, C_REG, C_NONE, C_REG, 2, 4, 0},
 	{AADD, C_REG, C_NONE, C_NONE, C_REG, 2, 4, 0},
+	{AADD, C_SCON, C_REG, C_NONE, C_REG, 4, 4, 0},
+	{AADD, C_SCON, C_NONE, C_NONE, C_REG, 4, 4, 0},
 	{AADD, C_ADDCON, C_REG, C_NONE, C_REG, 4, 4, 0},
 	{AADD, C_ADDCON, C_NONE, C_NONE, C_REG, 4, 4, 0},
 	{AADD, C_UCON, C_REG, C_NONE, C_REG, 20, 4, 0},
 	{AADD, C_UCON, C_NONE, C_NONE, C_REG, 20, 4, 0},
-	{AADDIS, C_ADDCON, C_REG, C_NONE, C_REG, 20, 4, 0},
-	{AADDIS, C_ADDCON, C_NONE, C_NONE, C_REG, 20, 4, 0},
+	{AADD, C_ANDCON, C_REG, C_NONE, C_REG, 22, 8, 0},
+	{AADD, C_ANDCON, C_NONE, C_NONE, C_REG, 22, 8, 0},
 	{AADD, C_LCON, C_REG, C_NONE, C_REG, 22, 12, 0},
 	{AADD, C_LCON, C_NONE, C_NONE, C_REG, 22, 12, 0},
+	{AADDIS, C_ADDCON, C_REG, C_NONE, C_REG, 20, 4, 0},
+	{AADDIS, C_ADDCON, C_NONE, C_NONE, C_REG, 20, 4, 0},
 	{AADDC, C_REG, C_REG, C_NONE, C_REG, 2, 4, 0},
 	{AADDC, C_REG, C_NONE, C_NONE, C_REG, 2, 4, 0},
 	{AADDC, C_ADDCON, C_REG, C_NONE, C_REG, 4, 4, 0},
@@ -106,10 +118,12 @@ var optab = []Optab{
 	{AANDCC, C_ANDCON, C_REG, C_NONE, C_REG, 58, 4, 0},
 	{AANDCC, C_UCON, C_NONE, C_NONE, C_REG, 59, 4, 0},
 	{AANDCC, C_UCON, C_REG, C_NONE, C_REG, 59, 4, 0},
-	{AANDISCC, C_ANDCON, C_NONE, C_NONE, C_REG, 59, 4, 0},
-	{AANDISCC, C_ANDCON, C_REG, C_NONE, C_REG, 59, 4, 0},
+	{AANDCC, C_ADDCON, C_NONE, C_NONE, C_REG, 23, 8, 0},
+	{AANDCC, C_ADDCON, C_REG, C_NONE, C_REG, 23, 8, 0},
 	{AANDCC, C_LCON, C_NONE, C_NONE, C_REG, 23, 12, 0},
 	{AANDCC, C_LCON, C_REG, C_NONE, C_REG, 23, 12, 0},
+	{AANDISCC, C_ANDCON, C_NONE, C_NONE, C_REG, 59, 4, 0},
+	{AANDISCC, C_ANDCON, C_REG, C_NONE, C_REG, 59, 4, 0},
 	{AMULLW, C_REG, C_REG, C_NONE, C_REG, 2, 4, 0},
 	{AMULLW, C_REG, C_NONE, C_NONE, C_REG, 2, 4, 0},
 	{AMULLW, C_ADDCON, C_REG, C_NONE, C_REG, 4, 4, 0},
@@ -128,10 +142,12 @@ var optab = []Optab{
 	{AOR, C_ANDCON, C_REG, C_NONE, C_REG, 58, 4, 0},
 	{AOR, C_UCON, C_NONE, C_NONE, C_REG, 59, 4, 0},
 	{AOR, C_UCON, C_REG, C_NONE, C_REG, 59, 4, 0},
-	{AORIS, C_ANDCON, C_NONE, C_NONE, C_REG, 59, 4, 0},
-	{AORIS, C_ANDCON, C_REG, C_NONE, C_REG, 59, 4, 0},
+	{AOR, C_ADDCON, C_NONE, C_NONE, C_REG, 23, 8, 0},
+	{AOR, C_ADDCON, C_REG, C_NONE, C_REG, 23, 8, 0},
 	{AOR, C_LCON, C_NONE, C_NONE, C_REG, 23, 12, 0},
 	{AOR, C_LCON, C_REG, C_NONE, C_REG, 23, 12, 0},
+	{AORIS, C_ANDCON, C_NONE, C_NONE, C_REG, 59, 4, 0},
+	{AORIS, C_ANDCON, C_REG, C_NONE, C_REG, 59, 4, 0},
 	{ADIVW, C_REG, C_REG, C_NONE, C_REG, 2, 4, 0}, /* op r1[,r2],r3 */
 	{ADIVW, C_REG, C_NONE, C_NONE, C_REG, 2, 4, 0},
 	{ASUB, C_REG, C_REG, C_NONE, C_REG, 10, 4, 0}, /* op r2[,r1],r3 */
@@ -277,16 +293,19 @@ var optab = []Optab{
 	{AMOVD, C_LECON, C_NONE, C_NONE, C_REG, 26, 8, REGSB},
 	{AMOVD, C_LACON, C_NONE, C_NONE, C_REG, 26, 8, REGSP},
 	{AMOVD, C_ADDCON, C_NONE, C_NONE, C_REG, 3, 4, REGZERO},
+	{AMOVD, C_ANDCON, C_NONE, C_NONE, C_REG, 3, 4, REGZERO},
 	{AMOVW, C_SECON, C_NONE, C_NONE, C_REG, 3, 4, REGSB}, /* TO DO: check */
 	{AMOVW, C_SACON, C_NONE, C_NONE, C_REG, 3, 4, REGSP},
 	{AMOVW, C_LECON, C_NONE, C_NONE, C_REG, 26, 8, REGSB},
 	{AMOVW, C_LACON, C_NONE, C_NONE, C_REG, 26, 8, REGSP},
 	{AMOVW, C_ADDCON, C_NONE, C_NONE, C_REG, 3, 4, REGZERO},
+	{AMOVW, C_ANDCON, C_NONE, C_NONE, C_REG, 3, 4, REGZERO},
 	{AMOVWZ, C_SECON, C_NONE, C_NONE, C_REG, 3, 4, REGSB}, /* TO DO: check */
 	{AMOVWZ, C_SACON, C_NONE, C_NONE, C_REG, 3, 4, REGSP},
 	{AMOVWZ, C_LECON, C_NONE, C_NONE, C_REG, 26, 8, REGSB},
 	{AMOVWZ, C_LACON, C_NONE, C_NONE, C_REG, 26, 8, REGSP},
 	{AMOVWZ, C_ADDCON, C_NONE, C_NONE, C_REG, 3, 4, REGZERO},
+	{AMOVWZ, C_ANDCON, C_NONE, C_NONE, C_REG, 3, 4, REGZERO},
 
 	/* load unsigned/long constants (TO DO: check) */
 	{AMOVD, C_UCON, C_NONE, C_NONE, C_REG, 3, 4, REGZERO},
@@ -1048,13 +1067,25 @@ func (x ocmp) Swap(i, j int) {
 	x[i], x[j] = x[j], x[i]
 }
 
+// Used when sorting the optab. Sorting is
+// done in a way so that the best choice of
+// opcode/operand combination is considered first.
 func (x ocmp) Less(i, j int) bool {
 	p1 := &x[i]
 	p2 := &x[j]
 	n := int(p1.as) - int(p2.as)
+	// same opcode
+	if n != 0 {
+		return n < 0
+	}
+	// Consider those that generate fewer
+	// instructions first.
+	n = int(p1.size) - int(p2.size)
 	if n != 0 {
 		return n < 0
 	}
+	// operand order should match
+	// better choices first
 	n = int(p1.a1) - int(p2.a1)
 	if n != 0 {
 		return n < 0
@@ -1073,10 +1104,15 @@ func (x ocmp) Less(i, j int) bool {
 	}
 	return false
 }
+
+// Add an entry to the opcode table for
+// a new opcode b0 with the same operand combinations
+// as opcode a.
 func opset(a, b0 obj.As) {
 	oprange[a&obj.AMask] = oprange[b0]
 }
 
+// Build the opcode table
 func buildop(ctxt *obj.Link) {
 	if oprange[AANDN&obj.AMask] != nil {
 		// Already initialized; stop now.
@@ -2256,7 +2292,7 @@ func (c *ctxt9) asmout(p *obj.Prog, o *Optab, out []uint32) {
 		}
 		o1 = AOP_RRR(c.oprrr(p.As), uint32(p.To.Reg), uint32(r), uint32(p.From.Reg))
 
-	case 3: /* mov $soreg/addcon/ucon, r ==> addis/addi $i,reg',r */
+	case 3: /* mov $soreg/addcon/andcon/ucon, r ==> addis/oris/addi/ori $i,reg',r */
 		d := c.vregoff(&p.From)
 
 		v := int32(d)
@@ -2272,6 +2308,8 @@ func (c *ctxt9) asmout(p *obj.Prog, o *Optab, out []uint32) {
 			if d&0xffff != 0 {
 				log.Fatalf("invalid handling of %v", p)
 			}
+			// For UCON operands the value is right shifted 16, using ADDIS if the
+			// value should be signed, ORIS if unsigned.
 			v >>= 16
 			if r == REGZERO && isuint32(uint64(d)) {
 				o1 = LOP_IRR(OP_ORIS, uint32(p.To.Reg), REGZERO, uint32(v))
@@ -2279,8 +2317,16 @@ func (c *ctxt9) asmout(p *obj.Prog, o *Optab, out []uint32) {
 			}
 
 			a = OP_ADDIS
-		} else {
-			if int64(int16(d)) != d {
+		} else if int64(int16(d)) != d {
+			// Operand is 16 bit value with sign bit set
+			if o.a1 == C_ANDCON {
+				// Needs unsigned 16 bit so use ORI
+				if r == 0 || r == REGZERO {
+					o1 = LOP_IRR(uint32(OP_ORI), uint32(p.To.Reg), uint32(0), uint32(v))
+					break
+				}
+				// With ADDCON, needs signed 16 bit value, fall through to use ADDI
+			} else if o.a1 != C_ADDCON {
 				log.Fatalf("invalid handling of %v", p)
 			}
 		}
@@ -2632,8 +2678,6 @@ func (c *ctxt9) asmout(p *obj.Prog, o *Optab, out []uint32) {
 			o1, o2 = c.symbolAccess(p.From.Sym, d, p.To.Reg, OP_ADDI)
 		}
 
-	//if(dlm) reloc(&p->from, p->pc, 0);
-
 	case 20: /* add $ucon,,r | addis $addcon,r,r */
 		v := c.regoff(&p.From)
 
@@ -2650,43 +2694,53 @@ func (c *ctxt9) asmout(p *obj.Prog, o *Optab, out []uint32) {
 			o1 = AOP_IRR(c.opirr(AADDIS), uint32(p.To.Reg), uint32(r), uint32(v)>>16)
 		}
 
-	case 22: /* add $lcon,r1,r2 ==> cau+or+add */ /* could do add/sub more efficiently */
+	case 22: /* add $lcon/$andcon,r1,r2 ==> oris+ori+add/ori+add */
 		if p.To.Reg == REGTMP || p.Reg == REGTMP {
 			c.ctxt.Diag("can't synthesize large constant\n%v", p)
 		}
 		d := c.vregoff(&p.From)
-		o1 = loadu32(REGTMP, d)
-		o2 = LOP_IRR(OP_ORI, REGTMP, REGTMP, uint32(int32(d)))
 		r := int(p.Reg)
 		if r == 0 {
 			r = int(p.To.Reg)
 		}
-		o3 = AOP_RRR(c.oprrr(p.As), uint32(p.To.Reg), REGTMP, uint32(r))
 		if p.From.Sym != nil {
 			c.ctxt.Diag("%v is not supported", p)
 		}
+		// If operand is ANDCON, generate 2 instructions using
+		// ORI for unsigned value; with LCON 3 instructions.
+		if o.size == 8 {
+			o1 = LOP_IRR(OP_ORI, REGTMP, REGZERO, uint32(int32(d)))
+			o2 = AOP_RRR(c.oprrr(p.As), uint32(p.To.Reg), REGTMP, uint32(r))
+		} else {
+			o1 = loadu32(REGTMP, d)
+			o2 = LOP_IRR(OP_ORI, REGTMP, REGTMP, uint32(int32(d)))
+			o3 = AOP_RRR(c.oprrr(p.As), uint32(p.To.Reg), REGTMP, uint32(r))
+		}
 
-	//if(dlm) reloc(&p->from, p->pc, 0);
-
-	case 23: /* and $lcon,r1,r2 ==> cau+or+and */ /* masks could be done using rlnm etc. */
+	case 23: /* and $lcon/$addcon,r1,r2 ==> oris+ori+and/addi+and */
 		if p.To.Reg == REGTMP || p.Reg == REGTMP {
 			c.ctxt.Diag("can't synthesize large constant\n%v", p)
 		}
 		d := c.vregoff(&p.From)
-		o1 = loadu32(REGTMP, d)
-		o2 = LOP_IRR(OP_ORI, REGTMP, REGTMP, uint32(int32(d)))
 		r := int(p.Reg)
 		if r == 0 {
 			r = int(p.To.Reg)
 		}
-		o3 = LOP_RRR(c.oprrr(p.As), uint32(p.To.Reg), REGTMP, uint32(r))
+
+		// With ADDCON operand, generate 2 instructions using ADDI for signed value,
+		// with LCON operand generate 3 instructions.
+		if o.size == 8 {
+			o1 = LOP_IRR(OP_ADDI, REGZERO, REGTMP, uint32(int32(d)))
+			o2 = LOP_RRR(c.oprrr(p.As), uint32(p.To.Reg), REGTMP, uint32(r))
+		} else {
+			o1 = loadu32(REGTMP, d)
+			o2 = LOP_IRR(OP_ORI, REGTMP, REGTMP, uint32(int32(d)))
+			o3 = LOP_RRR(c.oprrr(p.As), uint32(p.To.Reg), REGTMP, uint32(r))
+		}
 		if p.From.Sym != nil {
 			c.ctxt.Diag("%v is not supported", p)
 		}
 
-		//if(dlm) reloc(&p->from, p->pc, 0);
-
-		/*24*/
 	case 25:
 		/* sld[.] $sh,rS,rA -> rldicr[.] $sh,rS,mask(0,63-sh),rA; srd[.] -> rldicl */
 		v := c.regoff(&p.From)
@@ -3090,7 +3144,7 @@ func (c *ctxt9) asmout(p *obj.Prog, o *Optab, out []uint32) {
 		case AXOR:
 			o1 = LOP_IRR(c.opirr(AXORIS), uint32(p.To.Reg), uint32(r), uint32(v)>>16)
 		case AANDCC:
-			o1 = LOP_IRR(c.opirr(AANDCC), uint32(p.To.Reg), uint32(r), uint32(v)>>16)
+			o1 = LOP_IRR(c.opirr(AANDISCC), uint32(p.To.Reg), uint32(r), uint32(v)>>16)
 		default:
 			o1 = LOP_IRR(c.opirr(p.As), uint32(p.To.Reg), uint32(r), uint32(v))
 		}
author	Lynn Boger <laboger@linux.vnet.ibm.com>	2018-02-19 11:10:31 -0500
committer	Lynn Boger <laboger@linux.vnet.ibm.com>	2018-03-08 14:17:34 +0000
commit	5b14c7b324dc645dd7c90b22ddd86affac0637c2 (patch)
tree	c66617c712efaa1539d12ed1af79a7d89c558694 /src/cmd/internal/obj/ppc64
parent	0add9a4dcfb4cbc0d99cd168752bd1bd641ce8e2 (diff)
download	go-5b14c7b324dc645dd7c90b22ddd86affac0637c2.tar.xz