1 files changed, 732 insertions, 111 deletions
diff --git a/src/cmd/compile/internal/ssa/regalloc.go b/src/cmd/compile/internal/ssa/regalloc.go
index 22b9d12c19..65c25dfc5a 100644
--- a/src/cmd/compile/internal/ssa/regalloc.go
+++ b/src/cmd/compile/internal/ssa/regalloc.go
@@ -91,6 +91,18 @@
 // will have no use (so don't run deadcode after regalloc!).
 // TODO: maybe we should introduce these extra phis?
 
+// Additional not-quite-SSA output occurs when spills are sunk out
+// of loops to the targets of exit edges from the loop.  Before sinking,
+// there is one spill site (one StoreReg) targeting stack slot X, after
+// sinking there may be multiple spill sites targeting stack slot X,
+// with no phi functions at any join points reachable by the multiple
+// spill sites.  In addition, uses of the spill from copies of the original
+// will not name the copy in their reference; instead they will name
+// the original, though both will have the same spill location.  The
+// first sunk spill will be the original, but moved, to an exit block,
+// thus ensuring that there is a definition somewhere corresponding to
+// the original spill's uses.
+
 package ssa
 
 import (
@@ -100,7 +112,8 @@ import (
 )
 
 const (
-	logSpills = iota
+	moveSpills = iota
+	logSpills
 	regDebug
 	stackDebug
 )
@@ -176,10 +189,9 @@ type valState struct {
 	uses              *use    // list of uses in this block
 	spill             *Value  // spilled copy of the Value
 	spillUsed         bool
-	needReg           bool     // cached value of !v.Type.IsMemory() && !v.Type.IsVoid() && !.v.Type.IsFlags()
-	rematerializeable bool     // cached value of v.rematerializeable()
-	desired           register // register we want value to be in, if any
-	avoid             regMask  // registers to avoid if we can
+	spillUsedShuffle  bool // true if used in shuffling, after ordinary uses
+	needReg           bool // cached value of !v.Type.IsMemory() && !v.Type.IsVoid() && !.v.Type.IsFlags()
+	rematerializeable bool // cached value of v.rematerializeable()
 }
 
 type regState struct {
@@ -191,10 +203,11 @@ type regState struct {
 type regAllocState struct {
 	f *Func
 
-	registers []Register
-	numRegs   register
-	SPReg     register
-	SBReg     register
+	registers   []Register
+	numRegs     register
+	SPReg       register
+	SBReg       register
+	allocatable regMask
 
 	// for each block, its primary predecessor.
 	// A predecessor of b is primary if it is the closest
@@ -206,6 +219,11 @@ type regAllocState struct {
 	// which are live at the end of b, together with a count of how many instructions
 	// forward to the next use.
 	live [][]liveInfo
+	// desired register assignments at the end of each block.
+	// Note that this is a static map computed before allocation occurs. Dynamic
+	// register desires (from partially completed allocations) will trump
+	// this information.
+	desired []desiredState
 
 	// current state of each (preregalloc) Value
 	values []valState
@@ -243,6 +261,15 @@ type regAllocState struct {
 	loopnest *loopnest
 }
 
+type spillToSink struct {
+	spill *Value // Spill instruction to move (a StoreReg)
+	dests int32  // Bitmask indicating exit blocks from loop in which spill/val is defined. 1<<i set means val is live into loop.exitBlocks[i]
+}
+
+func (sts *spillToSink) spilledValue() *Value {
+	return sts.spill.Args[0]
+}
+
 type endReg struct {
 	r register
 	v *Value // pre-regalloc value held in this register (TODO: can we use ID here?)
@@ -310,6 +337,7 @@ func (s *regAllocState) assignReg(r register, v *Value, c *Value) {
 // If there is no unused register, a Value will be kicked out of
 // a register to make room.
 func (s *regAllocState) allocReg(v *Value, mask regMask) register {
+	mask &= s.allocatable
 	mask &^= s.nospill
 	if mask == 0 {
 		s.f.Fatalf("no register available")
@@ -317,20 +345,7 @@ func (s *regAllocState) allocReg(v *Value, mask regMask) register {
 
 	// Pick an unused register if one is available.
 	if mask&^s.used != 0 {
-		mask &^= s.used
-
-		// Use desired register if we can.
-		d := s.values[v.ID].desired
-		if d != noRegister && mask>>d&1 != 0 {
-			mask = regMask(1) << d
-		}
-
-		// Avoid avoidable registers if we can.
-		if mask&^s.values[v.ID].avoid != 0 {
-			mask &^= s.values[v.ID].avoid
-		}
-
-		return pickReg(mask)
+		return pickReg(mask &^ s.used)
 	}
 
 	// Pick a value to spill. Spill the value with the
@@ -340,10 +355,6 @@ func (s *regAllocState) allocReg(v *Value, mask regMask) register {
 	// TODO: if a single value is in multiple registers, spill one of them
 	// before spilling a value in just a single register.
 
-	// SP and SB are allocated specially. No regular value should
-	// be allocated to them.
-	mask &^= 1<<s.SPReg | 1<<s.SBReg
-
 	// Find a register to spill. We spill the register containing the value
 	// whose next use is as far in the future as possible.
 	// https://en.wikipedia.org/wiki/Page_replacement_algorithm#The_theoretically_optimal_page_replacement_algorithm
@@ -389,14 +400,6 @@ func (s *regAllocState) allocValToReg(v *Value, mask regMask, nospill bool, line
 		return s.regs[r].c
 	}
 
-	if v.Op != OpSP {
-		mask &^= 1 << s.SPReg // dont' spill SP
-	}
-	if v.Op != OpSB {
-		mask &^= 1 << s.SBReg // don't spill SB
-	}
-	mask &^= s.reserved()
-
 	// Allocate a register.
 	r := s.allocReg(v, mask)
 
@@ -417,7 +420,7 @@ func (s *regAllocState) allocValToReg(v *Value, mask regMask, nospill bool, line
 		// Load v from its spill location.
 		case vi.spill != nil:
 			if s.f.pass.debug > logSpills {
-				s.f.Config.Warnl(vi.spill.Line, "load spill")
+				s.f.Config.Warnl(vi.spill.Line, "load spill for %v from %v", v, vi.spill)
 			}
 			c = s.curBlock.NewValue1(line, OpLoadReg, v.Type, vi.spill)
 			vi.spillUsed = true
@@ -434,6 +437,7 @@ func (s *regAllocState) allocValToReg(v *Value, mask regMask, nospill bool, line
 }
 
 func (s *regAllocState) init(f *Func) {
+	s.f = f
 	s.registers = f.Config.registers
 	s.numRegs = register(len(s.registers))
 	if s.numRegs > noRegister || s.numRegs > register(unsafe.Sizeof(regMask(0))*8) {
@@ -448,7 +452,17 @@ func (s *regAllocState) init(f *Func) {
 		}
 	}
 
-	s.f = f
+	// Figure out which registers we're allowed to use.
+	s.allocatable = regMask(1)<<s.numRegs - 1
+	s.allocatable &^= 1 << s.SPReg
+	s.allocatable &^= 1 << s.SBReg
+	if obj.Framepointer_enabled != 0 {
+		s.allocatable &^= 1 << 5 // BP
+	}
+	if s.f.Config.ctxt.Flag_dynlink {
+		s.allocatable &^= 1 << 15 // R15
+	}
+
 	s.regs = make([]regState, s.numRegs)
 	s.values = make([]valState, f.NumValues())
 	s.orig = make([]*Value, f.NumValues())
@@ -457,7 +471,6 @@ func (s *regAllocState) init(f *Func) {
 			if !v.Type.IsMemory() && !v.Type.IsVoid() && !v.Type.IsFlags() {
 				s.values[v.ID].needReg = true
 				s.values[v.ID].rematerializeable = v.rematerializeable()
-				s.values[v.ID].desired = noRegister
 				s.orig[v.ID] = v
 			}
 		}
@@ -527,6 +540,18 @@ func (s *regAllocState) advanceUses(v *Value) {
 	}
 }
 
+// liveAfterCurrentInstruction reports whether v is live after
+// the current instruction is completed.  v must be used by the
+// current instruction.
+func (s *regAllocState) liveAfterCurrentInstruction(v *Value) bool {
+	u := s.values[v.ID].uses
+	d := u.dist
+	for u != nil && u.dist == d {
+		u = u.next
+	}
+	return u != nil && u.dist > d
+}
+
 // Sets the state of the registers to that encoded in regs.
 func (s *regAllocState) setState(regs []endReg) {
 	s.freeRegs(s.used)
@@ -541,9 +566,25 @@ func (s *regAllocState) compatRegs(t Type) regMask {
 	if t.IsFloat() || t == TypeInt128 {
 		m = 0xffff << 16 // X0-X15
 	} else {
-		m = 0xffef << 0 // AX-R15, except SP
+		m = 0xffff << 0 // AX-R15
+	}
+	return m & s.allocatable
+}
+
+// loopForBlock returns the loop containing block b,
+// provided that the loop is "interesting" for purposes
+// of improving register allocation (= is inner, and does
+// not contain a call)
+func (s *regAllocState) loopForBlock(b *Block) *loop {
+	loop := s.loopnest.b2l[b.ID]
+
+	// Minor for-the-time-being optimization: nothing happens
+	// unless a loop is both inner and call-free, therefore
+	// don't bother with other loops.
+	if loop != nil && (loop.containsCall || !loop.isInner) {
+		loop = nil
 	}
-	return m &^ s.reserved()
+	return loop
 }
 
 func (s *regAllocState) regalloc(f *Func) {
@@ -554,12 +595,46 @@ func (s *regAllocState) regalloc(f *Func) {
 	var phiRegs []register
 	var args []*Value
 
+	// statistics
+	var nSpills int               // # of spills remaining
+	var nSpillsInner int          // # of spills remaining in inner loops
+	var nSpillsSunk int           // # of sunk spills remaining
+	var nSpillsChanged int        // # of sunk spills lost because of register use change
+	var nSpillsSunkUnused int     // # of spills not sunk because they were removed completely
+	var nSpillsNotSunkLateUse int // # of spills not sunk because of very late use (in shuffle)
+
+	// Data structure used for computing desired registers.
+	var desired desiredState
+
+	// Desired registers for inputs & outputs for each instruction in the block.
+	type dentry struct {
+		out [4]register    // desired output registers
+		in  [3][4]register // desired input registers (for inputs 0,1, and 2)
+	}
+	var dinfo []dentry
+
 	if f.Entry != f.Blocks[0] {
 		f.Fatalf("entry block must be first")
 	}
 
+	// Get loop nest so that spills in inner loops can be
+	// tracked.  When the last block of a loop is processed,
+	// attempt to move spills out of the loop.
+	s.loopnest.findExits()
+
+	// Spills are moved from one block's slice of values to another's.
+	// This confuses register allocation if it occurs before it is
+	// complete, so candidates are recorded, then rechecked and
+	// moved after all allocation (register and stack) is complete.
+	// Because movement is only within a stack slot's lifetime, it
+	// is safe to do this.
+	var toSink []spillToSink
+	// Will be used to figure out live inputs to exit blocks of inner loops.
+	entryCandidates := newSparseMap(f.NumValues())
+
 	for _, b := range f.Blocks {
 		s.curBlock = b
+		loop := s.loopForBlock(b)
 
 		// Initialize liveSet and uses fields for this block.
 		// Walk backwards through the block doing liveness analysis.
@@ -739,6 +814,11 @@ func (s *regAllocState) regalloc(f *Func) {
 				s.setOrig(spill, v)
 				s.values[v.ID].spill = spill
 				s.values[v.ID].spillUsed = false
+				if loop != nil {
+					loop.spills = append(loop.spills, v)
+					nSpillsInner++
+				}
+				nSpills++
 			}
 
 			// Save the starting state for use by merge edges.
@@ -765,26 +845,27 @@ func (s *regAllocState) regalloc(f *Func) {
 			}
 		}
 
-		// Compute preferred registers for each value using a backwards pass.
+		// Allocate space to record the desired registers for each value.
+		dinfo = dinfo[:0]
+		for i := 0; i < len(oldSched); i++ {
+			dinfo = append(dinfo, dentry{})
+		}
+
+		// Load static desired register info at the end of the block.
+		desired.copy(&s.desired[b.ID])
+
+		// Check actual assigned registers at the start of the next block(s).
+		// Dynamically assigned registers will trump the static
+		// desired registers computed during liveness analysis.
 		// Note that we do this phase after startRegs is set above, so that
 		// we get the right behavior for a block which branches to itself.
 		for _, succ := range b.Succs {
-			// TODO: prioritize likely successor.
+			// TODO: prioritize likely successor?
 			for _, x := range s.startRegs[succ.ID] {
-				v := s.orig[x.vid]
-				s.values[v.ID].desired = x.r
-			}
-			// Process phi ops in succ
-			i := -1
-			for j, p := range succ.Preds {
-				if p == b {
-					i = j
-					break
-				}
-			}
-			if i == -1 {
-				s.f.Fatalf("can't find predecssor %s of %s\n", b, succ)
+				desired.add(x.vid, x.r)
 			}
+			// Process phi ops in succ.
+			pidx := predIdx(succ, b)
 			for _, v := range succ.Values {
 				if v.Op != OpPhi {
 					break
@@ -792,47 +873,44 @@ func (s *regAllocState) regalloc(f *Func) {
 				if !s.values[v.ID].needReg {
 					continue
 				}
-				r, ok := s.f.getHome(v.ID).(*Register)
+				rp, ok := s.f.getHome(v.ID).(*Register)
 				if !ok {
 					continue
 				}
-				a := s.orig[v.Args[i].ID]
-				s.values[a.ID].desired = register(r.Num)
+				desired.add(v.Args[pidx].ID, register(rp.Num))
 			}
 		}
-
-		// Set avoid fields to help desired register availability.
-		liveSet.clear()
-		for _, e := range s.live[b.ID] {
-			liveSet.add(e.ID)
-		}
-		if v := b.Control; v != nil && s.values[v.ID].needReg {
-			liveSet.add(v.ID)
-		}
+		// Walk values backwards computing desired register info.
+		// See computeLive for more comments.
 		for i := len(oldSched) - 1; i >= 0; i-- {
 			v := oldSched[i]
-			liveSet.remove(v.ID)
-
-			r := s.values[v.ID].desired
-			if r != noRegister {
-				m := regMask(1) << r
-				// All live values should avoid this register so
-				// it will be available at this point.
-				for _, w := range liveSet.contents() {
-					s.values[w].avoid |= m
+			prefs := desired.remove(v.ID)
+			desired.clobber(opcodeTable[v.Op].reg.clobbers)
+			for _, j := range opcodeTable[v.Op].reg.inputs {
+				if countRegs(j.regs) != 1 {
+					continue
 				}
+				desired.clobber(j.regs)
+				desired.add(v.Args[j.idx].ID, pickReg(j.regs))
 			}
-
-			for _, a := range v.Args {
-				if !s.values[a.ID].needReg {
-					continue
+			if opcodeTable[v.Op].resultInArg0 {
+				if opcodeTable[v.Op].commutative {
+					desired.addList(v.Args[1].ID, prefs)
 				}
-				liveSet.add(a.ID)
+				desired.addList(v.Args[0].ID, prefs)
+			}
+			// Save desired registers for this value.
+			dinfo[i].out = prefs
+			for j, a := range v.Args {
+				if j >= len(dinfo[i].in) {
+					break
+				}
+				dinfo[i].in[j] = desired.get(a.ID)
 			}
 		}
 
 		// Process all the non-phi values.
-		for _, v := range oldSched {
+		for idx, v := range oldSched {
 			if s.f.pass.debug > regDebug {
 				fmt.Printf("  processing %s\n", v.LongString())
 			}
@@ -880,15 +958,132 @@ func (s *regAllocState) regalloc(f *Func) {
 				continue
 			}
 
+			if s.f.pass.debug > regDebug {
+				fmt.Printf("value %s\n", v.LongString())
+				fmt.Printf("  out:")
+				for _, r := range dinfo[idx].out {
+					if r != noRegister {
+						fmt.Printf(" %s", s.registers[r].Name())
+					}
+				}
+				fmt.Println()
+				for i := 0; i < len(v.Args) && i < 3; i++ {
+					fmt.Printf("  in%d:", i)
+					for _, r := range dinfo[idx].in[i] {
+						if r != noRegister {
+							fmt.Printf(" %s", s.registers[r].Name())
+						}
+					}
+					fmt.Println()
+				}
+			}
+
 			// Move arguments to registers. Process in an ordering defined
 			// by the register specification (most constrained first).
 			args = append(args[:0], v.Args...)
 			for _, i := range regspec.inputs {
-				if i.regs == flagRegMask {
+				mask := i.regs
+				if mask == flagRegMask {
 					// TODO: remove flag input from regspec.inputs.
 					continue
 				}
-				args[i.idx] = s.allocValToReg(v.Args[i.idx], i.regs, true, v.Line)
+				if mask&s.values[args[i.idx].ID].regs == 0 {
+					// Need a new register for the input.
+					mask &= s.allocatable
+					mask &^= s.nospill
+					// Used desired register if available.
+					if i.idx < 3 {
+						for _, r := range dinfo[idx].in[i.idx] {
+							if r != noRegister && (mask&^s.used)>>r&1 != 0 {
+								// Desired register is allowed and unused.
+								mask = regMask(1) << r
+								break
+							}
+						}
+					}
+					// Avoid registers we're saving for other values.
+					if mask&^desired.avoid != 0 {
+						mask &^= desired.avoid
+					}
+				}
+				args[i.idx] = s.allocValToReg(args[i.idx], mask, true, v.Line)
+			}
+
+			// If the output clobbers the input register, make sure we have
+			// at least two copies of the input register so we don't
+			// have to reload the value from the spill location.
+			if opcodeTable[v.Op].resultInArg0 {
+				var m regMask
+				if !s.liveAfterCurrentInstruction(v.Args[0]) {
+					// arg0 is dead.  We can clobber its register.
+					goto ok
+				}
+				if countRegs(s.values[v.Args[0].ID].regs) >= 2 {
+					// we have at least 2 copies of arg0.  We can afford to clobber one.
+					goto ok
+				}
+				if opcodeTable[v.Op].commutative {
+					if !s.liveAfterCurrentInstruction(v.Args[1]) {
+						args[0], args[1] = args[1], args[0]
+						goto ok
+					}
+					if countRegs(s.values[v.Args[1].ID].regs) >= 2 {
+						args[0], args[1] = args[1], args[0]
+						goto ok
+					}
+				}
+
+				// We can't overwrite arg0 (or arg1, if commutative).  So we
+				// need to make a copy of an input so we have a register we can modify.
+
+				// Possible new registers to copy into.
+				m = s.compatRegs(v.Args[0].Type) &^ s.used
+				if m == 0 {
+					// No free registers.  In this case we'll just clobber
+					// an input and future uses of that input must use a restore.
+					// TODO(khr): We should really do this like allocReg does it,
+					// spilling the value with the most distant next use.
+					goto ok
+				}
+
+				// Try to move an input to the desired output.
+				for _, r := range dinfo[idx].out {
+					if r != noRegister && m>>r&1 != 0 {
+						m = regMask(1) << r
+						args[0] = s.allocValToReg(v.Args[0], m, true, v.Line)
+						// Note: we update args[0] so the instruction will
+						// use the register copy we just made.
+						goto ok
+					}
+				}
+				// Try to copy input to its desired location & use its old
+				// location as the result register.
+				for _, r := range dinfo[idx].in[0] {
+					if r != noRegister && m>>r&1 != 0 {
+						m = regMask(1) << r
+						s.allocValToReg(v.Args[0], m, true, v.Line)
+						// Note: no update to args[0] so the instruction will
+						// use the original copy.
+						goto ok
+					}
+				}
+				if opcodeTable[v.Op].commutative {
+					for _, r := range dinfo[idx].in[1] {
+						if r != noRegister && m>>r&1 != 0 {
+							m = regMask(1) << r
+							s.allocValToReg(v.Args[1], m, true, v.Line)
+							args[0], args[1] = args[1], args[0]
+							goto ok
+						}
+					}
+				}
+				// Avoid future fixed uses if we can.
+				if m&^desired.avoid != 0 {
+					m &^= desired.avoid
+				}
+				// Save input 0 to a new register so we can clobber it.
+				s.allocValToReg(v.Args[0], m, true, v.Line)
+			ok:
 			}
 
 			// Now that all args are in regs, we're ready to issue the value itself.
@@ -903,24 +1098,44 @@ func (s *regAllocState) regalloc(f *Func) {
 
 			// Pick register for output.
 			if s.values[v.ID].needReg {
-				mask := regspec.outputs[0] &^ s.reserved()
-				if mask>>33&1 != 0 {
-					s.f.Fatalf("bad mask %s\n", v.LongString())
-				}
+				mask := regspec.outputs[0] & s.allocatable
 				if opcodeTable[v.Op].resultInArg0 {
-					r := register(s.f.getHome(args[0].ID).(*Register).Num)
-					if (mask&^s.used)>>r&1 != 0 {
+					if !opcodeTable[v.Op].commutative {
+						// Output must use the same register as input 0.
+						r := register(s.f.getHome(args[0].ID).(*Register).Num)
 						mask = regMask(1) << r
-					}
-					if opcodeTable[v.Op].commutative {
-						r := register(s.f.getHome(args[1].ID).(*Register).Num)
-						if (mask&^s.used)>>r&1 != 0 {
-							mask = regMask(1) << r
+					} else {
+						// Output must use the same register as input 0 or 1.
+						r0 := register(s.f.getHome(args[0].ID).(*Register).Num)
+						r1 := register(s.f.getHome(args[1].ID).(*Register).Num)
+						// Check r0 and r1 for desired output register.
+						found := false
+						for _, r := range dinfo[idx].out {
+							if (r == r0 || r == r1) && (mask&^s.used)>>r&1 != 0 {
+								mask = regMask(1) << r
+								found = true
+								if r == r1 {
+									args[0], args[1] = args[1], args[0]
+								}
+								break
+							}
+						}
+						if !found {
+							// Neither are desired, pick r0.
+							mask = regMask(1) << r0
 						}
 					}
-					// TODO: enforce resultInArg0 always, instead of treating it
-					// as a hint.  Then we don't need the special cases adding
-					// moves all throughout ssa.go:genValue.
+				}
+				for _, r := range dinfo[idx].out {
+					if r != noRegister && (mask&^s.used)>>r&1 != 0 {
+						// Desired register is allowed and unused.
+						mask = regMask(1) << r
+						break
+					}
+				}
+				// Avoid registers we're saving for other values.
+				if mask&^desired.avoid != 0 {
+					mask &^= desired.avoid
 				}
 				r := s.allocReg(v, mask)
 				s.assignReg(r, v, v)
@@ -947,6 +1162,11 @@ func (s *regAllocState) regalloc(f *Func) {
 				s.setOrig(spill, v)
 				s.values[v.ID].spill = spill
 				s.values[v.ID].spillUsed = false
+				if loop != nil {
+					loop.spills = append(loop.spills, v)
+					nSpillsInner++
+				}
+				nSpills++
 			}
 		}
 
@@ -993,6 +1213,9 @@ func (s *regAllocState) regalloc(f *Func) {
 				}
 				v := s.orig[vid]
 				m := s.compatRegs(v.Type) &^ s.used
+				if m&^desired.avoid != 0 {
+					m &^= desired.avoid
+				}
 				if m != 0 {
 					s.allocValToReg(v, m, false, b.Line)
 				}
@@ -1056,6 +1279,69 @@ func (s *regAllocState) regalloc(f *Func) {
 			s.values[e.ID].spillUsed = true
 		}
 
+		// Keep track of values that are spilled in the loop, but whose spill
+		// is not used in the loop.  It may be possible to move ("sink") the
+		// spill out of the loop into one or more exit blocks.
+		if loop != nil {
+			loop.scratch++                    // increment count of blocks in this loop that have been processed
+			if loop.scratch == loop.nBlocks { // just processed last block of loop, if it is an inner loop.
+				// This check is redundant with code at the top of the loop.
+				// This is definitive; the one at the top of the loop is an optimization.
+				if loop.isInner && // Common case, easier, most likely to be profitable
+					!loop.containsCall && // Calls force spills, also lead to puzzling spill info.
+					len(loop.exits) <= 32 { // Almost no inner loops have more than 32 exits,
+					// and this allows use of a bitvector and a sparseMap.
+
+					// TODO: exit calculation is messed up for non-inner loops
+					// because of multilevel exits that are not part of the "exit"
+					// count.
+
+					// Compute the set of spill-movement candidates live at entry to exit blocks.
+					// isLoopSpillCandidate filters for
+					// (1) defined in appropriate loop
+					// (2) needs a register
+					// (3) spill not already used (in the loop)
+					// Condition (3) === "in a register at all loop exits"
+
+					entryCandidates.clear()
+
+					for whichExit, ss := range loop.exits {
+						// Start with live at end.
+						for _, li := range s.live[ss.ID] {
+							if s.isLoopSpillCandidate(loop, s.orig[li.ID]) {
+								entryCandidates.setBit(li.ID, uint(whichExit))
+							}
+						}
+						// Control can also be live.
+						if ss.Control != nil && s.isLoopSpillCandidate(loop, ss.Control) {
+							entryCandidates.setBit(ss.Control.ID, uint(whichExit))
+						}
+						// Walk backwards, filling in locally live values, removing those defined.
+						for i := len(ss.Values) - 1; i >= 0; i-- {
+							v := ss.Values[i]
+							entryCandidates.remove(v.ID) // Cannot be an issue, only keeps the sets smaller.
+							for _, a := range v.Args {
+								if s.isLoopSpillCandidate(loop, a) {
+									entryCandidates.setBit(a.ID, uint(whichExit))
+								}
+							}
+						}
+					}
+
+					for _, e := range loop.spills {
+						whichblocks := entryCandidates.get(e.ID)
+						oldSpill := s.values[e.ID].spill
+						if whichblocks != 0 && whichblocks != -1 { // -1 = not in map.
+							toSink = append(toSink, spillToSink{spill: oldSpill, dests: whichblocks})
+						}
+					}
+
+				} // loop is inner etc
+				loop.scratch = 0 // Don't leave a mess, just in case.
+				loop.spills = nil
+			} // if scratch == nBlocks
+		} // if loop is not nil
+
 		// Clear any final uses.
 		// All that is left should be the pseudo-uses added for values which
 		// are live at the end of b.
@@ -1078,7 +1364,7 @@ func (s *regAllocState) regalloc(f *Func) {
 		vi := s.values[i]
 		if vi.spillUsed {
 			if s.f.pass.debug > logSpills {
-				s.f.Config.Warnl(vi.spill.Line, "spilled value")
+				s.f.Config.Warnl(vi.spill.Line, "spilled value at %v remains", vi.spill)
 			}
 			continue
 		}
@@ -1087,9 +1373,16 @@ func (s *regAllocState) regalloc(f *Func) {
 			// Constants, SP, SB, ...
 			continue
 		}
+		loop := s.loopForBlock(spill.Block)
+		if loop != nil {
+			nSpillsInner--
+		}
+
 		spill.Args[0].Uses--
 		f.freeValue(spill)
+		nSpills--
 	}
+
 	for _, b := range f.Blocks {
 		i := 0
 		for _, v := range b.Values {
@@ -1104,12 +1397,161 @@ func (s *regAllocState) regalloc(f *Func) {
 		// Not important now because this is the last phase that manipulates Values
 	}
 
+	// Must clear these out before any potential recycling, though that's
+	// not currently implemented.
+	for i, ts := range toSink {
+		vsp := ts.spill
+		if vsp.Op == OpInvalid { // This spill was completely eliminated
+			toSink[i].spill = nil
+		}
+	}
+
 	// Anything that didn't get a register gets a stack location here.
 	// (StoreReg, stack-based phis, inputs, ...)
 	stacklive := stackalloc(s.f, s.spillLive)
 
 	// Fix up all merge edges.
 	s.shuffle(stacklive)
+
+	// Insert moved spills (that have not been marked invalid above)
+	// at start of appropriate block and remove the originals from their
+	// location within loops.  Notice that this can break SSA form;
+	// if a spill is sunk to multiple exits, there will be no phi for that
+	// spill at a join point downstream of those two exits, though the
+	// two spills will target the same stack slot.  Notice also that this
+	// takes place after stack allocation, so the stack allocator does
+	// not need to process these malformed flow graphs.
+sinking:
+	for _, ts := range toSink {
+		vsp := ts.spill
+		if vsp == nil { // This spill was completely eliminated
+			nSpillsSunkUnused++
+			continue sinking
+		}
+		e := ts.spilledValue()
+		if s.values[e.ID].spillUsedShuffle {
+			nSpillsNotSunkLateUse++
+			continue sinking
+		}
+
+		// move spills to a better (outside of loop) block.
+		// This would be costly if it occurred very often, but it doesn't.
+		b := vsp.Block
+		loop := s.loopnest.b2l[b.ID]
+		dests := ts.dests
+
+		// Pre-check to be sure that spilled value is still in expected register on all exits where live.
+	check_val_still_in_reg:
+		for i := uint(0); i < 32 && dests != 0; i++ {
+
+			if dests&(1<<i) == 0 {
+				continue
+			}
+			dests ^= 1 << i
+			d := loop.exits[i]
+			if len(d.Preds) > 1 {
+				panic("Should be impossible given critical edges removed")
+			}
+			p := d.Preds[0] // block in loop exiting to d.
+
+			endregs := s.endRegs[p.ID]
+			for _, regrec := range endregs {
+				if regrec.v == e && regrec.r != noRegister && regrec.c == e { // TODO: regrec.c != e implies different spill possible.
+					continue check_val_still_in_reg
+				}
+			}
+			// If here, the register assignment was lost down at least one exit and it can't be sunk
+			if s.f.pass.debug > moveSpills {
+				s.f.Config.Warnl(e.Line, "lost register assignment for spill %v in %v at exit %v to %v",
+					vsp, b, p, d)
+			}
+			nSpillsChanged++
+			continue sinking
+		}
+
+		nSpillsSunk++
+		nSpillsInner--
+		// don't update nSpills, since spill is only moved, and if it is duplicated, the spills-on-a-path is not increased.
+
+		dests = ts.dests
+
+		// remove vsp from b.Values
+		i := 0
+		for _, w := range b.Values {
+			if vsp == w {
+				continue
+			}
+			b.Values[i] = w
+			i++
+		}
+		b.Values = b.Values[:i]
+
+		first := true
+		for i := uint(0); i < 32 && dests != 0; i++ {
+
+			if dests&(1<<i) == 0 {
+				continue
+			}
+
+			dests ^= 1 << i
+
+			d := loop.exits[i]
+			vspnew := vsp // reuse original for first sunk spill, saves tracking down and renaming uses
+			if !first {   // any sunk spills after first must make a copy
+				vspnew = d.NewValue1(e.Line, OpStoreReg, e.Type, e)
+				f.setHome(vspnew, f.getHome(vsp.ID)) // copy stack home
+				if s.f.pass.debug > moveSpills {
+					s.f.Config.Warnl(e.Line, "copied spill %v in %v for %v to %v in %v",
+						vsp, b, e, vspnew, d)
+				}
+			} else {
+				first = false
+				vspnew.Block = d
+				d.Values = append(d.Values, vspnew)
+				if s.f.pass.debug > moveSpills {
+					s.f.Config.Warnl(e.Line, "moved spill %v in %v for %v to %v in %v",
+						vsp, b, e, vspnew, d)
+				}
+			}
+
+			// shuffle vspnew to the beginning of its block
+			copy(d.Values[1:], d.Values[0:len(d.Values)-1])
+			d.Values[0] = vspnew
+
+		}
+	}
+
+	if f.pass.stats > 0 {
+		f.logStat("spills_info",
+			nSpills, "spills", nSpillsInner, "inner_spills_remaining", nSpillsSunk, "inner_spills_sunk", nSpillsSunkUnused, "inner_spills_unused", nSpillsNotSunkLateUse, "inner_spills_shuffled", nSpillsChanged, "inner_spills_changed")
+	}
+}
+
+// isLoopSpillCandidate indicates whether the spill for v satisfies preliminary
+// spill-sinking conditions just after the last block of loop has been processed.
+// In particular:
+//   v needs a register.
+//   v's spill is not (YET) used.
+//   v's definition is within loop.
+// The spill may be used in the future, either by an outright use
+// in the code, or by shuffling code inserted after stack allocation.
+// Outright uses cause sinking; shuffling (within the loop) inhibits it.
+func (s *regAllocState) isLoopSpillCandidate(loop *loop, v *Value) bool {
+	return s.values[v.ID].needReg && !s.values[v.ID].spillUsed && s.loopnest.b2l[v.Block.ID] == loop
+}
+
+// lateSpillUse notes a late (after stack allocation) use of the spill of value with ID vid.
+// This will inhibit spill sinking.
+func (s *regAllocState) lateSpillUse(vid ID) {
+	// TODO investigate why this is necessary.
+	// It appears that an outside-the-loop use of
+	// an otherwise sinkable spill makes the spill
+	// a candidate for shuffling, when it would not
+	// otherwise have been the case (spillUsed was not
+	// true when isLoopSpillCandidate was called, yet
+	// it was shuffled).  Such shuffling cuts the amount
+	// of spill sinking by more than half (in make.bash)
+	s.values[vid].spillUsedShuffle = true
 }
 
 // shuffle fixes up all the merge edges (those going into blocks of indegree > 1).
@@ -1284,6 +1726,7 @@ func (e *edgeState) process() {
 		if _, isReg := loc.(*Register); isReg {
 			c = e.p.NewValue1(c.Line, OpCopy, c.Type, c)
 		} else {
+			e.s.lateSpillUse(vid)
 			c = e.p.NewValue1(c.Line, OpLoadReg, c.Type, c)
 		}
 		e.set(r, vid, c, false)
@@ -1372,6 +1815,7 @@ func (e *edgeState) processDest(loc Location, vid ID, splice **Value) bool {
 			}
 		} else {
 			if dstReg {
+				e.s.lateSpillUse(vid)
 				x = e.p.NewValue1(c.Line, OpLoadReg, c.Type, c)
 			} else {
 				// mem->mem. Use temp register.
@@ -1389,6 +1833,7 @@ func (e *edgeState) processDest(loc Location, vid ID, splice **Value) bool {
 				e.erase(loc)
 
 				r := e.findRegFor(c.Type)
+				e.s.lateSpillUse(vid)
 				t := e.p.NewValue1(c.Line, OpLoadReg, c.Type, c)
 				e.set(r, vid, t, false)
 				x = e.p.NewValue1(c.Line, OpStoreReg, loc.(LocalSlot).Type, t)
@@ -1554,24 +1999,36 @@ func (v *Value) rematerializeable() bool {
 }
 
 type liveInfo struct {
-	ID   ID    // ID of variable
+	ID   ID    // ID of value
 	dist int32 // # of instructions before next use
 }
 
+// dblock contains information about desired & avoid registers at the end of a block.
+type dblock struct {
+	prefers []desiredStateEntry
+	avoid   regMask
+}
+
 // computeLive computes a map from block ID to a list of value IDs live at the end
 // of that block. Together with the value ID is a count of how many instructions
-// to the next use of that value. The resulting map is stored at s.live.
+// to the next use of that value. The resulting map is stored in s.live.
+// computeLive also computes the desired register information at the end of each block.
+// This desired register information is stored in s.desired.
 // TODO: this could be quadratic if lots of variables are live across lots of
 // basic blocks. Figure out a way to make this function (or, more precisely, the user
 // of this function) require only linear size & time.
 func (s *regAllocState) computeLive() {
 	f := s.f
 	s.live = make([][]liveInfo, f.NumBlocks())
+	s.desired = make([]desiredState, f.NumBlocks())
 	var phis []*Value
 
 	live := newSparseMap(f.NumValues())
 	t := newSparseMap(f.NumValues())
 
+	// Keep track of which value we want in each register.
+	var desired desiredState
+
 	// Instead of iterating over f.Blocks, iterate over their postordering.
 	// Liveness information flows backward, so starting at the end
 	// increases the probability that we will stabilize quickly.
@@ -1594,7 +2051,7 @@ func (s *regAllocState) computeLive() {
 			d := int32(len(b.Values))
 			if b.Kind == BlockCall || b.Kind == BlockDefer {
 				// Because we keep no values in registers across a call,
-				// make every use past a call very far away.
+				// make every use past a call appear very far away.
 				d += unlikelyDistance
 			}
 			for _, e := range s.live[b.ID] {
@@ -1623,6 +2080,35 @@ func (s *regAllocState) computeLive() {
 					}
 				}
 			}
+			// Propagate desired registers backwards.
+			desired.copy(&s.desired[b.ID])
+			for i := len(b.Values) - 1; i >= 0; i-- {
+				v := b.Values[i]
+				prefs := desired.remove(v.ID)
+				if v.Op == OpPhi {
+					// TODO: if v is a phi, save desired register for phi inputs.
+					// For now, we just drop it and don't propagate
+					// desired registers back though phi nodes.
+					continue
+				}
+				// Cancel desired registers if they get clobbered.
+				desired.clobber(opcodeTable[v.Op].reg.clobbers)
+				// Update desired registers if there are any fixed register inputs.
+				for _, j := range opcodeTable[v.Op].reg.inputs {
+					if countRegs(j.regs) != 1 {
+						continue
+					}
+					desired.clobber(j.regs)
+					desired.add(v.Args[j.idx].ID, pickReg(j.regs))
+				}
+				// Set desired register of input 0 if this is a 2-operand instruction.
+				if opcodeTable[v.Op].resultInArg0 {
+					if opcodeTable[v.Op].commutative {
+						desired.addList(v.Args[1].ID, prefs)
+					}
+					desired.addList(v.Args[0].ID, prefs)
+				}
+			}
 
 			// For each predecessor of b, expand its list of live-at-end values.
 			// invariant: live contains the values live at the start of b (excluding phi inputs)
@@ -1642,6 +2128,9 @@ func (s *regAllocState) computeLive() {
 					}
 				}
 
+				// Update any desired registers at the end of p.
+				s.desired[p.ID].merge(&desired)
+
 				// Start t off with the previously known live values at the end of p.
 				t.clear()
 				for _, e := range s.live[p.ID] {
@@ -1662,7 +2151,7 @@ func (s *regAllocState) computeLive() {
 				// simultaneously happening at the start of the block).
 				for _, v := range phis {
 					id := v.Args[i].ID
-					if s.values[id].needReg && !t.contains(id) || delta < t.get(id) {
+					if s.values[id].needReg && (!t.contains(id) || delta < t.get(id)) {
 						update = true
 						t.set(id, delta)
 					}
@@ -1694,20 +2183,152 @@ func (s *regAllocState) computeLive() {
 			fmt.Printf("  %s:", b)
 			for _, x := range s.live[b.ID] {
 				fmt.Printf(" v%d", x.ID)
+				for _, e := range s.desired[b.ID].entries {
+					if e.ID != x.ID {
+						continue
+					}
+					fmt.Printf("[")
+					first := true
+					for _, r := range e.regs {
+						if r == noRegister {
+							continue
+						}
+						if !first {
+							fmt.Printf(",")
+						}
+						fmt.Print(s.registers[r].Name())
+						first = false
+					}
+					fmt.Printf("]")
+				}
 			}
+			fmt.Printf(" avoid=%x", int64(s.desired[b.ID].avoid))
 			fmt.Println()
 		}
 	}
 }
 
-// reserved returns a mask of reserved registers.
-func (s *regAllocState) reserved() regMask {
-	var m regMask
-	if obj.Framepointer_enabled != 0 {
-		m |= 1 << 5 // BP
+// A desiredState represents desired register assignments.
+type desiredState struct {
+	// Desired assignments will be small, so we just use a list
+	// of valueID+registers entries.
+	entries []desiredStateEntry
+	// Registers that other values want to be in.  This value will
+	// contain at least the union of the regs fields of entries, but
+	// may contain additional entries for values that were once in
+	// this data structure but are no longer.
+	avoid regMask
+}
+type desiredStateEntry struct {
+	// (pre-regalloc) value
+	ID ID
+	// Registers it would like to be in, in priority order.
+	// Unused slots are filled with noRegister.
+	regs [4]register
+}
+
+func (d *desiredState) clear() {
+	d.entries = d.entries[:0]
+	d.avoid = 0
+}
+
+// get returns a list of desired registers for value vid.
+func (d *desiredState) get(vid ID) [4]register {
+	for _, e := range d.entries {
+		if e.ID == vid {
+			return e.regs
+		}
 	}
-	if s.f.Config.ctxt.Flag_dynlink {
-		m |= 1 << 15 // R15
+	return [4]register{noRegister, noRegister, noRegister, noRegister}
+}
+
+// add records that we'd like value vid to be in register r.
+func (d *desiredState) add(vid ID, r register) {
+	d.avoid |= regMask(1) << r
+	for i := range d.entries {
+		e := &d.entries[i]
+		if e.ID != vid {
+			continue
+		}
+		if e.regs[0] == r {
+			// Already known and highest priority
+			return
+		}
+		for j := 1; j < len(e.regs); j++ {
+			if e.regs[j] == r {
+				// Move from lower priority to top priority
+				copy(e.regs[1:], e.regs[:j])
+				e.regs[0] = r
+				return
+			}
+		}
+		copy(e.regs[1:], e.regs[:])
+		e.regs[0] = r
+		return
+	}
+	d.entries = append(d.entries, desiredStateEntry{vid, [4]register{r, noRegister, noRegister, noRegister}})
+}
+
+func (d *desiredState) addList(vid ID, regs [4]register) {
+	// regs is in priority order, so iterate in reverse order.
+	for i := len(regs) - 1; i >= 0; i-- {
+		r := regs[i]
+		if r != noRegister {
+			d.add(vid, r)
+		}
+	}
+}
+
+// clobber erases any desired registers in the set m.
+func (d *desiredState) clobber(m regMask) {
+	for i := 0; i < len(d.entries); {
+		e := &d.entries[i]
+		j := 0
+		for _, r := range e.regs {
+			if r != noRegister && m>>r&1 == 0 {
+				e.regs[j] = r
+				j++
+			}
+		}
+		if j == 0 {
+			// No more desired registers for this value.
+			d.entries[i] = d.entries[len(d.entries)-1]
+			d.entries = d.entries[:len(d.entries)-1]
+			continue
+		}
+		for ; j < len(e.regs); j++ {
+			e.regs[j] = noRegister
+		}
+		i++
+	}
+	d.avoid &^= m
+}
+
+// copy copies a desired state from another desiredState x.
+func (d *desiredState) copy(x *desiredState) {
+	d.entries = append(d.entries[:0], x.entries...)
+	d.avoid = x.avoid
+}
+
+// remove removes the desired registers for vid and returns them.
+func (d *desiredState) remove(vid ID) [4]register {
+	for i := range d.entries {
+		if d.entries[i].ID == vid {
+			regs := d.entries[i].regs
+			d.entries[i] = d.entries[len(d.entries)-1]
+			d.entries = d.entries[:len(d.entries)-1]
+			return regs
+		}
+	}
+	return [4]register{noRegister, noRegister, noRegister, noRegister}
+}
+
+// merge merges another desired state x into d.
+func (d *desiredState) merge(x *desiredState) {
+	d.avoid |= x.avoid
+	// There should only be a few desired registers, so
+	// linear insert is ok.
+	for _, e := range x.entries {
+		d.addList(e.ID, e.regs)
 	}
-	return m
 }