2 files changed, 136 insertions, 75 deletions
diff --git a/src/runtime/gcinfo_test.go b/src/runtime/gcinfo_test.go
index dd5c25e0b1..7618d86a45 100644
--- a/src/runtime/gcinfo_test.go
+++ b/src/runtime/gcinfo_test.go
@@ -34,23 +34,23 @@ func TestGCInfo(t *testing.T) {
 	verifyGCInfo(t, "data eface", &dataEface, infoEface)
 	verifyGCInfo(t, "data iface", &dataIface, infoIface)
 
-	verifyGCInfo(t, "stack ScalarPtr", new(ScalarPtr), nonStackInfo(infoScalarPtr))
-	verifyGCInfo(t, "stack PtrScalar", new(PtrScalar), nonStackInfo(infoPtrScalar))
-	verifyGCInfo(t, "stack BigStruct", new(BigStruct), nonStackInfo(infoBigStruct()))
-	verifyGCInfo(t, "stack string", new(string), nonStackInfo(infoString))
-	verifyGCInfo(t, "stack slice", new([]string), nonStackInfo(infoSlice))
-	verifyGCInfo(t, "stack eface", new(interface{}), nonStackInfo(infoEface))
-	verifyGCInfo(t, "stack iface", new(Iface), nonStackInfo(infoIface))
+	verifyGCInfo(t, "stack ScalarPtr", new(ScalarPtr), infoScalarPtr)
+	verifyGCInfo(t, "stack PtrScalar", new(PtrScalar), infoPtrScalar)
+	verifyGCInfo(t, "stack BigStruct", new(BigStruct), infoBigStruct())
+	verifyGCInfo(t, "stack string", new(string), infoString)
+	verifyGCInfo(t, "stack slice", new([]string), infoSlice)
+	verifyGCInfo(t, "stack eface", new(interface{}), infoEface)
+	verifyGCInfo(t, "stack iface", new(Iface), infoIface)
 
 	for i := 0; i < 10; i++ {
-		verifyGCInfo(t, "heap PtrSlice", escape(&make([]*byte, 10)[0]), infoPtr10)
-		verifyGCInfo(t, "heap ScalarPtr", escape(new(ScalarPtr)), infoScalarPtr)
-		verifyGCInfo(t, "heap ScalarPtrSlice", escape(&make([]ScalarPtr, 4)[0]), infoScalarPtr4)
-		verifyGCInfo(t, "heap PtrScalar", escape(new(PtrScalar)), infoPtrScalar)
-		verifyGCInfo(t, "heap BigStruct", escape(new(BigStruct)), infoBigStruct())
-		verifyGCInfo(t, "heap string", escape(new(string)), infoString)
-		verifyGCInfo(t, "heap eface", escape(new(interface{})), infoEface)
-		verifyGCInfo(t, "heap iface", escape(new(Iface)), infoIface)
+		verifyGCInfo(t, "heap PtrSlice", escape(&make([]*byte, 10)[0]), trimDead(infoPtr10))
+		verifyGCInfo(t, "heap ScalarPtr", escape(new(ScalarPtr)), trimDead(infoScalarPtr))
+		verifyGCInfo(t, "heap ScalarPtrSlice", escape(&make([]ScalarPtr, 4)[0]), trimDead(infoScalarPtr4))
+		verifyGCInfo(t, "heap PtrScalar", escape(new(PtrScalar)), trimDead(infoPtrScalar))
+		verifyGCInfo(t, "heap BigStruct", escape(new(BigStruct)), trimDead(infoBigStruct()))
+		verifyGCInfo(t, "heap string", escape(new(string)), trimDead(infoString))
+		verifyGCInfo(t, "heap eface", escape(new(interface{})), trimDead(infoEface))
+		verifyGCInfo(t, "heap iface", escape(new(Iface)), trimDead(infoIface))
 	}
 
 }
@@ -67,16 +67,11 @@ func verifyGCInfo(t *testing.T, name string, p interface{}, mask0 []byte) {
 	}
 }
 
-func nonStackInfo(mask []byte) []byte {
-	// typeDead is replaced with typeScalar everywhere except stacks.
-	mask1 := make([]byte, len(mask))
-	for i, v := range mask {
-		if v == typeDead {
-			v = typeScalar
-		}
-		mask1[i] = v
+func trimDead(mask []byte) []byte {
+	for len(mask) > 2 && mask[len(mask)-1] == typeScalar {
+		mask = mask[:len(mask)-1]
 	}
-	return mask1
+	return mask
 }
 
 var gcinfoSink interface{}
diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go
index b866d7f732..61e1254bed 100644
--- a/src/runtime/mbitmap.go
+++ b/src/runtime/mbitmap.go
@@ -491,6 +491,8 @@ func heapBitsSweepSpan(base, size, n uintptr, f func(uintptr)) {
 // but if the start or end of x shares a bitmap byte with an adjacent
 // object, the GC marker is racing with updates to those object's mark bits.
 func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
+	const doubleCheck = false // slow but helpful; enable to test modifications to this function
+
 	// From here till marked label marking the object as allocated
 	// and storing type info in the GC bitmap.
 	h := heapBitsForAddr(x)
@@ -518,7 +520,7 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 
 	ptrmask := (*uint8)(unsafe.Pointer(typ.gc[0])) // pointer to unrolled mask
 	if typ.kind&kindGCProg != 0 {
-		nptr := (uintptr(typ.size) + ptrSize - 1) / ptrSize
+		nptr := typ.ptrdata / ptrSize
 		masksize := (nptr + 7) / 8
 		masksize++ // unroll flag in the beginning
 		if masksize > maxGCMask && typ.gc[1] != 0 {
@@ -568,21 +570,56 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 	// In general, one load can supply two bitmap byte writes.
 	// This is a lot of lines of code, but it compiles into relatively few
 	// machine instructions.
+
+	// Ptrmask buffer.
 	var (
 		p     *byte   // last ptrmask byte read
 		b     uintptr // ptrmask bits already loaded
-		nb    uint32  // number of bits in b at next read
+		nb    uintptr // number of bits in b at next read
 		endp  *byte   // final ptrmask byte to read (then repeat)
-		endnb uint32  // number of valid bits in *endp
+		endnb uintptr // number of valid bits in *endp
 		pbits uintptr // alternate source of bits
 	)
 
+	// Note about sizes:
+	//
+	// typ.size is the number of words in the object,
+	// and typ.ptrdata is the number of words in the prefix
+	// of the object that contains pointers. That is, the final
+	// typ.size - typ.ptrdata words contain no pointers.
+	// This allows optimization of a common pattern where
+	// an object has a small header followed by a large scalar
+	// buffer. If we know the pointers are over, we don't have
+	// to scan the buffer's heap bitmap at all.
+	// The 1-bit ptrmasks are sized to contain only bits for
+	// the typ.ptrdata prefix, zero padded out to a full byte
+	// of bitmap. This code sets nw (below) so that heap bitmap
+	// bits are only written for the typ.ptrdata prefix; if there is
+	// more room in the allocated object, the next heap bitmap
+	// entry is a 00, indicating that there are no more pointers
+	// to scan. So only the ptrmask for the ptrdata bytes is needed.
+	//
+	// Replicated copies are not as nice: if there is an array of
+	// objects with scalar tails, all but the last tail does have to
+	// be initialized, because there is no way to say "skip forward".
+	// However, because of the possibility of a repeated type with
+	// size not a multiple of 4 pointers (one heap bitmap byte),
+	// the code already must handle the last ptrmask byte specially
+	// by treating it as containing only the bits for endnb pointers,
+	// where endnb <= 4. We represent large scalar tails that must
+	// be expanded in the replication by setting endnb larger than 4.
+	// This will have the effect of reading many bits out of b,
+	// but once the real bits are shifted out, b will supply as many
+	// zero bits as we try to read, which is exactly what we need.
+
 	p = ptrmask
 	if typ.size < dataSize {
 		// Filling in bits for an array of typ.
 		// Set up for repetition of ptrmask during main loop.
-		if typ.size/ptrSize+7 <= ptrSize*8 {
-			// Entire ptrmask + a leftover fragment fits in uintptr.
+		// Note that ptrmask describes only a prefix of
+		const maxBits = ptrSize*8 - 7
+		if typ.ptrdata/ptrSize <= maxBits {
+			// Entire ptrmask fits in uintptr with room for a byte fragment.
 			// Load into pbits and never read from ptrmask again.
 			// This is especially important when the ptrmask has
 			// fewer than 8 bits in it; otherwise the reload in the middle
@@ -590,26 +627,34 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 			// at least 8 bits.
 
 			// Accumulate ptrmask into b.
-			nb = uint32(typ.size / ptrSize)
-			for i := uint32(0); i < nb; i += 8 {
+			// ptrmask is sized to describe only typ.ptrdata, but we record
+			// it as describing typ.size bytes, since all the high bits are zero.
+			nb = typ.ptrdata / ptrSize
+			for i := uintptr(0); i < nb; i += 8 {
 				b |= uintptr(*p) << i
 				p = addb(p, 1)
 			}
+			nb = typ.size / ptrSize
 
 			// Replicate ptrmask to fill entire pbits uintptr.
 			// Doubling and truncating is fewer steps than
 			// iterating by nb each time. (nb could be 1.)
+			// Since we loaded typ.ptrdata/ptrSize bits
+			// but are pretending to have typ.size/ptrSize,
+			// there might be no replication necessary/possible.
 			pbits = b
 			endnb = nb
-			for endnb <= ptrSize*8 {
-				pbits |= pbits << endnb
-				endnb += endnb
+			if nb+nb <= maxBits {
+				for endnb <= ptrSize*8 {
+					pbits |= pbits << endnb
+					endnb += endnb
+				}
+				// Truncate to a multiple of original ptrmask.
+				endnb = maxBits / nb * nb
+				pbits &= 1<<endnb - 1
+				b = pbits
+				nb = endnb
 			}
-			// Truncate to an multiple of original ptrmask.
-			endnb = (ptrSize*8 - 7) / nb * nb
-			pbits &= 1<<endnb - 1
-			b = pbits
-			nb = endnb
 
 			// Clear p and endp as sentinel for using pbits.
 			// Checked during Phase 2 loop.
@@ -617,11 +662,9 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 			endp = nil
 		} else {
 			// Ptrmask is larger. Read it multiple times.
-			endp = addb(ptrmask, (typ.size/ptrSize+7)/8-1)
-			endnb = uint32(typ.size/ptrSize) % 8
-			if endnb == 0 {
-				endnb = 8
-			}
+			n := (typ.ptrdata/ptrSize+7)/8 - 1
+			endp = addb(ptrmask, n)
+			endnb = typ.size/ptrSize - n*8
 		}
 	}
 	if p != nil {
@@ -630,8 +673,27 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 		nb = 8
 	}
 
-	w := uintptr(0)          // number of words processed
-	nw := dataSize / ptrSize // number of words to process
+	var w uintptr  // words processed
+	var nw uintptr // total number of words to process
+	if typ.size == dataSize {
+		// Single entry: can stop once we reach the non-pointer data.
+		nw = typ.ptrdata / ptrSize
+	} else {
+		// Repeated instances of typ in an array.
+		// Have to process the
+		nw = ((dataSize/typ.size-1)*typ.size + typ.ptrdata) / ptrSize
+	}
+	if nw == 0 {
+		// No pointers! Caller was supposed to check.
+		println("runtime: invalid type ", *typ._string)
+		throw("heapBitsSetType: called with non-pointer type")
+		return
+	}
+	if nw < 2 {
+		// Must write at least 2 words, because the "no scan"
+		// encoding doesn't take effect until the third word.
+		nw = 2
+	}
 
 	hbitp := h.bitp // next heap bitmap byte to write
 	var hb uintptr  // bits being preapred for *h.bitp
@@ -641,11 +703,11 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 	// which do not have the marked bits set.
 	// The leading half-byte is special because it's a half a byte and must be
 	// manipulated atomically.
-	switch h.shift {
+	switch {
 	default:
 		throw("heapBitsSetType: unexpected shift")
 
-	case 0:
+	case h.shift == 0:
 		// Ptrmask and heap bitmap are aligned.
 		// Handle first byte of bitmap specially.
 		// The first byte we write out contains the first two words of the object.
@@ -662,7 +724,7 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 		b >>= 4
 		nb -= 4
 
-	case 4:
+	case ptrSize == 8 && h.shift == 4:
 		// Ptrmask and heap bitmap are misaligned.
 		// The bits for the first two words are in a byte shared with another object
 		// and must be updated atomically.
@@ -679,17 +741,13 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 		// Note: no bitMarker in hb because the first two words don't get markers from us.
 		atomicor8(hbitp, uint8(hb))
 		hbitp = subtractb(hbitp, 1)
-
-		// Expand 8-bit chunks of ptrmask into pairs of heap bitmap bytes.
-		// We know the object size is a multiple of 2 words but not 4, so the
-		// object size minus the 2 words we just handled is a multiple of 4,
-		// so we can use non-atomic writes to the heap bitmap for the
-		// rest of this code, even for the final fragment or a trailing dead marker byte.
-
-		// Loop prepares bits for final byte but stops before writing them,
-		// so that in the case where we need to write only part of a byte,
-		// the code below the loop can truncate the bitMarked.
-		w += 2
+		if w += 2; w >= nw {
+			// We know that there is more data, because we handled 2-word objects above.
+			// This must be at least a 6-word object. If we're out of pointer words,
+			// mark no scan in next bitmap byte and finish.
+			*hbitp = 0
+			goto Phase4
+		}
 	}
 
 	// Phase 2: Full bytes in bitmap, up to but not including write to last byte (full or partial) in bitmap.
@@ -792,24 +850,27 @@ Phase3:
 		}
 	}
 
-	const test = false // slow but helpful
-	if test {
+Phase4:
+	// Phase 4: all done (goto target).
+
+	if doubleCheck {
 		// Double-check that bits to be written were written correctly.
 		// Does not check that other bits were not written, unfortunately.
 		h := heapBitsForAddr(x)
-		nptr := typ.size / ptrSize
+		nptr := typ.ptrdata / ptrSize
+		ndata := typ.size / ptrSize
+		count := dataSize / typ.size
 		for i := uintptr(0); i <= dataSize/ptrSize; i++ {
-			j := i % nptr
+			j := i % ndata
 			var have, want uint8
-			if i == dataSize/ptrSize {
-				if dataSize >= size {
-					break
-				}
-				have = (*h.bitp >> h.shift) & 3
-				want = 0 // dead bits
+			if i == dataSize/ptrSize && dataSize >= size {
+				break
+			}
+			have = (*h.bitp >> h.shift) & 3
+			if i == dataSize/ptrSize || i/ndata == count-1 && j >= nptr {
+				want = 0 // dead marker
 			} else {
-				have = (*h.bitp >> h.shift) & 3
-				if (*addb(ptrmask, j/8)>>(j%8))&1 != 0 {
+				if j < nptr && (*addb(ptrmask, j/8)>>(j%8))&1 != 0 {
 					want |= bitPointer
 				}
 				if i >= 2 {
@@ -820,13 +881,18 @@ Phase3:
 			}
 			if have != want {
 				println("mismatch writing bits for", *typ._string, "x", dataSize/typ.size)
-				print("typ.size=", typ.size, " dataSize=", dataSize, " size=", size, "\n")
+				print("typ.size=", typ.size, " typ.ptrdata=", typ.ptrdata, " dataSize=", dataSize, " size=", size, "\n")
+				print("w=", w, " nw=", nw, " b=", hex(b), " nb=", nb, " hb=", hex(hb), "\n")
 				h = heapBitsForAddr(x)
 				print("initial bits h.bitp=", h.bitp, " h.shift=", h.shift, "\n")
-				print("p=", p, " endp=", endp, " endnb=", endnb, " pbits=", hex(pbits), " b=", hex(b), " nb=", nb, "\n")
+				print("ptrmask=", ptrmask, " p=", p, " endp=", endp, " endnb=", endnb, " pbits=", hex(pbits), " b=", hex(b), " nb=", nb, "\n")
 				println("at word", i, "offset", i*ptrSize, "have", have, "want", want)
 				throw("bad heapBitsSetType")
 			}
+			if i >= 2 && want == 0 {
+				// found dead marker; the rest is uninitialized
+				break
+			}
 			h = h.next()
 		}
 	}
@@ -1076,7 +1142,7 @@ func getgcmask(ep interface{}) (mask []byte) {
 				mask[i/ptrSize] = 1
 			}
 			if i >= 2*ptrSize && !hbits.isMarked() {
-				mask[i/ptrSize] = 255
+				mask = mask[:i/ptrSize]
 				break
 			}
 		}