1 files changed, 256 insertions, 231 deletions
diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go
index cfdd259371..dea6879adc 100644
--- a/src/runtime/mbitmap.go
+++ b/src/runtime/mbitmap.go
@@ -6,48 +6,36 @@
 //
 // Stack, data, and bss bitmaps
 //
-// Not handled in this file, but worth mentioning: stack frames and global data
-// in the data and bss sections are described by 1-bit bitmaps in which 0 means
-// scalar or uninitialized or dead and 1 means pointer to visit during GC.
-//
-// Comparing this 1-bit form with the 2-bit form described below, 0 represents
-// both the 2-bit 00 and 01, while 1 represents the 2-bit 10.
-// Therefore conversions between the two (until the 2-bit form is gone)
-// can be done by x>>1 for 2-bit to 1-bit and x+1 for 1-bit to 2-bit.
-//
-// Type bitmaps
-//
-// Types that aren't too large
-// record information about the layout of their memory words using a type bitmap.
-// The bitmap holds two bits for each pointer-sized word. The two-bit values are:
-//
-// 	00 - typeDead: not a pointer, and no pointers in the rest of the object
-//	01 - typeScalar: not a pointer
-//	10 - typePointer: a pointer that GC should trace
-//	11 - unused
-//
-// typeDead only appears in type bitmaps in Go type descriptors
-// and in type bitmaps embedded in the heap bitmap (see below).
+// Stack frames and global variables in the data and bss sections are described
+// by 1-bit bitmaps in which 0 means uninteresting and 1 means live pointer
+// to be visited during GC.
 //
 // Heap bitmap
 //
 // The allocated heap comes from a subset of the memory in the range [start, used),
 // where start == mheap_.arena_start and used == mheap_.arena_used.
-// The heap bitmap comprises 4 bits for each pointer-sized word in that range,
+// The heap bitmap comprises 2 bits for each pointer-sized word in that range,
 // stored in bytes indexed backward in memory from start.
-// That is, the byte at address start-1 holds the 4-bit entries for the two words
-// start, start+ptrSize, the byte at start-2 holds the entries for start+2*ptrSize,
-// start+3*ptrSize, and so on.
-// In the byte holding the entries for addresses p and p+ptrSize, the low 4 bits
-// describe p and the high 4 bits describe p+ptrSize.
+// That is, the byte at address start-1 holds the 2-bit entries for the four words
+// start through start+3*ptrSize, the byte at start-2 holds the entries for
+// start+4*ptrSize through start+7*ptrSize, and so on.
+// In each byte, the low 2 bits describe the first word, the next 2 bits describe
+// the next word, and so on.
 //
-// The 4 bits for each word are:
-//	0001 - not used
-//	0010 - bitMarked: this object has been marked by GC
-//	tt00 - word type bits, as in a type bitmap.
+// In each 2-bit entry, the lower bit holds the same information as in the 1-bit
+// bitmaps: 0 means uninteresting and 1 means live pointer to be visited during GC.
+// The meaning of the high bit depends on the position of the word being described
+// in its allocated object. In the first word, the high bit is the GC ``marked'' bit.
+// In the second word, the high bit is the GC ``checkmarked'' bit (see below).
+// In the third and later words, the high bit indicates that the object is still
+// being described. In these words, if a bit pair with a high bit 0 is encountered,
+// the low bit can also be assumed to be 0, and the object description is over.
+// This 00 is called the ``dead'' encoding: it signals that the rest of the words
+// in the object are uninteresting to the garbage collector.
 //
-// The code makes use of the fact that the zero value for a heap bitmap nibble
-// has no boundary bit set, no marked bit set, and type bits == typeDead.
+// The code makes use of the fact that the zero value for a heap bitmap
+// has no live pointer bit set and is (depending on position), not marked,
+// not checkmarked, and is the dead encoding.
 // These properties must be preserved when modifying the encoding.
 //
 // Checkmarks
@@ -57,45 +45,33 @@
 // collector implementation. As a sanity check, the GC has a 'checkmark'
 // mode that retraverses the object graph with the world stopped, to make
 // sure that everything that should be marked is marked.
-// In checkmark mode, in the heap bitmap, the type bits for the first word
-// of an object are redefined:
-//
-//	00 - typeScalarCheckmarked // typeScalar, checkmarked
-//	01 - typeScalar // typeScalar, not checkmarked
-//	10 - typePointer // typePointer, not checkmarked
-//	11 - typePointerCheckmarked // typePointer, checkmarked
+// In checkmark mode, in the heap bitmap, the high bit of the 2-bit entry
+// for the second word of the object holds the checkmark bit.
+// When not in checkmark mode, this bit is set to 1.
 //
-// That is, typeDead is redefined to be typeScalar + a checkmark, and the
-// previously unused 11 pattern is redefined to be typePointer + a checkmark.
-// To prepare for this mode, we must move any typeDead in the first word of
-// a multiword object to the second word.
+// The smallest possible allocation is 8 bytes. On a 32-bit machine, that
+// means every allocated object has two words, so there is room for the
+// checkmark bit. On a 64-bit machine, however, the 8-byte allocation is
+// just one word, so the second bit pair is not available for encoding the
+// checkmark. However, because non-pointer allocations are combined
+// into larger 16-byte (maxTinySize) allocations, a plain 8-byte allocation
+// must be a pointer, so the type bit in the first word is not actually needed.
+// It is still used in general, except in checkmark the type bit is repurposed
+// as the checkmark bit and then reinitialized (to 1) as the type bit when
+// finished.
 
 package runtime
 
 import "unsafe"
 
 const (
-	typeDead               = 0
-	typeScalarCheckmarked  = 0
-	typeScalar             = 1
-	typePointer            = 2
-	typePointerCheckmarked = 3
+	bitPointer = 1
+	bitMarked  = 2
 
-	typeBitsWidth = 2 // # of type bits per pointer-sized word
-	typeMask      = 1<<typeBitsWidth - 1
-
-	heapBitsWidth   = 4
-	heapBitmapScale = ptrSize * (8 / heapBitsWidth) // number of data bytes per heap bitmap byte
-	bitMarked       = 2
-	typeShift       = 2
+	heapBitsWidth   = 2                             // heap bitmap bits to describe one pointer
+	heapBitmapScale = ptrSize * (8 / heapBitsWidth) // number of data bytes described by one heap bitmap byte
 )
 
-// Information from the compiler about the layout of stack frames.
-type bitvector struct {
-	n        int32 // # of bits
-	bytedata *uint8
-}
-
 // addb returns the byte pointer p+n.
 //go:nowritebarrier
 func addb(p *byte, n uintptr) *byte {
@@ -141,8 +117,9 @@ type heapBits struct {
 // heapBitsForAddr returns the heapBits for the address addr.
 // The caller must have already checked that addr is in the range [mheap_.arena_start, mheap_.arena_used).
 func heapBitsForAddr(addr uintptr) heapBits {
+	// 2 bits per work, 4 pairs per byte, and a mask is hard coded.
 	off := (addr - mheap_.arena_start) / ptrSize
-	return heapBits{(*uint8)(unsafe.Pointer(mheap_.arena_start - off/2 - 1)), uint32(4 * (off & 1))}
+	return heapBits{(*uint8)(unsafe.Pointer(mheap_.arena_start - off/4 - 1)), uint32(2 * (off & 3))}
 }
 
 // heapBitsForSpan returns the heapBits for the span base address base.
@@ -229,20 +206,39 @@ func (h heapBits) prefetch() {
 // That is, if h describes address p, h.next() describes p+ptrSize.
 // Note that next does not modify h. The caller must record the result.
 func (h heapBits) next() heapBits {
-	if h.shift == 0 {
-		return heapBits{h.bitp, 4}
+	if h.shift < 8-heapBitsWidth {
+		return heapBits{h.bitp, h.shift + heapBitsWidth}
 	}
 	return heapBits{subtractb(h.bitp, 1), 0}
 }
 
+// forward returns the heapBits describing n pointer-sized words ahead of h in memory.
+// That is, if h describes address p, h.forward(n) describes p+n*ptrSize.
+// h.forward(1) is equivalent to h.next(), just slower.
+// Note that forward does not modify h. The caller must record the result.
+// bits returns the heap bits for the current word.
+func (h heapBits) forward(n uintptr) heapBits {
+	n += uintptr(h.shift) / heapBitsWidth
+	return heapBits{subtractb(h.bitp, n/4), uint32(n%4) * heapBitsWidth}
+}
+
+// The caller can test isMarked and isPointer by &-ing with bitMarked and bitPointer.
+// The result includes in its higher bits the bits for subsequent words
+// described by the same bitmap byte.
+func (h heapBits) bits() uint32 {
+	return uint32(*h.bitp) >> h.shift
+}
+
 // isMarked reports whether the heap bits have the marked bit set.
+// h must describe the initial word of the object.
 func (h heapBits) isMarked() bool {
 	return *h.bitp&(bitMarked<<h.shift) != 0
 }
 
 // setMarked sets the marked bit in the heap bits, atomically.
+// h must describe the initial word of the object.
 func (h heapBits) setMarked() {
-	// Each byte of GC bitmap holds info for two words.
+	// Each byte of GC bitmap holds info for four words.
 	// Might be racing with other updates, so use atomic update always.
 	// We used to be clever here and use a non-atomic update in certain
 	// cases, but it's not worth the risk.
@@ -250,31 +246,68 @@ func (h heapBits) setMarked() {
 }
 
 // setMarkedNonAtomic sets the marked bit in the heap bits, non-atomically.
+// h must describe the initial word of the object.
 func (h heapBits) setMarkedNonAtomic() {
 	*h.bitp |= bitMarked << h.shift
 }
 
-// typeBits returns the heap bits' type bits.
-func (h heapBits) typeBits() uint8 {
-	return (*h.bitp >> (h.shift + typeShift)) & typeMask
+// isPointer reports whether the heap bits describe a pointer word.
+// h must describe the initial word of the object.
+func (h heapBits) isPointer() bool {
+	return (*h.bitp>>h.shift)&bitPointer != 0
+}
+
+// hasPointers reports whether the given object has any pointers.
+// It must be told how large the object at h is, so that it does not read too
+// far into the bitmap.
+// h must describe the initial word of the object.
+func (h heapBits) hasPointers(size uintptr) bool {
+	if size == ptrSize { // 1-word objects are always pointers
+		return true
+	}
+	// Otherwise, at least a 2-word object, and at least 2-word aligned,
+	// so h.shift is either 0 or 4, so we know we can get the bits for the
+	// first two words out of *h.bitp.
+	// If either of the first two words is a pointer, not pointer free.
+	b := uint32(*h.bitp >> h.shift)
+	if b&(bitPointer|bitPointer<<heapBitsWidth) != 0 {
+		return true
+	}
+	if size == 2*ptrSize {
+		return false
+	}
+	// At least a 4-word object. Check scan bit (aka marked bit) in third word.
+	if h.shift == 0 {
+		return b&(bitMarked<<(2*heapBitsWidth)) != 0
+	}
+	return uint32(*subtractb(h.bitp, 1))&bitMarked != 0
 }
 
 // isCheckmarked reports whether the heap bits have the checkmarked bit set.
-func (h heapBits) isCheckmarked() bool {
-	typ := h.typeBits()
-	return typ == typeScalarCheckmarked || typ == typePointerCheckmarked
+// It must be told how large the object at h is, because the encoding of the
+// checkmark bit varies by size.
+// h must describe the initial word of the object.
+func (h heapBits) isCheckmarked(size uintptr) bool {
+	if size == ptrSize {
+		return (*h.bitp>>h.shift)&bitPointer != 0
+	}
+	// All multiword objects are 2-word aligned,
+	// so we know that the initial word's 2-bit pair
+	// and the second word's 2-bit pair are in the
+	// same heap bitmap byte, *h.bitp.
+	return (*h.bitp>>(heapBitsWidth+h.shift))&bitMarked != 0
 }
 
 // setCheckmarked sets the checkmarked bit.
-func (h heapBits) setCheckmarked() {
-	typ := h.typeBits()
-	if typ == typeScalar {
-		// Clear low type bit to turn 01 into 00.
-		atomicand8(h.bitp, ^((1 << typeShift) << h.shift))
-	} else if typ == typePointer {
-		// Set low type bit to turn 10 into 11.
-		atomicor8(h.bitp, (1<<typeShift)<<h.shift)
+// It must be told how large the object at h is, because the encoding of the
+// checkmark bit varies by size.
+// h must describe the initial word of the object.
+func (h heapBits) setCheckmarked(size uintptr) {
+	if size == ptrSize {
+		atomicor8(h.bitp, bitPointer<<h.shift)
+		return
 	}
+	atomicor8(h.bitp, bitMarked<<(heapBitsWidth+h.shift))
 }
 
 // The methods operating on spans all require that h has been returned
@@ -295,95 +328,43 @@ func (h heapBits) initSpan(size, n, total uintptr) {
 }
 
 // initCheckmarkSpan initializes a span for being checkmarked.
-// This would be a no-op except that we need to rewrite any
-// typeDead bits in the first word of the object into typeScalar
-// followed by a typeDead in the second word of the object.
+// It clears the checkmark bits, which are set to 1 in normal operation.
 func (h heapBits) initCheckmarkSpan(size, n, total uintptr) {
-	if size == ptrSize {
+	// The ptrSize == 8 is a compile-time constant false on 32-bit and eliminates this code entirely.
+	if ptrSize == 8 && size == ptrSize {
+		// Checkmark bit is type bit, bottom bit of every 2-bit entry.
 		// Only possible on 64-bit system, since minimum size is 8.
-		// Must update both top and bottom nibble of each byte.
-		// There is no second word in these objects, so all we have
-		// to do is rewrite typeDead to typeScalar by adding the 1<<typeShift bit.
+		// Must clear type bit (checkmark bit) of every word.
+		// The type bit is the lower of every two-bit pair.
 		bitp := h.bitp
-		for i := uintptr(0); i < n; i += 2 {
-			x := int(*bitp)
-
-			if (x>>typeShift)&typeMask == typeDead {
-				x += (typeScalar - typeDead) << typeShift
-			}
-			if (x>>(4+typeShift))&typeMask == typeDead {
-				x += (typeScalar - typeDead) << (4 + typeShift)
-			}
-			*bitp = uint8(x)
+		for i := uintptr(0); i < n; i += 4 {
+			*bitp &^= bitPointer | bitPointer<<2 | bitPointer<<4 | bitPointer<<6
 			bitp = subtractb(bitp, 1)
 		}
 		return
 	}
-
-	// Update bottom nibble for first word of each object.
-	// If the bottom nibble says typeDead, change to typeScalar
-	// and clear top nibble to mark as typeDead.
-	bitp := h.bitp
-	step := size / heapBitmapScale
 	for i := uintptr(0); i < n; i++ {
-		x := *bitp
-		if (x>>typeShift)&typeMask == typeDead {
-			x += (typeScalar - typeDead) << typeShift
-			x &= 0x0f // clear top nibble to typeDead
-		}
-		bitp = subtractb(bitp, step)
+		*h.bitp &^= bitMarked << (heapBitsWidth + h.shift)
+		h = h.forward(size / ptrSize)
 	}
 }
 
-// clearCheckmarkSpan removes all the checkmarks from a span.
-// If it finds a multiword object starting with typeScalar typeDead,
-// it rewrites the heap bits to the simpler typeDead typeDead.
+// clearCheckmarkSpan undoes all the checkmarking in a span.
+// The actual checkmark bits are ignored, so the only work to do
+// is to fix the pointer bits. (Pointer bits are ignored by scanobject
+// but consulted by typedmemmove.)
 func (h heapBits) clearCheckmarkSpan(size, n, total uintptr) {
-	if size == ptrSize {
+	// The ptrSize == 8 is a compile-time constant false on 32-bit and eliminates this code entirely.
+	if ptrSize == 8 && size == ptrSize {
+		// Checkmark bit is type bit, bottom bit of every 2-bit entry.
 		// Only possible on 64-bit system, since minimum size is 8.
-		// Must update both top and bottom nibble of each byte.
-		// typeScalarCheckmarked can be left as typeDead,
-		// but we want to change typeScalar back to typeDead.
+		// Must clear type bit (checkmark bit) of every word.
+		// The type bit is the lower of every two-bit pair.
 		bitp := h.bitp
-		for i := uintptr(0); i < n; i += 2 {
-			x := int(*bitp)
-			switch typ := (x >> typeShift) & typeMask; typ {
-			case typeScalar:
-				x += (typeDead - typeScalar) << typeShift
-			case typePointerCheckmarked:
-				x += (typePointer - typePointerCheckmarked) << typeShift
-			}
-
-			switch typ := (x >> (4 + typeShift)) & typeMask; typ {
-			case typeScalar:
-				x += (typeDead - typeScalar) << (4 + typeShift)
-			case typePointerCheckmarked:
-				x += (typePointer - typePointerCheckmarked) << (4 + typeShift)
-			}
-
-			*bitp = uint8(x)
+		for i := uintptr(0); i < n; i += 4 {
+			*bitp |= bitPointer | bitPointer<<2 | bitPointer<<4 | bitPointer<<6
 			bitp = subtractb(bitp, 1)
 		}
-		return
-	}
-
-	// Update bottom nibble for first word of each object.
-	// If the bottom nibble says typeScalarCheckmarked and the top is not typeDead,
-	// change to typeScalar. Otherwise leave, since typeScalarCheckmarked == typeDead.
-	// If the bottom nibble says typePointerCheckmarked, change to typePointer.
-	bitp := h.bitp
-	step := size / heapBitmapScale
-	for i := uintptr(0); i < n; i++ {
-		x := int(*bitp)
-		switch typ := (x >> typeShift) & typeMask; {
-		case typ == typeScalarCheckmarked && (x>>(4+typeShift))&typeMask != typeDead:
-			x += (typeScalar - typeScalarCheckmarked) << typeShift
-		case typ == typePointerCheckmarked:
-			x += (typePointer - typePointerCheckmarked) << typeShift
-		}
-
-		*bitp = uint8(x)
-		bitp = subtractb(bitp, step)
 	}
 }
 
@@ -393,44 +374,98 @@ func (h heapBits) clearCheckmarkSpan(size, n, total uintptr) {
 // bits for the first two words (or one for single-word objects) to typeDead
 // and then calls f(p), where p is the object's base address.
 // f is expected to add the object to a free list.
+// For non-free objects, heapBitsSweepSpan turns off the marked bit.
 func heapBitsSweepSpan(base, size, n uintptr, f func(uintptr)) {
 	h := heapBitsForSpan(base)
-	if size == ptrSize {
-		// Only possible on 64-bit system, since minimum size is 8.
-		// Must read and update both top and bottom nibble of each byte.
+	switch {
+	default:
+		throw("heapBitsSweepSpan")
+	case size == ptrSize:
+		// Consider mark bits in all four 2-bit entries of each bitmap byte.
 		bitp := h.bitp
-		for i := uintptr(0); i < n; i += 2 {
-			x := int(*bitp)
+		for i := uintptr(0); i < n; i += 4 {
+			x := uint32(*bitp)
 			if x&bitMarked != 0 {
 				x &^= bitMarked
 			} else {
-				x &^= typeMask << typeShift
+				x &^= bitPointer
 				f(base + i*ptrSize)
 			}
+			if x&(bitMarked<<2) != 0 {
+				x &^= bitMarked << 2
+			} else {
+				x &^= bitPointer << 2
+				f(base + (i+1)*ptrSize)
+			}
 			if x&(bitMarked<<4) != 0 {
 				x &^= bitMarked << 4
 			} else {
-				x &^= typeMask << (4 + typeShift)
-				f(base + (i+1)*ptrSize)
+				x &^= bitPointer << 4
+				f(base + (i+2)*ptrSize)
+			}
+			if x&(bitMarked<<6) != 0 {
+				x &^= bitMarked << 6
+			} else {
+				x &^= bitPointer << 6
+				f(base + (i+3)*ptrSize)
 			}
 			*bitp = uint8(x)
 			bitp = subtractb(bitp, 1)
 		}
-		return
-	}
 
-	bitp := h.bitp
-	step := size / heapBitmapScale
-	for i := uintptr(0); i < n; i++ {
-		x := int(*bitp)
-		if x&bitMarked != 0 {
-			x &^= bitMarked
-		} else {
-			x = 0
-			f(base + i*size)
+	case size%(4*ptrSize) == 0:
+		// Mark bit is in first word of each object.
+		// Each object starts at bit 0 of a heap bitmap byte.
+		bitp := h.bitp
+		step := size / heapBitmapScale
+		for i := uintptr(0); i < n; i++ {
+			x := uint32(*bitp)
+			if x&bitMarked != 0 {
+				x &^= bitMarked
+			} else {
+				x = 0
+				f(base + i*size)
+			}
+			*bitp = uint8(x)
+			bitp = subtractb(bitp, step)
+		}
+
+	case size%(4*ptrSize) == 2*ptrSize:
+		// Mark bit is in first word of each object,
+		// but every other object starts halfway through a heap bitmap byte.
+		// Unroll loop 2x to handle alternating shift count and step size.
+		bitp := h.bitp
+		step := size / heapBitmapScale
+		var i uintptr
+		for i = uintptr(0); i < n; i += 2 {
+			x := uint32(*bitp)
+			if x&bitMarked != 0 {
+				x &^= bitMarked
+			} else {
+				x &^= 0x0f
+				f(base + i*size)
+				if size > 2*ptrSize {
+					x = 0
+				}
+			}
+			*bitp = uint8(x)
+			if i+1 >= n {
+				break
+			}
+			bitp = subtractb(bitp, step)
+			x = uint32(*bitp)
+			if x&(bitMarked<<4) != 0 {
+				x &^= bitMarked << 4
+			} else {
+				x &^= 0xf0
+				f(base + (i+1)*size)
+				if size > 2*ptrSize {
+					*subtractb(bitp, 1) = 0
+				}
+			}
+			*bitp = uint8(x)
+			bitp = subtractb(bitp, step+1)
 		}
-		*bitp = uint8(x)
-		bitp = subtractb(bitp, step)
 	}
 }
 
@@ -456,7 +491,7 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 		// when initializing the span, and then the atomicor8 here
 		// goes away - heapBitsSetType would be a no-op
 		// in that case.
-		atomicor8(h.bitp, typePointer<<(typeShift+h.shift))
+		atomicor8(h.bitp, bitPointer<<h.shift)
 		return
 	}
 	if typ.kind&kindGCProg != 0 {
@@ -489,41 +524,28 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 		ptrmask = (*uint8)(unsafe.Pointer(typ.gc[0])) // pointer to unrolled mask
 	}
 
-	// Copy from 1-bit ptrmask into 4-bit bitmap.
-	elemSize := typ.size
-	var v uint32 // pending byte of 4-bit bitmap; uint32 for better code gen
-	nv := 0      // number of bits added to v
-	for i := uintptr(0); i < dataSize; i += elemSize {
-		// At each word, b holds the pending bits from the 1-bit bitmap,
-		// with a sentinel 1 bit above all the actual bits.
-		// When b == 1, that means it is out of bits and needs to be refreshed.
-		// *(p+1) is the next byte to read.
-		p := ptrmask
-		b := uint32(*p) | 0x100
-		for j := uintptr(0); j < elemSize; j += ptrSize {
-			if b == 1 {
-				p = addb(p, 1)
-				b = uint32(*p) | 0x100
-			}
-			// b&1 is 1 for pointer, 0 for scalar.
-			// We want typePointer (2) or typeScalar (1), so add 1.
-			v |= ((b & 1) + 1) << (uint(nv) + typeShift)
-			b >>= 1
-			if nv += heapBitsWidth; nv == 8 {
-				*h.bitp = uint8(v)
-				h.bitp = subtractb(h.bitp, 1)
-				v = 0
-				nv = 0
-			}
+	// Copy from 1-bit ptrmask into 2-bit bitmap.
+	// If size is a multiple of 4 words, then the bitmap bytes for the object
+	// are not shared with any other object and can be written directly.
+	// On 64-bit systems, many sizes are only 16-byte aligned; half of
+	// those are not multiples of 4 words (for example, 48/8 = 6 words);
+	// those share either the leading byte or the trailing byte of their bitmaps
+	// with another object.
+	nptr := typ.size / ptrSize
+	_ = nptr
+	for i := uintptr(0); i < dataSize/ptrSize; i++ {
+		atomicand8(h.bitp, ^((bitPointer | bitMarked) << h.shift))
+		j := i % nptr
+		if (*addb(ptrmask, j/8)>>(j%8))&1 != 0 {
+			atomicor8(h.bitp, bitPointer<<h.shift)
+		}
+		if i >= 2 {
+			atomicor8(h.bitp, bitMarked<<h.shift)
 		}
+		h = h.next()
 	}
-
-	// Finish final byte of bitmap and mark next word (if any) with typeDead (0)
-	if nv != 0 {
-		*h.bitp = uint8(v)
-		h.bitp = subtractb(h.bitp, 1)
-	} else if dataSize < size {
-		*h.bitp = 0
+	if dataSize < size {
+		atomicand8(h.bitp, ^((bitPointer | bitMarked) << h.shift))
 	}
 }
 
@@ -600,7 +622,7 @@ const (
 // ppos is a pointer to position in mask, in bits.
 // sparse says to generate 4-bits per word mask for heap (1-bit for data/bss otherwise).
 //go:nowritebarrier
-func unrollgcprog1(maskp *byte, prog *byte, ppos *uintptr, inplace, sparse bool) *byte {
+func unrollgcprog1(maskp *byte, prog *byte, ppos *uintptr, inplace bool) *byte {
 	pos := *ppos
 	mask := (*[1 << 30]byte)(unsafe.Pointer(maskp))
 	for {
@@ -616,6 +638,8 @@ func unrollgcprog1(maskp *byte, prog *byte, ppos *uintptr, inplace, sparse bool)
 			for i := 0; i < siz; i++ {
 				v := p[i/8] >> (uint(i) % 8) & 1
 				if inplace {
+					throw("gc inplace")
+					const typeShift = 2
 					// Store directly into GC bitmap.
 					h := heapBitsForAddr(uintptr(unsafe.Pointer(&mask[pos])))
 					if h.shift == 0 {
@@ -624,12 +648,6 @@ func unrollgcprog1(maskp *byte, prog *byte, ppos *uintptr, inplace, sparse bool)
 						*h.bitp |= v << (4 + typeShift)
 					}
 					pos += ptrSize
-				} else if sparse {
-					throw("sparse")
-					// 4-bits per word, type bits in high bits
-					v <<= (pos % 8) + typeShift
-					mask[pos/8] |= v
-					pos += heapBitsWidth
 				} else {
 					// 1 bit per word, for data/bss bitmap
 					mask[pos/8] |= v << (pos % 8)
@@ -647,7 +665,7 @@ func unrollgcprog1(maskp *byte, prog *byte, ppos *uintptr, inplace, sparse bool)
 			prog = (*byte)(add(unsafe.Pointer(prog), ptrSize))
 			var prog1 *byte
 			for i := uintptr(0); i < siz; i++ {
-				prog1 = unrollgcprog1(&mask[0], prog, &pos, inplace, sparse)
+				prog1 = unrollgcprog1(&mask[0], prog, &pos, inplace)
 			}
 			if *prog1 != insArrayEnd {
 				throw("unrollgcprog: array does not end with insArrayEnd")
@@ -667,7 +685,7 @@ func unrollglobgcprog(prog *byte, size uintptr) bitvector {
 	mask := (*[1 << 30]byte)(persistentalloc(masksize+1, 0, &memstats.gc_sys))
 	mask[masksize] = 0xa1
 	pos := uintptr(0)
-	prog = unrollgcprog1(&mask[0], prog, &pos, false, false)
+	prog = unrollgcprog1(&mask[0], prog, &pos, false)
 	if pos != size/ptrSize {
 		print("unrollglobgcprog: bad program size, got ", pos, ", expect ", size/ptrSize, "\n")
 		throw("unrollglobgcprog: bad program size")
@@ -682,17 +700,21 @@ func unrollglobgcprog(prog *byte, size uintptr) bitvector {
 }
 
 func unrollgcproginplace_m(v unsafe.Pointer, typ *_type, size, size0 uintptr) {
+	throw("unrollinplace")
+	// TODO(rsc): Update for 1-bit bitmaps.
 	// TODO(rsc): Explain why these non-atomic updates are okay.
 	pos := uintptr(0)
 	prog := (*byte)(unsafe.Pointer(uintptr(typ.gc[1])))
 	for pos != size0 {
-		unrollgcprog1((*byte)(v), prog, &pos, true, true)
+		unrollgcprog1((*byte)(v), prog, &pos, true)
 	}
 
 	// Mark first word as bitAllocated.
 	// Mark word after last as typeDead.
 	if size0 < size {
 		h := heapBitsForAddr(uintptr(v) + size0)
+		const typeMask = 0
+		const typeShift = 0
 		*h.bitp &^= typeMask << typeShift
 	}
 }
@@ -707,7 +729,7 @@ func unrollgcprog_m(typ *_type) {
 	if *mask == 0 {
 		pos := uintptr(8) // skip the unroll flag
 		prog := (*byte)(unsafe.Pointer(uintptr(typ.gc[1])))
-		prog = unrollgcprog1(mask, prog, &pos, false, false)
+		prog = unrollgcprog1(mask, prog, &pos, false)
 		if *prog != insEnd {
 			throw("unrollgcprog: program does not end with insEnd")
 		}
@@ -737,26 +759,24 @@ func getgcmask(ep interface{}) (mask []byte) {
 	for datap := &firstmoduledata; datap != nil; datap = datap.next {
 		// data
 		if datap.data <= uintptr(p) && uintptr(p) < datap.edata {
+			bitmap := datap.gcdatamask.bytedata
 			n := (*ptrtype)(unsafe.Pointer(t)).elem.size
 			mask = make([]byte, n/ptrSize)
 			for i := uintptr(0); i < n; i += ptrSize {
 				off := (uintptr(p) + i - datap.data) / ptrSize
-				bits := (*addb(datap.gcdatamask.bytedata, off/8) >> (off % 8)) & 1
-				bits += 1 // convert 1-bit to 2-bit
-				mask[i/ptrSize] = bits
+				mask[i/ptrSize] = (*addb(bitmap, off/8) >> (off % 8)) & 1
 			}
 			return
 		}
 
 		// bss
 		if datap.bss <= uintptr(p) && uintptr(p) < datap.ebss {
+			bitmap := datap.gcbssmask.bytedata
 			n := (*ptrtype)(unsafe.Pointer(t)).elem.size
 			mask = make([]byte, n/ptrSize)
 			for i := uintptr(0); i < n; i += ptrSize {
 				off := (uintptr(p) + i - datap.bss) / ptrSize
-				bits := (*addb(datap.gcbssmask.bytedata, off/8) >> (off % 8)) & 1
-				bits += 1 // convert 1-bit to 2-bit
-				mask[i/ptrSize] = bits
+				mask[i/ptrSize] = (*addb(bitmap, off/8) >> (off % 8)) & 1
 			}
 			return
 		}
@@ -768,8 +788,14 @@ func getgcmask(ep interface{}) (mask []byte) {
 	if mlookup(uintptr(p), &base, &n, nil) != 0 {
 		mask = make([]byte, n/ptrSize)
 		for i := uintptr(0); i < n; i += ptrSize {
-			bits := heapBitsForAddr(base + i).typeBits()
-			mask[i/ptrSize] = bits
+			hbits := heapBitsForAddr(base + i)
+			if hbits.isPointer() {
+				mask[i/ptrSize] = 1
+			}
+			if i >= 2*ptrSize && !hbits.isMarked() {
+				mask[i/ptrSize] = 255
+				break
+			}
 		}
 		return
 	}
@@ -801,10 +827,9 @@ func getgcmask(ep interface{}) (mask []byte) {
 		n := (*ptrtype)(unsafe.Pointer(t)).elem.size
 		mask = make([]byte, n/ptrSize)
 		for i := uintptr(0); i < n; i += ptrSize {
+			bitmap := bv.bytedata
 			off := (uintptr(p) + i - frame.varp + size) / ptrSize
-			bits := (*addb(bv.bytedata, off/8) >> (off % 8)) & 1
-			bits += 1 // convert 1-bit to 2-bit
-			mask[i/ptrSize] = bits
+			mask[i/ptrSize] = (*addb(bitmap, off/8) >> (off % 8)) & 1
 		}
 	}
 	return