4 files changed, 182 insertions, 70 deletions
diff --git a/src/runtime/heapdump.go b/src/runtime/heapdump.go
index dbeaed9277..b255cbbae3 100644
--- a/src/runtime/heapdump.go
+++ b/src/runtime/heapdump.go
@@ -489,9 +489,15 @@ func dumpparams() {
 	}
 	dumpint(sys.PtrSize)
 	var arenaStart, arenaEnd uintptr
-	for i, ha := range mheap_.arenas {
-		if ha != nil {
-			base := arenaBase(uint(i))
+	for i1 := range mheap_.arenas {
+		if mheap_.arenas[i1] == nil {
+			continue
+		}
+		for i, ha := range mheap_.arenas[i1] {
+			if ha == nil {
+				continue
+			}
+			base := arenaBase(arenaIdx(i1)<<arenaL1Shift | arenaIdx(i))
 			if arenaStart == 0 || base < arenaStart {
 				arenaStart = base
 			}
diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index 6f78455c8b..bad35116b0 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -92,8 +92,10 @@
 // Since arenas are aligned, the address space can be viewed as a
 // series of arena frames. The arena map (mheap_.arenas) maps from
 // arena frame number to *heapArena, or nil for parts of the address
-// space not backed by the Go heap. Since arenas are large, the arena
-// index is just a single-level mapping.
+// space not backed by the Go heap. The arena map is structured as a
+// two-level array consisting of a "L1" arena map and many "L2" arena
+// maps; however, since arenas are large, on many architectures, the
+// arena map consists of a single, large L2 map.
 //
 // The arena map covers the entire possible address space, allowing
 // the Go heap to use any part of the address space. The allocator
@@ -202,11 +204,6 @@ const (
 	// space because doing so is cheap.
 	// mips32 only has access to the low 2GB of virtual memory, so
 	// we further limit it to 31 bits.
-	//
-	// The size of the arena map is proportional to
-	// 1<<heapAddrBits, so it's important that this not be too
-	// large. 48 bits is about the threshold; above that we would
-	// need to go to a two level arena map.
 	heapAddrBits = _64bit*48 + (1-_64bit)*(32-(sys.GoarchMips+sys.GoarchMipsle))
 
 	// maxAlloc is the maximum size of an allocation. On 64-bit,
@@ -219,13 +216,49 @@ const (
 	// heapArenaBytes is the size of a heap arena. The heap
 	// consists of mappings of size heapArenaBytes, aligned to
 	// heapArenaBytes. The initial heap mapping is one arena.
-	heapArenaBytes = (64<<20)*_64bit + (4<<20)*(1-_64bit)
+	//
+	// This is currently 64MB on 64-bit and 4MB on 32-bit.
+	heapArenaBytes = 1 << logHeapArenaBytes
+
+	// logHeapArenaBytes is log_2 of heapArenaBytes. For clarity,
+	// prefer using heapArenaBytes where possible (we need the
+	// constant to compute some other constants).
+	logHeapArenaBytes = (6+20)*_64bit + (2+20)*(1-_64bit)
 
 	// heapArenaBitmapBytes is the size of each heap arena's bitmap.
 	heapArenaBitmapBytes = heapArenaBytes / (sys.PtrSize * 8 / 2)
 
 	pagesPerArena = heapArenaBytes / pageSize
 
+	// arenaL1Bits is the number of bits of the arena number
+	// covered by the first level arena map.
+	//
+	// This number should be small, since the first level arena
+	// map requires PtrSize*(1<<arenaL1Bits) of space in the
+	// binary's BSS. It can be zero, in which case the first level
+	// index is effectively unused. There is a performance benefit
+	// to this, since the generated code can be more efficient,
+	// but comes at the cost of having a large L2 mapping.
+	arenaL1Bits = 0
+
+	// arenaL2Bits is the number of bits of the arena number
+	// covered by the second level arena index.
+	//
+	// The size of each arena map allocation is proportional to
+	// 1<<arenaL2Bits, so it's important that this not be too
+	// large. 48 bits leads to 32MB arena index allocations, which
+	// is about the practical threshold.
+	arenaL2Bits = heapAddrBits - logHeapArenaBytes - arenaL1Bits
+
+	// arenaL1Shift is the number of bits to shift an arena frame
+	// number by to compute an index into the first level arena map.
+	arenaL1Shift = arenaL2Bits
+
+	// arenaBits is the total bits in a combined arena map index.
+	// This is split between the index into the L1 arena map and
+	// the L2 arena map.
+	arenaBits = arenaL1Bits + arenaL2Bits
+
 	// arenaBaseOffset is the pointer value that corresponds to
 	// index 0 in the heap arena map.
 	//
@@ -323,12 +356,6 @@ func mallocinit() {
 		throw("bad system page size")
 	}
 
-	// Map the arena map. Most of this will never be written to,
-	mheap_.arenas = (*[(1 << heapAddrBits) / heapArenaBytes]*heapArena)(persistentalloc(unsafe.Sizeof(*mheap_.arenas), sys.PtrSize, nil))
-	if mheap_.arenas == nil {
-		throw("failed to allocate arena map")
-	}
-
 	// Initialize the heap.
 	mheap_.init()
 	_g_ := getg()
@@ -398,7 +425,7 @@ func mallocinit() {
 		// 3. We try to stake out a reasonably large initial
 		// heap reservation.
 
-		const arenaMetaSize = unsafe.Sizeof(heapArena{}) * uintptr(len(*mheap_.arenas))
+		const arenaMetaSize = unsafe.Sizeof([1 << arenaBits]heapArena{})
 		meta := uintptr(sysReserve(nil, arenaMetaSize))
 		if meta != 0 {
 			mheap_.heapArenaAlloc.init(meta, arenaMetaSize)
@@ -476,7 +503,7 @@ func (h *mheap) sysAlloc(n uintptr) (v unsafe.Pointer, size uintptr) {
 		if p+n < p {
 			// We can't use this, so don't ask.
 			v = nil
-		} else if arenaIndex(p+n-1) >= uint(len(mheap_.arenas)) {
+		} else if arenaIndex(p+n-1) >= 1<<arenaBits {
 			// Outside addressable heap. Can't use.
 			v = nil
 		} else {
@@ -528,9 +555,9 @@ func (h *mheap) sysAlloc(n uintptr) (v unsafe.Pointer, size uintptr) {
 		p := uintptr(v)
 		if p+size < p {
 			bad = "region exceeds uintptr range"
-		} else if arenaIndex(p) >= uint(len(mheap_.arenas)) {
+		} else if arenaIndex(p) >= 1<<arenaBits {
 			bad = "base outside usable address space"
-		} else if arenaIndex(p+size-1) >= uint(len(mheap_.arenas)) {
+		} else if arenaIndex(p+size-1) >= 1<<arenaBits {
 			bad = "end outside usable address space"
 		}
 		if bad != "" {
@@ -551,7 +578,17 @@ func (h *mheap) sysAlloc(n uintptr) (v unsafe.Pointer, size uintptr) {
 mapped:
 	// Create arena metadata.
 	for ri := arenaIndex(uintptr(v)); ri <= arenaIndex(uintptr(v)+size-1); ri++ {
-		if h.arenas[ri] != nil {
+		l2 := h.arenas[ri.l1()]
+		if l2 == nil {
+			// Allocate an L2 arena map.
+			l2 = (*[1 << arenaL2Bits]*heapArena)(persistentalloc(unsafe.Sizeof(*l2), sys.PtrSize, nil))
+			if l2 == nil {
+				throw("out of memory allocating heap arena map")
+			}
+			atomic.StorepNoWB(unsafe.Pointer(&h.arenas[ri.l1()]), unsafe.Pointer(l2))
+		}
+
+		if l2[ri.l2()] != nil {
 			throw("arena already initialized")
 		}
 		var r *heapArena
@@ -567,7 +604,7 @@ mapped:
 		// new heap arena becomes visible before the heap lock
 		// is released (which shouldn't happen, but there's
 		// little downside to this).
-		atomic.StorepNoWB(unsafe.Pointer(&h.arenas[ri]), unsafe.Pointer(r))
+		atomic.StorepNoWB(unsafe.Pointer(&l2[ri.l2()]), unsafe.Pointer(r))
 	}
 
 	// Tell the race detector about the new heap memory.
diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go
index 85d79c685b..294e3739b7 100644
--- a/src/runtime/mbitmap.go
+++ b/src/runtime/mbitmap.go
@@ -332,21 +332,23 @@ func (m *markBits) advance() {
 //
 // nosplit because it is used during write barriers and must not be preempted.
 //go:nosplit
-func heapBitsForAddr(addr uintptr) heapBits {
+func heapBitsForAddr(addr uintptr) (h heapBits) {
 	// 2 bits per word, 4 pairs per byte, and a mask is hard coded.
-	off := addr / sys.PtrSize
 	arena := arenaIndex(addr)
-	ha := mheap_.arenas[arena]
+	ha := mheap_.arenas[arena.l1()][arena.l2()]
 	// The compiler uses a load for nil checking ha, but in this
 	// case we'll almost never hit that cache line again, so it
 	// makes more sense to do a value check.
 	if ha == nil {
-		// addr is not in the heap. Crash without inhibiting inlining.
-		_ = *ha
+		// addr is not in the heap. Return nil heapBits, which
+		// we expect to crash in the caller.
+		return
 	}
-	bitp := &ha.bitmap[(off/4)%heapArenaBitmapBytes]
-	last := &ha.bitmap[len(ha.bitmap)-1]
-	return heapBits{bitp, uint32(off & 3), uint32(arena), last}
+	h.bitp = &ha.bitmap[(addr/(sys.PtrSize*4))%heapArenaBitmapBytes]
+	h.shift = uint32((addr / sys.PtrSize) & 3)
+	h.arena = uint32(arena)
+	h.last = &ha.bitmap[len(ha.bitmap)-1]
+	return
 }
 
 // findObject returns the base address for the heap object containing
@@ -432,21 +434,39 @@ func (h heapBits) next() heapBits {
 		h.bitp, h.shift = add1(h.bitp), 0
 	} else {
 		// Move to the next arena.
-		h.arena++
-		a := mheap_.arenas[h.arena]
-		if a == nil {
-			// We just passed the end of the object, which
-			// was also the end of the heap. Poison h. It
-			// should never be dereferenced at this point.
-			h.bitp, h.last = nil, nil
-		} else {
-			h.bitp, h.shift = &a.bitmap[0], 0
-			h.last = &a.bitmap[len(a.bitmap)-1]
-		}
+		return h.nextArena()
 	}
 	return h
 }
 
+// nextArena advances h to the beginning of the next heap arena.
+//
+// This is a slow-path helper to next. gc's inliner knows that
+// heapBits.next can be inlined even though it calls this. This is
+// marked noinline so it doesn't get inlined into next and cause next
+// to be too big to inline.
+//
+//go:nosplit
+//go:noinline
+func (h heapBits) nextArena() heapBits {
+	h.arena++
+	ai := arenaIdx(h.arena)
+	l2 := mheap_.arenas[ai.l1()]
+	if l2 == nil {
+		// We just passed the end of the object, which
+		// was also the end of the heap. Poison h. It
+		// should never be dereferenced at this point.
+		return heapBits{}
+	}
+	ha := l2[ai.l2()]
+	if ha == nil {
+		return heapBits{}
+	}
+	h.bitp, h.shift = &ha.bitmap[0], 0
+	h.last = &ha.bitmap[len(ha.bitmap)-1]
+	return h
+}
+
 // forward returns the heapBits describing n pointer-sized words ahead of h in memory.
 // That is, if h describes address p, h.forward(n) describes p+n*ptrSize.
 // h.forward(1) is equivalent to h.next(), just slower.
@@ -465,12 +485,13 @@ func (h heapBits) forward(n uintptr) heapBits {
 	// We're in a new heap arena.
 	past := nbitp - (uintptr(unsafe.Pointer(h.last)) + 1)
 	h.arena += 1 + uint32(past/heapArenaBitmapBytes)
-	a := mheap_.arenas[h.arena]
-	if a == nil {
-		h.bitp, h.last = nil, nil
-	} else {
+	ai := arenaIdx(h.arena)
+	if l2 := mheap_.arenas[ai.l1()]; l2 != nil && l2[ai.l2()] != nil {
+		a := l2[ai.l2()]
 		h.bitp = &a.bitmap[past%heapArenaBitmapBytes]
 		h.last = &a.bitmap[len(a.bitmap)-1]
+	} else {
+		h.bitp, h.last = nil, nil
 	}
 	return h
 }
@@ -971,7 +992,7 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 	// machine instructions.
 
 	outOfPlace := false
-	if arenaIndex(x+size-1) != uint(h.arena) {
+	if arenaIndex(x+size-1) != arenaIdx(h.arena) {
 		// This object spans heap arenas, so the bitmap may be
 		// discontiguous. Unroll it into the object instead
 		// and then copy it out.
@@ -1375,12 +1396,14 @@ Phase4:
 		// x+size may not point to the heap, so back up one
 		// word and then call next().
 		end := heapBitsForAddr(x + size - sys.PtrSize).next()
-		if !outOfPlace && (end.bitp == nil || (end.shift == 0 && end.bitp == &mheap_.arenas[end.arena].bitmap[0])) {
+		endAI := arenaIdx(end.arena)
+		if !outOfPlace && (end.bitp == nil || (end.shift == 0 && end.bitp == &mheap_.arenas[endAI.l1()][endAI.l2()].bitmap[0])) {
 			// The unrolling code above walks hbitp just
 			// past the bitmap without moving to the next
 			// arena. Synthesize this for end.bitp.
-			end.bitp = addb(&mheap_.arenas[end.arena-1].bitmap[0], heapArenaBitmapBytes)
 			end.arena--
+			endAI = arenaIdx(end.arena)
+			end.bitp = addb(&mheap_.arenas[endAI.l1()][endAI.l2()].bitmap[0], heapArenaBitmapBytes)
 			end.last = nil
 		}
 		if typ.kind&kindGCProg == 0 && (hbitp != end.bitp || (w == nw+2) != (end.shift == 2)) {
diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index 3460c54d72..b11853ca18 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -96,9 +96,9 @@ type mheap struct {
 	nlargefree  uint64                  // number of frees for large objects (>maxsmallsize)
 	nsmallfree  [_NumSizeClasses]uint64 // number of frees for small objects (<=maxsmallsize)
 
-	// arenas is the heap arena map.
-	// arenas[(va+arenaBaseOffset)/heapArenaBytes] points to the
-	// metadata for the heap arena containing va.
+	// arenas is the heap arena map. It points to the metadata for
+	// the heap for every arena frame of the entire usable virtual
+	// address space.
 	//
 	// Use arenaIndex to compute indexes into this array.
 	//
@@ -110,9 +110,13 @@ type mheap struct {
 	// transition from nil to non-nil at any time when the lock
 	// isn't held. (Entries never transitions back to nil.)
 	//
-	// This structure is fully mapped by mallocinit, so it's safe
-	// to probe any index.
-	arenas *[(1 << heapAddrBits) / heapArenaBytes]*heapArena
+	// In general, this is a two-level mapping consisting of an L1
+	// map and possibly many L2 maps. This saves space when there
+	// are a huge number of arena frames. However, on many
+	// platforms (even 64-bit), arenaL1Bits is 0, making this
+	// effectively a single-level map. In this case, arenas[0]
+	// will never be nil.
+	arenas [1 << arenaL1Bits]*[1 << arenaL2Bits]*heapArena
 
 	// heapArenaAlloc is pre-reserved space for allocating heapArena
 	// objects. This is only used on 32-bit, where we pre-reserve
@@ -410,24 +414,48 @@ func (sc spanClass) noscan() bool {
 	return sc&1 != 0
 }
 
-// arenaIndex returns the mheap_.arenas index of the arena containing
-// metadata for p. If p is outside the range of valid heap addresses,
-// it returns an index larger than len(mheap_.arenas).
+// arenaIndex returns the index into mheap_.arenas of the arena
+// containing metadata for p. This index combines of an index into the
+// L1 map and an index into the L2 map and should be used as
+// mheap_.arenas[ai.l1()][ai.l2()].
+//
+// If p is outside the range of valid heap addresses, either l1() or
+// l2() will be out of bounds.
 //
 // It is nosplit because it's called by spanOf and several other
 // nosplit functions.
 //
 //go:nosplit
-func arenaIndex(p uintptr) uint {
-	return uint((p + arenaBaseOffset) / heapArenaBytes)
+func arenaIndex(p uintptr) arenaIdx {
+	return arenaIdx((p + arenaBaseOffset) / heapArenaBytes)
 }
 
 // arenaBase returns the low address of the region covered by heap
 // arena i.
-func arenaBase(i uint) uintptr {
+func arenaBase(i arenaIdx) uintptr {
 	return uintptr(i)*heapArenaBytes - arenaBaseOffset
 }
 
+type arenaIdx uint
+
+func (i arenaIdx) l1() uint {
+	if arenaL1Bits == 0 {
+		// Let the compiler optimize this away if there's no
+		// L1 map.
+		return 0
+	} else {
+		return uint(i) >> arenaL1Shift
+	}
+}
+
+func (i arenaIdx) l2() uint {
+	if arenaL1Bits == 0 {
+		return uint(i)
+	} else {
+		return uint(i) & (1<<arenaL2Bits - 1)
+	}
+}
+
 // inheap reports whether b is a pointer into a (potentially dead) heap object.
 // It returns false for pointers into _MSpanManual spans.
 // Non-preemptible because it is used by write barriers.
@@ -467,14 +495,28 @@ func inHeapOrStack(b uintptr) bool {
 //
 //go:nosplit
 func spanOf(p uintptr) *mspan {
-	if p < minLegalPointer {
-		return nil
-	}
+	// This function looks big, but we use a lot of constant
+	// folding around arenaL1Bits to get it under the inlining
+	// budget. Also, many of the checks here are safety checks
+	// that Go needs to do anyway, so the generated code is quite
+	// short.
 	ri := arenaIndex(p)
-	if ri >= uint(len(mheap_.arenas)) {
+	if arenaL1Bits == 0 {
+		// If there's no L1, then ri.l1() can't be out of bounds but ri.l2() can.
+		if ri.l2() >= uint(len(mheap_.arenas[0])) {
+			return nil
+		}
+	} else {
+		// If there's an L1, then ri.l1() can be out of bounds but ri.l2() can't.
+		if ri.l1() >= uint(len(mheap_.arenas)) {
+			return nil
+		}
+	}
+	l2 := mheap_.arenas[ri.l1()]
+	if arenaL1Bits != 0 && l2 == nil { // Should never happen if there's no L1.
 		return nil
 	}
-	ha := mheap_.arenas[ri]
+	ha := l2[ri.l2()]
 	if ha == nil {
 		return nil
 	}
@@ -488,7 +530,8 @@ func spanOf(p uintptr) *mspan {
 //
 //go:nosplit
 func spanOfUnchecked(p uintptr) *mspan {
-	return mheap_.arenas[arenaIndex(p)].spans[(p/pageSize)%pagesPerArena]
+	ai := arenaIndex(p)
+	return mheap_.arenas[ai.l1()][ai.l2()].spans[(p/pageSize)%pagesPerArena]
 }
 
 // spanOfHeap is like spanOf, but returns nil if p does not point to a
@@ -763,18 +806,21 @@ func (h *mheap) allocManual(npage uintptr, stat *uint64) *mspan {
 
 // setSpan modifies the span map so spanOf(base) is s.
 func (h *mheap) setSpan(base uintptr, s *mspan) {
-	h.arenas[arenaIndex(base)].spans[(base/pageSize)%pagesPerArena] = s
+	ai := arenaIndex(base)
+	h.arenas[ai.l1()][ai.l2()].spans[(base/pageSize)%pagesPerArena] = s
 }
 
 // setSpans modifies the span map so [spanOf(base), spanOf(base+npage*pageSize))
 // is s.
 func (h *mheap) setSpans(base, npage uintptr, s *mspan) {
 	p := base / pageSize
-	ha := h.arenas[arenaIndex(base)]
+	ai := arenaIndex(base)
+	ha := h.arenas[ai.l1()][ai.l2()]
 	for n := uintptr(0); n < npage; n++ {
 		i := (p + n) % pagesPerArena
 		if i == 0 {
-			ha = h.arenas[arenaIndex(base+n*pageSize)]
+			ai = arenaIndex(base + n*pageSize)
+			ha = h.arenas[ai.l1()][ai.l2()]
 		}
 		ha.spans[i] = s
 	}