aboutsummaryrefslogtreecommitdiff
path: root/src/runtime/malloc.go
diff options
context:
space:
mode:
authorMichael Anthony Knyszek <mknyszek@google.com>2022-09-11 04:07:41 +0000
committerMichael Knyszek <mknyszek@google.com>2023-11-09 19:58:08 +0000
commit38ac7c41aa54306c0bdc04a092838103a7d09997 (patch)
treefe5e6983bf2ad16ae0baec7bc7c3756497d27242 /src/runtime/malloc.go
parent25867485a748bbefc938e66330912cd88c2f4acb (diff)
downloadgo-38ac7c41aa54306c0bdc04a092838103a7d09997.tar.xz
runtime: implement experiment to replace heap bitmap with alloc headers
This change replaces the 1-bit-per-word heap bitmap for most size classes with allocation headers for objects that contain pointers. The header consists of a single pointer to a type. All allocations with headers are treated as implicitly containing one or more instances of the type in the header. As the name implies, headers are usually stored as the first word of an object. There are two additional exceptions to where headers are stored and how they're used. Objects smaller than 512 bytes do not have headers. Instead, a heap bitmap is reserved at the end of spans for objects of this size. A full word of overhead is too much for these small objects. The bitmap is of the same format of the old bitmap, minus the noMorePtrs bits which are unnecessary. All the objects <512 bytes have a bitmap less than a pointer-word in size, and that was the granularity at which noMorePtrs could stop scanning early anyway. Objects that are larger than 32 KiB (which have their own span) have their headers stored directly in the span, to allow power-of-two-sized allocations to not spill over into an extra page. The full implementation is behind GOEXPERIMENT=allocheaders. The purpose of this change is performance. First and foremost, with headers we no longer have to unroll pointer/scalar data at allocation time for most size classes. Small size classes still need some unrolling, but their bitmaps are small so we can optimize that case fairly well. Larger objects effectively have their pointer/scalar data unrolled on-demand from type data, which is much more compactly represented and results in less TLB pressure. Furthermore, since the headers are usually right next to the object and where we're about to start scanning, we get an additional temporal locality benefit in the data cache when looking up type metadata. The pointer/scalar data is now effectively unrolled on-demand, but it's also simpler to unroll than before; that unrolled data is never written anywhere, and for arrays we get the benefit of retreading the same data per element, as opposed to looking it up from scratch for each pointer-word of bitmap. Lastly, because we no longer have a heap bitmap that spans the entire heap, there's a flat 1.5% memory use reduction. This is balanced slightly by some objects possibly being bumped up a size class, but most objects are not tightly optimized to size class sizes so there's some memory to spare, making the header basically free in those cases. See the follow-up CL which turns on this experiment by default for benchmark results. (CL 538217.) Change-Id: I4c9034ee200650d06d8bdecd579d5f7c1bbf1fc5 Reviewed-on: https://go-review.googlesource.com/c/go/+/437955 Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Keith Randall <khr@golang.org> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Diffstat (limited to 'src/runtime/malloc.go')
-rw-r--r--src/runtime/malloc.go87
1 files changed, 68 insertions, 19 deletions
diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index a071428391..beff9043ec 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -102,6 +102,7 @@ package runtime
import (
"internal/goarch"
+ "internal/goexperiment"
"internal/goos"
"runtime/internal/atomic"
"runtime/internal/math"
@@ -424,6 +425,26 @@ func mallocinit() {
print("pagesPerArena (", pagesPerArena, ") is not divisible by pagesPerReclaimerChunk (", pagesPerReclaimerChunk, ")\n")
throw("bad pagesPerReclaimerChunk")
}
+ if goexperiment.AllocHeaders {
+ // Check that the minimum size (exclusive) for a malloc header is also
+ // a size class boundary. This is important to making sure checks align
+ // across different parts of the runtime.
+ minSizeForMallocHeaderIsSizeClass := false
+ for i := 0; i < len(class_to_size); i++ {
+ if minSizeForMallocHeader == uintptr(class_to_size[i]) {
+ minSizeForMallocHeaderIsSizeClass = true
+ break
+ }
+ }
+ if !minSizeForMallocHeaderIsSizeClass {
+ throw("min size of malloc header is not a size class boundary")
+ }
+ // Check that the pointer bitmap for all small sizes without a malloc header
+ // fits in a word.
+ if minSizeForMallocHeader/goarch.PtrSize > 8*goarch.PtrSize {
+ throw("max pointer/scan bitmap size for headerless objects is too large")
+ }
+ }
if minTagBits > taggedPointerBits {
throw("taggedPointerbits too small")
@@ -1016,12 +1037,22 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
throw("mallocgc called without a P or outside bootstrapping")
}
var span *mspan
+ var header **_type
var x unsafe.Pointer
noscan := typ == nil || typ.PtrBytes == 0
// In some cases block zeroing can profitably (for latency reduction purposes)
// be delayed till preemption is possible; delayedZeroing tracks that state.
delayedZeroing := false
- if size <= maxSmallSize {
+ // Determine if it's a 'small' object that goes into a size-classed span.
+ //
+ // Note: This comparison looks a little strange, but it exists to smooth out
+ // the crossover between the largest size class and large objects that have
+ // their own spans. The small window of object sizes between maxSmallSize-mallocHeaderSize
+ // and maxSmallSize will be considered large, even though they might fit in
+ // a size class. In practice this is completely fine, since the largest small
+ // size class has a single object in it already, precisely to make the transition
+ // to large objects smooth.
+ if size <= maxSmallSize-mallocHeaderSize {
if noscan && size < maxTinySize {
// Tiny allocator.
//
@@ -1096,6 +1127,10 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
}
size = maxTinySize
} else {
+ hasHeader := !noscan && !heapBitsInSpan(size)
+ if goexperiment.AllocHeaders && hasHeader {
+ size += mallocHeaderSize
+ }
var sizeclass uint8
if size <= smallSizeMax-8 {
sizeclass = size_to_class8[divRoundUp(size, smallSizeDiv)]
@@ -1113,6 +1148,11 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
if needzero && span.needzero != 0 {
memclrNoHeapPointers(x, size)
}
+ if goexperiment.AllocHeaders && hasHeader {
+ header = (**_type)(x)
+ x = add(x, mallocHeaderSize)
+ size -= mallocHeaderSize
+ }
}
} else {
shouldhelpgc = true
@@ -1128,29 +1168,30 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
delayedZeroing = true
} else {
memclrNoHeapPointers(x, size)
- // We've in theory cleared almost the whole span here,
- // and could take the extra step of actually clearing
- // the whole thing. However, don't. Any GC bits for the
- // uncleared parts will be zero, and it's just going to
- // be needzero = 1 once freed anyway.
}
}
+ if goexperiment.AllocHeaders && !noscan {
+ header = &span.largeType
+ }
}
-
if !noscan {
- var scanSize uintptr
- heapBitsSetType(uintptr(x), size, dataSize, typ)
- if dataSize > typ.Size_ {
- // Array allocation. If there are any
- // pointers, GC has to scan to the last
- // element.
- if typ.PtrBytes != 0 {
- scanSize = dataSize - typ.Size_ + typ.PtrBytes
- }
+ if goexperiment.AllocHeaders {
+ c.scanAlloc += heapSetType(uintptr(x), dataSize, typ, header, span)
} else {
- scanSize = typ.PtrBytes
+ var scanSize uintptr
+ heapBitsSetType(uintptr(x), size, dataSize, typ)
+ if dataSize > typ.Size_ {
+ // Array allocation. If there are any
+ // pointers, GC has to scan to the last
+ // element.
+ if typ.PtrBytes != 0 {
+ scanSize = dataSize - typ.Size_ + typ.PtrBytes
+ }
+ } else {
+ scanSize = typ.PtrBytes
+ }
+ c.scanAlloc += scanSize
}
- c.scanAlloc += scanSize
}
// Ensure that the stores above that initialize x to
@@ -1176,7 +1217,12 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
// This may be racing with GC so do it atomically if there can be
// a race marking the bit.
if gcphase != _GCoff {
- gcmarknewobject(span, uintptr(x), size)
+ // Pass the full size of the allocation to the number of bytes
+ // marked.
+ //
+ // If !goexperiment.AllocHeaders, "size" doesn't include the
+ // allocation header, so use span.elemsize unconditionally.
+ gcmarknewobject(span, uintptr(x), span.elemsize)
}
if raceenabled {
@@ -1215,6 +1261,9 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
if !noscan {
throw("delayed zeroing on data that may contain pointers")
}
+ if goexperiment.AllocHeaders && header != nil {
+ throw("unexpected malloc header in delayed zeroing of large object")
+ }
memclrNoHeapPointersChunked(size, x) // This is a possible preemption point: see #47302
}