aboutsummaryrefslogtreecommitdiff
path: root/src/runtime/mpagealloc.go
diff options
context:
space:
mode:
authorMichael Anthony Knyszek <mknyszek@google.com>2022-09-23 16:32:34 +0000
committerMichael Knyszek <mknyszek@google.com>2023-04-19 14:30:00 +0000
commit8fa9e3beee8b0e6baa7333740996181268b60a3a (patch)
tree988a2f3ce70d3b76062f433dd1e98024ccb2dd81 /src/runtime/mpagealloc.go
parent1f9d80e33165dfb169d1ee82ca0021484951d3bb (diff)
downloadgo-8fa9e3beee8b0e6baa7333740996181268b60a3a.tar.xz
runtime: manage huge pages explicitly
This change makes it so that on Linux the Go runtime explicitly marks page heap memory as either available to be backed by hugepages or not using heuristics based on density. The motivation behind this change is twofold: 1. In default Linux configurations, khugepaged can recoalesce hugepages even after the scavenger breaks them up, resulting in significant overheads for small heaps when their heaps shrink. 2. The Go runtime already has some heuristics about this, but those heuristics appear to have bit-rotted and result in haphazard hugepage management. Unlucky (but otherwise fairly dense) regions of memory end up not backed by huge pages while sparse regions end up accidentally marked MADV_HUGEPAGE and are not later broken up by the scavenger, because it already got the memory it needed from more dense sections (this is more likely to happen with small heaps that go idle). In this change, the runtime uses a new policy: 1. Mark all new memory MADV_HUGEPAGE. 2. Track whether each page chunk (4 MiB) became dense during the GC cycle. Mark those MADV_HUGEPAGE, and hide them from the scavenger. 3. If a chunk is not dense for 1 full GC cycle, make it visible to the scavenger. 4. The scavenger marks a chunk MADV_NOHUGEPAGE before it scavenges it. This policy is intended to try and back memory that is a good candidate for huge pages (high occupancy) with huge pages, and give memory that is not (low occupancy) to the scavenger. Occupancy is defined not just by occupancy at any instant of time, but also occupancy in the near future. It's generally true that by the end of a GC cycle the heap gets quite dense (from the perspective of the page allocator). Because we want scavenging and huge page management to happen together (the right time to MADV_NOHUGEPAGE is just before scavenging in order to break up huge pages and keep them that way) and the cost of applying MADV_HUGEPAGE and MADV_NOHUGEPAGE is somewhat high, the scavenger avoids releasing memory in dense page chunks. All this together means the scavenger will now more generally release memory on a ~1 GC cycle delay. Notably this has implications for scavenging to maintain the memory limit and the runtime/debug.FreeOSMemory API. This change makes it so that in these cases all memory is visible to the scavenger regardless of sparseness and delays the page allocator in re-marking this memory with MADV_NOHUGEPAGE for around 1 GC cycle to mitigate churn. The end result of this change should be little-to-no performance difference for dense heaps (MADV_HUGEPAGE works a lot like the default unmarked state) but should allow the scavenger to more effectively take back fragments of huge pages. The main risk here is churn, because MADV_HUGEPAGE usually forces the kernel to immediately back memory with a huge page. That's the reason for the large amount of hysteresis (1 full GC cycle) and why the definition of high density is 96% occupancy. Fixes #55328. Change-Id: I8da7998f1a31b498a9cc9bc662c1ae1a6bf64630 Reviewed-on: https://go-review.googlesource.com/c/go/+/436395 Reviewed-by: Michael Pratt <mpratt@google.com> Run-TryBot: Michael Knyszek <mknyszek@google.com> TryBot-Result: Gopher Robot <gobot@golang.org>
Diffstat (limited to 'src/runtime/mpagealloc.go')
-rw-r--r--src/runtime/mpagealloc.go38
1 files changed, 26 insertions, 12 deletions
diff --git a/src/runtime/mpagealloc.go b/src/runtime/mpagealloc.go
index 4f35cafc24..da1b14e5a4 100644
--- a/src/runtime/mpagealloc.go
+++ b/src/runtime/mpagealloc.go
@@ -257,11 +257,9 @@ type pageAlloc struct {
// known by the page allocator to be currently in-use (passed
// to grow).
//
- // This field is currently unused on 32-bit architectures but
- // is harmless to track. We care much more about having a
- // contiguous heap in these cases and take additional measures
- // to ensure that, so in nearly all cases this should have just
- // 1 element.
+ // We care much more about having a contiguous heap in these cases
+ // and take additional measures to ensure that, so in nearly all
+ // cases this should have just 1 element.
//
// All access is protected by the mheapLock.
inUse addrRanges
@@ -300,7 +298,7 @@ type pageAlloc struct {
test bool
}
-func (p *pageAlloc) init(mheapLock *mutex, sysStat *sysMemStat) {
+func (p *pageAlloc) init(mheapLock *mutex, sysStat *sysMemStat, test bool) {
if levelLogPages[0] > logMaxPackedValue {
// We can't represent 1<<levelLogPages[0] pages, the maximum number
// of pages we need to represent at the root level, in a summary, which
@@ -315,13 +313,17 @@ func (p *pageAlloc) init(mheapLock *mutex, sysStat *sysMemStat) {
p.inUse.init(sysStat)
// System-dependent initialization.
- p.sysInit()
+ p.sysInit(test)
// Start with the searchAddr in a state indicating there's no free memory.
p.searchAddr = maxSearchAddr()
// Set the mheapLock.
p.mheapLock = mheapLock
+
+ // Set if we're in a test.
+ p.test = test
+ p.scav.index.test = test
}
// tryChunkOf returns the bitmap data for the given chunk.
@@ -415,6 +417,11 @@ func (p *pageAlloc) grow(base, size uintptr) {
// we need to ensure this newly-free memory is visible in the
// summaries.
p.update(base, size/pageSize, true, false)
+
+ // Mark all new memory as huge page eligible.
+ if !p.test {
+ sysHugePage(unsafe.Pointer(base), size)
+ }
}
// enableChunkHugePages enables huge pages for the chunk bitmap mappings (disabled by default).
@@ -568,19 +575,23 @@ func (p *pageAlloc) allocRange(base, npages uintptr) uintptr {
chunk := p.chunkOf(sc)
scav += chunk.scavenged.popcntRange(si, ei+1-si)
chunk.allocRange(si, ei+1-si)
+ p.scav.index.alloc(sc, ei+1-si)
} else {
// The range crosses at least one chunk boundary.
chunk := p.chunkOf(sc)
scav += chunk.scavenged.popcntRange(si, pallocChunkPages-si)
chunk.allocRange(si, pallocChunkPages-si)
+ p.scav.index.alloc(sc, pallocChunkPages-si)
for c := sc + 1; c < ec; c++ {
chunk := p.chunkOf(c)
scav += chunk.scavenged.popcntRange(0, pallocChunkPages)
chunk.allocAll()
+ p.scav.index.alloc(c, pallocChunkPages)
}
chunk = p.chunkOf(ec)
scav += chunk.scavenged.popcntRange(0, ei+1)
chunk.allocRange(0, ei+1)
+ p.scav.index.alloc(ec, ei+1)
}
p.update(base, npages, true, true)
return uintptr(scav) * pageSize
@@ -914,7 +925,7 @@ Found:
// Must run on the system stack because p.mheapLock must be held.
//
//go:systemstack
-func (p *pageAlloc) free(base, npages uintptr, scavenged bool) {
+func (p *pageAlloc) free(base, npages uintptr) {
assertLockHeld(p.mheapLock)
// If we're freeing pages below the p.searchAddr, update searchAddr.
@@ -922,14 +933,13 @@ func (p *pageAlloc) free(base, npages uintptr, scavenged bool) {
p.searchAddr = b
}
limit := base + npages*pageSize - 1
- if !scavenged {
- p.scav.index.mark(base, limit+1)
- }
if npages == 1 {
// Fast path: we're clearing a single bit, and we know exactly
// where it is, so mark it directly.
i := chunkIndex(base)
- p.chunkOf(i).free1(chunkPageIndex(base))
+ pi := chunkPageIndex(base)
+ p.chunkOf(i).free1(pi)
+ p.scav.index.free(i, pi, 1)
} else {
// Slow path: we're clearing more bits so we may need to iterate.
sc, ec := chunkIndex(base), chunkIndex(limit)
@@ -938,13 +948,17 @@ func (p *pageAlloc) free(base, npages uintptr, scavenged bool) {
if sc == ec {
// The range doesn't cross any chunk boundaries.
p.chunkOf(sc).free(si, ei+1-si)
+ p.scav.index.free(sc, si, ei+1-si)
} else {
// The range crosses at least one chunk boundary.
p.chunkOf(sc).free(si, pallocChunkPages-si)
+ p.scav.index.free(sc, si, pallocChunkPages-si)
for c := sc + 1; c < ec; c++ {
p.chunkOf(c).freeAll()
+ p.scav.index.free(c, 0, pallocChunkPages)
}
p.chunkOf(ec).free(0, ei+1)
+ p.scav.index.free(ec, 0, ei+1)
}
}
p.update(base, npages, true, false)