From 42019613df2d9b6ad39e8ccf80861e75666025a0 Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Thu, 23 Jul 2020 21:02:05 +0000
Subject: runtime: make distributed/local malloc stats the source-of-truth

This change makes it so that various local malloc stats (excluding
heap_scan and local_tinyallocs) are no longer written first to mheap
fields but are instead accessed directly from each mcache.

This change is part of a move toward having stats be distributed, and
cleaning up some old code related to the stats.

Note that because there's no central source-of-truth, when an mcache
dies, it must donate its stats to another mcache. It's always safe to
donate to the mcache for the 0th P, so do that.

Change-Id: I2556093dbc27357cb9621c9b97671f3c00aa1173
Reviewed-on: https://go-review.googlesource.com/c/go/+/246964
Trust: Michael Knyszek <mknyszek@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/mcache.go | 31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

(limited to 'src/runtime/mcache.go')

diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go
index 7a7d33ccae..5baa7b3da8 100644
--- a/src/runtime/mcache.go
+++ b/src/runtime/mcache.go
@@ -41,7 +41,13 @@ type mcache struct {
 
 	stackcache [_NumStackOrders]stackfreelist
 
-	// Local allocator stats, flushed during GC.
+	// Allocator stats (source-of-truth).
+	// Only the P that owns this mcache may write to these
+	// variables, so it's safe for that P to read non-atomically.
+	//
+	// When read with stats from other mcaches and with the world
+	// stopped, the result will accurately reflect the state of the
+	// application.
 	local_largefree  uintptr                  // bytes freed for large objects (>maxsmallsize)
 	local_nlargefree uintptr                  // number of frees for large objects (>maxsmallsize)
 	local_nsmallfree [_NumSizeClasses]uintptr // number of frees for small objects (<=maxsmallsize)
@@ -97,7 +103,13 @@ func allocmcache() *mcache {
 	return c
 }
 
-func freemcache(c *mcache) {
+// freemcache releases resources associated with this
+// mcache and puts the object onto a free list.
+//
+// In some cases there is no way to simply release
+// resources, such as statistics, so donate them to
+// a different mcache (the recipient).
+func freemcache(c *mcache, recipient *mcache) {
 	systemstack(func() {
 		c.releaseAll()
 		stackcache_clear(c)
@@ -109,11 +121,26 @@ func freemcache(c *mcache) {
 
 		lock(&mheap_.lock)
 		purgecachedstats(c)
+		// Donate anything else that's left.
+		c.donate(recipient)
 		mheap_.cachealloc.free(unsafe.Pointer(c))
 		unlock(&mheap_.lock)
 	})
 }
 
+// donate flushes data and resources which have no global
+// pool to another mcache.
+func (c *mcache) donate(d *mcache) {
+	d.local_largefree += c.local_largefree
+	c.local_largefree = 0
+	d.local_nlargefree += c.local_nlargefree
+	c.local_nlargefree = 0
+	for i := range c.local_nsmallfree {
+		d.local_nsmallfree[i] += c.local_nsmallfree[i]
+		c.local_nsmallfree[i] = 0
+	}
+}
+
 // refill acquires a new span of span class spc for c. This span will
 // have at least one free object. The current span in c must be full.
 //
-- 
cgit v1.3


From e63716bc76d3264f669843434bc365a78f2141d2 Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Thu, 23 Jul 2020 21:10:29 +0000
Subject: runtime: make nlargealloc and largealloc mcache fields

This change makes nlargealloc and largealloc into mcache fields just
like nlargefree and largefree. These local fields become the new
source-of-truth. This change also moves the accounting for these fields
out of allocSpan (which is an inappropriate place for it -- this
accounting generally happens much closer to the point of allocation) and
into largeAlloc. This move is partially possible now that we can call
gcController.revise at that point.

Furthermore, this change moves largeAlloc into mcache.go and makes it a
method of mcache. While there's a little bit of a mismatch here because
largeAlloc barely interacts with the mcache, it helps solidify the
mcache as the first allocation layer and provides a clear place to
aggregate and manage statistics.

Change-Id: I37b5e648710733bb4c04430b71e96700e438587a
Reviewed-on: https://go-review.googlesource.com/c/go/+/246965
Trust: Michael Knyszek <mknyszek@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/malloc.go | 33 +------------------------------
 src/runtime/mcache.go | 54 ++++++++++++++++++++++++++++++++++++++++++++++++---
 src/runtime/mheap.go  | 18 +----------------
 src/runtime/mstats.go |  4 ++--
 4 files changed, 55 insertions(+), 54 deletions(-)

(limited to 'src/runtime/mcache.go')

diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index b19d1f2671..ec601ccb39 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -1082,9 +1082,7 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 		}
 	} else {
 		shouldhelpgc = true
-		systemstack(func() {
-			span = largeAlloc(size, needzero, noscan)
-		})
+		span = c.largeAlloc(size, needzero, noscan)
 		span.freeindex = 1
 		span.allocCount = 1
 		x = unsafe.Pointer(span.base())
@@ -1179,35 +1177,6 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 	return x
 }
 
-func largeAlloc(size uintptr, needzero bool, noscan bool) *mspan {
-	// print("largeAlloc size=", size, "\n")
-
-	if size+_PageSize < size {
-		throw("out of memory")
-	}
-	npages := size >> _PageShift
-	if size&_PageMask != 0 {
-		npages++
-	}
-
-	// Deduct credit for this span allocation and sweep if
-	// necessary. mHeap_Alloc will also sweep npages, so this only
-	// pays the debt down to npage pages.
-	deductSweepCredit(npages*_PageSize, npages)
-
-	spc := makeSpanClass(0, noscan)
-	s := mheap_.alloc(npages, spc, needzero)
-	if s == nil {
-		throw("out of memory")
-	}
-	// Put the large span in the mcentral swept list so that it's
-	// visible to the background sweeper.
-	mheap_.central[spc].mcentral.fullSwept(mheap_.sweepgen).push(s)
-	s.limit = s.base() + size
-	heapBitsForAddr(s.base()).initSpan(s)
-	return s
-}
-
 // implementation of new builtin
 // compiler (both frontend and SSA backend) knows the signature
 // of this function
diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go
index 5baa7b3da8..3657c0b86a 100644
--- a/src/runtime/mcache.go
+++ b/src/runtime/mcache.go
@@ -10,6 +10,7 @@ import (
 )
 
 // Per-thread (in Go, per-P) cache for small objects.
+// This includes a small object cache and local allocation stats.
 // No locking needed because it is per-thread (per-P).
 //
 // mcaches are allocated from non-GC'd memory, so any heap pointers
@@ -48,9 +49,11 @@ type mcache struct {
 	// When read with stats from other mcaches and with the world
 	// stopped, the result will accurately reflect the state of the
 	// application.
-	local_largefree  uintptr                  // bytes freed for large objects (>maxsmallsize)
-	local_nlargefree uintptr                  // number of frees for large objects (>maxsmallsize)
-	local_nsmallfree [_NumSizeClasses]uintptr // number of frees for small objects (<=maxsmallsize)
+	local_largealloc  uintptr                  // bytes allocated for large objects
+	local_nlargealloc uintptr                  // number of large object allocations
+	local_largefree   uintptr                  // bytes freed for large objects (>maxsmallsize)
+	local_nlargefree  uintptr                  // number of frees for large objects (>maxsmallsize)
+	local_nsmallfree  [_NumSizeClasses]uintptr // number of frees for small objects (<=maxsmallsize)
 
 	// flushGen indicates the sweepgen during which this mcache
 	// was last flushed. If flushGen != mheap_.sweepgen, the spans
@@ -131,6 +134,10 @@ func freemcache(c *mcache, recipient *mcache) {
 // donate flushes data and resources which have no global
 // pool to another mcache.
 func (c *mcache) donate(d *mcache) {
+	d.local_largealloc += c.local_largealloc
+	c.local_largealloc = 0
+	d.local_nlargealloc += c.local_nlargealloc
+	c.local_nlargealloc = 0
 	d.local_largefree += c.local_largefree
 	c.local_largefree = 0
 	d.local_nlargefree += c.local_nlargefree
@@ -178,6 +185,47 @@ func (c *mcache) refill(spc spanClass) {
 	c.alloc[spc] = s
 }
 
+// largeAlloc allocates a span for a large object.
+func (c *mcache) largeAlloc(size uintptr, needzero bool, noscan bool) *mspan {
+	if size+_PageSize < size {
+		throw("out of memory")
+	}
+	npages := size >> _PageShift
+	if size&_PageMask != 0 {
+		npages++
+	}
+
+	// Deduct credit for this span allocation and sweep if
+	// necessary. mHeap_Alloc will also sweep npages, so this only
+	// pays the debt down to npage pages.
+	deductSweepCredit(npages*_PageSize, npages)
+
+	spc := makeSpanClass(0, noscan)
+	s := mheap_.alloc(npages, spc, needzero)
+	if s == nil {
+		throw("out of memory")
+	}
+	c.local_largealloc += npages * pageSize
+	c.local_nlargealloc++
+
+	// Update heap_live and revise pacing if needed.
+	atomic.Xadd64(&memstats.heap_live, int64(npages*pageSize))
+	if trace.enabled {
+		// Trace that a heap alloc occurred because heap_live changed.
+		traceHeapAlloc()
+	}
+	if gcBlackenEnabled != 0 {
+		gcController.revise()
+	}
+
+	// Put the large span in the mcentral swept list so that it's
+	// visible to the background sweeper.
+	mheap_.central[spc].mcentral.fullSwept(mheap_.sweepgen).push(s)
+	s.limit = s.base() + size
+	heapBitsForAddr(s.base()).initSpan(s)
+	return s
+}
+
 func (c *mcache) releaseAll() {
 	for i := range c.alloc {
 		s := c.alloc[i]
diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index 1b41b204ab..5635dc6784 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -128,10 +128,6 @@ type mheap struct {
 	// This is accessed atomically.
 	reclaimCredit uintptr
 
-	// Malloc stats.
-	largealloc  uint64 // bytes allocated for large objects
-	nlargealloc uint64 // number of large object allocations
-
 	// arenas is the heap arena map. It points to the metadata for
 	// the heap for every arena frame of the entire usable virtual
 	// address space.
@@ -1170,14 +1166,7 @@ func (h *mheap) allocSpan(npages uintptr, manual bool, spanclass spanClass, sysS
 		memstats.tinyallocs += uint64(c.local_tinyallocs)
 		c.local_tinyallocs = 0
 
-		// Do some additional accounting if it's a large allocation.
-		if spanclass.sizeclass() == 0 {
-			mheap_.largealloc += uint64(npages * pageSize)
-			mheap_.nlargealloc++
-			atomic.Xadd64(&memstats.heap_live, int64(npages*pageSize))
-		}
-
-		// Either heap_live or heap_scan could have been updated.
+		// heap_scan was been updated.
 		if gcBlackenEnabled != 0 {
 			gcController.revise()
 		}
@@ -1277,11 +1266,6 @@ HaveSpan:
 
 		// Update related page sweeper stats.
 		atomic.Xadd64(&h.pagesInUse, int64(npages))
-
-		if trace.enabled {
-			// Trace that a heap alloc occurred.
-			traceHeapAlloc()
-		}
 	}
 
 	// Make sure the newly allocated span will be observed
diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go
index d81d2ebe81..d9acb361d5 100644
--- a/src/runtime/mstats.go
+++ b/src/runtime/mstats.go
@@ -578,6 +578,8 @@ func updatememstats() {
 			continue
 		}
 		// Collect large allocation stats.
+		memstats.nmalloc += uint64(c.local_nlargealloc)
+		totalAlloc += uint64(c.local_largealloc)
 		totalFree += uint64(c.local_largefree)
 		memstats.nfree += uint64(c.local_nlargefree)
 
@@ -589,8 +591,6 @@ func updatememstats() {
 		}
 	}
 	// Collect remaining large allocation stats.
-	memstats.nmalloc += mheap_.nlargealloc
-	totalAlloc += mheap_.largealloc
 
 	totalFree += smallFree
 
-- 
cgit v1.3


From a5088e76f108f6470d2a9b3ac56a58ddb9376e4f Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Thu, 23 Jul 2020 22:07:44 +0000
Subject: runtime: remove mcentral.nmalloc and add mcache.local_nsmallalloc

This change removes mcentral.nmalloc and adds mcache.local_nsmallalloc
which fulfills the same role but may be accessed non-atomically. It also
moves responsibility for updating heap_live and local_nsmallalloc into
mcache functions.

As a result of this change, mcache is now the sole source-of-truth for
malloc stats. It is also solely responsible for updating heap_live and
performing the various operations required as a result of updating
heap_live. The overall improvement here is in code organization:
previously malloc stats were fairly scattered, and now they have one
single home, and nearly all the required manipulations exist in a single
file.

Change-Id: I7e93fa297c1debf17e3f2a0d68aeed28a9c6af00
Reviewed-on: https://go-review.googlesource.com/c/go/+/246966
Trust: Michael Knyszek <mknyszek@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/mcache.go   | 34 ++++++++++++++++++++++++++++++++++
 src/runtime/mcentral.go | 41 +----------------------------------------
 src/runtime/mstats.go   | 18 ++++++------------
 3 files changed, 41 insertions(+), 52 deletions(-)

(limited to 'src/runtime/mcache.go')

diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go
index 3657c0b86a..4d2ba6dff0 100644
--- a/src/runtime/mcache.go
+++ b/src/runtime/mcache.go
@@ -51,6 +51,7 @@ type mcache struct {
 	// application.
 	local_largealloc  uintptr                  // bytes allocated for large objects
 	local_nlargealloc uintptr                  // number of large object allocations
+	local_nsmallalloc [_NumSizeClasses]uintptr // number of allocs for small objects
 	local_largefree   uintptr                  // bytes freed for large objects (>maxsmallsize)
 	local_nlargefree  uintptr                  // number of frees for large objects (>maxsmallsize)
 	local_nsmallfree  [_NumSizeClasses]uintptr // number of frees for small objects (<=maxsmallsize)
@@ -138,6 +139,10 @@ func (c *mcache) donate(d *mcache) {
 	c.local_largealloc = 0
 	d.local_nlargealloc += c.local_nlargealloc
 	c.local_nlargealloc = 0
+	for i := range c.local_nsmallalloc {
+		d.local_nsmallalloc[i] += c.local_nsmallalloc[i]
+		c.local_nsmallalloc[i] = 0
+	}
 	d.local_largefree += c.local_largefree
 	c.local_largefree = 0
 	d.local_nlargefree += c.local_nlargefree
@@ -182,6 +187,20 @@ func (c *mcache) refill(spc spanClass) {
 	// sweeping in the next sweep phase.
 	s.sweepgen = mheap_.sweepgen + 3
 
+	// Assume all objects from this span will be allocated in the
+	// mcache. If it gets uncached, we'll adjust this.
+	c.local_nsmallalloc[spc.sizeclass()] += uintptr(s.nelems) - uintptr(s.allocCount)
+	usedBytes := uintptr(s.allocCount) * s.elemsize
+	atomic.Xadd64(&memstats.heap_live, int64(s.npages*pageSize)-int64(usedBytes))
+	if trace.enabled {
+		// heap_live changed.
+		traceHeapAlloc()
+	}
+	if gcBlackenEnabled != 0 {
+		// heap_live changed.
+		gcController.revise()
+	}
+
 	c.alloc[spc] = s
 }
 
@@ -227,9 +246,24 @@ func (c *mcache) largeAlloc(size uintptr, needzero bool, noscan bool) *mspan {
 }
 
 func (c *mcache) releaseAll() {
+	sg := mheap_.sweepgen
 	for i := range c.alloc {
 		s := c.alloc[i]
 		if s != &emptymspan {
+			// Adjust nsmallalloc in case the span wasn't fully allocated.
+			n := uintptr(s.nelems) - uintptr(s.allocCount)
+			c.local_nsmallalloc[spanClass(i).sizeclass()] -= n
+			if s.sweepgen != sg+1 {
+				// refill conservatively counted unallocated slots in heap_live.
+				// Undo this.
+				//
+				// If this span was cached before sweep, then
+				// heap_live was totally recomputed since
+				// caching this span, so we don't do this for
+				// stale spans.
+				atomic.Xadd64(&memstats.heap_live, -int64(n)*int64(s.elemsize))
+			}
+			// Release the span to the mcentral.
 			mheap_.central[i].mcentral.uncacheSpan(s)
 			c.alloc[i] = &emptymspan
 		}
diff --git a/src/runtime/mcentral.go b/src/runtime/mcentral.go
index ed49e01677..97fe92c2ab 100644
--- a/src/runtime/mcentral.go
+++ b/src/runtime/mcentral.go
@@ -44,11 +44,6 @@ type mcentral struct {
 	// encounter swept spans, and these should be ignored.
 	partial [2]spanSet // list of spans with a free object
 	full    [2]spanSet // list of spans with no free objects
-
-	// nmalloc is the cumulative count of objects allocated from
-	// this mcentral, assuming all spans in mcaches are
-	// fully-allocated. Written atomically, read under STW.
-	nmalloc uint64
 }
 
 // Initialize a single central free list.
@@ -178,19 +173,6 @@ havespan:
 	if n == 0 || s.freeindex == s.nelems || uintptr(s.allocCount) == s.nelems {
 		throw("span has no free objects")
 	}
-	// Assume all objects from this span will be allocated in the
-	// mcache. If it gets uncached, we'll adjust this.
-	atomic.Xadd64(&c.nmalloc, int64(n))
-	usedBytes := uintptr(s.allocCount) * s.elemsize
-	atomic.Xadd64(&memstats.heap_live, int64(spanBytes)-int64(usedBytes))
-	if trace.enabled {
-		// heap_live changed.
-		traceHeapAlloc()
-	}
-	if gcBlackenEnabled != 0 {
-		// heap_live changed.
-		gcController.revise()
-	}
 	freeByteBase := s.freeindex &^ (64 - 1)
 	whichByte := freeByteBase / 8
 	// Init alloc bits cache.
@@ -228,27 +210,6 @@ func (c *mcentral) uncacheSpan(s *mspan) {
 		// Indicate that s is no longer cached.
 		atomic.Store(&s.sweepgen, sg)
 	}
-	n := int(s.nelems) - int(s.allocCount)
-
-	// Fix up statistics.
-	if n > 0 {
-		// cacheSpan updated alloc assuming all objects on s
-		// were going to be allocated. Adjust for any that
-		// weren't. We must do this before potentially
-		// sweeping the span.
-		atomic.Xadd64(&c.nmalloc, -int64(n))
-
-		if !stale {
-			// (*mcentral).cacheSpan conservatively counted
-			// unallocated slots in heap_live. Undo this.
-			//
-			// If this span was cached before sweep, then
-			// heap_live was totally recomputed since
-			// caching this span, so we don't do this for
-			// stale spans.
-			atomic.Xadd64(&memstats.heap_live, -int64(n)*int64(s.elemsize))
-		}
-	}
 
 	// Put the span in the appropriate place.
 	if stale {
@@ -256,7 +217,7 @@ func (c *mcentral) uncacheSpan(s *mspan) {
 		// the right list.
 		s.sweep(false)
 	} else {
-		if n > 0 {
+		if int(s.nelems)-int(s.allocCount) > 0 {
 			// Put it back on the partial swept list.
 			c.partialSwept(sg).push(s)
 		} else {
diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go
index d9acb361d5..44cf17c85b 100644
--- a/src/runtime/mstats.go
+++ b/src/runtime/mstats.go
@@ -561,17 +561,6 @@ func updatememstats() {
 	// Collect allocation stats. This is safe and consistent
 	// because the world is stopped.
 	var smallFree, totalAlloc, totalFree uint64
-	// Collect per-spanclass stats.
-	for spc := range mheap_.central {
-		// The mcaches are now empty, so mcentral stats are
-		// up-to-date.
-		c := &mheap_.central[spc].mcentral
-		memstats.nmalloc += c.nmalloc
-		i := spanClass(spc).sizeclass()
-		memstats.by_size[i].nmalloc += c.nmalloc
-		totalAlloc += c.nmalloc * uint64(class_to_size[i])
-	}
-
 	for _, p := range allp {
 		c := p.mcache
 		if c == nil {
@@ -585,12 +574,17 @@ func updatememstats() {
 
 		// Collect per-sizeclass stats.
 		for i := 0; i < _NumSizeClasses; i++ {
+			// Malloc stats.
+			memstats.nmalloc += uint64(c.local_nsmallalloc[i])
+			memstats.by_size[i].nmalloc += uint64(c.local_nsmallalloc[i])
+			totalAlloc += uint64(c.local_nsmallalloc[i]) * uint64(class_to_size[i])
+
+			// Free stats.
 			memstats.nfree += uint64(c.local_nsmallfree[i])
 			memstats.by_size[i].nfree += uint64(c.local_nsmallfree[i])
 			smallFree += uint64(c.local_nsmallfree[i]) * uint64(class_to_size[i])
 		}
 	}
-	// Collect remaining large allocation stats.
 
 	totalFree += smallFree
 
-- 
cgit v1.3


From cca3d1e5533cb40beb9ef55bbc332b733adcc6ba Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Thu, 23 Jul 2020 22:16:46 +0000
Subject: runtime: don't flush local_tinyallocs

This change makes local_tinyallocs work like the rest of the malloc
stats and doesn't flush local_tinyallocs, instead making that the
source-of-truth.

Change-Id: I3e6cb5f1b3d086e432ce7d456895511a48e3617a
Reviewed-on: https://go-review.googlesource.com/c/go/+/246967
Trust: Michael Knyszek <mknyszek@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/export_test.go | 7 +++++--
 src/runtime/mcache.go      | 8 +++++---
 src/runtime/mheap.go       | 4 ----
 src/runtime/mstats.go      | 6 ++++--
 4 files changed, 14 insertions(+), 11 deletions(-)

(limited to 'src/runtime/mcache.go')

diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index d5a90ca65b..d71b180f76 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -339,7 +339,7 @@ func ReadMemStatsSlow() (base, slow MemStats) {
 
 		// Add in frees. readmemstats_m flushed the cached stats, so
 		// these are up-to-date.
-		var largeFree, smallFree uint64
+		var tinyAllocs, largeFree, smallFree uint64
 		for _, p := range allp {
 			c := p.mcache
 			if c == nil {
@@ -349,6 +349,9 @@ func ReadMemStatsSlow() (base, slow MemStats) {
 			largeFree += uint64(c.local_largefree)
 			slow.Frees += uint64(c.local_nlargefree)
 
+			// Collect tiny allocation stats.
+			tinyAllocs += uint64(c.local_tinyallocs)
+
 			// Collect per-sizeclass stats.
 			for i := 0; i < _NumSizeClasses; i++ {
 				slow.Frees += uint64(c.local_nsmallfree[i])
@@ -357,7 +360,7 @@ func ReadMemStatsSlow() (base, slow MemStats) {
 				smallFree += uint64(c.local_nsmallfree[i]) * uint64(class_to_size[i])
 			}
 		}
-		slow.Frees += memstats.tinyallocs
+		slow.Frees += tinyAllocs
 		slow.Mallocs += slow.Frees
 
 		slow.TotalAlloc = slow.Alloc + largeFree + smallFree
diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go
index 4d2ba6dff0..fe603116a2 100644
--- a/src/runtime/mcache.go
+++ b/src/runtime/mcache.go
@@ -32,9 +32,8 @@ type mcache struct {
 	// tiny is a heap pointer. Since mcache is in non-GC'd memory,
 	// we handle it by clearing it in releaseAll during mark
 	// termination.
-	tiny             uintptr
-	tinyoffset       uintptr
-	local_tinyallocs uintptr // number of tiny allocs not counted in other stats
+	tiny       uintptr
+	tinyoffset uintptr
 
 	// The rest is not accessed on every malloc.
 
@@ -49,6 +48,7 @@ type mcache struct {
 	// When read with stats from other mcaches and with the world
 	// stopped, the result will accurately reflect the state of the
 	// application.
+	local_tinyallocs  uintptr                  // number of tiny allocs not counted in other stats
 	local_largealloc  uintptr                  // bytes allocated for large objects
 	local_nlargealloc uintptr                  // number of large object allocations
 	local_nsmallalloc [_NumSizeClasses]uintptr // number of allocs for small objects
@@ -151,6 +151,8 @@ func (c *mcache) donate(d *mcache) {
 		d.local_nsmallfree[i] += c.local_nsmallfree[i]
 		c.local_nsmallfree[i] = 0
 	}
+	d.local_tinyallocs += c.local_tinyallocs
+	c.local_tinyallocs = 0
 }
 
 // refill acquires a new span of span class spc for c. This span will
diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index 5635dc6784..47f86ee38c 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -1163,8 +1163,6 @@ func (h *mheap) allocSpan(npages uintptr, manual bool, spanclass spanClass, sysS
 		}
 		atomic.Xadd64(&memstats.heap_scan, int64(c.local_scan))
 		c.local_scan = 0
-		memstats.tinyallocs += uint64(c.local_tinyallocs)
-		c.local_tinyallocs = 0
 
 		// heap_scan was been updated.
 		if gcBlackenEnabled != 0 {
@@ -1358,8 +1356,6 @@ func (h *mheap) freeSpan(s *mspan) {
 		lock(&h.lock)
 		atomic.Xadd64(&memstats.heap_scan, int64(c.local_scan))
 		c.local_scan = 0
-		memstats.tinyallocs += uint64(c.local_tinyallocs)
-		c.local_tinyallocs = 0
 		if msanenabled {
 			// Tell msan that this entire span is no longer in use.
 			base := unsafe.Pointer(s.base())
diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go
index 44cf17c85b..341906fced 100644
--- a/src/runtime/mstats.go
+++ b/src/runtime/mstats.go
@@ -550,6 +550,7 @@ func updatememstats() {
 	memstats.total_alloc = 0
 	memstats.nmalloc = 0
 	memstats.nfree = 0
+	memstats.tinyallocs = 0
 	for i := 0; i < len(memstats.by_size); i++ {
 		memstats.by_size[i].nmalloc = 0
 		memstats.by_size[i].nfree = 0
@@ -572,6 +573,9 @@ func updatememstats() {
 		totalFree += uint64(c.local_largefree)
 		memstats.nfree += uint64(c.local_nlargefree)
 
+		// Collect tiny allocation stats.
+		memstats.tinyallocs += uint64(c.local_tinyallocs)
+
 		// Collect per-sizeclass stats.
 		for i := 0; i < _NumSizeClasses; i++ {
 			// Malloc stats.
@@ -644,8 +648,6 @@ func purgecachedstats(c *mcache) {
 	// Protected by heap lock.
 	atomic.Xadd64(&memstats.heap_scan, int64(c.local_scan))
 	c.local_scan = 0
-	memstats.tinyallocs += uint64(c.local_tinyallocs)
-	c.local_tinyallocs = 0
 }
 
 // Atomically increases a given *system* memory stat. We are counting on this
-- 
cgit v1.3


From d677899e903c4741920846f1af2c14c56f6e710e Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Thu, 23 Jul 2020 22:36:58 +0000
Subject: runtime: flush local_scan directly and more often

Now that local_scan is the last mcache-based statistic that is flushed
by purgecachedstats, and heap_scan and gcController.revise may be
interacted with concurrently, we don't need to flush heap_scan at
arbitrary locations where the heap is locked, and we don't need
purgecachedstats and cachestats anymore. Instead, we can flush
local_scan at the same time we update heap_live in refill, so the two
updates may share the same revise call.

Clean up unused functions, remove code that would cause the heap to get
locked in the allocSpan when it didn't need to (other than to flush
local_scan), and flush local_scan explicitly in a few important places.
Notably we need to flush local_scan whenever we flush the other stats,
but it doesn't need to be donated anywhere, so have releaseAll do the
flushing. Also, we need to flush local_scan before we set heap_scan at
the end of a GC, which was previously handled by cachestats. Just do so
explicitly -- it's not much code and it becomes a lot more clear why we
need to do so.

Change-Id: I35ac081784df7744d515479896a41d530653692d
Reviewed-on: https://go-review.googlesource.com/c/go/+/246968
Run-TryBot: Michael Knyszek <mknyszek@google.com>
Trust: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/mcache.go | 22 ++++++++++++++++++++--
 src/runtime/mgc.go    | 14 ++++++++++++--
 src/runtime/mheap.go  | 49 +++----------------------------------------------
 src/runtime/mstats.go | 25 -------------------------
 4 files changed, 35 insertions(+), 75 deletions(-)

(limited to 'src/runtime/mcache.go')

diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go
index fe603116a2..b8e388cc4f 100644
--- a/src/runtime/mcache.go
+++ b/src/runtime/mcache.go
@@ -124,7 +124,6 @@ func freemcache(c *mcache, recipient *mcache) {
 		// gcworkbuffree(c.gcworkbuf)
 
 		lock(&mheap_.lock)
-		purgecachedstats(c)
 		// Donate anything else that's left.
 		c.donate(recipient)
 		mheap_.cachealloc.free(unsafe.Pointer(c))
@@ -135,6 +134,8 @@ func freemcache(c *mcache, recipient *mcache) {
 // donate flushes data and resources which have no global
 // pool to another mcache.
 func (c *mcache) donate(d *mcache) {
+	// local_scan is handled separately because it's not
+	// like these stats -- it's used for GC pacing.
 	d.local_largealloc += c.local_largealloc
 	c.local_largealloc = 0
 	d.local_nlargealloc += c.local_nlargealloc
@@ -192,14 +193,22 @@ func (c *mcache) refill(spc spanClass) {
 	// Assume all objects from this span will be allocated in the
 	// mcache. If it gets uncached, we'll adjust this.
 	c.local_nsmallalloc[spc.sizeclass()] += uintptr(s.nelems) - uintptr(s.allocCount)
+
+	// Update heap_live with the same assumption.
 	usedBytes := uintptr(s.allocCount) * s.elemsize
 	atomic.Xadd64(&memstats.heap_live, int64(s.npages*pageSize)-int64(usedBytes))
+
+	// While we're here, flush local_scan, since we have to call
+	// revise anyway.
+	atomic.Xadd64(&memstats.heap_scan, int64(c.local_scan))
+	c.local_scan = 0
+
 	if trace.enabled {
 		// heap_live changed.
 		traceHeapAlloc()
 	}
 	if gcBlackenEnabled != 0 {
-		// heap_live changed.
+		// heap_live and heap_scan changed.
 		gcController.revise()
 	}
 
@@ -248,6 +257,10 @@ func (c *mcache) largeAlloc(size uintptr, needzero bool, noscan bool) *mspan {
 }
 
 func (c *mcache) releaseAll() {
+	// Take this opportunity to flush local_scan.
+	atomic.Xadd64(&memstats.heap_scan, int64(c.local_scan))
+	c.local_scan = 0
+
 	sg := mheap_.sweepgen
 	for i := range c.alloc {
 		s := c.alloc[i]
@@ -273,6 +286,11 @@ func (c *mcache) releaseAll() {
 	// Clear tinyalloc pool.
 	c.tiny = 0
 	c.tinyoffset = 0
+
+	// Updated heap_scan and possible heap_live.
+	if gcBlackenEnabled != 0 {
+		gcController.revise()
+	}
 }
 
 // prepareForSweep flushes c if the system has entered a new sweep phase
diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go
index c54f893689..55554c117c 100644
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@ -2083,11 +2083,21 @@ func gcMark(start_time int64) {
 		gcw.dispose()
 	}
 
-	cachestats()
-
 	// Update the marked heap stat.
 	memstats.heap_marked = work.bytesMarked
 
+	// Flush local_scan from each mcache since we're about to modify
+	// heap_scan directly. If we were to flush this later, then local_scan
+	// might have incorrect information.
+	for _, p := range allp {
+		c := p.mcache
+		if c == nil {
+			continue
+		}
+		memstats.heap_scan += uint64(c.local_scan)
+		c.local_scan = 0
+	}
+
 	// Update other GC heap size stats. This must happen after
 	// cachestats (which flushes local statistics to these) and
 	// flushallmcaches (which modifies heap_live).
diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index 47f86ee38c..40fd58b0ef 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -1102,23 +1102,11 @@ func (h *mheap) allocSpan(npages uintptr, manual bool, spanclass spanClass, sysS
 		base, scav = c.alloc(npages)
 		if base != 0 {
 			s = h.tryAllocMSpan()
-
-			if s != nil && gcBlackenEnabled == 0 && (manual || spanclass.sizeclass() != 0) {
+			if s != nil {
 				goto HaveSpan
 			}
-			// We're either running duing GC, failed to acquire a mspan,
-			// or the allocation is for a large object. This means we
-			// have to lock the heap and do a bunch of extra work,
-			// so go down the HaveBaseLocked path.
-			//
-			// We must do this during GC to avoid skew with heap_scan
-			// since we flush mcache stats whenever we lock.
-			//
-			// TODO(mknyszek): It would be nice to not have to
-			// lock the heap if it's a large allocation, but
-			// it's fine for now. The critical section here is
-			// short and large object allocations are relatively
-			// infrequent.
+			// We have a base but no mspan, so we need
+			// to lock the heap.
 		}
 	}
 
@@ -1145,30 +1133,6 @@ func (h *mheap) allocSpan(npages uintptr, manual bool, spanclass spanClass, sysS
 		// one now that we have the heap lock.
 		s = h.allocMSpanLocked()
 	}
-	if !manual {
-		// This is a heap span, so we should do some additional accounting
-		// which may only be done with the heap locked.
-
-		// Transfer stats from mcache to global.
-		var c *mcache
-		if gp.m.p != 0 {
-			c = gp.m.p.ptr().mcache
-		} else {
-			// This case occurs while bootstrapping.
-			// See the similar code in mallocgc.
-			c = mcache0
-			if c == nil {
-				throw("mheap.allocSpan called with no P")
-			}
-		}
-		atomic.Xadd64(&memstats.heap_scan, int64(c.local_scan))
-		c.local_scan = 0
-
-		// heap_scan was been updated.
-		if gcBlackenEnabled != 0 {
-			gcController.revise()
-		}
-	}
 	unlock(&h.lock)
 
 HaveSpan:
@@ -1352,20 +1316,13 @@ func (h *mheap) grow(npage uintptr) bool {
 // Free the span back into the heap.
 func (h *mheap) freeSpan(s *mspan) {
 	systemstack(func() {
-		c := getg().m.p.ptr().mcache
 		lock(&h.lock)
-		atomic.Xadd64(&memstats.heap_scan, int64(c.local_scan))
-		c.local_scan = 0
 		if msanenabled {
 			// Tell msan that this entire span is no longer in use.
 			base := unsafe.Pointer(s.base())
 			bytes := s.npages << _PageShift
 			msanfree(base, bytes)
 		}
-		if gcBlackenEnabled != 0 {
-			// heap_scan changed.
-			gcController.revise()
-		}
 		h.freeSpanLocked(s, true, true)
 		unlock(&h.lock)
 	})
diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go
index 341906fced..5eeb173640 100644
--- a/src/runtime/mstats.go
+++ b/src/runtime/mstats.go
@@ -556,9 +556,6 @@ func updatememstats() {
 		memstats.by_size[i].nfree = 0
 	}
 
-	// Aggregate local stats.
-	cachestats()
-
 	// Collect allocation stats. This is safe and consistent
 	// because the world is stopped.
 	var smallFree, totalAlloc, totalFree uint64
@@ -602,21 +599,6 @@ func updatememstats() {
 	memstats.heap_objects = memstats.nmalloc - memstats.nfree
 }
 
-// cachestats flushes all mcache stats.
-//
-// The world must be stopped.
-//
-//go:nowritebarrier
-func cachestats() {
-	for _, p := range allp {
-		c := p.mcache
-		if c == nil {
-			continue
-		}
-		purgecachedstats(c)
-	}
-}
-
 // flushmcache flushes the mcache of allp[i].
 //
 // The world must be stopped.
@@ -643,13 +625,6 @@ func flushallmcaches() {
 	}
 }
 
-//go:nosplit
-func purgecachedstats(c *mcache) {
-	// Protected by heap lock.
-	atomic.Xadd64(&memstats.heap_scan, int64(c.local_scan))
-	c.local_scan = 0
-}
-
 // Atomically increases a given *system* memory stat. We are counting on this
 // stat never overflowing a uintptr, so this function must only be used for
 // system memory stats.
-- 
cgit v1.3


From c8638498008f9874dc5a48734418e0fbea08cee9 Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Fri, 24 Jul 2020 19:58:31 +0000
Subject: runtime: rename mcache fields to match Go style

This change renames a bunch of malloc statistics stored in the mcache
that are all named with the "local_" prefix. It also renames largeAlloc
to allocLarge to prevent a naming conflict, and next_sample because it
would be the last mcache field with the old C naming style.

Change-Id: I29695cb83b397a435ede7e9ad5c3c9be72767ea3
Reviewed-on: https://go-review.googlesource.com/c/go/+/246969
Trust: Michael Knyszek <mknyszek@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/export_test.go      | 14 ++++----
 src/runtime/malloc.go           | 12 +++----
 src/runtime/mcache.go           | 78 ++++++++++++++++++++---------------------
 src/runtime/mgc.go              |  8 ++---
 src/runtime/mgcsweep.go         |  6 ++--
 src/runtime/mstats.go           | 22 ++++++------
 src/runtime/pprof/mprof_test.go |  2 +-
 7 files changed, 71 insertions(+), 71 deletions(-)

(limited to 'src/runtime/mcache.go')

diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index d71b180f76..47cbc286f6 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -346,18 +346,18 @@ func ReadMemStatsSlow() (base, slow MemStats) {
 				continue
 			}
 			// Collect large allocation stats.
-			largeFree += uint64(c.local_largefree)
-			slow.Frees += uint64(c.local_nlargefree)
+			largeFree += uint64(c.largeFree)
+			slow.Frees += uint64(c.largeFreeCount)
 
 			// Collect tiny allocation stats.
-			tinyAllocs += uint64(c.local_tinyallocs)
+			tinyAllocs += uint64(c.tinyAllocCount)
 
 			// Collect per-sizeclass stats.
 			for i := 0; i < _NumSizeClasses; i++ {
-				slow.Frees += uint64(c.local_nsmallfree[i])
-				bySize[i].Frees += uint64(c.local_nsmallfree[i])
-				bySize[i].Mallocs += uint64(c.local_nsmallfree[i])
-				smallFree += uint64(c.local_nsmallfree[i]) * uint64(class_to_size[i])
+				slow.Frees += uint64(c.smallFreeCount[i])
+				bySize[i].Frees += uint64(c.smallFreeCount[i])
+				bySize[i].Mallocs += uint64(c.smallFreeCount[i])
+				smallFree += uint64(c.smallFreeCount[i]) * uint64(class_to_size[i])
 			}
 		}
 		slow.Frees += tinyAllocs
diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index ec601ccb39..0f48d7f68e 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -1040,7 +1040,7 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 				// The object fits into existing tiny block.
 				x = unsafe.Pointer(c.tiny + off)
 				c.tinyoffset = off + size
-				c.local_tinyallocs++
+				c.tinyAllocCount++
 				mp.mallocing = 0
 				releasem(mp)
 				return x
@@ -1082,7 +1082,7 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 		}
 	} else {
 		shouldhelpgc = true
-		span = c.largeAlloc(size, needzero, noscan)
+		span = c.allocLarge(size, needzero, noscan)
 		span.freeindex = 1
 		span.allocCount = 1
 		x = unsafe.Pointer(span.base())
@@ -1111,7 +1111,7 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 		} else {
 			scanSize = typ.ptrdata
 		}
-		c.local_scan += scanSize
+		c.scanAlloc += scanSize
 	}
 
 	// Ensure that the stores above that initialize x to
@@ -1153,8 +1153,8 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 	}
 
 	if rate := MemProfileRate; rate > 0 {
-		if rate != 1 && size < c.next_sample {
-			c.next_sample -= size
+		if rate != 1 && size < c.nextSample {
+			c.nextSample -= size
 		} else {
 			mp := acquirem()
 			profilealloc(mp, x, size)
@@ -1221,7 +1221,7 @@ func profilealloc(mp *m, x unsafe.Pointer, size uintptr) {
 			throw("profilealloc called with no P")
 		}
 	}
-	c.next_sample = nextSample()
+	c.nextSample = nextSample()
 	mProf_Malloc(x, size)
 }
 
diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go
index b8e388cc4f..c3e0e5e1f7 100644
--- a/src/runtime/mcache.go
+++ b/src/runtime/mcache.go
@@ -20,8 +20,8 @@ import (
 type mcache struct {
 	// The following members are accessed on every malloc,
 	// so they are grouped here for better caching.
-	next_sample uintptr // trigger heap sample after allocating this many bytes
-	local_scan  uintptr // bytes of scannable heap allocated
+	nextSample uintptr // trigger heap sample after allocating this many bytes
+	scanAlloc  uintptr // bytes of scannable heap allocated
 
 	// Allocator cache for tiny objects w/o pointers.
 	// See "Tiny allocator" comment in malloc.go.
@@ -48,13 +48,13 @@ type mcache struct {
 	// When read with stats from other mcaches and with the world
 	// stopped, the result will accurately reflect the state of the
 	// application.
-	local_tinyallocs  uintptr                  // number of tiny allocs not counted in other stats
-	local_largealloc  uintptr                  // bytes allocated for large objects
-	local_nlargealloc uintptr                  // number of large object allocations
-	local_nsmallalloc [_NumSizeClasses]uintptr // number of allocs for small objects
-	local_largefree   uintptr                  // bytes freed for large objects (>maxsmallsize)
-	local_nlargefree  uintptr                  // number of frees for large objects (>maxsmallsize)
-	local_nsmallfree  [_NumSizeClasses]uintptr // number of frees for small objects (<=maxsmallsize)
+	tinyAllocCount  uintptr                  // number of tiny allocs not counted in other stats
+	largeAlloc      uintptr                  // bytes allocated for large objects
+	largeAllocCount uintptr                  // number of large object allocations
+	smallAllocCount [_NumSizeClasses]uintptr // number of allocs for small objects
+	largeFree       uintptr                  // bytes freed for large objects (>maxSmallSize)
+	largeFreeCount  uintptr                  // number of frees for large objects (>maxSmallSize)
+	smallFreeCount  [_NumSizeClasses]uintptr // number of frees for small objects (<=maxSmallSize)
 
 	// flushGen indicates the sweepgen during which this mcache
 	// was last flushed. If flushGen != mheap_.sweepgen, the spans
@@ -103,7 +103,7 @@ func allocmcache() *mcache {
 	for i := range c.alloc {
 		c.alloc[i] = &emptymspan
 	}
-	c.next_sample = nextSample()
+	c.nextSample = nextSample()
 	return c
 }
 
@@ -134,26 +134,26 @@ func freemcache(c *mcache, recipient *mcache) {
 // donate flushes data and resources which have no global
 // pool to another mcache.
 func (c *mcache) donate(d *mcache) {
-	// local_scan is handled separately because it's not
+	// scanAlloc is handled separately because it's not
 	// like these stats -- it's used for GC pacing.
-	d.local_largealloc += c.local_largealloc
-	c.local_largealloc = 0
-	d.local_nlargealloc += c.local_nlargealloc
-	c.local_nlargealloc = 0
-	for i := range c.local_nsmallalloc {
-		d.local_nsmallalloc[i] += c.local_nsmallalloc[i]
-		c.local_nsmallalloc[i] = 0
+	d.largeAlloc += c.largeAlloc
+	c.largeAlloc = 0
+	d.largeAllocCount += c.largeAllocCount
+	c.largeAllocCount = 0
+	for i := range c.smallAllocCount {
+		d.smallAllocCount[i] += c.smallAllocCount[i]
+		c.smallAllocCount[i] = 0
 	}
-	d.local_largefree += c.local_largefree
-	c.local_largefree = 0
-	d.local_nlargefree += c.local_nlargefree
-	c.local_nlargefree = 0
-	for i := range c.local_nsmallfree {
-		d.local_nsmallfree[i] += c.local_nsmallfree[i]
-		c.local_nsmallfree[i] = 0
+	d.largeFree += c.largeFree
+	c.largeFree = 0
+	d.largeFreeCount += c.largeFreeCount
+	c.largeFreeCount = 0
+	for i := range c.smallFreeCount {
+		d.smallFreeCount[i] += c.smallFreeCount[i]
+		c.smallFreeCount[i] = 0
 	}
-	d.local_tinyallocs += c.local_tinyallocs
-	c.local_tinyallocs = 0
+	d.tinyAllocCount += c.tinyAllocCount
+	c.tinyAllocCount = 0
 }
 
 // refill acquires a new span of span class spc for c. This span will
@@ -192,16 +192,16 @@ func (c *mcache) refill(spc spanClass) {
 
 	// Assume all objects from this span will be allocated in the
 	// mcache. If it gets uncached, we'll adjust this.
-	c.local_nsmallalloc[spc.sizeclass()] += uintptr(s.nelems) - uintptr(s.allocCount)
+	c.smallAllocCount[spc.sizeclass()] += uintptr(s.nelems) - uintptr(s.allocCount)
 
 	// Update heap_live with the same assumption.
 	usedBytes := uintptr(s.allocCount) * s.elemsize
 	atomic.Xadd64(&memstats.heap_live, int64(s.npages*pageSize)-int64(usedBytes))
 
-	// While we're here, flush local_scan, since we have to call
+	// While we're here, flush scanAlloc, since we have to call
 	// revise anyway.
-	atomic.Xadd64(&memstats.heap_scan, int64(c.local_scan))
-	c.local_scan = 0
+	atomic.Xadd64(&memstats.heap_scan, int64(c.scanAlloc))
+	c.scanAlloc = 0
 
 	if trace.enabled {
 		// heap_live changed.
@@ -215,8 +215,8 @@ func (c *mcache) refill(spc spanClass) {
 	c.alloc[spc] = s
 }
 
-// largeAlloc allocates a span for a large object.
-func (c *mcache) largeAlloc(size uintptr, needzero bool, noscan bool) *mspan {
+// allocLarge allocates a span for a large object.
+func (c *mcache) allocLarge(size uintptr, needzero bool, noscan bool) *mspan {
 	if size+_PageSize < size {
 		throw("out of memory")
 	}
@@ -235,8 +235,8 @@ func (c *mcache) largeAlloc(size uintptr, needzero bool, noscan bool) *mspan {
 	if s == nil {
 		throw("out of memory")
 	}
-	c.local_largealloc += npages * pageSize
-	c.local_nlargealloc++
+	c.largeAlloc += npages * pageSize
+	c.largeAllocCount++
 
 	// Update heap_live and revise pacing if needed.
 	atomic.Xadd64(&memstats.heap_live, int64(npages*pageSize))
@@ -257,9 +257,9 @@ func (c *mcache) largeAlloc(size uintptr, needzero bool, noscan bool) *mspan {
 }
 
 func (c *mcache) releaseAll() {
-	// Take this opportunity to flush local_scan.
-	atomic.Xadd64(&memstats.heap_scan, int64(c.local_scan))
-	c.local_scan = 0
+	// Take this opportunity to flush scanAlloc.
+	atomic.Xadd64(&memstats.heap_scan, int64(c.scanAlloc))
+	c.scanAlloc = 0
 
 	sg := mheap_.sweepgen
 	for i := range c.alloc {
@@ -267,7 +267,7 @@ func (c *mcache) releaseAll() {
 		if s != &emptymspan {
 			// Adjust nsmallalloc in case the span wasn't fully allocated.
 			n := uintptr(s.nelems) - uintptr(s.allocCount)
-			c.local_nsmallalloc[spanClass(i).sizeclass()] -= n
+			c.smallAllocCount[spanClass(i).sizeclass()] -= n
 			if s.sweepgen != sg+1 {
 				// refill conservatively counted unallocated slots in heap_live.
 				// Undo this.
diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go
index 55554c117c..540c376f1c 100644
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@ -2086,16 +2086,16 @@ func gcMark(start_time int64) {
 	// Update the marked heap stat.
 	memstats.heap_marked = work.bytesMarked
 
-	// Flush local_scan from each mcache since we're about to modify
-	// heap_scan directly. If we were to flush this later, then local_scan
+	// Flush scanAlloc from each mcache since we're about to modify
+	// heap_scan directly. If we were to flush this later, then scanAlloc
 	// might have incorrect information.
 	for _, p := range allp {
 		c := p.mcache
 		if c == nil {
 			continue
 		}
-		memstats.heap_scan += uint64(c.local_scan)
-		c.local_scan = 0
+		memstats.heap_scan += uint64(c.scanAlloc)
+		c.scanAlloc = 0
 	}
 
 	// Update other GC heap size stats. This must happen after
diff --git a/src/runtime/mgcsweep.go b/src/runtime/mgcsweep.go
index 6b8c56ce35..7103b08455 100644
--- a/src/runtime/mgcsweep.go
+++ b/src/runtime/mgcsweep.go
@@ -503,7 +503,7 @@ func (s *mspan) sweep(preserve bool) bool {
 			// wasn't totally filled, but then swept, still has all of its
 			// free slots zeroed.
 			s.needzero = 1
-			c.local_nsmallfree[spc.sizeclass()] += uintptr(nfreed)
+			c.smallFreeCount[spc.sizeclass()] += uintptr(nfreed)
 		}
 		if !preserve {
 			// The caller may not have removed this span from whatever
@@ -548,8 +548,8 @@ func (s *mspan) sweep(preserve bool) bool {
 			} else {
 				mheap_.freeSpan(s)
 			}
-			c.local_nlargefree++
-			c.local_largefree += size
+			c.largeFreeCount++
+			c.largeFree += size
 			return true
 		}
 
diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go
index 5eeb173640..64687c24e5 100644
--- a/src/runtime/mstats.go
+++ b/src/runtime/mstats.go
@@ -565,25 +565,25 @@ func updatememstats() {
 			continue
 		}
 		// Collect large allocation stats.
-		memstats.nmalloc += uint64(c.local_nlargealloc)
-		totalAlloc += uint64(c.local_largealloc)
-		totalFree += uint64(c.local_largefree)
-		memstats.nfree += uint64(c.local_nlargefree)
+		memstats.nmalloc += uint64(c.largeAllocCount)
+		totalAlloc += uint64(c.largeAlloc)
+		totalFree += uint64(c.largeFree)
+		memstats.nfree += uint64(c.largeFreeCount)
 
 		// Collect tiny allocation stats.
-		memstats.tinyallocs += uint64(c.local_tinyallocs)
+		memstats.tinyallocs += uint64(c.tinyAllocCount)
 
 		// Collect per-sizeclass stats.
 		for i := 0; i < _NumSizeClasses; i++ {
 			// Malloc stats.
-			memstats.nmalloc += uint64(c.local_nsmallalloc[i])
-			memstats.by_size[i].nmalloc += uint64(c.local_nsmallalloc[i])
-			totalAlloc += uint64(c.local_nsmallalloc[i]) * uint64(class_to_size[i])
+			memstats.nmalloc += uint64(c.smallAllocCount[i])
+			memstats.by_size[i].nmalloc += uint64(c.smallAllocCount[i])
+			totalAlloc += uint64(c.smallAllocCount[i]) * uint64(class_to_size[i])
 
 			// Free stats.
-			memstats.nfree += uint64(c.local_nsmallfree[i])
-			memstats.by_size[i].nfree += uint64(c.local_nsmallfree[i])
-			smallFree += uint64(c.local_nsmallfree[i]) * uint64(class_to_size[i])
+			memstats.nfree += uint64(c.smallFreeCount[i])
+			memstats.by_size[i].nfree += uint64(c.smallFreeCount[i])
+			smallFree += uint64(c.smallFreeCount[i]) * uint64(class_to_size[i])
 		}
 	}
 
diff --git a/src/runtime/pprof/mprof_test.go b/src/runtime/pprof/mprof_test.go
index f253f07def..c11a45fd69 100644
--- a/src/runtime/pprof/mprof_test.go
+++ b/src/runtime/pprof/mprof_test.go
@@ -70,7 +70,7 @@ func TestMemoryProfiler(t *testing.T) {
 		runtime.MemProfileRate = oldRate
 	}()
 
-	// Allocate a meg to ensure that mcache.next_sample is updated to 1.
+	// Allocate a meg to ensure that mcache.nextSample is updated to 1.
 	for i := 0; i < 1024; i++ {
 		memSink = make([]byte, 1024)
 	}
-- 
cgit v1.3


From c02134abb01e019683daf051029d66b15dd11213 Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Mon, 3 Aug 2020 20:08:25 +0000
Subject: runtime: add helper for getting an mcache in allocation contexts

This change adds a function getMCache which returns the current P's
mcache if it's available, and otherwise tries to get mcache0 if we're
bootstrapping. This function will come in handy as we need to replicate
this behavior in multiple places in future changes.

Change-Id: I536073d6f6dc6c6390269e613ead9f8bcb6e7f98
Reviewed-on: https://go-review.googlesource.com/c/go/+/246976
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Trust: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/malloc.go | 25 ++-----------------------
 src/runtime/mcache.go | 23 +++++++++++++++++++++++
 2 files changed, 25 insertions(+), 23 deletions(-)

(limited to 'src/runtime/mcache.go')

diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index ee22bad58c..6383c34817 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -972,19 +972,7 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 
 	shouldhelpgc := false
 	dataSize := size
-	var c *mcache
-	if mp.p != 0 {
-		c = mp.p.ptr().mcache
-	} else {
-		// We will be called without a P while bootstrapping,
-		// in which case we use mcache0, which is set in mallocinit.
-		// mcache0 is cleared when bootstrapping is complete,
-		// by procresize.
-		c = mcache0
-		if c == nil {
-			throw("malloc called with no P")
-		}
-	}
+	c := getMCache()
 	var span *mspan
 	var x unsafe.Pointer
 	noscan := typ == nil || typ.ptrdata == 0
@@ -1212,16 +1200,7 @@ func reflect_unsafe_NewArray(typ *_type, n int) unsafe.Pointer {
 }
 
 func profilealloc(mp *m, x unsafe.Pointer, size uintptr) {
-	var c *mcache
-	if mp.p != 0 {
-		c = mp.p.ptr().mcache
-	} else {
-		c = mcache0
-		if c == nil {
-			throw("profilealloc called with no P")
-		}
-	}
-	c.nextSample = nextSample()
+	getMCache().nextSample = nextSample()
 	mProf_Malloc(x, size)
 }
 
diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go
index c3e0e5e1f7..5564e4a47d 100644
--- a/src/runtime/mcache.go
+++ b/src/runtime/mcache.go
@@ -131,6 +131,29 @@ func freemcache(c *mcache, recipient *mcache) {
 	})
 }
 
+// getMCache is a convenience function which tries to obtain an mcache.
+//
+// Must be running with a P when called (so the caller must be in a
+// non-preemptible state) or must be called during bootstrapping.
+func getMCache() *mcache {
+	// Grab the mcache, since that's where stats live.
+	pp := getg().m.p.ptr()
+	var c *mcache
+	if pp == nil {
+		// We will be called without a P while bootstrapping,
+		// in which case we use mcache0, which is set in mallocinit.
+		// mcache0 is cleared when bootstrapping is complete,
+		// by procresize.
+		c = mcache0
+		if c == nil {
+			throw("getMCache called with no P or outside bootstrapping")
+		}
+	} else {
+		c = pp.mcache
+	}
+	return c
+}
+
 // donate flushes data and resources which have no global
 // pool to another mcache.
 func (c *mcache) donate(d *mcache) {
-- 
cgit v1.3


From fe7ff71185cf30f9bdee3e8d8897e8b6069ad02e Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Mon, 3 Aug 2020 20:11:04 +0000
Subject: runtime: add consistent heap statistics

This change adds a global set of heap statistics which are similar
to existing memory statistics. The purpose of these new statistics
is to be able to read them and get a consistent result without stopping
the world. The goal is to eventually replace as many of the existing
memstats statistics with the sharded ones as possible.

The consistent memory statistics use a tailor-made synchronization
mechanism to allow writers (allocators) to proceed with minimal
synchronization by using a sequence counter and a global generation
counter to determine which set of statistics to update. Readers
increment the global generation counter to effectively grab a snapshot
of the statistics, and then iterate over all Ps using the sequence
counter to ensure that they may safely read the snapshotted statistics.
To keep statistics fresh, the reader also has a responsibility to merge
sets of statistics.

These consistent statistics are computed, but otherwise unused for now.
Upcoming changes will integrate them with the rest of the codebase and
will begin to phase out existing statistics.

Change-Id: I637a11f2439e2049d7dccb8650c5d82500733ca5
Reviewed-on: https://go-review.googlesource.com/c/go/+/247037
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Trust: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/mcache.go      |   4 +
 src/runtime/mgcscavenge.go |  11 ++-
 src/runtime/mheap.go       |  34 +++++++++
 src/runtime/mstats.go      | 184 ++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 230 insertions(+), 3 deletions(-)

(limited to 'src/runtime/mcache.go')

diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go
index 5564e4a47d..e27a1c9ec0 100644
--- a/src/runtime/mcache.go
+++ b/src/runtime/mcache.go
@@ -61,6 +61,10 @@ type mcache struct {
 	// in this mcache are stale and need to the flushed so they
 	// can be swept. This is done in acquirep.
 	flushGen uint32
+
+	// statsSeq is a counter indicating whether this P is currently
+	// writing any stats. Its value is even when not, odd when it is.
+	statsSeq uint32
 }
 
 // A gclink is a node in a linked list of blocks, like mlink,
diff --git a/src/runtime/mgcscavenge.go b/src/runtime/mgcscavenge.go
index 8b1a0be353..5843ada981 100644
--- a/src/runtime/mgcscavenge.go
+++ b/src/runtime/mgcscavenge.go
@@ -711,7 +711,16 @@ func (p *pageAlloc) scavengeRangeLocked(ci chunkIdx, base, npages uint) uintptr
 
 	// Update global accounting only when not in test, otherwise
 	// the runtime's accounting will be wrong.
-	atomic.Xadd64(&memstats.heap_released, int64(npages)*pageSize)
+	nbytes := int64(npages) * pageSize
+	atomic.Xadd64(&memstats.heap_released, nbytes)
+
+	// Update consistent accounting too.
+	c := getMCache()
+	stats := memstats.heapStats.acquire(c)
+	atomic.Xaddint64(&stats.committed, -nbytes)
+	atomic.Xaddint64(&stats.released, nbytes)
+	memstats.heapStats.release(c)
+
 	return addr
 }
 
diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index 87d2fd495b..d17b6fa284 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -1239,6 +1239,22 @@ HaveSpan:
 		// Manually managed memory doesn't count toward heap_sys.
 		memstats.heap_sys.add(-int64(nbytes))
 	}
+	// Update consistent stats.
+	c := getMCache()
+	stats := memstats.heapStats.acquire(c)
+	atomic.Xaddint64(&stats.committed, int64(scav))
+	atomic.Xaddint64(&stats.released, -int64(scav))
+	switch typ {
+	case spanAllocHeap:
+		atomic.Xaddint64(&stats.inHeap, int64(nbytes))
+	case spanAllocStack:
+		atomic.Xaddint64(&stats.inStacks, int64(nbytes))
+	case spanAllocPtrScalarBits:
+		atomic.Xaddint64(&stats.inPtrScalarBits, int64(nbytes))
+	case spanAllocWorkBuf:
+		atomic.Xaddint64(&stats.inWorkBufs, int64(nbytes))
+	}
+	memstats.heapStats.release(c)
 
 	// Publish the span in various locations.
 
@@ -1316,6 +1332,10 @@ func (h *mheap) grow(npage uintptr) bool {
 		// size which is always > physPageSize, so its safe to
 		// just add directly to heap_released.
 		atomic.Xadd64(&memstats.heap_released, int64(asize))
+		c := getMCache()
+		stats := memstats.heapStats.acquire(c)
+		atomic.Xaddint64(&stats.released, int64(asize))
+		memstats.heapStats.release(c)
 
 		// Recalculate nBase.
 		// We know this won't overflow, because sysAlloc returned
@@ -1415,6 +1435,20 @@ func (h *mheap) freeSpanLocked(s *mspan, typ spanAllocType) {
 		// Manually managed memory doesn't count toward heap_sys, so add it back.
 		memstats.heap_sys.add(int64(nbytes))
 	}
+	// Update consistent stats.
+	c := getMCache()
+	stats := memstats.heapStats.acquire(c)
+	switch typ {
+	case spanAllocHeap:
+		atomic.Xaddint64(&stats.inHeap, -int64(nbytes))
+	case spanAllocStack:
+		atomic.Xaddint64(&stats.inStacks, -int64(nbytes))
+	case spanAllocPtrScalarBits:
+		atomic.Xaddint64(&stats.inPtrScalarBits, -int64(nbytes))
+	case spanAllocWorkBuf:
+		atomic.Xaddint64(&stats.inWorkBufs, -int64(nbytes))
+	}
+	memstats.heapStats.release(c)
 
 	// Mark the space as free.
 	h.pages.free(s.base(), s.npages)
diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go
index a6e38d1c1b..76546c0f0c 100644
--- a/src/runtime/mstats.go
+++ b/src/runtime/mstats.go
@@ -148,6 +148,9 @@ type mstats struct {
 	// unlike heap_live, heap_marked does not change until the
 	// next mark termination.
 	heap_marked uint64
+
+	// heapStats is a set of statistics
+	heapStats consistentHeapStats
 }
 
 var memstats mstats
@@ -426,10 +429,20 @@ type MemStats struct {
 }
 
 func init() {
-	if unsafe.Offsetof(memstats.heap_live)%8 != 0 {
-		println(unsafe.Offsetof(memstats.heap_live))
+	if offset := unsafe.Offsetof(memstats.heap_live); offset%8 != 0 {
+		println(offset)
 		throw("memstats.heap_live not aligned to 8 bytes")
 	}
+	if offset := unsafe.Offsetof(memstats.heapStats); offset%8 != 0 {
+		println(offset)
+		throw("memstats.heapStats not aligned to 8 bytes")
+	}
+	// Ensure the size of heapStatsDelta causes adjacent fields/slots (e.g.
+	// [3]heapStatsDelta) to be 8-byte aligned.
+	if size := unsafe.Sizeof(heapStatsDelta{}); size%8 != 0 {
+		println(size)
+		throw("heapStatsDelta not a multiple of 8 bytes in size")
+	}
 }
 
 // ReadMemStats populates m with memory allocator statistics.
@@ -687,3 +700,170 @@ func (s *sysMemStat) add(n int64) {
 		throw("sysMemStat overflow")
 	}
 }
+
+// heapStatsDelta contains deltas of various runtime memory statistics
+// that need to be updated together in order for them to be kept
+// consistent with one another.
+type heapStatsDelta struct {
+	committed       int64 // byte delta of memory committed
+	released        int64 // byte delta of released memory generated
+	inHeap          int64 // byte delta of memory placed in the heap
+	inStacks        int64 // byte delta of memory reserved for stacks
+	inWorkBufs      int64 // byte delta of memory reserved for work bufs
+	inPtrScalarBits int64 // byte delta of memory reserved for unrolled GC prog bits
+}
+
+// merge adds in the deltas from b into a.
+func (a *heapStatsDelta) merge(b *heapStatsDelta) {
+	a.committed += b.committed
+	a.released += b.released
+	a.inHeap += b.inHeap
+	a.inStacks += b.inStacks
+	a.inWorkBufs += b.inWorkBufs
+	a.inPtrScalarBits += b.inPtrScalarBits
+}
+
+// consistentHeapStats represents a set of various memory statistics
+// whose updates must be viewed completely to get a consistent
+// state of the world.
+//
+// To write updates to memory stats use the acquire and release
+// methods. To obtain a consistent global snapshot of these statistics,
+// use read.
+type consistentHeapStats struct {
+	// stats is a ring buffer of heapStatsDelta values.
+	// Writers always atomically update the delta at index gen.
+	//
+	// Readers operate by rotating gen (0 -> 1 -> 2 -> 0 -> ...)
+	// and synchronizing with writers by observing each mcache's
+	// statsSeq field. If the reader observes a P (to which the
+	// mcache is bound) not writing, it can be sure that it will
+	// pick up the new gen value the next time it writes.
+	// The reader then takes responsibility by clearing space
+	// in the ring buffer for the next reader to rotate gen to
+	// that space (i.e. it merges in values from index (gen-2) mod 3
+	// to index (gen-1) mod 3, then clears the former).
+	//
+	// Note that this means only one reader can be reading at a time.
+	// There is no way for readers to synchronize.
+	//
+	// This process is why we need ring buffer of size 3 instead
+	// of 2: one is for the writers, one contains the most recent
+	// data, and the last one is clear so writers can begin writing
+	// to it the moment gen is updated.
+	stats [3]heapStatsDelta
+
+	// gen represents the current index into which writers
+	// are writing, and can take on the value of 0, 1, or 2.
+	// This value is updated atomically.
+	gen uint32
+}
+
+// acquire returns a heapStatsDelta to be updated. In effect,
+// it acquires the shard for writing. release must be called
+// as soon as the relevant deltas are updated. c must be
+// a valid mcache not being used by any other thread.
+//
+// The returned heapStatsDelta must be updated atomically.
+//
+// Note however, that this is unsafe to call concurrently
+// with other writers and there must be only one writer
+// at a time.
+func (m *consistentHeapStats) acquire(c *mcache) *heapStatsDelta {
+	seq := atomic.Xadd(&c.statsSeq, 1)
+	if seq%2 == 0 {
+		// Should have been incremented to odd.
+		print("runtime: seq=", seq, "\n")
+		throw("bad sequence number")
+	}
+	gen := atomic.Load(&m.gen) % 3
+	return &m.stats[gen]
+}
+
+// release indicates that the writer is done modifying
+// the delta. The value returned by the corresponding
+// acquire must no longer be accessed or modified after
+// release is called.
+//
+// The mcache passed here must be the same as the one
+// passed to acquire.
+func (m *consistentHeapStats) release(c *mcache) {
+	seq := atomic.Xadd(&c.statsSeq, 1)
+	if seq%2 != 0 {
+		// Should have been incremented to even.
+		print("runtime: seq=", seq, "\n")
+		throw("bad sequence number")
+	}
+}
+
+// unsafeRead aggregates the delta for this shard into out.
+//
+// Unsafe because it does so without any synchronization. The
+// only safe time to call this is if the world is stopped or
+// we're freezing the world or going down anyway (and we just
+// want _some_ estimate).
+func (m *consistentHeapStats) unsafeRead(out *heapStatsDelta) {
+	for i := range m.stats {
+		out.merge(&m.stats[i])
+	}
+}
+
+// unsafeClear clears the shard.
+//
+// Unsafe because the world must be stopped and values should
+// be donated elsewhere before clearing.
+func (m *consistentHeapStats) unsafeClear() {
+	for i := range m.stats {
+		m.stats[i] = heapStatsDelta{}
+	}
+}
+
+// read takes a globally consistent snapshot of m
+// and puts the aggregated value in out. Even though out is a
+// heapStatsDelta, the resulting values should be complete and
+// valid statistic values.
+//
+// Not safe to call concurrently.
+func (m *consistentHeapStats) read(out *heapStatsDelta) {
+	// Getting preempted after this point is not safe because
+	// we read allp. We need to make sure a STW can't happen
+	// so it doesn't change out from under us.
+	mp := acquirem()
+
+	// Rotate gen, effectively taking a snapshot of the state of
+	// these statistics at the point of the exchange by moving
+	// writers to the next set of deltas.
+	//
+	// This exchange is safe to do because we won't race
+	// with anyone else trying to update this value.
+	currGen := atomic.Load(&m.gen)
+	atomic.Xchg(&m.gen, (currGen+1)%3)
+	prevGen := currGen - 1
+	if currGen == 0 {
+		prevGen = 2
+	}
+	for _, p := range allp {
+		c := p.mcache
+		if c == nil {
+			continue
+		}
+		// Spin until there are no more writers.
+		for atomic.Load(&c.statsSeq)%2 != 0 {
+		}
+	}
+
+	// At this point we've observed that each sequence
+	// number is even, so any future writers will observe
+	// the new gen value. That means it's safe to read from
+	// the other deltas in the stats buffer.
+
+	// Perform our responsibilities and free up
+	// stats[prevGen] for the next time we want to take
+	// a snapshot.
+	m.stats[currGen].merge(&m.stats[prevGen])
+	m.stats[prevGen] = heapStatsDelta{}
+
+	// Finally, copy out the complete delta.
+	*out = m.stats[currGen]
+	releasem(mp)
+}
-- 
cgit v1.3


From 79781e8dd382ac34e502ed6a088dff6860a08c05 Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Tue, 4 Aug 2020 17:29:03 +0000
Subject: runtime: move malloc stats into consistentHeapStats

This change moves the mcache-local malloc stats into the
consistentHeapStats structure so the malloc stats can be managed
consistently with the memory stats. The one exception here is
tinyAllocs for which moving that into the global stats would incur
several atomic writes on the fast path. Microbenchmarks for just one CPU
core have shown a 50% loss in throughput. Since tiny allocation counnt
isn't exposed anyway and is always blindly added to both allocs and
frees, let that stay inconsistent and flush the tiny allocation count
every so often.

Change-Id: I2a4b75f209c0e659b9c0db081a3287bf227c10ca
Reviewed-on: https://go-review.googlesource.com/c/go/+/247039
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Trust: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/export_test.go | 37 ++++++++--------------
 src/runtime/malloc.go      |  2 +-
 src/runtime/mcache.go      | 70 ++++++++++++++---------------------------
 src/runtime/mgcsweep.go    | 10 ++++--
 src/runtime/mstats.go      | 78 ++++++++++++++++++++++++++--------------------
 src/runtime/proc.go        |  2 +-
 6 files changed, 90 insertions(+), 109 deletions(-)

(limited to 'src/runtime/mcache.go')

diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index cb753ee819..ff901fd7be 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -337,33 +337,22 @@ func ReadMemStatsSlow() (base, slow MemStats) {
 			}
 		}
 
-		// Add in frees. readmemstats_m flushed the cached stats, so
-		// these are up-to-date.
-		var tinyAllocs, largeFree, smallFree uint64
-		for _, p := range allp {
-			c := p.mcache
-			if c == nil {
-				continue
-			}
-			// Collect large allocation stats.
-			largeFree += uint64(c.largeFree)
-			slow.Frees += uint64(c.largeFreeCount)
-
-			// Collect tiny allocation stats.
-			tinyAllocs += uint64(c.tinyAllocCount)
-
-			// Collect per-sizeclass stats.
-			for i := 0; i < _NumSizeClasses; i++ {
-				slow.Frees += uint64(c.smallFreeCount[i])
-				bySize[i].Frees += uint64(c.smallFreeCount[i])
-				bySize[i].Mallocs += uint64(c.smallFreeCount[i])
-				smallFree += uint64(c.smallFreeCount[i]) * uint64(class_to_size[i])
-			}
+		// Add in frees by just reading the stats for those directly.
+		var m heapStatsDelta
+		memstats.heapStats.unsafeRead(&m)
+
+		// Collect per-sizeclass free stats.
+		var smallFree uint64
+		for i := 0; i < _NumSizeClasses; i++ {
+			slow.Frees += uint64(m.smallFreeCount[i])
+			bySize[i].Frees += uint64(m.smallFreeCount[i])
+			bySize[i].Mallocs += uint64(m.smallFreeCount[i])
+			smallFree += uint64(m.smallFreeCount[i]) * uint64(class_to_size[i])
 		}
-		slow.Frees += tinyAllocs
+		slow.Frees += memstats.tinyallocs + uint64(m.largeFreeCount)
 		slow.Mallocs += slow.Frees
 
-		slow.TotalAlloc = slow.Alloc + largeFree + smallFree
+		slow.TotalAlloc = slow.Alloc + uint64(m.largeFree) + smallFree
 
 		for i := range slow.BySize {
 			slow.BySize[i].Mallocs = bySize[i].Mallocs
diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index 6383c34817..d0b8c668c3 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -1028,7 +1028,7 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 				// The object fits into existing tiny block.
 				x = unsafe.Pointer(c.tiny + off)
 				c.tinyoffset = off + size
-				c.tinyAllocCount++
+				c.tinyAllocs++
 				mp.mallocing = 0
 				releasem(mp)
 				return x
diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go
index e27a1c9ec0..c9342a41c9 100644
--- a/src/runtime/mcache.go
+++ b/src/runtime/mcache.go
@@ -32,8 +32,12 @@ type mcache struct {
 	// tiny is a heap pointer. Since mcache is in non-GC'd memory,
 	// we handle it by clearing it in releaseAll during mark
 	// termination.
+	//
+	// tinyAllocs is the number of tiny allocations performed
+	// by the P that owns this mcache.
 	tiny       uintptr
 	tinyoffset uintptr
+	tinyAllocs uintptr
 
 	// The rest is not accessed on every malloc.
 
@@ -41,21 +45,6 @@ type mcache struct {
 
 	stackcache [_NumStackOrders]stackfreelist
 
-	// Allocator stats (source-of-truth).
-	// Only the P that owns this mcache may write to these
-	// variables, so it's safe for that P to read non-atomically.
-	//
-	// When read with stats from other mcaches and with the world
-	// stopped, the result will accurately reflect the state of the
-	// application.
-	tinyAllocCount  uintptr                  // number of tiny allocs not counted in other stats
-	largeAlloc      uintptr                  // bytes allocated for large objects
-	largeAllocCount uintptr                  // number of large object allocations
-	smallAllocCount [_NumSizeClasses]uintptr // number of allocs for small objects
-	largeFree       uintptr                  // bytes freed for large objects (>maxSmallSize)
-	largeFreeCount  uintptr                  // number of frees for large objects (>maxSmallSize)
-	smallFreeCount  [_NumSizeClasses]uintptr // number of frees for small objects (<=maxSmallSize)
-
 	// flushGen indicates the sweepgen during which this mcache
 	// was last flushed. If flushGen != mheap_.sweepgen, the spans
 	// in this mcache are stale and need to the flushed so they
@@ -117,7 +106,7 @@ func allocmcache() *mcache {
 // In some cases there is no way to simply release
 // resources, such as statistics, so donate them to
 // a different mcache (the recipient).
-func freemcache(c *mcache, recipient *mcache) {
+func freemcache(c *mcache) {
 	systemstack(func() {
 		c.releaseAll()
 		stackcache_clear(c)
@@ -128,8 +117,6 @@ func freemcache(c *mcache, recipient *mcache) {
 		// gcworkbuffree(c.gcworkbuf)
 
 		lock(&mheap_.lock)
-		// Donate anything else that's left.
-		c.donate(recipient)
 		mheap_.cachealloc.free(unsafe.Pointer(c))
 		unlock(&mheap_.lock)
 	})
@@ -158,31 +145,6 @@ func getMCache() *mcache {
 	return c
 }
 
-// donate flushes data and resources which have no global
-// pool to another mcache.
-func (c *mcache) donate(d *mcache) {
-	// scanAlloc is handled separately because it's not
-	// like these stats -- it's used for GC pacing.
-	d.largeAlloc += c.largeAlloc
-	c.largeAlloc = 0
-	d.largeAllocCount += c.largeAllocCount
-	c.largeAllocCount = 0
-	for i := range c.smallAllocCount {
-		d.smallAllocCount[i] += c.smallAllocCount[i]
-		c.smallAllocCount[i] = 0
-	}
-	d.largeFree += c.largeFree
-	c.largeFree = 0
-	d.largeFreeCount += c.largeFreeCount
-	c.largeFreeCount = 0
-	for i := range c.smallFreeCount {
-		d.smallFreeCount[i] += c.smallFreeCount[i]
-		c.smallFreeCount[i] = 0
-	}
-	d.tinyAllocCount += c.tinyAllocCount
-	c.tinyAllocCount = 0
-}
-
 // refill acquires a new span of span class spc for c. This span will
 // have at least one free object. The current span in c must be full.
 //
@@ -219,12 +181,20 @@ func (c *mcache) refill(spc spanClass) {
 
 	// Assume all objects from this span will be allocated in the
 	// mcache. If it gets uncached, we'll adjust this.
-	c.smallAllocCount[spc.sizeclass()] += uintptr(s.nelems) - uintptr(s.allocCount)
+	stats := memstats.heapStats.acquire(c)
+	atomic.Xadduintptr(&stats.smallAllocCount[spc.sizeclass()], uintptr(s.nelems)-uintptr(s.allocCount))
+	memstats.heapStats.release(c)
 
 	// Update heap_live with the same assumption.
 	usedBytes := uintptr(s.allocCount) * s.elemsize
 	atomic.Xadd64(&memstats.heap_live, int64(s.npages*pageSize)-int64(usedBytes))
 
+	// Flush tinyAllocs.
+	if spc == tinySpanClass {
+		atomic.Xadd64(&memstats.tinyallocs, int64(c.tinyAllocs))
+		c.tinyAllocs = 0
+	}
+
 	// While we're here, flush scanAlloc, since we have to call
 	// revise anyway.
 	atomic.Xadd64(&memstats.heap_scan, int64(c.scanAlloc))
@@ -262,8 +232,10 @@ func (c *mcache) allocLarge(size uintptr, needzero bool, noscan bool) *mspan {
 	if s == nil {
 		throw("out of memory")
 	}
-	c.largeAlloc += npages * pageSize
-	c.largeAllocCount++
+	stats := memstats.heapStats.acquire(c)
+	atomic.Xadduintptr(&stats.largeAlloc, npages*pageSize)
+	atomic.Xadduintptr(&stats.largeAllocCount, 1)
+	memstats.heapStats.release(c)
 
 	// Update heap_live and revise pacing if needed.
 	atomic.Xadd64(&memstats.heap_live, int64(npages*pageSize))
@@ -294,7 +266,9 @@ func (c *mcache) releaseAll() {
 		if s != &emptymspan {
 			// Adjust nsmallalloc in case the span wasn't fully allocated.
 			n := uintptr(s.nelems) - uintptr(s.allocCount)
-			c.smallAllocCount[spanClass(i).sizeclass()] -= n
+			stats := memstats.heapStats.acquire(c)
+			atomic.Xadduintptr(&stats.smallAllocCount[spanClass(i).sizeclass()], -n)
+			memstats.heapStats.release(c)
 			if s.sweepgen != sg+1 {
 				// refill conservatively counted unallocated slots in heap_live.
 				// Undo this.
@@ -313,6 +287,8 @@ func (c *mcache) releaseAll() {
 	// Clear tinyalloc pool.
 	c.tiny = 0
 	c.tinyoffset = 0
+	atomic.Xadd64(&memstats.tinyallocs, int64(c.tinyAllocs))
+	c.tinyAllocs = 0
 
 	// Updated heap_scan and possible heap_live.
 	if gcBlackenEnabled != 0 {
diff --git a/src/runtime/mgcsweep.go b/src/runtime/mgcsweep.go
index 7103b08455..9b77ce635c 100644
--- a/src/runtime/mgcsweep.go
+++ b/src/runtime/mgcsweep.go
@@ -503,7 +503,9 @@ func (s *mspan) sweep(preserve bool) bool {
 			// wasn't totally filled, but then swept, still has all of its
 			// free slots zeroed.
 			s.needzero = 1
-			c.smallFreeCount[spc.sizeclass()] += uintptr(nfreed)
+			stats := memstats.heapStats.acquire(c)
+			atomic.Xadduintptr(&stats.smallFreeCount[spc.sizeclass()], uintptr(nfreed))
+			memstats.heapStats.release(c)
 		}
 		if !preserve {
 			// The caller may not have removed this span from whatever
@@ -548,8 +550,10 @@ func (s *mspan) sweep(preserve bool) bool {
 			} else {
 				mheap_.freeSpan(s)
 			}
-			c.largeFreeCount++
-			c.largeFree += size
+			stats := memstats.heapStats.acquire(c)
+			atomic.Xadduintptr(&stats.largeFreeCount, 1)
+			atomic.Xadduintptr(&stats.largeFree, size)
+			memstats.heapStats.release(c)
 			return true
 		}
 
diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go
index 4363eff1e0..a8eca85fe6 100644
--- a/src/runtime/mstats.go
+++ b/src/runtime/mstats.go
@@ -612,48 +612,36 @@ func updatememstats() {
 	memstats.total_alloc = 0
 	memstats.nmalloc = 0
 	memstats.nfree = 0
-	memstats.tinyallocs = 0
 	for i := 0; i < len(memstats.by_size); i++ {
 		memstats.by_size[i].nmalloc = 0
 		memstats.by_size[i].nfree = 0
 	}
-
-	// Collect allocation stats. This is safe and consistent
-	// because the world is stopped.
-	var smallFree, totalAlloc, totalFree uint64
-	for _, p := range allp {
-		c := p.mcache
-		if c == nil {
-			continue
-		}
-		// Collect large allocation stats.
-		memstats.nmalloc += uint64(c.largeAllocCount)
-		totalAlloc += uint64(c.largeAlloc)
-		totalFree += uint64(c.largeFree)
-		memstats.nfree += uint64(c.largeFreeCount)
-
-		// Collect tiny allocation stats.
-		memstats.tinyallocs += uint64(c.tinyAllocCount)
-
-		// Collect per-sizeclass stats.
-		for i := 0; i < _NumSizeClasses; i++ {
-			// Malloc stats.
-			memstats.nmalloc += uint64(c.smallAllocCount[i])
-			memstats.by_size[i].nmalloc += uint64(c.smallAllocCount[i])
-			totalAlloc += uint64(c.smallAllocCount[i]) * uint64(class_to_size[i])
-
-			// Free stats.
-			memstats.nfree += uint64(c.smallFreeCount[i])
-			memstats.by_size[i].nfree += uint64(c.smallFreeCount[i])
-			smallFree += uint64(c.smallFreeCount[i]) * uint64(class_to_size[i])
-		}
-	}
 	// Collect consistent stats, which are the source-of-truth in the some cases.
 	var consStats heapStatsDelta
 	memstats.heapStats.unsafeRead(&consStats)
 
-	totalFree += smallFree
+	// Collect large allocation stats.
+	totalAlloc := uint64(consStats.largeAlloc)
+	memstats.nmalloc += uint64(consStats.largeAllocCount)
+	totalFree := uint64(consStats.largeFree)
+	memstats.nfree += uint64(consStats.largeFreeCount)
+
+	// Collect per-sizeclass stats.
+	for i := 0; i < _NumSizeClasses; i++ {
+		// Malloc stats.
+		a := uint64(consStats.smallAllocCount[i])
+		totalAlloc += a * uint64(class_to_size[i])
+		memstats.nmalloc += a
+		memstats.by_size[i].nmalloc = a
+
+		// Free stats.
+		f := uint64(consStats.smallFreeCount[i])
+		totalFree += f * uint64(class_to_size[i])
+		memstats.nfree += f
+		memstats.by_size[i].nfree = f
+	}
 
+	// Account for tiny allocations.
 	memstats.nfree += memstats.tinyallocs
 	memstats.nmalloc += memstats.tinyallocs
 
@@ -752,12 +740,25 @@ func (s *sysMemStat) add(n int64) {
 // that need to be updated together in order for them to be kept
 // consistent with one another.
 type heapStatsDelta struct {
+	// Memory stats.
 	committed       int64 // byte delta of memory committed
 	released        int64 // byte delta of released memory generated
 	inHeap          int64 // byte delta of memory placed in the heap
 	inStacks        int64 // byte delta of memory reserved for stacks
 	inWorkBufs      int64 // byte delta of memory reserved for work bufs
 	inPtrScalarBits int64 // byte delta of memory reserved for unrolled GC prog bits
+
+	// Allocator stats.
+	largeAlloc      uintptr                  // bytes allocated for large objects
+	largeAllocCount uintptr                  // number of large object allocations
+	smallAllocCount [_NumSizeClasses]uintptr // number of allocs for small objects
+	largeFree       uintptr                  // bytes freed for large objects (>maxSmallSize)
+	largeFreeCount  uintptr                  // number of frees for large objects (>maxSmallSize)
+	smallFreeCount  [_NumSizeClasses]uintptr // number of frees for small objects (<=maxSmallSize)
+
+	// Add a uint32 to ensure this struct is a multiple of 8 bytes in size.
+	// Only necessary on 32-bit platforms.
+	// _ [(sys.PtrSize / 4) % 2]uint32
 }
 
 // merge adds in the deltas from b into a.
@@ -768,6 +769,17 @@ func (a *heapStatsDelta) merge(b *heapStatsDelta) {
 	a.inStacks += b.inStacks
 	a.inWorkBufs += b.inWorkBufs
 	a.inPtrScalarBits += b.inPtrScalarBits
+
+	a.largeAlloc += b.largeAlloc
+	a.largeAllocCount += b.largeAllocCount
+	for i := range b.smallAllocCount {
+		a.smallAllocCount[i] += b.smallAllocCount[i]
+	}
+	a.largeFree += b.largeFree
+	a.largeFreeCount += b.largeFreeCount
+	for i := range b.smallFreeCount {
+		a.smallFreeCount[i] += b.smallFreeCount[i]
+	}
 }
 
 // consistentHeapStats represents a set of various memory statistics
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index 4f4cff38aa..ebecc92745 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -4550,7 +4550,7 @@ func (pp *p) destroy() {
 		pp.mspancache.len = 0
 		pp.pcache.flush(&mheap_.pages)
 	})
-	freemcache(pp.mcache, allp[0].mcache)
+	freemcache(pp.mcache)
 	pp.mcache = nil
 	gfpurge(pp)
 	traceProcFree(pp)
-- 
cgit v1.3


From ac766e37182f36cd0a3247e44a4143d2d2132e42 Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Mon, 2 Nov 2020 16:58:38 +0000
Subject: runtime: make getMCache inlineable

This change moves the responsibility of throwing if an mcache is not
available to the caller, because the inlining cost of throw is set very
high in the compiler. Even if it was reduced down to the cost of a usual
function call, it would still be too expensive, so just move it out.

This choice also makes sense in the context of #42339 since we're going
to have to handle the case where we don't have an mcache to update stats
in a few contexts anyhow.

Also, add getMCache to the list of functions that should be inlined to
prevent future regressions.

getMCache is called on the allocation fast path and because its not
inlined actually causes a significant regression (~10%) in some
microbenchmarks.

Fixes #42305.

Change-Id: I64ac5e4f26b730bd4435ea1069a4a50f55411ced
Reviewed-on: https://go-review.googlesource.com/c/go/+/267157
Trust: Michael Knyszek <mknyszek@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
---
 src/cmd/compile/internal/gc/inl_test.go |  1 +
 src/runtime/malloc.go                   |  9 ++++++++-
 src/runtime/mcache.go                   |  7 ++-----
 src/runtime/mgcscavenge.go              |  3 +++
 src/runtime/mheap.go                    | 12 ++++++++++++
 5 files changed, 26 insertions(+), 6 deletions(-)

(limited to 'src/runtime/mcache.go')

diff --git a/src/cmd/compile/internal/gc/inl_test.go b/src/cmd/compile/internal/gc/inl_test.go
index afa6b98315..02735e50fb 100644
--- a/src/cmd/compile/internal/gc/inl_test.go
+++ b/src/cmd/compile/internal/gc/inl_test.go
@@ -51,6 +51,7 @@ func TestIntendedInlining(t *testing.T) {
 			"funcPC",
 			"getArgInfoFast",
 			"getm",
+			"getMCache",
 			"isDirectIface",
 			"itabHashFunc",
 			"noescape",
diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index 4b798d129c..551acd0796 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -975,6 +975,9 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 	shouldhelpgc := false
 	dataSize := size
 	c := getMCache()
+	if c == nil {
+		throw("mallocgc called without a P or outside bootstrapping")
+	}
 	var span *mspan
 	var x unsafe.Pointer
 	noscan := typ == nil || typ.ptrdata == 0
@@ -1202,7 +1205,11 @@ func reflect_unsafe_NewArray(typ *_type, n int) unsafe.Pointer {
 }
 
 func profilealloc(mp *m, x unsafe.Pointer, size uintptr) {
-	getMCache().nextSample = nextSample()
+	c := getMCache()
+	if c == nil {
+		throw("profilealloc called without a P or outside bootstrapping")
+	}
+	c.nextSample = nextSample()
 	mProf_Malloc(x, size)
 }
 
diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go
index c9342a41c9..847a5dedf3 100644
--- a/src/runtime/mcache.go
+++ b/src/runtime/mcache.go
@@ -124,8 +124,8 @@ func freemcache(c *mcache) {
 
 // getMCache is a convenience function which tries to obtain an mcache.
 //
-// Must be running with a P when called (so the caller must be in a
-// non-preemptible state) or must be called during bootstrapping.
+// Returns nil if we're not bootstrapping or we don't have a P. The caller's
+// P must not change, so we must be in a non-preemptible state.
 func getMCache() *mcache {
 	// Grab the mcache, since that's where stats live.
 	pp := getg().m.p.ptr()
@@ -136,9 +136,6 @@ func getMCache() *mcache {
 		// mcache0 is cleared when bootstrapping is complete,
 		// by procresize.
 		c = mcache0
-		if c == nil {
-			throw("getMCache called with no P or outside bootstrapping")
-		}
 	} else {
 		c = pp.mcache
 	}
diff --git a/src/runtime/mgcscavenge.go b/src/runtime/mgcscavenge.go
index a242577bd9..ab4e28a60b 100644
--- a/src/runtime/mgcscavenge.go
+++ b/src/runtime/mgcscavenge.go
@@ -734,6 +734,9 @@ func (p *pageAlloc) scavengeRangeLocked(ci chunkIdx, base, npages uint) uintptr
 
 	// Update consistent accounting too.
 	c := getMCache()
+	if c == nil {
+		throw("scavengeRangeLocked called without a P or outside bootstrapping")
+	}
 	stats := memstats.heapStats.acquire(c)
 	atomic.Xaddint64(&stats.committed, -nbytes)
 	atomic.Xaddint64(&stats.released, nbytes)
diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index 66a59cb999..6b29f34a82 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -1247,6 +1247,10 @@ HaveSpan:
 	}
 	// Update consistent stats.
 	c := getMCache()
+	if c == nil {
+		// TODO(mknyszek): Remove this and handle this case to fix #42339.
+		throw("allocSpan called without P or outside bootstrapping")
+	}
 	stats := memstats.heapStats.acquire(c)
 	atomic.Xaddint64(&stats.committed, int64(scav))
 	atomic.Xaddint64(&stats.released, -int64(scav))
@@ -1341,6 +1345,10 @@ func (h *mheap) grow(npage uintptr) bool {
 		// just add directly to heap_released.
 		atomic.Xadd64(&memstats.heap_released, int64(asize))
 		c := getMCache()
+		if c == nil {
+			// TODO(mknyszek): Remove this and handle this case to fix #42339.
+			throw("grow called without P or outside bootstrapping")
+		}
 		stats := memstats.heapStats.acquire(c)
 		atomic.Xaddint64(&stats.released, int64(asize))
 		memstats.heapStats.release(c)
@@ -1440,6 +1448,10 @@ func (h *mheap) freeSpanLocked(s *mspan, typ spanAllocType) {
 	}
 	// Update consistent stats.
 	c := getMCache()
+	if c == nil {
+		// TODO(mknyszek): Remove this and handle this case to fix #42339.
+		throw("freeSpanLocked called without P or outside bootstrapping")
+	}
 	stats := memstats.heapStats.acquire(c)
 	switch typ {
 	case spanAllocHeap:
-- 
cgit v1.3


From 39a5ee52b9b41b1e4f4cf821c78ef5b7be68d181 Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Mon, 2 Nov 2020 19:03:16 +0000
Subject: runtime: decouple consistent stats from mcache and allow P-less
 update

This change modifies the consistent stats implementation to keep the
per-P sequence counter on each P instead of each mcache. A valid mcache
is not available everywhere that we want to call e.g. allocSpan, as per
issue #42339. By decoupling these two, we can add a mechanism to allow
contexts without a P to update stats consistently.

In this CL, we achieve that with a mutex. In practice, it will be very
rare for an M to update these stats without a P. Furthermore, the stats
reader also only needs to hold the mutex across the update to "gen"
since once that changes, writers are free to continue updating the new
stats generation. Contention could thus only arise between writers
without a P, and as mentioned earlier, those should be rare.

A nice side-effect of this change is that the consistent stats acquire
and release API becomes simpler.

Fixes #42339.

Change-Id: Ied74ab256f69abd54b550394c8ad7c4c40a5fe34
Reviewed-on: https://go-review.googlesource.com/c/go/+/267158
Run-TryBot: Michael Knyszek <mknyszek@google.com>
Trust: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/mcache.go      | 16 +++-----
 src/runtime/mgcscavenge.go |  8 +---
 src/runtime/mgcsweep.go    | 10 ++---
 src/runtime/mheap.go       | 27 +++----------
 src/runtime/mstats.go      | 95 +++++++++++++++++++++++++++++-----------------
 src/runtime/proc.go        |  4 ++
 src/runtime/runtime2.go    |  8 +++-
 7 files changed, 88 insertions(+), 80 deletions(-)

(limited to 'src/runtime/mcache.go')

diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go
index 847a5dedf3..bb7475b6f3 100644
--- a/src/runtime/mcache.go
+++ b/src/runtime/mcache.go
@@ -50,10 +50,6 @@ type mcache struct {
 	// in this mcache are stale and need to the flushed so they
 	// can be swept. This is done in acquirep.
 	flushGen uint32
-
-	// statsSeq is a counter indicating whether this P is currently
-	// writing any stats. Its value is even when not, odd when it is.
-	statsSeq uint32
 }
 
 // A gclink is a node in a linked list of blocks, like mlink,
@@ -178,9 +174,9 @@ func (c *mcache) refill(spc spanClass) {
 
 	// Assume all objects from this span will be allocated in the
 	// mcache. If it gets uncached, we'll adjust this.
-	stats := memstats.heapStats.acquire(c)
+	stats := memstats.heapStats.acquire()
 	atomic.Xadduintptr(&stats.smallAllocCount[spc.sizeclass()], uintptr(s.nelems)-uintptr(s.allocCount))
-	memstats.heapStats.release(c)
+	memstats.heapStats.release()
 
 	// Update heap_live with the same assumption.
 	usedBytes := uintptr(s.allocCount) * s.elemsize
@@ -229,10 +225,10 @@ func (c *mcache) allocLarge(size uintptr, needzero bool, noscan bool) *mspan {
 	if s == nil {
 		throw("out of memory")
 	}
-	stats := memstats.heapStats.acquire(c)
+	stats := memstats.heapStats.acquire()
 	atomic.Xadduintptr(&stats.largeAlloc, npages*pageSize)
 	atomic.Xadduintptr(&stats.largeAllocCount, 1)
-	memstats.heapStats.release(c)
+	memstats.heapStats.release()
 
 	// Update heap_live and revise pacing if needed.
 	atomic.Xadd64(&memstats.heap_live, int64(npages*pageSize))
@@ -263,9 +259,9 @@ func (c *mcache) releaseAll() {
 		if s != &emptymspan {
 			// Adjust nsmallalloc in case the span wasn't fully allocated.
 			n := uintptr(s.nelems) - uintptr(s.allocCount)
-			stats := memstats.heapStats.acquire(c)
+			stats := memstats.heapStats.acquire()
 			atomic.Xadduintptr(&stats.smallAllocCount[spanClass(i).sizeclass()], -n)
-			memstats.heapStats.release(c)
+			memstats.heapStats.release()
 			if s.sweepgen != sg+1 {
 				// refill conservatively counted unallocated slots in heap_live.
 				// Undo this.
diff --git a/src/runtime/mgcscavenge.go b/src/runtime/mgcscavenge.go
index ab4e28a60b..38f09309dc 100644
--- a/src/runtime/mgcscavenge.go
+++ b/src/runtime/mgcscavenge.go
@@ -733,14 +733,10 @@ func (p *pageAlloc) scavengeRangeLocked(ci chunkIdx, base, npages uint) uintptr
 	atomic.Xadd64(&memstats.heap_released, nbytes)
 
 	// Update consistent accounting too.
-	c := getMCache()
-	if c == nil {
-		throw("scavengeRangeLocked called without a P or outside bootstrapping")
-	}
-	stats := memstats.heapStats.acquire(c)
+	stats := memstats.heapStats.acquire()
 	atomic.Xaddint64(&stats.committed, -nbytes)
 	atomic.Xaddint64(&stats.released, nbytes)
-	memstats.heapStats.release(c)
+	memstats.heapStats.release()
 
 	return addr
 }
diff --git a/src/runtime/mgcsweep.go b/src/runtime/mgcsweep.go
index 8391435630..76bc4246e5 100644
--- a/src/runtime/mgcsweep.go
+++ b/src/runtime/mgcsweep.go
@@ -339,8 +339,6 @@ func (s *mspan) sweep(preserve bool) bool {
 	spc := s.spanclass
 	size := s.elemsize
 
-	c := _g_.m.p.ptr().mcache
-
 	// The allocBits indicate which unmarked objects don't need to be
 	// processed since they were free at the end of the last GC cycle
 	// and were not allocated since then.
@@ -505,9 +503,9 @@ func (s *mspan) sweep(preserve bool) bool {
 			// wasn't totally filled, but then swept, still has all of its
 			// free slots zeroed.
 			s.needzero = 1
-			stats := memstats.heapStats.acquire(c)
+			stats := memstats.heapStats.acquire()
 			atomic.Xadduintptr(&stats.smallFreeCount[spc.sizeclass()], uintptr(nfreed))
-			memstats.heapStats.release(c)
+			memstats.heapStats.release()
 		}
 		if !preserve {
 			// The caller may not have removed this span from whatever
@@ -552,10 +550,10 @@ func (s *mspan) sweep(preserve bool) bool {
 			} else {
 				mheap_.freeSpan(s)
 			}
-			stats := memstats.heapStats.acquire(c)
+			stats := memstats.heapStats.acquire()
 			atomic.Xadduintptr(&stats.largeFreeCount, 1)
 			atomic.Xadduintptr(&stats.largeFree, size)
-			memstats.heapStats.release(c)
+			memstats.heapStats.release()
 			return true
 		}
 
diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index 6b29f34a82..b8429eee94 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -1246,12 +1246,7 @@ HaveSpan:
 		memstats.heap_sys.add(-int64(nbytes))
 	}
 	// Update consistent stats.
-	c := getMCache()
-	if c == nil {
-		// TODO(mknyszek): Remove this and handle this case to fix #42339.
-		throw("allocSpan called without P or outside bootstrapping")
-	}
-	stats := memstats.heapStats.acquire(c)
+	stats := memstats.heapStats.acquire()
 	atomic.Xaddint64(&stats.committed, int64(scav))
 	atomic.Xaddint64(&stats.released, -int64(scav))
 	switch typ {
@@ -1264,7 +1259,7 @@ HaveSpan:
 	case spanAllocWorkBuf:
 		atomic.Xaddint64(&stats.inWorkBufs, int64(nbytes))
 	}
-	memstats.heapStats.release(c)
+	memstats.heapStats.release()
 
 	// Publish the span in various locations.
 
@@ -1344,14 +1339,9 @@ func (h *mheap) grow(npage uintptr) bool {
 		// size which is always > physPageSize, so its safe to
 		// just add directly to heap_released.
 		atomic.Xadd64(&memstats.heap_released, int64(asize))
-		c := getMCache()
-		if c == nil {
-			// TODO(mknyszek): Remove this and handle this case to fix #42339.
-			throw("grow called without P or outside bootstrapping")
-		}
-		stats := memstats.heapStats.acquire(c)
+		stats := memstats.heapStats.acquire()
 		atomic.Xaddint64(&stats.released, int64(asize))
-		memstats.heapStats.release(c)
+		memstats.heapStats.release()
 
 		// Recalculate nBase.
 		// We know this won't overflow, because sysAlloc returned
@@ -1447,12 +1437,7 @@ func (h *mheap) freeSpanLocked(s *mspan, typ spanAllocType) {
 		memstats.heap_sys.add(int64(nbytes))
 	}
 	// Update consistent stats.
-	c := getMCache()
-	if c == nil {
-		// TODO(mknyszek): Remove this and handle this case to fix #42339.
-		throw("freeSpanLocked called without P or outside bootstrapping")
-	}
-	stats := memstats.heapStats.acquire(c)
+	stats := memstats.heapStats.acquire()
 	switch typ {
 	case spanAllocHeap:
 		atomic.Xaddint64(&stats.inHeap, -int64(nbytes))
@@ -1463,7 +1448,7 @@ func (h *mheap) freeSpanLocked(s *mspan, typ spanAllocType) {
 	case spanAllocWorkBuf:
 		atomic.Xaddint64(&stats.inWorkBufs, -int64(nbytes))
 	}
-	memstats.heapStats.release(c)
+	memstats.heapStats.release()
 
 	// Mark the space as free.
 	h.pages.free(s.base(), s.npages)
diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go
index 3829355d7b..6defaedabe 100644
--- a/src/runtime/mstats.go
+++ b/src/runtime/mstats.go
@@ -158,7 +158,7 @@ type mstats struct {
 	// heapStats is a set of statistics
 	heapStats consistentHeapStats
 
-	_ uint32 // ensure gcPauseDist is aligned
+	// _ uint32 // ensure gcPauseDist is aligned
 
 	// gcPauseDist represents the distribution of all GC-related
 	// application pauses in the runtime.
@@ -818,10 +818,11 @@ type consistentHeapStats struct {
 	// Writers always atomically update the delta at index gen.
 	//
 	// Readers operate by rotating gen (0 -> 1 -> 2 -> 0 -> ...)
-	// and synchronizing with writers by observing each mcache's
-	// statsSeq field. If the reader observes a P (to which the
-	// mcache is bound) not writing, it can be sure that it will
-	// pick up the new gen value the next time it writes.
+	// and synchronizing with writers by observing each P's
+	// statsSeq field. If the reader observes a P not writing,
+	// it can be sure that it will pick up the new gen value the
+	// next time it writes.
+	//
 	// The reader then takes responsibility by clearing space
 	// in the ring buffer for the next reader to rotate gen to
 	// that space (i.e. it merges in values from index (gen-2) mod 3
@@ -830,7 +831,7 @@ type consistentHeapStats struct {
 	// Note that this means only one reader can be reading at a time.
 	// There is no way for readers to synchronize.
 	//
-	// This process is why we need ring buffer of size 3 instead
+	// This process is why we need a ring buffer of size 3 instead
 	// of 2: one is for the writers, one contains the most recent
 	// data, and the last one is clear so writers can begin writing
 	// to it the moment gen is updated.
@@ -840,24 +841,34 @@ type consistentHeapStats struct {
 	// are writing, and can take on the value of 0, 1, or 2.
 	// This value is updated atomically.
 	gen uint32
+
+	// noPLock is intended to provide mutual exclusion for updating
+	// stats when no P is available. It does not block other writers
+	// with a P, only other writers without a P and the reader. Because
+	// stats are usually updated when a P is available, contention on
+	// this lock should be minimal.
+	noPLock mutex
 }
 
 // acquire returns a heapStatsDelta to be updated. In effect,
 // it acquires the shard for writing. release must be called
-// as soon as the relevant deltas are updated. c must be
-// a valid mcache not being used by any other thread.
+// as soon as the relevant deltas are updated.
 //
 // The returned heapStatsDelta must be updated atomically.
 //
-// Note however, that this is unsafe to call concurrently
-// with other writers and there must be only one writer
-// at a time.
-func (m *consistentHeapStats) acquire(c *mcache) *heapStatsDelta {
-	seq := atomic.Xadd(&c.statsSeq, 1)
-	if seq%2 == 0 {
-		// Should have been incremented to odd.
-		print("runtime: seq=", seq, "\n")
-		throw("bad sequence number")
+// The caller's P must not change between acquire and
+// release. This also means that the caller should not
+// acquire a P or release its P in between.
+func (m *consistentHeapStats) acquire() *heapStatsDelta {
+	if pp := getg().m.p.ptr(); pp != nil {
+		seq := atomic.Xadd(&pp.statsSeq, 1)
+		if seq%2 == 0 {
+			// Should have been incremented to odd.
+			print("runtime: seq=", seq, "\n")
+			throw("bad sequence number")
+		}
+	} else {
+		lock(&m.noPLock)
 	}
 	gen := atomic.Load(&m.gen) % 3
 	return &m.stats[gen]
@@ -868,14 +879,19 @@ func (m *consistentHeapStats) acquire(c *mcache) *heapStatsDelta {
 // acquire must no longer be accessed or modified after
 // release is called.
 //
-// The mcache passed here must be the same as the one
-// passed to acquire.
-func (m *consistentHeapStats) release(c *mcache) {
-	seq := atomic.Xadd(&c.statsSeq, 1)
-	if seq%2 != 0 {
-		// Should have been incremented to even.
-		print("runtime: seq=", seq, "\n")
-		throw("bad sequence number")
+// The caller's P must not change between acquire and
+// release. This also means that the caller should not
+// acquire a P or release its P in between.
+func (m *consistentHeapStats) release() {
+	if pp := getg().m.p.ptr(); pp != nil {
+		seq := atomic.Xadd(&pp.statsSeq, 1)
+		if seq%2 != 0 {
+			// Should have been incremented to even.
+			print("runtime: seq=", seq, "\n")
+			throw("bad sequence number")
+		}
+	} else {
+		unlock(&m.noPLock)
 	}
 }
 
@@ -916,25 +932,33 @@ func (m *consistentHeapStats) read(out *heapStatsDelta) {
 	// so it doesn't change out from under us.
 	mp := acquirem()
 
+	// Get the current generation. We can be confident that this
+	// will not change since read is serialized and is the only
+	// one that modifies currGen.
+	currGen := atomic.Load(&m.gen)
+	prevGen := currGen - 1
+	if currGen == 0 {
+		prevGen = 2
+	}
+
+	// Prevent writers without a P from writing while we update gen.
+	lock(&m.noPLock)
+
 	// Rotate gen, effectively taking a snapshot of the state of
 	// these statistics at the point of the exchange by moving
 	// writers to the next set of deltas.
 	//
 	// This exchange is safe to do because we won't race
 	// with anyone else trying to update this value.
-	currGen := atomic.Load(&m.gen)
 	atomic.Xchg(&m.gen, (currGen+1)%3)
-	prevGen := currGen - 1
-	if currGen == 0 {
-		prevGen = 2
-	}
+
+	// Allow P-less writers to continue. They'll be writing to the
+	// next generation now.
+	unlock(&m.noPLock)
+
 	for _, p := range allp {
-		c := p.mcache
-		if c == nil {
-			continue
-		}
 		// Spin until there are no more writers.
-		for atomic.Load(&c.statsSeq)%2 != 0 {
+		for atomic.Load(&p.statsSeq)%2 != 0 {
 		}
 	}
 
@@ -951,5 +975,6 @@ func (m *consistentHeapStats) read(out *heapStatsDelta) {
 
 	// Finally, copy out the complete delta.
 	*out = m.stats[currGen]
+
 	releasem(mp)
 }
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index 79529ac7ec..87949a2694 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -577,6 +577,10 @@ func schedinit() {
 	lockInit(&trace.lock, lockRankTrace)
 	lockInit(&cpuprof.lock, lockRankCpuprof)
 	lockInit(&trace.stackTab.lock, lockRankTraceStackTab)
+	// Enforce that this lock is always a leaf lock.
+	// All of this lock's critical sections should be
+	// extremely short.
+	lockInit(&memstats.heapStats.noPLock, lockRankLeafRank)
 
 	// raceinit must be the first call to race detector.
 	// In particular, it must be done before mallocinit below calls racemapshadow.
diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go
index 82fedd804b..c9376827da 100644
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -654,8 +654,8 @@ type p struct {
 	timerModifiedEarliest uint64
 
 	// Per-P GC state
-	gcAssistTime         int64    // Nanoseconds in assistAlloc
-	gcFractionalMarkTime int64    // Nanoseconds in fractional mark worker (atomic)
+	gcAssistTime         int64 // Nanoseconds in assistAlloc
+	gcFractionalMarkTime int64 // Nanoseconds in fractional mark worker (atomic)
 
 	// gcMarkWorkerMode is the mode for the next mark worker to run in.
 	// That is, this is used to communicate with the worker goroutine
@@ -679,6 +679,10 @@ type p struct {
 
 	runSafePointFn uint32 // if 1, run sched.safePointFn at next safe point
 
+	// statsSeq is a counter indicating whether this P is currently
+	// writing any stats. Its value is even when not, odd when it is.
+	statsSeq uint32
+
 	// Lock for timers. We normally access the timers while running
 	// on this P, but the scheduler can also do it from a different P.
 	timersLock mutex
-- 
cgit v1.3