From a413908dd064de6e3ea5b8d95d707a532bd3f4c8 Mon Sep 17 00:00:00 2001
From: Cherry Zhang <cherryyz@google.com>
Date: Wed, 16 Sep 2020 16:59:58 -0400
Subject: all: add GOOS=ios

Introduce GOOS=ios for iOS systems. GOOS=ios matches "darwin"
build tag, like GOOS=android matches "linux" and GOOS=illumos
matches "solaris". Only ios/arm64 is supported (ios/amd64 is
not).

GOOS=ios and GOOS=darwin remain essentially the same at this
point. They will diverge at later time, to differentiate macOS
and iOS.

Uses of GOOS=="darwin" are changed to (GOOS=="darwin" || GOOS=="ios"),
except if it clearly means macOS (e.g. GOOS=="darwin" && GOARCH=="amd64"),
it remains GOOS=="darwin".

Updates #38485.

Change-Id: I4faacdc1008f42434599efb3c3ad90763a83b67c
Reviewed-on: https://go-review.googlesource.com/c/go/+/254740
Trust: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Austin Clements <austin@google.com>
---
 src/runtime/malloc.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'src/runtime/malloc.go')
diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index e46327f9ce..4fa14996c2 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -207,7 +207,7 @@ const (
 	// arenaBaseOffset to offset into the top 4 GiB.
 	//
 	// WebAssembly currently has a limit of 4GB linear memory.
-	heapAddrBits = (_64bit*(1-sys.GoarchWasm)*(1-sys.GoosDarwin*sys.GoarchArm64))*48 + (1-_64bit+sys.GoarchWasm)*(32-(sys.GoarchMips+sys.GoarchMipsle)) + 33*sys.GoosDarwin*sys.GoarchArm64
+	heapAddrBits = (_64bit*(1-sys.GoarchWasm)*(1-(sys.GoosDarwin+sys.GoosIos)*sys.GoarchArm64))*48 + (1-_64bit+sys.GoarchWasm)*(32-(sys.GoarchMips+sys.GoarchMipsle)) + 33*(sys.GoosDarwin+sys.GoosIos)*sys.GoarchArm64
 
 	// maxAlloc is the maximum size of an allocation. On 64-bit,
 	// it's theoretically possible to allocate 1<<heapAddrBits bytes. On
@@ -521,7 +521,7 @@ func mallocinit() {
 		for i := 0x7f; i >= 0; i-- {
 			var p uintptr
 			switch {
-			case GOARCH == "arm64" && GOOS == "darwin":
+			case GOARCH == "arm64" && (GOOS == "darwin" || GOOS == "ios"):
 				p = uintptr(i)<<40 | uintptrMask&(0x0013<<28)
 			case GOARCH == "arm64":
 				p = uintptr(i)<<40 | uintptrMask&(0x0040<<32)
-- 
cgit v1.3


From 5756b3560141d0c09c4a27d2025f5438f49f59f2 Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Thu, 10 Sep 2020 21:20:46 +0000
Subject: runtime: align 12-byte objects to 8 bytes on 32-bit systems

Currently on 32-bit systems 8-byte fields in a struct have an alignment
of 4 bytes, which means that atomic instructions may fault. This issue
is tracked in #36606.

Our current workaround is to allocate memory and put any such atomically
accessed fields at the beginning of the object. This workaround fails
because the tiny allocator might not align the object right. This case
specifically only happens with 12-byte objects because a type's size is
rounded up to its alignment. So if e.g. we have a type like:

type obj struct {
    a uint64
    b byte
}

then its size will be 12 bytes, because "a" will require a 4 byte
alignment. This argument may be extended to all objects of size 9-15
bytes.

So, make this workaround work by specifically aligning such objects to 8
bytes on 32-bit systems. This change leaves a TODO to remove the code
once #36606 gets resolved. It also adds a test which will presumably no
longer be necessary (the compiler should enforce the right alignment)
when it gets resolved as well.

Fixes #37262.

Change-Id: I3a34e5b014b3c37ed2e5e75e62d71d8640aa42bc
Reviewed-on: https://go-review.googlesource.com/c/go/+/254057
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Trust: Michael Knyszek <mknyszek@google.com>
---
 src/runtime/malloc.go      |  8 +++++++
 src/runtime/malloc_test.go | 57 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 65 insertions(+)

(limited to 'src/runtime/malloc.go')

diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index 4fa14996c2..c71f856f09 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -1016,6 +1016,14 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 			// Align tiny pointer for required (conservative) alignment.
 			if size&7 == 0 {
 				off = alignUp(off, 8)
+			} else if sys.PtrSize == 4 && size == 12 {
+				// Conservatively align 12-byte objects to 8 bytes on 32-bit
+				// systems so that objects whose first field is a 64-bit
+				// value is aligned to 8 bytes and does not cause a fault on
+				// atomic access. See issue 37262.
+				// TODO(mknyszek): Remove this workaround if/when issue 36606
+				// is resolved.
+				off = alignUp(off, 8)
 			} else if size&3 == 0 {
 				off = alignUp(off, 4)
 			} else if size&1 == 0 {
diff --git a/src/runtime/malloc_test.go b/src/runtime/malloc_test.go
index 5c97f548fd..4ba94d0494 100644
--- a/src/runtime/malloc_test.go
+++ b/src/runtime/malloc_test.go
@@ -12,8 +12,10 @@ import (
 	"os"
 	"os/exec"
 	"reflect"
+	"runtime"
 	. "runtime"
 	"strings"
+	"sync/atomic"
 	"testing"
 	"time"
 	"unsafe"
@@ -168,6 +170,61 @@ func TestTinyAlloc(t *testing.T) {
 	}
 }
 
+var (
+	tinyByteSink   *byte
+	tinyUint32Sink *uint32
+	tinyObj12Sink  *obj12
+)
+
+type obj12 struct {
+	a uint64
+	b uint32
+}
+
+func TestTinyAllocIssue37262(t *testing.T) {
+	// Try to cause an alignment access fault
+	// by atomically accessing the first 64-bit
+	// value of a tiny-allocated object.
+	// See issue 37262 for details.
+
+	// GC twice, once to reach a stable heap state
+	// and again to make sure we finish the sweep phase.
+	runtime.GC()
+	runtime.GC()
+
+	// Make 1-byte allocations until we get a fresh tiny slot.
+	aligned := false
+	for i := 0; i < 16; i++ {
+		tinyByteSink = new(byte)
+		if uintptr(unsafe.Pointer(tinyByteSink))&0xf == 0xf {
+			aligned = true
+			break
+		}
+	}
+	if !aligned {
+		t.Fatal("unable to get a fresh tiny slot")
+	}
+
+	// Create a 4-byte object so that the current
+	// tiny slot is partially filled.
+	tinyUint32Sink = new(uint32)
+
+	// Create a 12-byte object, which fits into the
+	// tiny slot. If it actually gets place there,
+	// then the field "a" will be improperly aligned
+	// for atomic access on 32-bit architectures.
+	// This won't be true if issue 36606 gets resolved.
+	tinyObj12Sink = new(obj12)
+
+	// Try to atomically access "x.a".
+	atomic.StoreUint64(&tinyObj12Sink.a, 10)
+
+	// Clear the sinks.
+	tinyByteSink = nil
+	tinyUint32Sink = nil
+	tinyObj12Sink = nil
+}
+
 func TestPageCacheLeak(t *testing.T) {
 	defer GOMAXPROCS(GOMAXPROCS(1))
 	leaked := PageCachePagesLeaked()
-- 
cgit v1.3


From a739306ca7d9ea3a98acca59b853fe889f04c28c Mon Sep 17 00:00:00 2001
From: Cherry Zhang <cherryyz@google.com>
Date: Thu, 17 Sep 2020 10:53:10 -0400
Subject: runtime: enable more address bits on macOS/ARM64

Apparently macOS/ARM64 has 47-bit addresses, instead of 33-bit as
on ios/ARM64. Enable more address bits.

Updates #38485.

Change-Id: I8aa64ba22a3933e3d9c4fffd17d902b5f31c30e3
Reviewed-on: https://go-review.googlesource.com/c/go/+/256918
Trust: Cherry Zhang <cherryyz@google.com>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
---
 src/runtime/malloc.go           | 8 ++++----
 src/runtime/mpagealloc_32bit.go | 4 ++--
 src/runtime/mpagealloc_64bit.go | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'src/runtime/malloc.go')

diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index c71f856f09..f7e9b7c4b4 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -198,7 +198,7 @@ const (
 	// mips32 only has access to the low 2GB of virtual memory, so
 	// we further limit it to 31 bits.
 	//
-	// On darwin/arm64, although 64-bit pointers are presumably
+	// On ios/arm64, although 64-bit pointers are presumably
 	// available, pointers are truncated to 33 bits. Furthermore,
 	// only the top 4 GiB of the address space are actually available
 	// to the application, but we allow the whole 33 bits anyway for
@@ -207,7 +207,7 @@ const (
 	// arenaBaseOffset to offset into the top 4 GiB.
 	//
 	// WebAssembly currently has a limit of 4GB linear memory.
-	heapAddrBits = (_64bit*(1-sys.GoarchWasm)*(1-(sys.GoosDarwin+sys.GoosIos)*sys.GoarchArm64))*48 + (1-_64bit+sys.GoarchWasm)*(32-(sys.GoarchMips+sys.GoarchMipsle)) + 33*(sys.GoosDarwin+sys.GoosIos)*sys.GoarchArm64
+	heapAddrBits = (_64bit*(1-sys.GoarchWasm)*(1-sys.GoosIos*sys.GoarchArm64))*48 + (1-_64bit+sys.GoarchWasm)*(32-(sys.GoarchMips+sys.GoarchMipsle)) + 33*sys.GoosIos*sys.GoarchArm64
 
 	// maxAlloc is the maximum size of an allocation. On 64-bit,
 	// it's theoretically possible to allocate 1<<heapAddrBits bytes. On
@@ -514,14 +514,14 @@ func mallocinit() {
 		// However, on arm64, we ignore all this advice above and slam the
 		// allocation at 0x40 << 32 because when using 4k pages with 3-level
 		// translation buffers, the user address space is limited to 39 bits
-		// On darwin/arm64, the address space is even smaller.
+		// On ios/arm64, the address space is even smaller.
 		//
 		// On AIX, mmaps starts at 0x0A00000000000000 for 64-bit.
 		// processes.
 		for i := 0x7f; i >= 0; i-- {
 			var p uintptr
 			switch {
-			case GOARCH == "arm64" && (GOOS == "darwin" || GOOS == "ios"):
+			case GOARCH == "arm64" && GOOS == "ios":
 				p = uintptr(i)<<40 | uintptrMask&(0x0013<<28)
 			case GOARCH == "arm64":
 				p = uintptr(i)<<40 | uintptrMask&(0x0040<<32)
diff --git a/src/runtime/mpagealloc_32bit.go b/src/runtime/mpagealloc_32bit.go
index 6658a900ac..90f1e54d6c 100644
--- a/src/runtime/mpagealloc_32bit.go
+++ b/src/runtime/mpagealloc_32bit.go
@@ -2,14 +2,14 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build 386 arm mips mipsle wasm darwin,arm64
+// +build 386 arm mips mipsle wasm ios,arm64
 
 // wasm is a treated as a 32-bit architecture for the purposes of the page
 // allocator, even though it has 64-bit pointers. This is because any wasm
 // pointer always has its top 32 bits as zero, so the effective heap address
 // space is only 2^32 bytes in size (see heapAddrBits).
 
-// darwin/arm64 is treated as a 32-bit architecture for the purposes of the
+// ios/arm64 is treated as a 32-bit architecture for the purposes of the
 // page allocator, even though it has 64-bit pointers and a 33-bit address
 // space (see heapAddrBits). The 33 bit address space cannot be rounded up
 // to 64 bits because there are too many summary levels to fit in just 33
diff --git a/src/runtime/mpagealloc_64bit.go b/src/runtime/mpagealloc_64bit.go
index 831626e4b2..a1691ba802 100644
--- a/src/runtime/mpagealloc_64bit.go
+++ b/src/runtime/mpagealloc_64bit.go
@@ -2,9 +2,9 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build amd64 !darwin,arm64 mips64 mips64le ppc64 ppc64le riscv64 s390x
+// +build amd64 !ios,arm64 mips64 mips64le ppc64 ppc64le riscv64 s390x
 
-// See mpagealloc_32bit.go for why darwin/arm64 is excluded here.
+// See mpagealloc_32bit.go for why ios/arm64 is excluded here.
 
 package runtime
 
-- 
cgit v1.3


From 7c58ef732efd9bf0d0882bb95371ce1909924a75 Mon Sep 17 00:00:00 2001
From: Martin Möhrmann <moehrmann@google.com>
Date: Mon, 14 Sep 2020 16:55:34 +0200
Subject: runtime: implement GODEBUG=inittrace=1 support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Setting inittrace=1 causes the runtime to emit a single line to standard error for
each package with init work, summarizing the execution time and memory allocation.

The emitted debug information for init functions can be used to find bottlenecks
or regressions in Go startup performance.

Packages with no init function work (user defined or compiler generated) are omitted.

Tracing plugin inits is not supported as they can execute concurrently. This would
make the implementation of tracing more complex while adding support for a very rare
use case. Plugin inits can be traced separately by testing a main package importing
the plugins package imports explicitly.

$ GODEBUG=inittrace=1 go test
init internal/bytealg @0.008 ms, 0 ms clock, 0 bytes, 0 allocs
init runtime @0.059 ms, 0.026 ms clock, 0 bytes, 0 allocs
init math @0.19 ms, 0.001 ms clock, 0 bytes, 0 allocs
init errors @0.22 ms, 0.004 ms clock, 0 bytes, 0 allocs
init strconv @0.24 ms, 0.002 ms clock, 32 bytes, 2 allocs
init sync @0.28 ms, 0.003 ms clock, 16 bytes, 1 allocs
init unicode @0.44 ms, 0.11 ms clock, 23328 bytes, 24 allocs
...

Inspired by stapelberg@google.com who instrumented doInit
in a prototype to measure init times with GDB.

Fixes #41378

Change-Id: Ic37c6a0cfc95488de9e737f5e346b8dbb39174e1
Reviewed-on: https://go-review.googlesource.com/c/go/+/254659
Trust: Martin Möhrmann <moehrmann@google.com>
Run-TryBot: Martin Möhrmann <moehrmann@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
---
 doc/diagnostics.html    |  2 ++
 src/runtime/extern.go   | 13 ++++++++++
 src/runtime/malloc.go   | 56 ++++++++++++++++++++++++---------------
 src/runtime/proc.go     | 69 ++++++++++++++++++++++++++++++++++++++++++++-----
 src/runtime/runtime1.go | 13 ++++++++--
 src/runtime/symtab.go   | 16 ++++++++++++
 6 files changed, 140 insertions(+), 29 deletions(-)

(limited to 'src/runtime/malloc.go')

diff --git a/doc/diagnostics.html b/doc/diagnostics.html
index 478611c15c..f9368886c4 100644
--- a/doc/diagnostics.html
+++ b/doc/diagnostics.html
@@ -454,6 +454,8 @@ environmental variable is set accordingly.</p>
 <li>GODEBUG=gctrace=1 prints garbage collector events at
 each collection, summarizing the amount of memory collected
 and the length of the pause.</li>
+<li>GODEBUG=inittrace=1 prints a summary of execution time and memory allocation
+information for completed package initilization work.</li>
 <li>GODEBUG=schedtrace=X prints scheduling events every X milliseconds.</li>
 </ul>
 
diff --git a/src/runtime/extern.go b/src/runtime/extern.go
index 7316503ed2..b75507b8f8 100644
--- a/src/runtime/extern.go
+++ b/src/runtime/extern.go
@@ -78,6 +78,19 @@ It is a comma-separated list of name=val pairs setting these named variables:
 	If the line ends with "(forced)", this GC was forced by a
 	runtime.GC() call.
 
+	inittrace: setting inittrace=1 causes the runtime to emit a single line to standard
+	error for each package with init work, summarizing the execution time and memory
+	allocation. No information is printed for inits executed as part of plugin loading
+	and for packages without both user defined and compiler generated init work.
+	The format of this line is subject to change. Currently, it is:
+		init # @#ms, # ms clock, # bytes, # allocs
+	where the fields are as follows:
+		init #      the package name
+		@# ms       time in milliseconds when the init started since program start
+		# clock     wall-clock time for package initialization work
+		# bytes     memory allocated on the heap
+		# allocs    number of heap allocations
+
 	madvdontneed: setting madvdontneed=1 will use MADV_DONTNEED
 	instead of MADV_FREE on Linux when returning memory to the
 	kernel. This is less efficient, but causes RSS numbers to drop
diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index f7e9b7c4b4..b19d1f2671 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -909,27 +909,34 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 		return unsafe.Pointer(&zerobase)
 	}
 
-	if debug.sbrk != 0 {
-		align := uintptr(16)
-		if typ != nil {
-			// TODO(austin): This should be just
-			//   align = uintptr(typ.align)
-			// but that's only 4 on 32-bit platforms,
-			// even if there's a uint64 field in typ (see #599).
-			// This causes 64-bit atomic accesses to panic.
-			// Hence, we use stricter alignment that matches
-			// the normal allocator better.
-			if size&7 == 0 {
-				align = 8
-			} else if size&3 == 0 {
-				align = 4
-			} else if size&1 == 0 {
-				align = 2
-			} else {
-				align = 1
+	if debug.malloc {
+		if debug.sbrk != 0 {
+			align := uintptr(16)
+			if typ != nil {
+				// TODO(austin): This should be just
+				//   align = uintptr(typ.align)
+				// but that's only 4 on 32-bit platforms,
+				// even if there's a uint64 field in typ (see #599).
+				// This causes 64-bit atomic accesses to panic.
+				// Hence, we use stricter alignment that matches
+				// the normal allocator better.
+				if size&7 == 0 {
+					align = 8
+				} else if size&3 == 0 {
+					align = 4
+				} else if size&1 == 0 {
+					align = 2
+				} else {
+					align = 1
+				}
 			}
+			return persistentalloc(size, align, &memstats.other_sys)
+		}
+
+		if inittrace.active && inittrace.id == getg().goid {
+			// Init functions are executed sequentially in a single Go routine.
+			inittrace.allocs += 1
 		}
-		return persistentalloc(size, align, &memstats.other_sys)
 	}
 
 	// assistG is the G to charge for this allocation, or nil if
@@ -1136,8 +1143,15 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 	mp.mallocing = 0
 	releasem(mp)
 
-	if debug.allocfreetrace != 0 {
-		tracealloc(x, size, typ)
+	if debug.malloc {
+		if debug.allocfreetrace != 0 {
+			tracealloc(x, size, typ)
+		}
+
+		if inittrace.active && inittrace.id == getg().goid {
+			// Init functions are executed sequentially in a single Go routine.
+			inittrace.bytes += uint64(size)
+		}
 	}
 
 	if rate := MemProfileRate; rate > 0 {
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index a1e2ed0680..4872480314 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -154,11 +154,20 @@ func main() {
 		throw("runtime.main not on m0")
 	}
 
-	doInit(&runtime_inittask) // must be before defer
-	if nanotime() == 0 {
+	// Record when the world started.
+	// Must be before doInit for tracing init.
+	runtimeInitTime = nanotime()
+	if runtimeInitTime == 0 {
 		throw("nanotime returning zero")
 	}
 
+	if debug.inittrace != 0 {
+		inittrace.id = getg().goid
+		inittrace.active = true
+	}
+
+	doInit(&runtime_inittask) // Must be before defer.
+
 	// Defer unlock so that runtime.Goexit during init does the unlock too.
 	needUnlock := true
 	defer func() {
@@ -167,9 +176,6 @@ func main() {
 		}
 	}()
 
-	// Record when the world started.
-	runtimeInitTime = nanotime()
-
 	gcenable()
 
 	main_init_done = make(chan bool)
@@ -196,6 +202,10 @@ func main() {
 
 	doInit(&main_inittask)
 
+	// Disable init tracing after main init done to avoid overhead
+	// of collecting statistics in malloc and newproc
+	inittrace.active = false
+
 	close(main_init_done)
 
 	needUnlock = false
@@ -5665,6 +5675,17 @@ type initTask struct {
 	// followed by nfns pcs, one per init function to run
 }
 
+// inittrace stores statistics for init functions which are
+// updated by malloc and newproc when active is true.
+var inittrace tracestat
+
+type tracestat struct {
+	active bool   // init tracing activation status
+	id     int64  // init go routine id
+	allocs uint64 // heap allocations
+	bytes  uint64 // heap allocated bytes
+}
+
 func doInit(t *initTask) {
 	switch t.state {
 	case 2: // fully initialized
@@ -5673,16 +5694,52 @@ func doInit(t *initTask) {
 		throw("recursive call during initialization - linker skew")
 	default: // not initialized yet
 		t.state = 1 // initialization in progress
+
 		for i := uintptr(0); i < t.ndeps; i++ {
 			p := add(unsafe.Pointer(t), (3+i)*sys.PtrSize)
 			t2 := *(**initTask)(p)
 			doInit(t2)
 		}
+
+		if t.nfns == 0 {
+			t.state = 2 // initialization done
+			return
+		}
+
+		var (
+			start  int64
+			before tracestat
+		)
+
+		if inittrace.active {
+			start = nanotime()
+			// Load stats non-atomically since tracinit is updated only by this init go routine.
+			before = inittrace
+		}
+
+		firstFunc := add(unsafe.Pointer(t), (3+t.ndeps)*sys.PtrSize)
 		for i := uintptr(0); i < t.nfns; i++ {
-			p := add(unsafe.Pointer(t), (3+t.ndeps+i)*sys.PtrSize)
+			p := add(firstFunc, i*sys.PtrSize)
 			f := *(*func())(unsafe.Pointer(&p))
 			f()
 		}
+
+		if inittrace.active {
+			end := nanotime()
+			// Load stats non-atomically since tracinit is updated only by this init go routine.
+			after := inittrace
+
+			pkg := funcpkgpath(findfunc(funcPC(firstFunc)))
+
+			var sbuf [24]byte
+			print("init ", pkg, " @")
+			print(string(fmtNSAsMS(sbuf[:], uint64(start-runtimeInitTime))), " ms, ")
+			print(string(fmtNSAsMS(sbuf[:], uint64(end-start))), " ms clock, ")
+			print(string(itoa(sbuf[:], after.bytes-before.bytes)), " bytes, ")
+			print(string(itoa(sbuf[:], after.allocs-before.allocs)), " allocs")
+			print("\n")
+		}
+
 		t.state = 2 // initialization done
 	}
 }
diff --git a/src/runtime/runtime1.go b/src/runtime/runtime1.go
index 7c893aa25c..0f182ac58e 100644
--- a/src/runtime/runtime1.go
+++ b/src/runtime/runtime1.go
@@ -300,7 +300,6 @@ type dbgVar struct {
 // existing int var for that value, which may
 // already have an initial value.
 var debug struct {
-	allocfreetrace     int32
 	cgocheck           int32
 	clobberfree        int32
 	efence             int32
@@ -311,13 +310,20 @@ var debug struct {
 	gctrace            int32
 	invalidptr         int32
 	madvdontneed       int32 // for Linux; issue 28466
-	sbrk               int32
 	scavenge           int32
 	scavtrace          int32
 	scheddetail        int32
 	schedtrace         int32
 	tracebackancestors int32
 	asyncpreemptoff    int32
+
+	// debug.malloc is used as a combined debug check
+	// in the malloc function and should be set
+	// if any of the below debug options is != 0.
+	malloc         bool
+	allocfreetrace int32
+	inittrace      int32
+	sbrk           int32
 }
 
 var dbgvars = []dbgVar{
@@ -339,6 +345,7 @@ var dbgvars = []dbgVar{
 	{"schedtrace", &debug.schedtrace},
 	{"tracebackancestors", &debug.tracebackancestors},
 	{"asyncpreemptoff", &debug.asyncpreemptoff},
+	{"inittrace", &debug.inittrace},
 }
 
 func parsedebugvars() {
@@ -378,6 +385,8 @@ func parsedebugvars() {
 		}
 	}
 
+	debug.malloc = (debug.allocfreetrace | debug.inittrace | debug.sbrk) != 0
+
 	setTraceback(gogetenv("GOTRACEBACK"))
 	traceback_env = traceback_cache
 }
diff --git a/src/runtime/symtab.go b/src/runtime/symtab.go
index a14f5c13d9..84637376bf 100644
--- a/src/runtime/symtab.go
+++ b/src/runtime/symtab.go
@@ -844,6 +844,22 @@ func funcname(f funcInfo) string {
 	return gostringnocopy(cfuncname(f))
 }
 
+func funcpkgpath(f funcInfo) string {
+	name := funcname(f)
+	i := len(name) - 1
+	for ; i > 0; i-- {
+		if name[i] == '/' {
+			break
+		}
+	}
+	for ; i < len(name); i++ {
+		if name[i] == '.' {
+			break
+		}
+	}
+	return name[:i]
+}
+
 func cfuncnameFromNameoff(f funcInfo, nameoff int32) *byte {
 	if !f.valid() {
 		return nil
-- 
cgit v1.3


From e63716bc76d3264f669843434bc365a78f2141d2 Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Thu, 23 Jul 2020 21:10:29 +0000
Subject: runtime: make nlargealloc and largealloc mcache fields

This change makes nlargealloc and largealloc into mcache fields just
like nlargefree and largefree. These local fields become the new
source-of-truth. This change also moves the accounting for these fields
out of allocSpan (which is an inappropriate place for it -- this
accounting generally happens much closer to the point of allocation) and
into largeAlloc. This move is partially possible now that we can call
gcController.revise at that point.

Furthermore, this change moves largeAlloc into mcache.go and makes it a
method of mcache. While there's a little bit of a mismatch here because
largeAlloc barely interacts with the mcache, it helps solidify the
mcache as the first allocation layer and provides a clear place to
aggregate and manage statistics.

Change-Id: I37b5e648710733bb4c04430b71e96700e438587a
Reviewed-on: https://go-review.googlesource.com/c/go/+/246965
Trust: Michael Knyszek <mknyszek@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/malloc.go | 33 +------------------------------
 src/runtime/mcache.go | 54 ++++++++++++++++++++++++++++++++++++++++++++++++---
 src/runtime/mheap.go  | 18 +----------------
 src/runtime/mstats.go |  4 ++--
 4 files changed, 55 insertions(+), 54 deletions(-)

(limited to 'src/runtime/malloc.go')

diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index b19d1f2671..ec601ccb39 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -1082,9 +1082,7 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 		}
 	} else {
 		shouldhelpgc = true
-		systemstack(func() {
-			span = largeAlloc(size, needzero, noscan)
-		})
+		span = c.largeAlloc(size, needzero, noscan)
 		span.freeindex = 1
 		span.allocCount = 1
 		x = unsafe.Pointer(span.base())
@@ -1179,35 +1177,6 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 	return x
 }
 
-func largeAlloc(size uintptr, needzero bool, noscan bool) *mspan {
-	// print("largeAlloc size=", size, "\n")
-
-	if size+_PageSize < size {
-		throw("out of memory")
-	}
-	npages := size >> _PageShift
-	if size&_PageMask != 0 {
-		npages++
-	}
-
-	// Deduct credit for this span allocation and sweep if
-	// necessary. mHeap_Alloc will also sweep npages, so this only
-	// pays the debt down to npage pages.
-	deductSweepCredit(npages*_PageSize, npages)
-
-	spc := makeSpanClass(0, noscan)
-	s := mheap_.alloc(npages, spc, needzero)
-	if s == nil {
-		throw("out of memory")
-	}
-	// Put the large span in the mcentral swept list so that it's
-	// visible to the background sweeper.
-	mheap_.central[spc].mcentral.fullSwept(mheap_.sweepgen).push(s)
-	s.limit = s.base() + size
-	heapBitsForAddr(s.base()).initSpan(s)
-	return s
-}
-
 // implementation of new builtin
 // compiler (both frontend and SSA backend) knows the signature
 // of this function
diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go
index 5baa7b3da8..3657c0b86a 100644
--- a/src/runtime/mcache.go
+++ b/src/runtime/mcache.go
@@ -10,6 +10,7 @@ import (
 )
 
 // Per-thread (in Go, per-P) cache for small objects.
+// This includes a small object cache and local allocation stats.
 // No locking needed because it is per-thread (per-P).
 //
 // mcaches are allocated from non-GC'd memory, so any heap pointers
@@ -48,9 +49,11 @@ type mcache struct {
 	// When read with stats from other mcaches and with the world
 	// stopped, the result will accurately reflect the state of the
 	// application.
-	local_largefree  uintptr                  // bytes freed for large objects (>maxsmallsize)
-	local_nlargefree uintptr                  // number of frees for large objects (>maxsmallsize)
-	local_nsmallfree [_NumSizeClasses]uintptr // number of frees for small objects (<=maxsmallsize)
+	local_largealloc  uintptr                  // bytes allocated for large objects
+	local_nlargealloc uintptr                  // number of large object allocations
+	local_largefree   uintptr                  // bytes freed for large objects (>maxsmallsize)
+	local_nlargefree  uintptr                  // number of frees for large objects (>maxsmallsize)
+	local_nsmallfree  [_NumSizeClasses]uintptr // number of frees for small objects (<=maxsmallsize)
 
 	// flushGen indicates the sweepgen during which this mcache
 	// was last flushed. If flushGen != mheap_.sweepgen, the spans
@@ -131,6 +134,10 @@ func freemcache(c *mcache, recipient *mcache) {
 // donate flushes data and resources which have no global
 // pool to another mcache.
 func (c *mcache) donate(d *mcache) {
+	d.local_largealloc += c.local_largealloc
+	c.local_largealloc = 0
+	d.local_nlargealloc += c.local_nlargealloc
+	c.local_nlargealloc = 0
 	d.local_largefree += c.local_largefree
 	c.local_largefree = 0
 	d.local_nlargefree += c.local_nlargefree
@@ -178,6 +185,47 @@ func (c *mcache) refill(spc spanClass) {
 	c.alloc[spc] = s
 }
 
+// largeAlloc allocates a span for a large object.
+func (c *mcache) largeAlloc(size uintptr, needzero bool, noscan bool) *mspan {
+	if size+_PageSize < size {
+		throw("out of memory")
+	}
+	npages := size >> _PageShift
+	if size&_PageMask != 0 {
+		npages++
+	}
+
+	// Deduct credit for this span allocation and sweep if
+	// necessary. mHeap_Alloc will also sweep npages, so this only
+	// pays the debt down to npage pages.
+	deductSweepCredit(npages*_PageSize, npages)
+
+	spc := makeSpanClass(0, noscan)
+	s := mheap_.alloc(npages, spc, needzero)
+	if s == nil {
+		throw("out of memory")
+	}
+	c.local_largealloc += npages * pageSize
+	c.local_nlargealloc++
+
+	// Update heap_live and revise pacing if needed.
+	atomic.Xadd64(&memstats.heap_live, int64(npages*pageSize))
+	if trace.enabled {
+		// Trace that a heap alloc occurred because heap_live changed.
+		traceHeapAlloc()
+	}
+	if gcBlackenEnabled != 0 {
+		gcController.revise()
+	}
+
+	// Put the large span in the mcentral swept list so that it's
+	// visible to the background sweeper.
+	mheap_.central[spc].mcentral.fullSwept(mheap_.sweepgen).push(s)
+	s.limit = s.base() + size
+	heapBitsForAddr(s.base()).initSpan(s)
+	return s
+}
+
 func (c *mcache) releaseAll() {
 	for i := range c.alloc {
 		s := c.alloc[i]
diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index 1b41b204ab..5635dc6784 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -128,10 +128,6 @@ type mheap struct {
 	// This is accessed atomically.
 	reclaimCredit uintptr
 
-	// Malloc stats.
-	largealloc  uint64 // bytes allocated for large objects
-	nlargealloc uint64 // number of large object allocations
-
 	// arenas is the heap arena map. It points to the metadata for
 	// the heap for every arena frame of the entire usable virtual
 	// address space.
@@ -1170,14 +1166,7 @@ func (h *mheap) allocSpan(npages uintptr, manual bool, spanclass spanClass, sysS
 		memstats.tinyallocs += uint64(c.local_tinyallocs)
 		c.local_tinyallocs = 0
 
-		// Do some additional accounting if it's a large allocation.
-		if spanclass.sizeclass() == 0 {
-			mheap_.largealloc += uint64(npages * pageSize)
-			mheap_.nlargealloc++
-			atomic.Xadd64(&memstats.heap_live, int64(npages*pageSize))
-		}
-
-		// Either heap_live or heap_scan could have been updated.
+		// heap_scan was been updated.
 		if gcBlackenEnabled != 0 {
 			gcController.revise()
 		}
@@ -1277,11 +1266,6 @@ HaveSpan:
 
 		// Update related page sweeper stats.
 		atomic.Xadd64(&h.pagesInUse, int64(npages))
-
-		if trace.enabled {
-			// Trace that a heap alloc occurred.
-			traceHeapAlloc()
-		}
 	}
 
 	// Make sure the newly allocated span will be observed
diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go
index d81d2ebe81..d9acb361d5 100644
--- a/src/runtime/mstats.go
+++ b/src/runtime/mstats.go
@@ -578,6 +578,8 @@ func updatememstats() {
 			continue
 		}
 		// Collect large allocation stats.
+		memstats.nmalloc += uint64(c.local_nlargealloc)
+		totalAlloc += uint64(c.local_largealloc)
 		totalFree += uint64(c.local_largefree)
 		memstats.nfree += uint64(c.local_nlargefree)
 
@@ -589,8 +591,6 @@ func updatememstats() {
 		}
 	}
 	// Collect remaining large allocation stats.
-	memstats.nmalloc += mheap_.nlargealloc
-	totalAlloc += mheap_.largealloc
 
 	totalFree += smallFree
 
-- 
cgit v1.3


From c8638498008f9874dc5a48734418e0fbea08cee9 Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Fri, 24 Jul 2020 19:58:31 +0000
Subject: runtime: rename mcache fields to match Go style

This change renames a bunch of malloc statistics stored in the mcache
that are all named with the "local_" prefix. It also renames largeAlloc
to allocLarge to prevent a naming conflict, and next_sample because it
would be the last mcache field with the old C naming style.

Change-Id: I29695cb83b397a435ede7e9ad5c3c9be72767ea3
Reviewed-on: https://go-review.googlesource.com/c/go/+/246969
Trust: Michael Knyszek <mknyszek@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/export_test.go      | 14 ++++----
 src/runtime/malloc.go           | 12 +++----
 src/runtime/mcache.go           | 78 ++++++++++++++++++++---------------------
 src/runtime/mgc.go              |  8 ++---
 src/runtime/mgcsweep.go         |  6 ++--
 src/runtime/mstats.go           | 22 ++++++------
 src/runtime/pprof/mprof_test.go |  2 +-
 7 files changed, 71 insertions(+), 71 deletions(-)

(limited to 'src/runtime/malloc.go')

diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index d71b180f76..47cbc286f6 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -346,18 +346,18 @@ func ReadMemStatsSlow() (base, slow MemStats) {
 				continue
 			}
 			// Collect large allocation stats.
-			largeFree += uint64(c.local_largefree)
-			slow.Frees += uint64(c.local_nlargefree)
+			largeFree += uint64(c.largeFree)
+			slow.Frees += uint64(c.largeFreeCount)
 
 			// Collect tiny allocation stats.
-			tinyAllocs += uint64(c.local_tinyallocs)
+			tinyAllocs += uint64(c.tinyAllocCount)
 
 			// Collect per-sizeclass stats.
 			for i := 0; i < _NumSizeClasses; i++ {
-				slow.Frees += uint64(c.local_nsmallfree[i])
-				bySize[i].Frees += uint64(c.local_nsmallfree[i])
-				bySize[i].Mallocs += uint64(c.local_nsmallfree[i])
-				smallFree += uint64(c.local_nsmallfree[i]) * uint64(class_to_size[i])
+				slow.Frees += uint64(c.smallFreeCount[i])
+				bySize[i].Frees += uint64(c.smallFreeCount[i])
+				bySize[i].Mallocs += uint64(c.smallFreeCount[i])
+				smallFree += uint64(c.smallFreeCount[i]) * uint64(class_to_size[i])
 			}
 		}
 		slow.Frees += tinyAllocs
diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index ec601ccb39..0f48d7f68e 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -1040,7 +1040,7 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 				// The object fits into existing tiny block.
 				x = unsafe.Pointer(c.tiny + off)
 				c.tinyoffset = off + size
-				c.local_tinyallocs++
+				c.tinyAllocCount++
 				mp.mallocing = 0
 				releasem(mp)
 				return x
@@ -1082,7 +1082,7 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 		}
 	} else {
 		shouldhelpgc = true
-		span = c.largeAlloc(size, needzero, noscan)
+		span = c.allocLarge(size, needzero, noscan)
 		span.freeindex = 1
 		span.allocCount = 1
 		x = unsafe.Pointer(span.base())
@@ -1111,7 +1111,7 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 		} else {
 			scanSize = typ.ptrdata
 		}
-		c.local_scan += scanSize
+		c.scanAlloc += scanSize
 	}
 
 	// Ensure that the stores above that initialize x to
@@ -1153,8 +1153,8 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 	}
 
 	if rate := MemProfileRate; rate > 0 {
-		if rate != 1 && size < c.next_sample {
-			c.next_sample -= size
+		if rate != 1 && size < c.nextSample {
+			c.nextSample -= size
 		} else {
 			mp := acquirem()
 			profilealloc(mp, x, size)
@@ -1221,7 +1221,7 @@ func profilealloc(mp *m, x unsafe.Pointer, size uintptr) {
 			throw("profilealloc called with no P")
 		}
 	}
-	c.next_sample = nextSample()
+	c.nextSample = nextSample()
 	mProf_Malloc(x, size)
 }
 
diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go
index b8e388cc4f..c3e0e5e1f7 100644
--- a/src/runtime/mcache.go
+++ b/src/runtime/mcache.go
@@ -20,8 +20,8 @@ import (
 type mcache struct {
 	// The following members are accessed on every malloc,
 	// so they are grouped here for better caching.
-	next_sample uintptr // trigger heap sample after allocating this many bytes
-	local_scan  uintptr // bytes of scannable heap allocated
+	nextSample uintptr // trigger heap sample after allocating this many bytes
+	scanAlloc  uintptr // bytes of scannable heap allocated
 
 	// Allocator cache for tiny objects w/o pointers.
 	// See "Tiny allocator" comment in malloc.go.
@@ -48,13 +48,13 @@ type mcache struct {
 	// When read with stats from other mcaches and with the world
 	// stopped, the result will accurately reflect the state of the
 	// application.
-	local_tinyallocs  uintptr                  // number of tiny allocs not counted in other stats
-	local_largealloc  uintptr                  // bytes allocated for large objects
-	local_nlargealloc uintptr                  // number of large object allocations
-	local_nsmallalloc [_NumSizeClasses]uintptr // number of allocs for small objects
-	local_largefree   uintptr                  // bytes freed for large objects (>maxsmallsize)
-	local_nlargefree  uintptr                  // number of frees for large objects (>maxsmallsize)
-	local_nsmallfree  [_NumSizeClasses]uintptr // number of frees for small objects (<=maxsmallsize)
+	tinyAllocCount  uintptr                  // number of tiny allocs not counted in other stats
+	largeAlloc      uintptr                  // bytes allocated for large objects
+	largeAllocCount uintptr                  // number of large object allocations
+	smallAllocCount [_NumSizeClasses]uintptr // number of allocs for small objects
+	largeFree       uintptr                  // bytes freed for large objects (>maxSmallSize)
+	largeFreeCount  uintptr                  // number of frees for large objects (>maxSmallSize)
+	smallFreeCount  [_NumSizeClasses]uintptr // number of frees for small objects (<=maxSmallSize)
 
 	// flushGen indicates the sweepgen during which this mcache
 	// was last flushed. If flushGen != mheap_.sweepgen, the spans
@@ -103,7 +103,7 @@ func allocmcache() *mcache {
 	for i := range c.alloc {
 		c.alloc[i] = &emptymspan
 	}
-	c.next_sample = nextSample()
+	c.nextSample = nextSample()
 	return c
 }
 
@@ -134,26 +134,26 @@ func freemcache(c *mcache, recipient *mcache) {
 // donate flushes data and resources which have no global
 // pool to another mcache.
 func (c *mcache) donate(d *mcache) {
-	// local_scan is handled separately because it's not
+	// scanAlloc is handled separately because it's not
 	// like these stats -- it's used for GC pacing.
-	d.local_largealloc += c.local_largealloc
-	c.local_largealloc = 0
-	d.local_nlargealloc += c.local_nlargealloc
-	c.local_nlargealloc = 0
-	for i := range c.local_nsmallalloc {
-		d.local_nsmallalloc[i] += c.local_nsmallalloc[i]
-		c.local_nsmallalloc[i] = 0
+	d.largeAlloc += c.largeAlloc
+	c.largeAlloc = 0
+	d.largeAllocCount += c.largeAllocCount
+	c.largeAllocCount = 0
+	for i := range c.smallAllocCount {
+		d.smallAllocCount[i] += c.smallAllocCount[i]
+		c.smallAllocCount[i] = 0
 	}
-	d.local_largefree += c.local_largefree
-	c.local_largefree = 0
-	d.local_nlargefree += c.local_nlargefree
-	c.local_nlargefree = 0
-	for i := range c.local_nsmallfree {
-		d.local_nsmallfree[i] += c.local_nsmallfree[i]
-		c.local_nsmallfree[i] = 0
+	d.largeFree += c.largeFree
+	c.largeFree = 0
+	d.largeFreeCount += c.largeFreeCount
+	c.largeFreeCount = 0
+	for i := range c.smallFreeCount {
+		d.smallFreeCount[i] += c.smallFreeCount[i]
+		c.smallFreeCount[i] = 0
 	}
-	d.local_tinyallocs += c.local_tinyallocs
-	c.local_tinyallocs = 0
+	d.tinyAllocCount += c.tinyAllocCount
+	c.tinyAllocCount = 0
 }
 
 // refill acquires a new span of span class spc for c. This span will
@@ -192,16 +192,16 @@ func (c *mcache) refill(spc spanClass) {
 
 	// Assume all objects from this span will be allocated in the
 	// mcache. If it gets uncached, we'll adjust this.
-	c.local_nsmallalloc[spc.sizeclass()] += uintptr(s.nelems) - uintptr(s.allocCount)
+	c.smallAllocCount[spc.sizeclass()] += uintptr(s.nelems) - uintptr(s.allocCount)
 
 	// Update heap_live with the same assumption.
 	usedBytes := uintptr(s.allocCount) * s.elemsize
 	atomic.Xadd64(&memstats.heap_live, int64(s.npages*pageSize)-int64(usedBytes))
 
-	// While we're here, flush local_scan, since we have to call
+	// While we're here, flush scanAlloc, since we have to call
 	// revise anyway.
-	atomic.Xadd64(&memstats.heap_scan, int64(c.local_scan))
-	c.local_scan = 0
+	atomic.Xadd64(&memstats.heap_scan, int64(c.scanAlloc))
+	c.scanAlloc = 0
 
 	if trace.enabled {
 		// heap_live changed.
@@ -215,8 +215,8 @@ func (c *mcache) refill(spc spanClass) {
 	c.alloc[spc] = s
 }
 
-// largeAlloc allocates a span for a large object.
-func (c *mcache) largeAlloc(size uintptr, needzero bool, noscan bool) *mspan {
+// allocLarge allocates a span for a large object.
+func (c *mcache) allocLarge(size uintptr, needzero bool, noscan bool) *mspan {
 	if size+_PageSize < size {
 		throw("out of memory")
 	}
@@ -235,8 +235,8 @@ func (c *mcache) largeAlloc(size uintptr, needzero bool, noscan bool) *mspan {
 	if s == nil {
 		throw("out of memory")
 	}
-	c.local_largealloc += npages * pageSize
-	c.local_nlargealloc++
+	c.largeAlloc += npages * pageSize
+	c.largeAllocCount++
 
 	// Update heap_live and revise pacing if needed.
 	atomic.Xadd64(&memstats.heap_live, int64(npages*pageSize))
@@ -257,9 +257,9 @@ func (c *mcache) largeAlloc(size uintptr, needzero bool, noscan bool) *mspan {
 }
 
 func (c *mcache) releaseAll() {
-	// Take this opportunity to flush local_scan.
-	atomic.Xadd64(&memstats.heap_scan, int64(c.local_scan))
-	c.local_scan = 0
+	// Take this opportunity to flush scanAlloc.
+	atomic.Xadd64(&memstats.heap_scan, int64(c.scanAlloc))
+	c.scanAlloc = 0
 
 	sg := mheap_.sweepgen
 	for i := range c.alloc {
@@ -267,7 +267,7 @@ func (c *mcache) releaseAll() {
 		if s != &emptymspan {
 			// Adjust nsmallalloc in case the span wasn't fully allocated.
 			n := uintptr(s.nelems) - uintptr(s.allocCount)
-			c.local_nsmallalloc[spanClass(i).sizeclass()] -= n
+			c.smallAllocCount[spanClass(i).sizeclass()] -= n
 			if s.sweepgen != sg+1 {
 				// refill conservatively counted unallocated slots in heap_live.
 				// Undo this.
diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go
index 55554c117c..540c376f1c 100644
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@ -2086,16 +2086,16 @@ func gcMark(start_time int64) {
 	// Update the marked heap stat.
 	memstats.heap_marked = work.bytesMarked
 
-	// Flush local_scan from each mcache since we're about to modify
-	// heap_scan directly. If we were to flush this later, then local_scan
+	// Flush scanAlloc from each mcache since we're about to modify
+	// heap_scan directly. If we were to flush this later, then scanAlloc
 	// might have incorrect information.
 	for _, p := range allp {
 		c := p.mcache
 		if c == nil {
 			continue
 		}
-		memstats.heap_scan += uint64(c.local_scan)
-		c.local_scan = 0
+		memstats.heap_scan += uint64(c.scanAlloc)
+		c.scanAlloc = 0
 	}
 
 	// Update other GC heap size stats. This must happen after
diff --git a/src/runtime/mgcsweep.go b/src/runtime/mgcsweep.go
index 6b8c56ce35..7103b08455 100644
--- a/src/runtime/mgcsweep.go
+++ b/src/runtime/mgcsweep.go
@@ -503,7 +503,7 @@ func (s *mspan) sweep(preserve bool) bool {
 			// wasn't totally filled, but then swept, still has all of its
 			// free slots zeroed.
 			s.needzero = 1
-			c.local_nsmallfree[spc.sizeclass()] += uintptr(nfreed)
+			c.smallFreeCount[spc.sizeclass()] += uintptr(nfreed)
 		}
 		if !preserve {
 			// The caller may not have removed this span from whatever
@@ -548,8 +548,8 @@ func (s *mspan) sweep(preserve bool) bool {
 			} else {
 				mheap_.freeSpan(s)
 			}
-			c.local_nlargefree++
-			c.local_largefree += size
+			c.largeFreeCount++
+			c.largeFree += size
 			return true
 		}
 
diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go
index 5eeb173640..64687c24e5 100644
--- a/src/runtime/mstats.go
+++ b/src/runtime/mstats.go
@@ -565,25 +565,25 @@ func updatememstats() {
 			continue
 		}
 		// Collect large allocation stats.
-		memstats.nmalloc += uint64(c.local_nlargealloc)
-		totalAlloc += uint64(c.local_largealloc)
-		totalFree += uint64(c.local_largefree)
-		memstats.nfree += uint64(c.local_nlargefree)
+		memstats.nmalloc += uint64(c.largeAllocCount)
+		totalAlloc += uint64(c.largeAlloc)
+		totalFree += uint64(c.largeFree)
+		memstats.nfree += uint64(c.largeFreeCount)
 
 		// Collect tiny allocation stats.
-		memstats.tinyallocs += uint64(c.local_tinyallocs)
+		memstats.tinyallocs += uint64(c.tinyAllocCount)
 
 		// Collect per-sizeclass stats.
 		for i := 0; i < _NumSizeClasses; i++ {
 			// Malloc stats.
-			memstats.nmalloc += uint64(c.local_nsmallalloc[i])
-			memstats.by_size[i].nmalloc += uint64(c.local_nsmallalloc[i])
-			totalAlloc += uint64(c.local_nsmallalloc[i]) * uint64(class_to_size[i])
+			memstats.nmalloc += uint64(c.smallAllocCount[i])
+			memstats.by_size[i].nmalloc += uint64(c.smallAllocCount[i])
+			totalAlloc += uint64(c.smallAllocCount[i]) * uint64(class_to_size[i])
 
 			// Free stats.
-			memstats.nfree += uint64(c.local_nsmallfree[i])
-			memstats.by_size[i].nfree += uint64(c.local_nsmallfree[i])
-			smallFree += uint64(c.local_nsmallfree[i]) * uint64(class_to_size[i])
+			memstats.nfree += uint64(c.smallFreeCount[i])
+			memstats.by_size[i].nfree += uint64(c.smallFreeCount[i])
+			smallFree += uint64(c.smallFreeCount[i]) * uint64(class_to_size[i])
 		}
 	}
 
diff --git a/src/runtime/pprof/mprof_test.go b/src/runtime/pprof/mprof_test.go
index f253f07def..c11a45fd69 100644
--- a/src/runtime/pprof/mprof_test.go
+++ b/src/runtime/pprof/mprof_test.go
@@ -70,7 +70,7 @@ func TestMemoryProfiler(t *testing.T) {
 		runtime.MemProfileRate = oldRate
 	}()
 
-	// Allocate a meg to ensure that mcache.next_sample is updated to 1.
+	// Allocate a meg to ensure that mcache.nextSample is updated to 1.
 	for i := 0; i < 1024; i++ {
 		memSink = make([]byte, 1024)
 	}
-- 
cgit v1.3


From 8ebc58452af3a586a3da1f68725bc83c78d4b073 Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Wed, 29 Jul 2020 20:25:05 +0000
Subject: runtime: delineate which memstats are system stats with a type

This change modifies the type of several mstats fields to be a new type:
sysMemStat. This type has the same structure as the fields used to have.

The purpose of this change is to make it very clear which stats may be
used in various functions for accounting (usually the platform-specific
sys* functions, but there are others). Currently there's an implicit
understanding that the *uint64 value passed to these functions is some
kind of statistic whose value is atomically managed. This understanding
isn't inherently problematic, but we're about to change how some stats
(which currently use mSysStatInc and mSysStatDec) work, so we want to
make it very clear what the various requirements are around "sysStat".

This change also removes mSysStatInc and mSysStatDec in favor of a
method on sysMemStat. Note that those two functions were originally
written the way they were because atomic 64-bit adds required a valid G
on ARM, but this hasn't been the case for a very long time (since
golang.org/cl/14204, but even before then it wasn't clear if mutexes
required a valid G anymore). Today we implement 64-bit adds on ARM with
a spinlock table.

Change-Id: I4e9b37cf14afc2ae20cf736e874eb0064af086d7
Reviewed-on: https://go-review.googlesource.com/c/go/+/246971
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Trust: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/export_test.go |  4 +--
 src/runtime/heapdump.go    | 14 ++++----
 src/runtime/malloc.go      | 10 +++---
 src/runtime/mem_aix.go     | 12 +++----
 src/runtime/mem_bsd.go     | 12 +++----
 src/runtime/mem_darwin.go  | 12 +++----
 src/runtime/mem_js.go      | 10 +++---
 src/runtime/mem_linux.go   | 12 +++----
 src/runtime/mem_plan9.go   | 12 +++----
 src/runtime/mem_windows.go | 12 +++----
 src/runtime/mfixalloc.go   |  4 +--
 src/runtime/mgcscavenge.go |  4 +--
 src/runtime/mheap.go       | 28 ++++++++--------
 src/runtime/mpagealloc.go  |  4 +--
 src/runtime/mranges.go     |  4 +--
 src/runtime/mstats.go      | 82 +++++++++++++++++-----------------------------
 src/runtime/os_darwin.go   |  3 +-
 17 files changed, 109 insertions(+), 130 deletions(-)

(limited to 'src/runtime/malloc.go')

diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index 47cbc286f6..cb753ee819 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -820,7 +820,7 @@ type AddrRanges struct {
 // Add.
 func NewAddrRanges() AddrRanges {
 	r := addrRanges{}
-	r.init(new(uint64))
+	r.init(new(sysMemStat))
 	return AddrRanges{r, true}
 }
 
@@ -844,7 +844,7 @@ func MakeAddrRanges(a ...AddrRange) AddrRanges {
 	return AddrRanges{addrRanges{
 		ranges:     ranges,
 		totalBytes: total,
-		sysStat:    new(uint64),
+		sysStat:    new(sysMemStat),
 	}, false}
 }
 
diff --git a/src/runtime/heapdump.go b/src/runtime/heapdump.go
index 4c35309211..495ecc5164 100644
--- a/src/runtime/heapdump.go
+++ b/src/runtime/heapdump.go
@@ -548,20 +548,20 @@ func dumpmemstats() {
 	dumpint(memstats.nmalloc)
 	dumpint(memstats.nfree)
 	dumpint(memstats.heap_alloc)
-	dumpint(memstats.heap_sys)
+	dumpint(memstats.heap_sys.load())
 	dumpint(memstats.heap_idle)
 	dumpint(memstats.heap_inuse)
 	dumpint(memstats.heap_released)
 	dumpint(memstats.heap_objects)
 	dumpint(memstats.stacks_inuse)
-	dumpint(memstats.stacks_sys)
+	dumpint(memstats.stacks_sys.load())
 	dumpint(memstats.mspan_inuse)
-	dumpint(memstats.mspan_sys)
+	dumpint(memstats.mspan_sys.load())
 	dumpint(memstats.mcache_inuse)
-	dumpint(memstats.mcache_sys)
-	dumpint(memstats.buckhash_sys)
-	dumpint(memstats.gc_sys)
-	dumpint(memstats.other_sys)
+	dumpint(memstats.mcache_sys.load())
+	dumpint(memstats.buckhash_sys.load())
+	dumpint(memstats.gc_sys.load())
+	dumpint(memstats.other_sys.load())
 	dumpint(memstats.next_gc)
 	dumpint(memstats.last_gc_unix)
 	dumpint(memstats.pause_total_ns)
diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index 0f48d7f68e..27d678d917 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -1313,7 +1313,7 @@ var persistentChunks *notInHeap
 // The returned memory will be zeroed.
 //
 // Consider marking persistentalloc'd types go:notinheap.
-func persistentalloc(size, align uintptr, sysStat *uint64) unsafe.Pointer {
+func persistentalloc(size, align uintptr, sysStat *sysMemStat) unsafe.Pointer {
 	var p *notInHeap
 	systemstack(func() {
 		p = persistentalloc1(size, align, sysStat)
@@ -1324,7 +1324,7 @@ func persistentalloc(size, align uintptr, sysStat *uint64) unsafe.Pointer {
 // Must run on system stack because stack growth can (re)invoke it.
 // See issue 9174.
 //go:systemstack
-func persistentalloc1(size, align uintptr, sysStat *uint64) *notInHeap {
+func persistentalloc1(size, align uintptr, sysStat *sysMemStat) *notInHeap {
 	const (
 		maxBlock = 64 << 10 // VM reservation granularity is 64K on windows
 	)
@@ -1383,8 +1383,8 @@ func persistentalloc1(size, align uintptr, sysStat *uint64) *notInHeap {
 	}
 
 	if sysStat != &memstats.other_sys {
-		mSysStatInc(sysStat, size)
-		mSysStatDec(&memstats.other_sys, size)
+		sysStat.add(int64(size))
+		memstats.other_sys.add(-int64(size))
 	}
 	return p
 }
@@ -1425,7 +1425,7 @@ func (l *linearAlloc) init(base, size uintptr) {
 	l.end = base + size
 }
 
-func (l *linearAlloc) alloc(size, align uintptr, sysStat *uint64) unsafe.Pointer {
+func (l *linearAlloc) alloc(size, align uintptr, sysStat *sysMemStat) unsafe.Pointer {
 	p := alignUp(l.next, align)
 	if p+size > l.end {
 		return nil
diff --git a/src/runtime/mem_aix.go b/src/runtime/mem_aix.go
index 7e145b072a..957aa4dcc2 100644
--- a/src/runtime/mem_aix.go
+++ b/src/runtime/mem_aix.go
@@ -11,7 +11,7 @@ import (
 // Don't split the stack as this method may be invoked without a valid G, which
 // prevents us from allocating more stack.
 //go:nosplit
-func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
+func sysAlloc(n uintptr, sysStat *sysMemStat) unsafe.Pointer {
 	p, err := mmap(nil, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
 	if err != 0 {
 		if err == _EACCES {
@@ -24,7 +24,7 @@ func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
 		}
 		return nil
 	}
-	mSysStatInc(sysStat, n)
+	sysStat.add(int64(n))
 	return p
 }
 
@@ -41,8 +41,8 @@ func sysHugePage(v unsafe.Pointer, n uintptr) {
 // Don't split the stack as this function may be invoked without a valid G,
 // which prevents us from allocating more stack.
 //go:nosplit
-func sysFree(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatDec(sysStat, n)
+func sysFree(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(-int64(n))
 	munmap(v, n)
 
 }
@@ -59,8 +59,8 @@ func sysReserve(v unsafe.Pointer, n uintptr) unsafe.Pointer {
 	return p
 }
 
-func sysMap(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatInc(sysStat, n)
+func sysMap(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(int64(n))
 
 	// AIX does not allow mapping a range that is already mapped.
 	// So, call mprotect to change permissions.
diff --git a/src/runtime/mem_bsd.go b/src/runtime/mem_bsd.go
index 4d860e7bd3..bc672019fb 100644
--- a/src/runtime/mem_bsd.go
+++ b/src/runtime/mem_bsd.go
@@ -13,12 +13,12 @@ import (
 // Don't split the stack as this function may be invoked without a valid G,
 // which prevents us from allocating more stack.
 //go:nosplit
-func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
+func sysAlloc(n uintptr, sysStat *sysMemStat) unsafe.Pointer {
 	v, err := mmap(nil, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
 	if err != 0 {
 		return nil
 	}
-	mSysStatInc(sysStat, n)
+	sysStat.add(int64(n))
 	return v
 }
 
@@ -35,8 +35,8 @@ func sysHugePage(v unsafe.Pointer, n uintptr) {
 // Don't split the stack as this function may be invoked without a valid G,
 // which prevents us from allocating more stack.
 //go:nosplit
-func sysFree(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatDec(sysStat, n)
+func sysFree(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(-int64(n))
 	munmap(v, n)
 }
 
@@ -65,8 +65,8 @@ func sysReserve(v unsafe.Pointer, n uintptr) unsafe.Pointer {
 const _sunosEAGAIN = 11
 const _ENOMEM = 12
 
-func sysMap(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatInc(sysStat, n)
+func sysMap(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(int64(n))
 
 	p, err := mmap(v, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_FIXED|_MAP_PRIVATE, -1, 0)
 	if err == _ENOMEM || ((GOOS == "solaris" || GOOS == "illumos") && err == _sunosEAGAIN) {
diff --git a/src/runtime/mem_darwin.go b/src/runtime/mem_darwin.go
index 3b5d565b0f..7fccd2bb8e 100644
--- a/src/runtime/mem_darwin.go
+++ b/src/runtime/mem_darwin.go
@@ -11,12 +11,12 @@ import (
 // Don't split the stack as this function may be invoked without a valid G,
 // which prevents us from allocating more stack.
 //go:nosplit
-func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
+func sysAlloc(n uintptr, sysStat *sysMemStat) unsafe.Pointer {
 	v, err := mmap(nil, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
 	if err != 0 {
 		return nil
 	}
-	mSysStatInc(sysStat, n)
+	sysStat.add(int64(n))
 	return v
 }
 
@@ -39,8 +39,8 @@ func sysHugePage(v unsafe.Pointer, n uintptr) {
 // Don't split the stack as this function may be invoked without a valid G,
 // which prevents us from allocating more stack.
 //go:nosplit
-func sysFree(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatDec(sysStat, n)
+func sysFree(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(-int64(n))
 	munmap(v, n)
 }
 
@@ -58,8 +58,8 @@ func sysReserve(v unsafe.Pointer, n uintptr) unsafe.Pointer {
 
 const _ENOMEM = 12
 
-func sysMap(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatInc(sysStat, n)
+func sysMap(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(int64(n))
 
 	p, err := mmap(v, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_FIXED|_MAP_PRIVATE, -1, 0)
 	if err == _ENOMEM {
diff --git a/src/runtime/mem_js.go b/src/runtime/mem_js.go
index 092b3d4fa2..957ed36ffa 100644
--- a/src/runtime/mem_js.go
+++ b/src/runtime/mem_js.go
@@ -13,7 +13,7 @@ import (
 // Don't split the stack as this function may be invoked without a valid G,
 // which prevents us from allocating more stack.
 //go:nosplit
-func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
+func sysAlloc(n uintptr, sysStat *sysMemStat) unsafe.Pointer {
 	p := sysReserve(nil, n)
 	sysMap(p, n, sysStat)
 	return p
@@ -31,8 +31,8 @@ func sysHugePage(v unsafe.Pointer, n uintptr) {
 // Don't split the stack as this function may be invoked without a valid G,
 // which prevents us from allocating more stack.
 //go:nosplit
-func sysFree(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatDec(sysStat, n)
+func sysFree(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(-int64(n))
 }
 
 func sysFault(v unsafe.Pointer, n uintptr) {
@@ -80,6 +80,6 @@ func growMemory(pages int32) int32
 // This allows the front-end to replace the old DataView object with a new one.
 func resetMemoryDataView()
 
-func sysMap(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatInc(sysStat, n)
+func sysMap(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(int64(n))
 }
diff --git a/src/runtime/mem_linux.go b/src/runtime/mem_linux.go
index 59b0bca970..3436851091 100644
--- a/src/runtime/mem_linux.go
+++ b/src/runtime/mem_linux.go
@@ -17,7 +17,7 @@ const (
 // Don't split the stack as this method may be invoked without a valid G, which
 // prevents us from allocating more stack.
 //go:nosplit
-func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
+func sysAlloc(n uintptr, sysStat *sysMemStat) unsafe.Pointer {
 	p, err := mmap(nil, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
 	if err != 0 {
 		if err == _EACCES {
@@ -30,7 +30,7 @@ func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
 		}
 		return nil
 	}
-	mSysStatInc(sysStat, n)
+	sysStat.add(int64(n))
 	return p
 }
 
@@ -144,8 +144,8 @@ func sysHugePage(v unsafe.Pointer, n uintptr) {
 // Don't split the stack as this function may be invoked without a valid G,
 // which prevents us from allocating more stack.
 //go:nosplit
-func sysFree(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatDec(sysStat, n)
+func sysFree(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(-int64(n))
 	munmap(v, n)
 }
 
@@ -161,8 +161,8 @@ func sysReserve(v unsafe.Pointer, n uintptr) unsafe.Pointer {
 	return p
 }
 
-func sysMap(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatInc(sysStat, n)
+func sysMap(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(int64(n))
 
 	p, err := mmap(v, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_FIXED|_MAP_PRIVATE, -1, 0)
 	if err == _ENOMEM {
diff --git a/src/runtime/mem_plan9.go b/src/runtime/mem_plan9.go
index 4fea851cdd..53d8e6dffa 100644
--- a/src/runtime/mem_plan9.go
+++ b/src/runtime/mem_plan9.go
@@ -140,19 +140,19 @@ func sbrk(n uintptr) unsafe.Pointer {
 	return unsafe.Pointer(bl)
 }
 
-func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
+func sysAlloc(n uintptr, sysStat *sysMemStat) unsafe.Pointer {
 	lock(&memlock)
 	p := memAlloc(n)
 	memCheck()
 	unlock(&memlock)
 	if p != nil {
-		mSysStatInc(sysStat, n)
+		sysStat.add(int64(n))
 	}
 	return p
 }
 
-func sysFree(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatDec(sysStat, n)
+func sysFree(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(-int64(n))
 	lock(&memlock)
 	if uintptr(v)+n == bloc {
 		// Address range being freed is at the end of memory,
@@ -176,10 +176,10 @@ func sysUsed(v unsafe.Pointer, n uintptr) {
 func sysHugePage(v unsafe.Pointer, n uintptr) {
 }
 
-func sysMap(v unsafe.Pointer, n uintptr, sysStat *uint64) {
+func sysMap(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
 	// sysReserve has already allocated all heap memory,
 	// but has not adjusted stats.
-	mSysStatInc(sysStat, n)
+	sysStat.add(int64(n))
 }
 
 func sysFault(v unsafe.Pointer, n uintptr) {
diff --git a/src/runtime/mem_windows.go b/src/runtime/mem_windows.go
index 165062ec27..3a805b9767 100644
--- a/src/runtime/mem_windows.go
+++ b/src/runtime/mem_windows.go
@@ -24,8 +24,8 @@ const (
 // Don't split the stack as this function may be invoked without a valid G,
 // which prevents us from allocating more stack.
 //go:nosplit
-func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
-	mSysStatInc(sysStat, n)
+func sysAlloc(n uintptr, sysStat *sysMemStat) unsafe.Pointer {
+	sysStat.add(int64(n))
 	return unsafe.Pointer(stdcall4(_VirtualAlloc, 0, n, _MEM_COMMIT|_MEM_RESERVE, _PAGE_READWRITE))
 }
 
@@ -97,8 +97,8 @@ func sysHugePage(v unsafe.Pointer, n uintptr) {
 // Don't split the stack as this function may be invoked without a valid G,
 // which prevents us from allocating more stack.
 //go:nosplit
-func sysFree(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatDec(sysStat, n)
+func sysFree(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(-int64(n))
 	r := stdcall3(_VirtualFree, uintptr(v), 0, _MEM_RELEASE)
 	if r == 0 {
 		print("runtime: VirtualFree of ", n, " bytes failed with errno=", getlasterror(), "\n")
@@ -124,6 +124,6 @@ func sysReserve(v unsafe.Pointer, n uintptr) unsafe.Pointer {
 	return unsafe.Pointer(stdcall4(_VirtualAlloc, 0, n, _MEM_RESERVE, _PAGE_READWRITE))
 }
 
-func sysMap(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatInc(sysStat, n)
+func sysMap(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(int64(n))
 }
diff --git a/src/runtime/mfixalloc.go b/src/runtime/mfixalloc.go
index f9dd6ca474..293c16b38b 100644
--- a/src/runtime/mfixalloc.go
+++ b/src/runtime/mfixalloc.go
@@ -32,7 +32,7 @@ type fixalloc struct {
 	chunk  uintptr // use uintptr instead of unsafe.Pointer to avoid write barriers
 	nchunk uint32
 	inuse  uintptr // in-use bytes now
-	stat   *uint64
+	stat   *sysMemStat
 	zero   bool // zero allocations
 }
 
@@ -49,7 +49,7 @@ type mlink struct {
 
 // Initialize f to allocate objects of the given size,
 // using the allocator to obtain chunks of memory.
-func (f *fixalloc) init(size uintptr, first func(arg, p unsafe.Pointer), arg unsafe.Pointer, stat *uint64) {
+func (f *fixalloc) init(size uintptr, first func(arg, p unsafe.Pointer), arg unsafe.Pointer, stat *sysMemStat) {
 	f.size = size
 	f.first = first
 	f.arg = arg
diff --git a/src/runtime/mgcscavenge.go b/src/runtime/mgcscavenge.go
index 6328b295ca..8b1a0be353 100644
--- a/src/runtime/mgcscavenge.go
+++ b/src/runtime/mgcscavenge.go
@@ -100,7 +100,7 @@ const (
 
 // heapRetained returns an estimate of the current heap RSS.
 func heapRetained() uint64 {
-	return atomic.Load64(&memstats.heap_sys) - atomic.Load64(&memstats.heap_released)
+	return memstats.heap_sys.load() - atomic.Load64(&memstats.heap_released)
 }
 
 // gcPaceScavenger updates the scavenger's pacing, particularly
@@ -711,7 +711,7 @@ func (p *pageAlloc) scavengeRangeLocked(ci chunkIdx, base, npages uint) uintptr
 
 	// Update global accounting only when not in test, otherwise
 	// the runtime's accounting will be wrong.
-	mSysStatInc(&memstats.heap_released, uintptr(npages)*pageSize)
+	atomic.Xadd64(&memstats.heap_released, int64(npages)*pageSize)
 	return addr
 }
 
diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index df659e222b..27c1bfbcf1 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -1222,22 +1222,22 @@ HaveSpan:
 		// sysUsed all the pages that are actually available
 		// in the span since some of them might be scavenged.
 		sysUsed(unsafe.Pointer(base), nbytes)
-		mSysStatDec(&memstats.heap_released, scav)
+		atomic.Xadd64(&memstats.heap_released, -int64(scav))
 	}
 	// Update stats.
 	switch typ {
 	case spanAllocHeap:
-		mSysStatInc(&memstats.heap_inuse, nbytes)
+		atomic.Xadd64(&memstats.heap_inuse, int64(nbytes))
 	case spanAllocStack:
-		mSysStatInc(&memstats.stacks_inuse, nbytes)
+		atomic.Xadd64(&memstats.stacks_inuse, int64(nbytes))
 	case spanAllocPtrScalarBits, spanAllocWorkBuf:
-		mSysStatInc(&memstats.gc_sys, nbytes)
+		memstats.gc_sys.add(int64(nbytes))
 	}
 	if typ.manual() {
 		// Manually managed memory doesn't count toward heap_sys.
-		mSysStatDec(&memstats.heap_sys, nbytes)
+		memstats.heap_sys.add(-int64(nbytes))
 	}
-	mSysStatDec(&memstats.heap_idle, nbytes)
+	atomic.Xadd64(&memstats.heap_idle, -int64(nbytes))
 
 	// Publish the span in various locations.
 
@@ -1314,8 +1314,8 @@ func (h *mheap) grow(npage uintptr) bool {
 		// The allocation is always aligned to the heap arena
 		// size which is always > physPageSize, so its safe to
 		// just add directly to heap_released.
-		mSysStatInc(&memstats.heap_released, asize)
-		mSysStatInc(&memstats.heap_idle, asize)
+		atomic.Xadd64(&memstats.heap_released, int64(asize))
+		atomic.Xadd64(&memstats.heap_idle, int64(asize))
 
 		// Recalculate nBase.
 		// We know this won't overflow, because sysAlloc returned
@@ -1400,18 +1400,20 @@ func (h *mheap) freeSpanLocked(s *mspan, typ spanAllocType) {
 	// Update stats.
 	//
 	// Mirrors the code in allocSpan.
+	nbytes := s.npages * pageSize
 	switch typ {
 	case spanAllocHeap:
-		mSysStatDec(&memstats.heap_inuse, s.npages*pageSize)
+		atomic.Xadd64(&memstats.heap_inuse, -int64(nbytes))
 	case spanAllocStack:
-		mSysStatDec(&memstats.stacks_inuse, s.npages*pageSize)
+		atomic.Xadd64(&memstats.stacks_inuse, -int64(nbytes))
 	case spanAllocPtrScalarBits, spanAllocWorkBuf:
-		mSysStatDec(&memstats.gc_sys, s.npages*pageSize)
+		memstats.gc_sys.add(-int64(nbytes))
 	}
 	if typ.manual() {
-		mSysStatInc(&memstats.heap_sys, s.npages*pageSize)
+		// Manually managed memory doesn't count toward heap_sys, so add it back.
+		memstats.heap_sys.add(int64(nbytes))
 	}
-	mSysStatInc(&memstats.heap_idle, s.npages*pageSize)
+	atomic.Xadd64(&memstats.heap_idle, int64(nbytes))
 
 	// Mark the space as free.
 	h.pages.free(s.base(), s.npages)
diff --git a/src/runtime/mpagealloc.go b/src/runtime/mpagealloc.go
index 560babed03..2af1c97e0b 100644
--- a/src/runtime/mpagealloc.go
+++ b/src/runtime/mpagealloc.go
@@ -293,13 +293,13 @@ type pageAlloc struct {
 
 	// sysStat is the runtime memstat to update when new system
 	// memory is committed by the pageAlloc for allocation metadata.
-	sysStat *uint64
+	sysStat *sysMemStat
 
 	// Whether or not this struct is being used in tests.
 	test bool
 }
 
-func (p *pageAlloc) init(mheapLock *mutex, sysStat *uint64) {
+func (p *pageAlloc) init(mheapLock *mutex, sysStat *sysMemStat) {
 	if levelLogPages[0] > logMaxPackedValue {
 		// We can't represent 1<<levelLogPages[0] pages, the maximum number
 		// of pages we need to represent at the root level, in a summary, which
diff --git a/src/runtime/mranges.go b/src/runtime/mranges.go
index 1109f506a6..16acadcff1 100644
--- a/src/runtime/mranges.go
+++ b/src/runtime/mranges.go
@@ -160,10 +160,10 @@ type addrRanges struct {
 	totalBytes uintptr
 
 	// sysStat is the stat to track allocations by this type
-	sysStat *uint64
+	sysStat *sysMemStat
 }
 
-func (a *addrRanges) init(sysStat *uint64) {
+func (a *addrRanges) init(sysStat *sysMemStat) {
 	ranges := (*notInHeapSlice)(unsafe.Pointer(&a.ranges))
 	ranges.len = 0
 	ranges.cap = 16
diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go
index 64687c24e5..571a9c9ce3 100644
--- a/src/runtime/mstats.go
+++ b/src/runtime/mstats.go
@@ -8,7 +8,6 @@ package runtime
 
 import (
 	"runtime/internal/atomic"
-	"runtime/internal/sys"
 	"unsafe"
 )
 
@@ -35,11 +34,11 @@ type mstats struct {
 	//
 	// Like MemStats, heap_sys and heap_inuse do not count memory
 	// in manually-managed spans.
-	heap_alloc    uint64 // bytes allocated and not yet freed (same as alloc above)
-	heap_sys      uint64 // virtual address space obtained from system for GC'd heap
-	heap_idle     uint64 // bytes in idle spans
-	heap_inuse    uint64 // bytes in mSpanInUse spans
-	heap_released uint64 // bytes released to the os
+	heap_alloc    uint64     // bytes allocated and not yet freed (same as alloc above)
+	heap_sys      sysMemStat // virtual address space obtained from system for GC'd heap
+	heap_idle     uint64     // bytes in idle spans
+	heap_inuse    uint64     // bytes in mSpanInUse spans
+	heap_released uint64     // bytes released to the os
 
 	// heap_objects is not used by the runtime directly and instead
 	// computed on the fly by updatememstats.
@@ -47,15 +46,15 @@ type mstats struct {
 
 	// Statistics about allocation of low-level fixed-size structures.
 	// Protected by FixAlloc locks.
-	stacks_inuse uint64 // bytes in manually-managed stack spans; updated atomically or during STW
-	stacks_sys   uint64 // only counts newosproc0 stack in mstats; differs from MemStats.StackSys
-	mspan_inuse  uint64 // mspan structures
-	mspan_sys    uint64
+	stacks_inuse uint64     // bytes in manually-managed stack spans; updated atomically or during STW
+	stacks_sys   sysMemStat // only counts newosproc0 stack in mstats; differs from MemStats.StackSys
+	mspan_inuse  uint64     // mspan structures
+	mspan_sys    sysMemStat
 	mcache_inuse uint64 // mcache structures
-	mcache_sys   uint64
-	buckhash_sys uint64 // profiling bucket hash table
-	gc_sys       uint64 // updated atomically or during STW
-	other_sys    uint64 // updated atomically or during STW
+	mcache_sys   sysMemStat
+	buckhash_sys sysMemStat // profiling bucket hash table
+	gc_sys       sysMemStat // updated atomically or during STW
+	other_sys    sysMemStat // updated atomically or during STW
 
 	// Statistics about the garbage collector.
 
@@ -533,8 +532,9 @@ func updatememstats() {
 
 	memstats.mcache_inuse = uint64(mheap_.cachealloc.inuse)
 	memstats.mspan_inuse = uint64(mheap_.spanalloc.inuse)
-	memstats.sys = memstats.heap_sys + memstats.stacks_sys + memstats.mspan_sys +
-		memstats.mcache_sys + memstats.buckhash_sys + memstats.gc_sys + memstats.other_sys
+	memstats.sys = memstats.heap_sys.load() + memstats.stacks_sys.load() + memstats.mspan_sys.load() +
+		memstats.mcache_sys.load() + memstats.buckhash_sys.load() + memstats.gc_sys.load() +
+		memstats.other_sys.load()
 
 	// We also count stacks_inuse as sys memory.
 	memstats.sys += memstats.stacks_inuse
@@ -625,46 +625,24 @@ func flushallmcaches() {
 	}
 }
 
-// Atomically increases a given *system* memory stat. We are counting on this
-// stat never overflowing a uintptr, so this function must only be used for
-// system memory stats.
+// sysMemStat represents a global system statistic that is managed atomically.
 //
-// The current implementation for little endian architectures is based on
-// xadduintptr(), which is less than ideal: xadd64() should really be used.
-// Using xadduintptr() is a stop-gap solution until arm supports xadd64() that
-// doesn't use locks.  (Locks are a problem as they require a valid G, which
-// restricts their useability.)
-//
-// A side-effect of using xadduintptr() is that we need to check for
-// overflow errors.
-//go:nosplit
-func mSysStatInc(sysStat *uint64, n uintptr) {
-	if sysStat == nil {
-		return
-	}
-	if sys.BigEndian {
-		atomic.Xadd64(sysStat, int64(n))
-		return
-	}
-	if val := atomic.Xadduintptr((*uintptr)(unsafe.Pointer(sysStat)), n); val < n {
-		print("runtime: stat overflow: val ", val, ", n ", n, "\n")
-		exit(2)
-	}
+// This type must structurally be a uint64 so that mstats aligns with MemStats.
+type sysMemStat uint64
+
+// load atomically reads the value of the stat.
+func (s *sysMemStat) load() uint64 {
+	return atomic.Load64((*uint64)(s))
 }
 
-// Atomically decreases a given *system* memory stat. Same comments as
-// mSysStatInc apply.
-//go:nosplit
-func mSysStatDec(sysStat *uint64, n uintptr) {
-	if sysStat == nil {
-		return
-	}
-	if sys.BigEndian {
-		atomic.Xadd64(sysStat, -int64(n))
+// add atomically adds the sysMemStat by n.
+func (s *sysMemStat) add(n int64) {
+	if s == nil {
 		return
 	}
-	if val := atomic.Xadduintptr((*uintptr)(unsafe.Pointer(sysStat)), uintptr(-int64(n))); val+n < n {
-		print("runtime: stat underflow: val ", val, ", n ", n, "\n")
-		exit(2)
+	val := atomic.Xadd64((*uint64)(s), n)
+	if (n > 0 && int64(val) < n) || (n < 0 && int64(val)+n < n) {
+		print("runtime: val=", val, " n=", n, "\n")
+		throw("sysMemStat overflow")
 	}
 }
diff --git a/src/runtime/os_darwin.go b/src/runtime/os_darwin.go
index 394bd6fb0f..3f5bb7cf96 100644
--- a/src/runtime/os_darwin.go
+++ b/src/runtime/os_darwin.go
@@ -198,7 +198,6 @@ func newosproc(mp *m) {
 		exit(1)
 	}
 	mp.g0.stack.hi = stacksize // for mstart
-	//mSysStatInc(&memstats.stacks_sys, stacksize) //TODO: do this?
 
 	// Tell the pthread library we won't join with this thread.
 	if pthread_attr_setdetachstate(&attr, _PTHREAD_CREATE_DETACHED) != 0 {
@@ -247,7 +246,7 @@ func newosproc0(stacksize uintptr, fn uintptr) {
 		exit(1)
 	}
 	g0.stack.hi = stacksize // for mstart
-	mSysStatInc(&memstats.stacks_sys, stacksize)
+	memstats.stacks_sys.add(int64(stacksize))
 
 	// Tell the pthread library we won't join with this thread.
 	if pthread_attr_setdetachstate(&attr, _PTHREAD_CREATE_DETACHED) != 0 {
-- 
cgit v1.3


From ad863ba32a2ede207d708fa15897e9de1d14dd87 Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Mon, 3 Aug 2020 19:23:30 +0000
Subject: runtime: break down memstats.gc_sys

This change breaks apart gc_sys into three distinct pieces. Two of those
pieces are pieces which come from heap_sys since they're allocated from
the page heap. The rest comes from memory mapped from e.g.
persistentalloc which better fits the purpose of a sysMemStat. Also,
rename gc_sys to gcMiscSys.

Change-Id: I098789170052511e7b31edbcdc9a53e5c24573f7
Reviewed-on: https://go-review.googlesource.com/c/go/+/246973
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Trust: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/heapdump.go   |  5 ++++-
 src/runtime/malloc.go     |  6 +++---
 src/runtime/mcheckmark.go |  2 +-
 src/runtime/mfinal.go     |  2 +-
 src/runtime/mheap.go      | 16 ++++++++++------
 src/runtime/mspanset.go   |  4 ++--
 src/runtime/mstats.go     | 31 ++++++++++++++++++-------------
 7 files changed, 39 insertions(+), 27 deletions(-)

(limited to 'src/runtime/malloc.go')

diff --git a/src/runtime/heapdump.go b/src/runtime/heapdump.go
index 495ecc5164..eed47930f0 100644
--- a/src/runtime/heapdump.go
+++ b/src/runtime/heapdump.go
@@ -540,6 +540,9 @@ func dumpms() {
 }
 
 func dumpmemstats() {
+	// These ints should be identical to the exported
+	// MemStats structure and should be ordered the same
+	// way too.
 	dumpint(tagMemStats)
 	dumpint(memstats.alloc)
 	dumpint(memstats.total_alloc)
@@ -560,7 +563,7 @@ func dumpmemstats() {
 	dumpint(memstats.mcache_inuse)
 	dumpint(memstats.mcache_sys.load())
 	dumpint(memstats.buckhash_sys.load())
-	dumpint(memstats.gc_sys.load())
+	dumpint(memstats.gcMiscSys.load() + memstats.gcWorkBufInUse + memstats.gcProgPtrScalarBitsInUse)
 	dumpint(memstats.other_sys.load())
 	dumpint(memstats.next_gc)
 	dumpint(memstats.last_gc_unix)
diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index 27d678d917..ee22bad58c 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -743,9 +743,9 @@ mapped:
 			throw("arena already initialized")
 		}
 		var r *heapArena
-		r = (*heapArena)(h.heapArenaAlloc.alloc(unsafe.Sizeof(*r), sys.PtrSize, &memstats.gc_sys))
+		r = (*heapArena)(h.heapArenaAlloc.alloc(unsafe.Sizeof(*r), sys.PtrSize, &memstats.gcMiscSys))
 		if r == nil {
-			r = (*heapArena)(persistentalloc(unsafe.Sizeof(*r), sys.PtrSize, &memstats.gc_sys))
+			r = (*heapArena)(persistentalloc(unsafe.Sizeof(*r), sys.PtrSize, &memstats.gcMiscSys))
 			if r == nil {
 				throw("out of memory allocating heap arena metadata")
 			}
@@ -757,7 +757,7 @@ mapped:
 			if size == 0 {
 				size = physPageSize
 			}
-			newArray := (*notInHeap)(persistentalloc(size, sys.PtrSize, &memstats.gc_sys))
+			newArray := (*notInHeap)(persistentalloc(size, sys.PtrSize, &memstats.gcMiscSys))
 			if newArray == nil {
 				throw("out of memory allocating allArenas")
 			}
diff --git a/src/runtime/mcheckmark.go b/src/runtime/mcheckmark.go
index 1fd8e4e78f..c0b028d715 100644
--- a/src/runtime/mcheckmark.go
+++ b/src/runtime/mcheckmark.go
@@ -41,7 +41,7 @@ func startCheckmarks() {
 
 		if bitmap == nil {
 			// Allocate bitmap on first use.
-			bitmap = (*checkmarksMap)(persistentalloc(unsafe.Sizeof(*bitmap), 0, &memstats.gc_sys))
+			bitmap = (*checkmarksMap)(persistentalloc(unsafe.Sizeof(*bitmap), 0, &memstats.gcMiscSys))
 			if bitmap == nil {
 				throw("out of memory allocating checkmarks bitmap")
 			}
diff --git a/src/runtime/mfinal.go b/src/runtime/mfinal.go
index 6676ae6736..6ec5133be0 100644
--- a/src/runtime/mfinal.go
+++ b/src/runtime/mfinal.go
@@ -88,7 +88,7 @@ func queuefinalizer(p unsafe.Pointer, fn *funcval, nret uintptr, fint *_type, ot
 	lock(&finlock)
 	if finq == nil || finq.cnt == uint32(len(finq.fin)) {
 		if finc == nil {
-			finc = (*finblock)(persistentalloc(_FinBlockSize, 0, &memstats.gc_sys))
+			finc = (*finblock)(persistentalloc(_FinBlockSize, 0, &memstats.gcMiscSys))
 			finc.alllink = allfin
 			allfin = finc
 			if finptrmask[0] == 0 {
diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index 27c1bfbcf1..1624a04b9d 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -713,7 +713,7 @@ func (h *mheap) init() {
 		h.central[i].mcentral.init(spanClass(i))
 	}
 
-	h.pages.init(&h.lock, &memstats.gc_sys)
+	h.pages.init(&h.lock, &memstats.gcMiscSys)
 }
 
 // reclaim sweeps and reclaims at least npage pages into the heap.
@@ -1230,8 +1230,10 @@ HaveSpan:
 		atomic.Xadd64(&memstats.heap_inuse, int64(nbytes))
 	case spanAllocStack:
 		atomic.Xadd64(&memstats.stacks_inuse, int64(nbytes))
-	case spanAllocPtrScalarBits, spanAllocWorkBuf:
-		memstats.gc_sys.add(int64(nbytes))
+	case spanAllocWorkBuf:
+		atomic.Xadd64(&memstats.gcWorkBufInUse, int64(nbytes))
+	case spanAllocPtrScalarBits:
+		atomic.Xadd64(&memstats.gcProgPtrScalarBitsInUse, int64(nbytes))
 	}
 	if typ.manual() {
 		// Manually managed memory doesn't count toward heap_sys.
@@ -1406,8 +1408,10 @@ func (h *mheap) freeSpanLocked(s *mspan, typ spanAllocType) {
 		atomic.Xadd64(&memstats.heap_inuse, -int64(nbytes))
 	case spanAllocStack:
 		atomic.Xadd64(&memstats.stacks_inuse, -int64(nbytes))
-	case spanAllocPtrScalarBits, spanAllocWorkBuf:
-		memstats.gc_sys.add(-int64(nbytes))
+	case spanAllocWorkBuf:
+		atomic.Xadd64(&memstats.gcWorkBufInUse, -int64(nbytes))
+	case spanAllocPtrScalarBits:
+		atomic.Xadd64(&memstats.gcProgPtrScalarBitsInUse, -int64(nbytes))
 	}
 	if typ.manual() {
 		// Manually managed memory doesn't count toward heap_sys, so add it back.
@@ -1956,7 +1960,7 @@ func newArenaMayUnlock() *gcBitsArena {
 	var result *gcBitsArena
 	if gcBitsArenas.free == nil {
 		unlock(&gcBitsArenas.lock)
-		result = (*gcBitsArena)(sysAlloc(gcBitsChunkBytes, &memstats.gc_sys))
+		result = (*gcBitsArena)(sysAlloc(gcBitsChunkBytes, &memstats.gcMiscSys))
 		if result == nil {
 			throw("runtime: cannot allocate memory")
 		}
diff --git a/src/runtime/mspanset.go b/src/runtime/mspanset.go
index 490eed4549..10d2596c38 100644
--- a/src/runtime/mspanset.go
+++ b/src/runtime/mspanset.go
@@ -102,7 +102,7 @@ retry:
 			if newCap == 0 {
 				newCap = spanSetInitSpineCap
 			}
-			newSpine := persistentalloc(newCap*sys.PtrSize, cpu.CacheLineSize, &memstats.gc_sys)
+			newSpine := persistentalloc(newCap*sys.PtrSize, cpu.CacheLineSize, &memstats.gcMiscSys)
 			if b.spineCap != 0 {
 				// Blocks are allocated off-heap, so
 				// no write barriers.
@@ -283,7 +283,7 @@ func (p *spanSetBlockAlloc) alloc() *spanSetBlock {
 	if s := (*spanSetBlock)(p.stack.pop()); s != nil {
 		return s
 	}
-	return (*spanSetBlock)(persistentalloc(unsafe.Sizeof(spanSetBlock{}), cpu.CacheLineSize, &memstats.gc_sys))
+	return (*spanSetBlock)(persistentalloc(unsafe.Sizeof(spanSetBlock{}), cpu.CacheLineSize, &memstats.gcMiscSys))
 }
 
 // free returns a spanSetBlock back to the pool.
diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go
index 466f33836c..967fe6e2be 100644
--- a/src/runtime/mstats.go
+++ b/src/runtime/mstats.go
@@ -44,15 +44,17 @@ type mstats struct {
 
 	// Statistics about allocation of low-level fixed-size structures.
 	// Protected by FixAlloc locks.
-	stacks_inuse uint64     // bytes in manually-managed stack spans; updated atomically or during STW
-	stacks_sys   sysMemStat // only counts newosproc0 stack in mstats; differs from MemStats.StackSys
-	mspan_inuse  uint64     // mspan structures
-	mspan_sys    sysMemStat
-	mcache_inuse uint64 // mcache structures
-	mcache_sys   sysMemStat
-	buckhash_sys sysMemStat // profiling bucket hash table
-	gc_sys       sysMemStat // updated atomically or during STW
-	other_sys    sysMemStat // updated atomically or during STW
+	stacks_inuse             uint64     // bytes in manually-managed stack spans; updated atomically or during STW
+	stacks_sys               sysMemStat // only counts newosproc0 stack in mstats; differs from MemStats.StackSys
+	mspan_inuse              uint64     // mspan structures
+	mspan_sys                sysMemStat
+	mcache_inuse             uint64 // mcache structures
+	mcache_sys               sysMemStat
+	buckhash_sys             sysMemStat // profiling bucket hash table
+	gcWorkBufInUse           uint64     // updated atomically or during STW
+	gcProgPtrScalarBitsInUse uint64     // updated atomically or during STW
+	gcMiscSys                sysMemStat // updated atomically or during STW
+	other_sys                sysMemStat // updated atomically or during STW
 
 	// Statistics about the garbage collector.
 
@@ -472,7 +474,10 @@ func readmemstats_m(stats *MemStats) {
 	stats.MCacheInuse = memstats.mcache_inuse
 	stats.MCacheSys = memstats.mcache_sys.load()
 	stats.BuckHashSys = memstats.buckhash_sys.load()
-	stats.GCSys = memstats.gc_sys.load()
+	// MemStats defines GCSys as an aggregate of all memory related
+	// to the memory management system, but we track this memory
+	// at a more granular level in the runtime.
+	stats.GCSys = memstats.gcMiscSys.load() + memstats.gcWorkBufInUse + memstats.gcProgPtrScalarBitsInUse
 	stats.OtherSys = memstats.other_sys.load()
 	stats.NextGC = memstats.next_gc
 	stats.LastGC = memstats.last_gc_unix
@@ -557,11 +562,11 @@ func updatememstats() {
 	memstats.mcache_inuse = uint64(mheap_.cachealloc.inuse)
 	memstats.mspan_inuse = uint64(mheap_.spanalloc.inuse)
 	memstats.sys = memstats.heap_sys.load() + memstats.stacks_sys.load() + memstats.mspan_sys.load() +
-		memstats.mcache_sys.load() + memstats.buckhash_sys.load() + memstats.gc_sys.load() +
+		memstats.mcache_sys.load() + memstats.buckhash_sys.load() + memstats.gcMiscSys.load() +
 		memstats.other_sys.load()
 
-	// We also count stacks_inuse as sys memory.
-	memstats.sys += memstats.stacks_inuse
+	// We also count stacks_inuse, gcWorkBufInUse, and gcProgPtrScalarBitsInUse as sys memory.
+	memstats.sys += memstats.stacks_inuse + memstats.gcWorkBufInUse + memstats.gcProgPtrScalarBitsInUse
 
 	// Calculate memory allocator stats.
 	// During program execution we only count number of frees and amount of freed memory.
-- 
cgit v1.3


From c02134abb01e019683daf051029d66b15dd11213 Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Mon, 3 Aug 2020 20:08:25 +0000
Subject: runtime: add helper for getting an mcache in allocation contexts

This change adds a function getMCache which returns the current P's
mcache if it's available, and otherwise tries to get mcache0 if we're
bootstrapping. This function will come in handy as we need to replicate
this behavior in multiple places in future changes.

Change-Id: I536073d6f6dc6c6390269e613ead9f8bcb6e7f98
Reviewed-on: https://go-review.googlesource.com/c/go/+/246976
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Trust: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/malloc.go | 25 ++-----------------------
 src/runtime/mcache.go | 23 +++++++++++++++++++++++
 2 files changed, 25 insertions(+), 23 deletions(-)

(limited to 'src/runtime/malloc.go')

diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index ee22bad58c..6383c34817 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -972,19 +972,7 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 
 	shouldhelpgc := false
 	dataSize := size
-	var c *mcache
-	if mp.p != 0 {
-		c = mp.p.ptr().mcache
-	} else {
-		// We will be called without a P while bootstrapping,
-		// in which case we use mcache0, which is set in mallocinit.
-		// mcache0 is cleared when bootstrapping is complete,
-		// by procresize.
-		c = mcache0
-		if c == nil {
-			throw("malloc called with no P")
-		}
-	}
+	c := getMCache()
 	var span *mspan
 	var x unsafe.Pointer
 	noscan := typ == nil || typ.ptrdata == 0
@@ -1212,16 +1200,7 @@ func reflect_unsafe_NewArray(typ *_type, n int) unsafe.Pointer {
 }
 
 func profilealloc(mp *m, x unsafe.Pointer, size uintptr) {
-	var c *mcache
-	if mp.p != 0 {
-		c = mp.p.ptr().mcache
-	} else {
-		c = mcache0
-		if c == nil {
-			throw("profilealloc called with no P")
-		}
-	}
-	c.nextSample = nextSample()
+	getMCache().nextSample = nextSample()
 	mProf_Malloc(x, size)
 }
 
diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go
index c3e0e5e1f7..5564e4a47d 100644
--- a/src/runtime/mcache.go
+++ b/src/runtime/mcache.go
@@ -131,6 +131,29 @@ func freemcache(c *mcache, recipient *mcache) {
 	})
 }
 
+// getMCache is a convenience function which tries to obtain an mcache.
+//
+// Must be running with a P when called (so the caller must be in a
+// non-preemptible state) or must be called during bootstrapping.
+func getMCache() *mcache {
+	// Grab the mcache, since that's where stats live.
+	pp := getg().m.p.ptr()
+	var c *mcache
+	if pp == nil {
+		// We will be called without a P while bootstrapping,
+		// in which case we use mcache0, which is set in mallocinit.
+		// mcache0 is cleared when bootstrapping is complete,
+		// by procresize.
+		c = mcache0
+		if c == nil {
+			throw("getMCache called with no P or outside bootstrapping")
+		}
+	} else {
+		c = pp.mcache
+	}
+	return c
+}
+
 // donate flushes data and resources which have no global
 // pool to another mcache.
 func (c *mcache) donate(d *mcache) {
-- 
cgit v1.3


From 79781e8dd382ac34e502ed6a088dff6860a08c05 Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Tue, 4 Aug 2020 17:29:03 +0000
Subject: runtime: move malloc stats into consistentHeapStats

This change moves the mcache-local malloc stats into the
consistentHeapStats structure so the malloc stats can be managed
consistently with the memory stats. The one exception here is
tinyAllocs for which moving that into the global stats would incur
several atomic writes on the fast path. Microbenchmarks for just one CPU
core have shown a 50% loss in throughput. Since tiny allocation counnt
isn't exposed anyway and is always blindly added to both allocs and
frees, let that stay inconsistent and flush the tiny allocation count
every so often.

Change-Id: I2a4b75f209c0e659b9c0db081a3287bf227c10ca
Reviewed-on: https://go-review.googlesource.com/c/go/+/247039
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Trust: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/export_test.go | 37 ++++++++--------------
 src/runtime/malloc.go      |  2 +-
 src/runtime/mcache.go      | 70 ++++++++++++++---------------------------
 src/runtime/mgcsweep.go    | 10 ++++--
 src/runtime/mstats.go      | 78 ++++++++++++++++++++++++++--------------------
 src/runtime/proc.go        |  2 +-
 6 files changed, 90 insertions(+), 109 deletions(-)

(limited to 'src/runtime/malloc.go')

diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index cb753ee819..ff901fd7be 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -337,33 +337,22 @@ func ReadMemStatsSlow() (base, slow MemStats) {
 			}
 		}
 
-		// Add in frees. readmemstats_m flushed the cached stats, so
-		// these are up-to-date.
-		var tinyAllocs, largeFree, smallFree uint64
-		for _, p := range allp {
-			c := p.mcache
-			if c == nil {
-				continue
-			}
-			// Collect large allocation stats.
-			largeFree += uint64(c.largeFree)
-			slow.Frees += uint64(c.largeFreeCount)
-
-			// Collect tiny allocation stats.
-			tinyAllocs += uint64(c.tinyAllocCount)
-
-			// Collect per-sizeclass stats.
-			for i := 0; i < _NumSizeClasses; i++ {
-				slow.Frees += uint64(c.smallFreeCount[i])
-				bySize[i].Frees += uint64(c.smallFreeCount[i])
-				bySize[i].Mallocs += uint64(c.smallFreeCount[i])
-				smallFree += uint64(c.smallFreeCount[i]) * uint64(class_to_size[i])
-			}
+		// Add in frees by just reading the stats for those directly.
+		var m heapStatsDelta
+		memstats.heapStats.unsafeRead(&m)
+
+		// Collect per-sizeclass free stats.
+		var smallFree uint64
+		for i := 0; i < _NumSizeClasses; i++ {
+			slow.Frees += uint64(m.smallFreeCount[i])
+			bySize[i].Frees += uint64(m.smallFreeCount[i])
+			bySize[i].Mallocs += uint64(m.smallFreeCount[i])
+			smallFree += uint64(m.smallFreeCount[i]) * uint64(class_to_size[i])
 		}
-		slow.Frees += tinyAllocs
+		slow.Frees += memstats.tinyallocs + uint64(m.largeFreeCount)
 		slow.Mallocs += slow.Frees
 
-		slow.TotalAlloc = slow.Alloc + largeFree + smallFree
+		slow.TotalAlloc = slow.Alloc + uint64(m.largeFree) + smallFree
 
 		for i := range slow.BySize {
 			slow.BySize[i].Mallocs = bySize[i].Mallocs
diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index 6383c34817..d0b8c668c3 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -1028,7 +1028,7 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 				// The object fits into existing tiny block.
 				x = unsafe.Pointer(c.tiny + off)
 				c.tinyoffset = off + size
-				c.tinyAllocCount++
+				c.tinyAllocs++
 				mp.mallocing = 0
 				releasem(mp)
 				return x
diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go
index e27a1c9ec0..c9342a41c9 100644
--- a/src/runtime/mcache.go
+++ b/src/runtime/mcache.go
@@ -32,8 +32,12 @@ type mcache struct {
 	// tiny is a heap pointer. Since mcache is in non-GC'd memory,
 	// we handle it by clearing it in releaseAll during mark
 	// termination.
+	//
+	// tinyAllocs is the number of tiny allocations performed
+	// by the P that owns this mcache.
 	tiny       uintptr
 	tinyoffset uintptr
+	tinyAllocs uintptr
 
 	// The rest is not accessed on every malloc.
 
@@ -41,21 +45,6 @@ type mcache struct {
 
 	stackcache [_NumStackOrders]stackfreelist
 
-	// Allocator stats (source-of-truth).
-	// Only the P that owns this mcache may write to these
-	// variables, so it's safe for that P to read non-atomically.
-	//
-	// When read with stats from other mcaches and with the world
-	// stopped, the result will accurately reflect the state of the
-	// application.
-	tinyAllocCount  uintptr                  // number of tiny allocs not counted in other stats
-	largeAlloc      uintptr                  // bytes allocated for large objects
-	largeAllocCount uintptr                  // number of large object allocations
-	smallAllocCount [_NumSizeClasses]uintptr // number of allocs for small objects
-	largeFree       uintptr                  // bytes freed for large objects (>maxSmallSize)
-	largeFreeCount  uintptr                  // number of frees for large objects (>maxSmallSize)
-	smallFreeCount  [_NumSizeClasses]uintptr // number of frees for small objects (<=maxSmallSize)
-
 	// flushGen indicates the sweepgen during which this mcache
 	// was last flushed. If flushGen != mheap_.sweepgen, the spans
 	// in this mcache are stale and need to the flushed so they
@@ -117,7 +106,7 @@ func allocmcache() *mcache {
 // In some cases there is no way to simply release
 // resources, such as statistics, so donate them to
 // a different mcache (the recipient).
-func freemcache(c *mcache, recipient *mcache) {
+func freemcache(c *mcache) {
 	systemstack(func() {
 		c.releaseAll()
 		stackcache_clear(c)
@@ -128,8 +117,6 @@ func freemcache(c *mcache, recipient *mcache) {
 		// gcworkbuffree(c.gcworkbuf)
 
 		lock(&mheap_.lock)
-		// Donate anything else that's left.
-		c.donate(recipient)
 		mheap_.cachealloc.free(unsafe.Pointer(c))
 		unlock(&mheap_.lock)
 	})
@@ -158,31 +145,6 @@ func getMCache() *mcache {
 	return c
 }
 
-// donate flushes data and resources which have no global
-// pool to another mcache.
-func (c *mcache) donate(d *mcache) {
-	// scanAlloc is handled separately because it's not
-	// like these stats -- it's used for GC pacing.
-	d.largeAlloc += c.largeAlloc
-	c.largeAlloc = 0
-	d.largeAllocCount += c.largeAllocCount
-	c.largeAllocCount = 0
-	for i := range c.smallAllocCount {
-		d.smallAllocCount[i] += c.smallAllocCount[i]
-		c.smallAllocCount[i] = 0
-	}
-	d.largeFree += c.largeFree
-	c.largeFree = 0
-	d.largeFreeCount += c.largeFreeCount
-	c.largeFreeCount = 0
-	for i := range c.smallFreeCount {
-		d.smallFreeCount[i] += c.smallFreeCount[i]
-		c.smallFreeCount[i] = 0
-	}
-	d.tinyAllocCount += c.tinyAllocCount
-	c.tinyAllocCount = 0
-}
-
 // refill acquires a new span of span class spc for c. This span will
 // have at least one free object. The current span in c must be full.
 //
@@ -219,12 +181,20 @@ func (c *mcache) refill(spc spanClass) {
 
 	// Assume all objects from this span will be allocated in the
 	// mcache. If it gets uncached, we'll adjust this.
-	c.smallAllocCount[spc.sizeclass()] += uintptr(s.nelems) - uintptr(s.allocCount)
+	stats := memstats.heapStats.acquire(c)
+	atomic.Xadduintptr(&stats.smallAllocCount[spc.sizeclass()], uintptr(s.nelems)-uintptr(s.allocCount))
+	memstats.heapStats.release(c)
 
 	// Update heap_live with the same assumption.
 	usedBytes := uintptr(s.allocCount) * s.elemsize
 	atomic.Xadd64(&memstats.heap_live, int64(s.npages*pageSize)-int64(usedBytes))
 
+	// Flush tinyAllocs.
+	if spc == tinySpanClass {
+		atomic.Xadd64(&memstats.tinyallocs, int64(c.tinyAllocs))
+		c.tinyAllocs = 0
+	}
+
 	// While we're here, flush scanAlloc, since we have to call
 	// revise anyway.
 	atomic.Xadd64(&memstats.heap_scan, int64(c.scanAlloc))
@@ -262,8 +232,10 @@ func (c *mcache) allocLarge(size uintptr, needzero bool, noscan bool) *mspan {
 	if s == nil {
 		throw("out of memory")
 	}
-	c.largeAlloc += npages * pageSize
-	c.largeAllocCount++
+	stats := memstats.heapStats.acquire(c)
+	atomic.Xadduintptr(&stats.largeAlloc, npages*pageSize)
+	atomic.Xadduintptr(&stats.largeAllocCount, 1)
+	memstats.heapStats.release(c)
 
 	// Update heap_live and revise pacing if needed.
 	atomic.Xadd64(&memstats.heap_live, int64(npages*pageSize))
@@ -294,7 +266,9 @@ func (c *mcache) releaseAll() {
 		if s != &emptymspan {
 			// Adjust nsmallalloc in case the span wasn't fully allocated.
 			n := uintptr(s.nelems) - uintptr(s.allocCount)
-			c.smallAllocCount[spanClass(i).sizeclass()] -= n
+			stats := memstats.heapStats.acquire(c)
+			atomic.Xadduintptr(&stats.smallAllocCount[spanClass(i).sizeclass()], -n)
+			memstats.heapStats.release(c)
 			if s.sweepgen != sg+1 {
 				// refill conservatively counted unallocated slots in heap_live.
 				// Undo this.
@@ -313,6 +287,8 @@ func (c *mcache) releaseAll() {
 	// Clear tinyalloc pool.
 	c.tiny = 0
 	c.tinyoffset = 0
+	atomic.Xadd64(&memstats.tinyallocs, int64(c.tinyAllocs))
+	c.tinyAllocs = 0
 
 	// Updated heap_scan and possible heap_live.
 	if gcBlackenEnabled != 0 {
diff --git a/src/runtime/mgcsweep.go b/src/runtime/mgcsweep.go
index 7103b08455..9b77ce635c 100644
--- a/src/runtime/mgcsweep.go
+++ b/src/runtime/mgcsweep.go
@@ -503,7 +503,9 @@ func (s *mspan) sweep(preserve bool) bool {
 			// wasn't totally filled, but then swept, still has all of its
 			// free slots zeroed.
 			s.needzero = 1
-			c.smallFreeCount[spc.sizeclass()] += uintptr(nfreed)
+			stats := memstats.heapStats.acquire(c)
+			atomic.Xadduintptr(&stats.smallFreeCount[spc.sizeclass()], uintptr(nfreed))
+			memstats.heapStats.release(c)
 		}
 		if !preserve {
 			// The caller may not have removed this span from whatever
@@ -548,8 +550,10 @@ func (s *mspan) sweep(preserve bool) bool {
 			} else {
 				mheap_.freeSpan(s)
 			}
-			c.largeFreeCount++
-			c.largeFree += size
+			stats := memstats.heapStats.acquire(c)
+			atomic.Xadduintptr(&stats.largeFreeCount, 1)
+			atomic.Xadduintptr(&stats.largeFree, size)
+			memstats.heapStats.release(c)
 			return true
 		}
 
diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go
index 4363eff1e0..a8eca85fe6 100644
--- a/src/runtime/mstats.go
+++ b/src/runtime/mstats.go
@@ -612,48 +612,36 @@ func updatememstats() {
 	memstats.total_alloc = 0
 	memstats.nmalloc = 0
 	memstats.nfree = 0
-	memstats.tinyallocs = 0
 	for i := 0; i < len(memstats.by_size); i++ {
 		memstats.by_size[i].nmalloc = 0
 		memstats.by_size[i].nfree = 0
 	}
-
-	// Collect allocation stats. This is safe and consistent
-	// because the world is stopped.
-	var smallFree, totalAlloc, totalFree uint64
-	for _, p := range allp {
-		c := p.mcache
-		if c == nil {
-			continue
-		}
-		// Collect large allocation stats.
-		memstats.nmalloc += uint64(c.largeAllocCount)
-		totalAlloc += uint64(c.largeAlloc)
-		totalFree += uint64(c.largeFree)
-		memstats.nfree += uint64(c.largeFreeCount)
-
-		// Collect tiny allocation stats.
-		memstats.tinyallocs += uint64(c.tinyAllocCount)
-
-		// Collect per-sizeclass stats.
-		for i := 0; i < _NumSizeClasses; i++ {
-			// Malloc stats.
-			memstats.nmalloc += uint64(c.smallAllocCount[i])
-			memstats.by_size[i].nmalloc += uint64(c.smallAllocCount[i])
-			totalAlloc += uint64(c.smallAllocCount[i]) * uint64(class_to_size[i])
-
-			// Free stats.
-			memstats.nfree += uint64(c.smallFreeCount[i])
-			memstats.by_size[i].nfree += uint64(c.smallFreeCount[i])
-			smallFree += uint64(c.smallFreeCount[i]) * uint64(class_to_size[i])
-		}
-	}
 	// Collect consistent stats, which are the source-of-truth in the some cases.
 	var consStats heapStatsDelta
 	memstats.heapStats.unsafeRead(&consStats)
 
-	totalFree += smallFree
+	// Collect large allocation stats.
+	totalAlloc := uint64(consStats.largeAlloc)
+	memstats.nmalloc += uint64(consStats.largeAllocCount)
+	totalFree := uint64(consStats.largeFree)
+	memstats.nfree += uint64(consStats.largeFreeCount)
+
+	// Collect per-sizeclass stats.
+	for i := 0; i < _NumSizeClasses; i++ {
+		// Malloc stats.
+		a := uint64(consStats.smallAllocCount[i])
+		totalAlloc += a * uint64(class_to_size[i])
+		memstats.nmalloc += a
+		memstats.by_size[i].nmalloc = a
+
+		// Free stats.
+		f := uint64(consStats.smallFreeCount[i])
+		totalFree += f * uint64(class_to_size[i])
+		memstats.nfree += f
+		memstats.by_size[i].nfree = f
+	}
 
+	// Account for tiny allocations.
 	memstats.nfree += memstats.tinyallocs
 	memstats.nmalloc += memstats.tinyallocs
 
@@ -752,12 +740,25 @@ func (s *sysMemStat) add(n int64) {
 // that need to be updated together in order for them to be kept
 // consistent with one another.
 type heapStatsDelta struct {
+	// Memory stats.
 	committed       int64 // byte delta of memory committed
 	released        int64 // byte delta of released memory generated
 	inHeap          int64 // byte delta of memory placed in the heap
 	inStacks        int64 // byte delta of memory reserved for stacks
 	inWorkBufs      int64 // byte delta of memory reserved for work bufs
 	inPtrScalarBits int64 // byte delta of memory reserved for unrolled GC prog bits
+
+	// Allocator stats.
+	largeAlloc      uintptr                  // bytes allocated for large objects
+	largeAllocCount uintptr                  // number of large object allocations
+	smallAllocCount [_NumSizeClasses]uintptr // number of allocs for small objects
+	largeFree       uintptr                  // bytes freed for large objects (>maxSmallSize)
+	largeFreeCount  uintptr                  // number of frees for large objects (>maxSmallSize)
+	smallFreeCount  [_NumSizeClasses]uintptr // number of frees for small objects (<=maxSmallSize)
+
+	// Add a uint32 to ensure this struct is a multiple of 8 bytes in size.
+	// Only necessary on 32-bit platforms.
+	// _ [(sys.PtrSize / 4) % 2]uint32
 }
 
 // merge adds in the deltas from b into a.
@@ -768,6 +769,17 @@ func (a *heapStatsDelta) merge(b *heapStatsDelta) {
 	a.inStacks += b.inStacks
 	a.inWorkBufs += b.inWorkBufs
 	a.inPtrScalarBits += b.inPtrScalarBits
+
+	a.largeAlloc += b.largeAlloc
+	a.largeAllocCount += b.largeAllocCount
+	for i := range b.smallAllocCount {
+		a.smallAllocCount[i] += b.smallAllocCount[i]
+	}
+	a.largeFree += b.largeFree
+	a.largeFreeCount += b.largeFreeCount
+	for i := range b.smallFreeCount {
+		a.smallFreeCount[i] += b.smallFreeCount[i]
+	}
 }
 
 // consistentHeapStats represents a set of various memory statistics
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index 4f4cff38aa..ebecc92745 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -4550,7 +4550,7 @@ func (pp *p) destroy() {
 		pp.mspancache.len = 0
 		pp.pcache.flush(&mheap_.pages)
 	})
-	freemcache(pp.mcache, allp[0].mcache)
+	freemcache(pp.mcache)
 	pp.mcache = nil
 	gfpurge(pp)
 	traceProcFree(pp)
-- 
cgit v1.3


From f7e26467b4e7ee0bb3219c26e71292ff4aac7da9 Mon Sep 17 00:00:00 2001
From: Cherry Zhang <cherryyz@google.com>
Date: Thu, 29 Oct 2020 15:50:53 -0400
Subject: runtime: allocate at desired address when race detector is on

Currently, on all supported platforms, the race detector (LLVM
TSAN) expects the Go heap is at 0xc000000000 - 0xe000000000.
Move the raceenabled condition first, so we always allocate
there.

This means on Linux/ARM64 when race detector is on we will
allocate to 0xc000000000 - 0xe000000000, instead of 0x4000000000.
The old address is meant for 39-bit VMA. But the race detector
only supports 48-bit VMA anyway. So this is fine.

Change-Id: I51ac8eff68297b37c8c651a93145cc94f83a939d
Reviewed-on: https://go-review.googlesource.com/c/go/+/266372
Trust: Cherry Zhang <cherryyz@google.com>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
---
 src/runtime/malloc.go | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'src/runtime/malloc.go')

diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index d0b8c668c3..0563f49d17 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -521,6 +521,14 @@ func mallocinit() {
 		for i := 0x7f; i >= 0; i-- {
 			var p uintptr
 			switch {
+			case raceenabled:
+				// The TSAN runtime requires the heap
+				// to be in the range [0x00c000000000,
+				// 0x00e000000000).
+				p = uintptr(i)<<32 | uintptrMask&(0x00c0<<32)
+				if p >= uintptrMask&0x00e000000000 {
+					continue
+				}
 			case GOARCH == "arm64" && GOOS == "ios":
 				p = uintptr(i)<<40 | uintptrMask&(0x0013<<28)
 			case GOARCH == "arm64":
@@ -532,14 +540,6 @@ func mallocinit() {
 					continue
 				}
 				p = uintptr(i)<<40 | uintptrMask&(0xa0<<52)
-			case raceenabled:
-				// The TSAN runtime requires the heap
-				// to be in the range [0x00c000000000,
-				// 0x00e000000000).
-				p = uintptr(i)<<32 | uintptrMask&(0x00c0<<32)
-				if p >= uintptrMask&0x00e000000000 {
-					continue
-				}
 			default:
 				p = uintptr(i)<<40 | uintptrMask&(0x00c0<<32)
 			}
-- 
cgit v1.3


From 9393b5bae5944acebed3ab6f995926b7de3ce429 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Fri, 21 Aug 2020 11:59:55 -0400
Subject: runtime: add heap lock assertions

Some functions that required holding the heap lock _or_ world stop have
been simplified to simply requiring the heap lock. This is conceptually
simpler and taking the heap lock during world stop is guaranteed to not
contend. This was only done on functions already called on the
systemstack to avoid too many extra systemstack calls in GC.

Updates #40677

Change-Id: I15aa1dadcdd1a81aac3d2a9ecad6e7d0377befdc
Reviewed-on: https://go-review.googlesource.com/c/go/+/250262
Run-TryBot: Michael Pratt <mpratt@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Austin Clements <austin@google.com>
Trust: Michael Pratt <mpratt@google.com>
---
 src/runtime/export_test.go | 61 +++++++++++++++++++++++++++++++++++++++++-----
 src/runtime/malloc.go      |  2 ++
 src/runtime/mgc.go         |  4 +++
 src/runtime/mgcscavenge.go | 18 ++++++++++++++
 src/runtime/mheap.go       | 29 +++++++++++++++++-----
 src/runtime/mpagealloc.go  | 22 +++++++++++++++++
 src/runtime/mpagecache.go  | 14 ++++++++++-
 src/runtime/proc.go        |  2 ++
 8 files changed, 139 insertions(+), 13 deletions(-)

(limited to 'src/runtime/malloc.go')

diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index 4ca0420d2a..44551dcaf1 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -743,7 +743,16 @@ func (c *PageCache) Alloc(npages uintptr) (uintptr, uintptr) {
 	return (*pageCache)(c).alloc(npages)
 }
 func (c *PageCache) Flush(s *PageAlloc) {
-	(*pageCache)(c).flush((*pageAlloc)(s))
+	cp := (*pageCache)(c)
+	sp := (*pageAlloc)(s)
+
+	systemstack(func() {
+		// None of the tests need any higher-level locking, so we just
+		// take the lock internally.
+		lock(sp.mheapLock)
+		cp.flush(sp)
+		unlock(sp.mheapLock)
+	})
 }
 
 // Expose chunk index type.
@@ -754,13 +763,41 @@ type ChunkIdx chunkIdx
 type PageAlloc pageAlloc
 
 func (p *PageAlloc) Alloc(npages uintptr) (uintptr, uintptr) {
-	return (*pageAlloc)(p).alloc(npages)
+	pp := (*pageAlloc)(p)
+
+	var addr, scav uintptr
+	systemstack(func() {
+		// None of the tests need any higher-level locking, so we just
+		// take the lock internally.
+		lock(pp.mheapLock)
+		addr, scav = pp.alloc(npages)
+		unlock(pp.mheapLock)
+	})
+	return addr, scav
 }
 func (p *PageAlloc) AllocToCache() PageCache {
-	return PageCache((*pageAlloc)(p).allocToCache())
+	pp := (*pageAlloc)(p)
+
+	var c PageCache
+	systemstack(func() {
+		// None of the tests need any higher-level locking, so we just
+		// take the lock internally.
+		lock(pp.mheapLock)
+		c = PageCache(pp.allocToCache())
+		unlock(pp.mheapLock)
+	})
+	return c
 }
 func (p *PageAlloc) Free(base, npages uintptr) {
-	(*pageAlloc)(p).free(base, npages)
+	pp := (*pageAlloc)(p)
+
+	systemstack(func() {
+		// None of the tests need any higher-level locking, so we just
+		// take the lock internally.
+		lock(pp.mheapLock)
+		pp.free(base, npages)
+		unlock(pp.mheapLock)
+	})
 }
 func (p *PageAlloc) Bounds() (ChunkIdx, ChunkIdx) {
 	return ChunkIdx((*pageAlloc)(p).start), ChunkIdx((*pageAlloc)(p).end)
@@ -768,6 +805,8 @@ func (p *PageAlloc) Bounds() (ChunkIdx, ChunkIdx) {
 func (p *PageAlloc) Scavenge(nbytes uintptr, mayUnlock bool) (r uintptr) {
 	pp := (*pageAlloc)(p)
 	systemstack(func() {
+		// None of the tests need any higher-level locking, so we just
+		// take the lock internally.
 		lock(pp.mheapLock)
 		r = pp.scavenge(nbytes, mayUnlock)
 		unlock(pp.mheapLock)
@@ -926,7 +965,11 @@ func NewPageAlloc(chunks, scav map[ChunkIdx][]BitRange) *PageAlloc {
 		addr := chunkBase(chunkIdx(i))
 
 		// Mark the chunk's existence in the pageAlloc.
-		p.grow(addr, pallocChunkBytes)
+		systemstack(func() {
+			lock(p.mheapLock)
+			p.grow(addr, pallocChunkBytes)
+			unlock(p.mheapLock)
+		})
 
 		// Initialize the bitmap and update pageAlloc metadata.
 		chunk := p.chunkOf(chunkIndex(addr))
@@ -957,13 +1000,19 @@ func NewPageAlloc(chunks, scav map[ChunkIdx][]BitRange) *PageAlloc {
 		}
 
 		// Update heap metadata for the allocRange calls above.
-		p.update(addr, pallocChunkPages, false, false)
+		systemstack(func() {
+			lock(p.mheapLock)
+			p.update(addr, pallocChunkPages, false, false)
+			unlock(p.mheapLock)
+		})
 	}
+
 	systemstack(func() {
 		lock(p.mheapLock)
 		p.scavengeStartGen()
 		unlock(p.mheapLock)
 	})
+
 	return (*PageAlloc)(p)
 }
 
diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index 0563f49d17..4b798d129c 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -627,6 +627,8 @@ func mallocinit() {
 //
 // h must be locked.
 func (h *mheap) sysAlloc(n uintptr) (v unsafe.Pointer, size uintptr) {
+	assertLockHeld(&h.lock)
+
 	n = alignUp(n, heapArenaBytes)
 
 	// First, try the arena pre-reservation.
diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go
index fb3c149942..185d3201ca 100644
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@ -821,6 +821,8 @@ func pollFractionalWorkerExit() bool {
 //
 // mheap_.lock must be held or the world must be stopped.
 func gcSetTriggerRatio(triggerRatio float64) {
+	assertWorldStoppedOrLockHeld(&mheap_.lock)
+
 	// Compute the next GC goal, which is when the allocated heap
 	// has grown by GOGC/100 over the heap marked by the last
 	// cycle.
@@ -960,6 +962,8 @@ func gcSetTriggerRatio(triggerRatio float64) {
 //
 // mheap_.lock must be held or the world must be stopped.
 func gcEffectiveGrowthRatio() float64 {
+	assertWorldStoppedOrLockHeld(&mheap_.lock)
+
 	egogc := float64(atomic.Load64(&memstats.next_gc)-memstats.heap_marked) / float64(memstats.heap_marked)
 	if egogc < 0 {
 		// Shouldn't happen, but just in case.
diff --git a/src/runtime/mgcscavenge.go b/src/runtime/mgcscavenge.go
index 5843ada981..a242577bd9 100644
--- a/src/runtime/mgcscavenge.go
+++ b/src/runtime/mgcscavenge.go
@@ -397,6 +397,8 @@ func bgscavenge(c chan int) {
 //
 //go:systemstack
 func (p *pageAlloc) scavenge(nbytes uintptr, mayUnlock bool) uintptr {
+	assertLockHeld(p.mheapLock)
+
 	var (
 		addrs addrRange
 		gen   uint32
@@ -446,6 +448,8 @@ func printScavTrace(gen uint32, released uintptr, forced bool) {
 //
 //go:systemstack
 func (p *pageAlloc) scavengeStartGen() {
+	assertLockHeld(p.mheapLock)
+
 	if debug.scavtrace > 0 {
 		printScavTrace(p.scav.gen, p.scav.released, false)
 	}
@@ -495,6 +499,8 @@ func (p *pageAlloc) scavengeStartGen() {
 //
 //go:systemstack
 func (p *pageAlloc) scavengeReserve() (addrRange, uint32) {
+	assertLockHeld(p.mheapLock)
+
 	// Start by reserving the minimum.
 	r := p.scav.inUse.removeLast(p.scav.reservationBytes)
 
@@ -525,6 +531,8 @@ func (p *pageAlloc) scavengeReserve() (addrRange, uint32) {
 //
 //go:systemstack
 func (p *pageAlloc) scavengeUnreserve(r addrRange, gen uint32) {
+	assertLockHeld(p.mheapLock)
+
 	if r.size() == 0 || gen != p.scav.gen {
 		return
 	}
@@ -552,6 +560,8 @@ func (p *pageAlloc) scavengeUnreserve(r addrRange, gen uint32) {
 //
 //go:systemstack
 func (p *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (uintptr, addrRange) {
+	assertLockHeld(p.mheapLock)
+
 	// Defensively check if we've recieved an empty address range.
 	// If so, just return.
 	if work.size() == 0 {
@@ -610,6 +620,8 @@ func (p *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (ui
 		// If we found something, scavenge it and return!
 		if npages != 0 {
 			work.limit = offAddr{p.scavengeRangeLocked(maxChunk, base, npages)}
+
+			assertLockHeld(p.mheapLock) // Must be locked on return.
 			return uintptr(npages) * pageSize, work
 		}
 	}
@@ -674,12 +686,16 @@ func (p *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (ui
 		base, npages := chunk.findScavengeCandidate(pallocChunkPages-1, minPages, maxPages)
 		if npages > 0 {
 			work.limit = offAddr{p.scavengeRangeLocked(candidateChunkIdx, base, npages)}
+
+			assertLockHeld(p.mheapLock) // Must be locked on return.
 			return uintptr(npages) * pageSize, work
 		}
 
 		// We were fooled, so let's continue from where we left off.
 		work.limit = offAddr{chunkBase(candidateChunkIdx)}
 	}
+
+	assertLockHeld(p.mheapLock) // Must be locked on return.
 	return 0, work
 }
 
@@ -692,6 +708,8 @@ func (p *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (ui
 //
 // p.mheapLock must be held.
 func (p *pageAlloc) scavengeRangeLocked(ci chunkIdx, base, npages uint) uintptr {
+	assertLockHeld(p.mheapLock)
+
 	p.chunkOf(ci).scavenged.setRange(base, npages)
 
 	// Compute the full address for the start of the range.
diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index 14a73c0491..66a59cb999 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -483,10 +483,15 @@ func (s *mspan) layout() (size, n, total uintptr) {
 // indirect call from the fixalloc initializer, the compiler can't see
 // this.
 //
+// The heap lock must be held.
+//
 //go:nowritebarrierrec
 func recordspan(vh unsafe.Pointer, p unsafe.Pointer) {
 	h := (*mheap)(vh)
 	s := (*mspan)(p)
+
+	assertLockHeld(&h.lock)
+
 	if len(h.allspans) >= cap(h.allspans) {
 		n := 64 * 1024 / sys.PtrSize
 		if n < cap(h.allspans)*3/2 {
@@ -721,7 +726,7 @@ func (h *mheap) init() {
 //
 // reclaim implements the page-reclaimer half of the sweeper.
 //
-// h must NOT be locked.
+// h.lock must NOT be held.
 func (h *mheap) reclaim(npage uintptr) {
 	// TODO(austin): Half of the time spent freeing spans is in
 	// locking/unlocking the heap (even with low contention). We
@@ -804,6 +809,8 @@ func (h *mheap) reclaimChunk(arenas []arenaIdx, pageIdx, n uintptr) uintptr {
 	// In particular, if a span were freed and merged concurrently
 	// with this probing heapArena.spans, it would be possible to
 	// observe arbitrary, stale span pointers.
+	assertLockHeld(&h.lock)
+
 	n0 := n
 	var nFreed uintptr
 	sg := h.sweepgen
@@ -858,6 +865,8 @@ func (h *mheap) reclaimChunk(arenas []arenaIdx, pageIdx, n uintptr) uintptr {
 		traceGCSweepSpan((n0 - nFreed) * pageSize)
 		lock(&h.lock)
 	}
+
+	assertLockHeld(&h.lock) // Must be locked on return.
 	return nFreed
 }
 
@@ -1011,7 +1020,7 @@ func (h *mheap) allocNeedsZero(base, npage uintptr) (needZero bool) {
 // tryAllocMSpan attempts to allocate an mspan object from
 // the P-local cache, but may fail.
 //
-// h need not be locked.
+// h.lock need not be held.
 //
 // This caller must ensure that its P won't change underneath
 // it during this function. Currently to ensure that we enforce
@@ -1035,7 +1044,7 @@ func (h *mheap) tryAllocMSpan() *mspan {
 
 // allocMSpanLocked allocates an mspan object.
 //
-// h must be locked.
+// h.lock must be held.
 //
 // allocMSpanLocked must be called on the system stack because
 // its caller holds the heap lock. See mheap for details.
@@ -1044,6 +1053,8 @@ func (h *mheap) tryAllocMSpan() *mspan {
 //
 //go:systemstack
 func (h *mheap) allocMSpanLocked() *mspan {
+	assertLockHeld(&h.lock)
+
 	pp := getg().m.p.ptr()
 	if pp == nil {
 		// We don't have a p so just do the normal thing.
@@ -1065,7 +1076,7 @@ func (h *mheap) allocMSpanLocked() *mspan {
 
 // freeMSpanLocked free an mspan object.
 //
-// h must be locked.
+// h.lock must be held.
 //
 // freeMSpanLocked must be called on the system stack because
 // its caller holds the heap lock. See mheap for details.
@@ -1074,6 +1085,8 @@ func (h *mheap) allocMSpanLocked() *mspan {
 //
 //go:systemstack
 func (h *mheap) freeMSpanLocked(s *mspan) {
+	assertLockHeld(&h.lock)
+
 	pp := getg().m.p.ptr()
 	// First try to free the mspan directly to the cache.
 	if pp != nil && pp.mspancache.len < len(pp.mspancache.buf) {
@@ -1097,7 +1110,7 @@ func (h *mheap) freeMSpanLocked(s *mspan) {
 //
 // The returned span is fully initialized.
 //
-// h must not be locked.
+// h.lock must not be held.
 //
 // allocSpan must be called on the system stack both because it acquires
 // the heap lock and because it must block GC transitions.
@@ -1281,8 +1294,10 @@ HaveSpan:
 // Try to add at least npage pages of memory to the heap,
 // returning whether it worked.
 //
-// h must be locked.
+// h.lock must be held.
 func (h *mheap) grow(npage uintptr) bool {
+	assertLockHeld(&h.lock)
+
 	// We must grow the heap in whole palloc chunks.
 	ask := alignUp(npage, pallocChunkPages) * pageSize
 
@@ -1391,6 +1406,8 @@ func (h *mheap) freeManual(s *mspan, typ spanAllocType) {
 }
 
 func (h *mheap) freeSpanLocked(s *mspan, typ spanAllocType) {
+	assertLockHeld(&h.lock)
+
 	switch s.state.get() {
 	case mSpanManual:
 		if s.allocCount != 0 {
diff --git a/src/runtime/mpagealloc.go b/src/runtime/mpagealloc.go
index 2af1c97e0b..dac1f39969 100644
--- a/src/runtime/mpagealloc.go
+++ b/src/runtime/mpagealloc.go
@@ -349,6 +349,8 @@ func (p *pageAlloc) chunkOf(ci chunkIdx) *pallocData {
 //
 // p.mheapLock must be held.
 func (p *pageAlloc) grow(base, size uintptr) {
+	assertLockHeld(p.mheapLock)
+
 	// Round up to chunks, since we can't deal with increments smaller
 	// than chunks. Also, sysGrow expects aligned values.
 	limit := alignUp(base+size, pallocChunkBytes)
@@ -413,6 +415,8 @@ func (p *pageAlloc) grow(base, size uintptr) {
 //
 // p.mheapLock must be held.
 func (p *pageAlloc) update(base, npages uintptr, contig, alloc bool) {
+	assertLockHeld(p.mheapLock)
+
 	// base, limit, start, and end are inclusive.
 	limit := base + npages*pageSize - 1
 	sc, ec := chunkIndex(base), chunkIndex(limit)
@@ -499,6 +503,8 @@ func (p *pageAlloc) update(base, npages uintptr, contig, alloc bool) {
 //
 // p.mheapLock must be held.
 func (p *pageAlloc) allocRange(base, npages uintptr) uintptr {
+	assertLockHeld(p.mheapLock)
+
 	limit := base + npages*pageSize - 1
 	sc, ec := chunkIndex(base), chunkIndex(limit)
 	si, ei := chunkPageIndex(base), chunkPageIndex(limit)
@@ -534,6 +540,8 @@ func (p *pageAlloc) allocRange(base, npages uintptr) uintptr {
 //
 // p.mheapLock must be held.
 func (p *pageAlloc) findMappedAddr(addr offAddr) offAddr {
+	assertLockHeld(p.mheapLock)
+
 	// If we're not in a test, validate first by checking mheap_.arenas.
 	// This is a fast path which is only safe to use outside of testing.
 	ai := arenaIndex(addr.addr())
@@ -568,6 +576,8 @@ func (p *pageAlloc) findMappedAddr(addr offAddr) offAddr {
 //
 // p.mheapLock must be held.
 func (p *pageAlloc) find(npages uintptr) (uintptr, offAddr) {
+	assertLockHeld(p.mheapLock)
+
 	// Search algorithm.
 	//
 	// This algorithm walks each level l of the radix tree from the root level
@@ -786,7 +796,13 @@ nextLevel:
 // should be ignored.
 //
 // p.mheapLock must be held.
+//
+// Must run on the system stack because p.mheapLock must be held.
+//
+//go:systemstack
 func (p *pageAlloc) alloc(npages uintptr) (addr uintptr, scav uintptr) {
+	assertLockHeld(p.mheapLock)
+
 	// If the searchAddr refers to a region which has a higher address than
 	// any known chunk, then we know we're out of memory.
 	if chunkIndex(p.searchAddr.addr()) >= p.end {
@@ -841,7 +857,13 @@ Found:
 // free returns npages worth of memory starting at base back to the page heap.
 //
 // p.mheapLock must be held.
+//
+// Must run on the system stack because p.mheapLock must be held.
+//
+//go:systemstack
 func (p *pageAlloc) free(base, npages uintptr) {
+	assertLockHeld(p.mheapLock)
+
 	// If we're freeing pages below the p.searchAddr, update searchAddr.
 	if b := (offAddr{base}); b.lessThan(p.searchAddr) {
 		p.searchAddr = b
diff --git a/src/runtime/mpagecache.go b/src/runtime/mpagecache.go
index 5f76501a1c..4b5c66d8d6 100644
--- a/src/runtime/mpagecache.go
+++ b/src/runtime/mpagecache.go
@@ -71,8 +71,14 @@ func (c *pageCache) allocN(npages uintptr) (uintptr, uintptr) {
 // into s. Then, it clears the cache, such that empty returns
 // true.
 //
-// p.mheapLock must be held or the world must be stopped.
+// p.mheapLock must be held.
+//
+// Must run on the system stack because p.mheapLock must be held.
+//
+//go:systemstack
 func (c *pageCache) flush(p *pageAlloc) {
+	assertLockHeld(p.mheapLock)
+
 	if c.empty() {
 		return
 	}
@@ -103,7 +109,13 @@ func (c *pageCache) flush(p *pageAlloc) {
 // chunk.
 //
 // p.mheapLock must be held.
+//
+// Must run on the system stack because p.mheapLock must be held.
+//
+//go:systemstack
 func (p *pageAlloc) allocToCache() pageCache {
+	assertLockHeld(p.mheapLock)
+
 	// If the searchAddr refers to a region which has a higher address than
 	// any known chunk, then we know we're out of memory.
 	if chunkIndex(p.searchAddr.addr()) >= p.end {
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index 82284e6cd6..ced27ceb3a 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -4603,7 +4603,9 @@ func (pp *p) destroy() {
 			mheap_.spanalloc.free(unsafe.Pointer(pp.mspancache.buf[i]))
 		}
 		pp.mspancache.len = 0
+		lock(&mheap_.lock)
 		pp.pcache.flush(&mheap_.pages)
+		unlock(&mheap_.lock)
 	})
 	freemcache(pp.mcache)
 	pp.mcache = nil
-- 
cgit v1.3


From ac766e37182f36cd0a3247e44a4143d2d2132e42 Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Mon, 2 Nov 2020 16:58:38 +0000
Subject: runtime: make getMCache inlineable

This change moves the responsibility of throwing if an mcache is not
available to the caller, because the inlining cost of throw is set very
high in the compiler. Even if it was reduced down to the cost of a usual
function call, it would still be too expensive, so just move it out.

This choice also makes sense in the context of #42339 since we're going
to have to handle the case where we don't have an mcache to update stats
in a few contexts anyhow.

Also, add getMCache to the list of functions that should be inlined to
prevent future regressions.

getMCache is called on the allocation fast path and because its not
inlined actually causes a significant regression (~10%) in some
microbenchmarks.

Fixes #42305.

Change-Id: I64ac5e4f26b730bd4435ea1069a4a50f55411ced
Reviewed-on: https://go-review.googlesource.com/c/go/+/267157
Trust: Michael Knyszek <mknyszek@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
---
 src/cmd/compile/internal/gc/inl_test.go |  1 +
 src/runtime/malloc.go                   |  9 ++++++++-
 src/runtime/mcache.go                   |  7 ++-----
 src/runtime/mgcscavenge.go              |  3 +++
 src/runtime/mheap.go                    | 12 ++++++++++++
 5 files changed, 26 insertions(+), 6 deletions(-)

(limited to 'src/runtime/malloc.go')

diff --git a/src/cmd/compile/internal/gc/inl_test.go b/src/cmd/compile/internal/gc/inl_test.go
index afa6b98315..02735e50fb 100644
--- a/src/cmd/compile/internal/gc/inl_test.go
+++ b/src/cmd/compile/internal/gc/inl_test.go
@@ -51,6 +51,7 @@ func TestIntendedInlining(t *testing.T) {
 			"funcPC",
 			"getArgInfoFast",
 			"getm",
+			"getMCache",
 			"isDirectIface",
 			"itabHashFunc",
 			"noescape",
diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index 4b798d129c..551acd0796 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -975,6 +975,9 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 	shouldhelpgc := false
 	dataSize := size
 	c := getMCache()
+	if c == nil {
+		throw("mallocgc called without a P or outside bootstrapping")
+	}
 	var span *mspan
 	var x unsafe.Pointer
 	noscan := typ == nil || typ.ptrdata == 0
@@ -1202,7 +1205,11 @@ func reflect_unsafe_NewArray(typ *_type, n int) unsafe.Pointer {
 }
 
 func profilealloc(mp *m, x unsafe.Pointer, size uintptr) {
-	getMCache().nextSample = nextSample()
+	c := getMCache()
+	if c == nil {
+		throw("profilealloc called without a P or outside bootstrapping")
+	}
+	c.nextSample = nextSample()
 	mProf_Malloc(x, size)
 }
 
diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go
index c9342a41c9..847a5dedf3 100644
--- a/src/runtime/mcache.go
+++ b/src/runtime/mcache.go
@@ -124,8 +124,8 @@ func freemcache(c *mcache) {
 
 // getMCache is a convenience function which tries to obtain an mcache.
 //
-// Must be running with a P when called (so the caller must be in a
-// non-preemptible state) or must be called during bootstrapping.
+// Returns nil if we're not bootstrapping or we don't have a P. The caller's
+// P must not change, so we must be in a non-preemptible state.
 func getMCache() *mcache {
 	// Grab the mcache, since that's where stats live.
 	pp := getg().m.p.ptr()
@@ -136,9 +136,6 @@ func getMCache() *mcache {
 		// mcache0 is cleared when bootstrapping is complete,
 		// by procresize.
 		c = mcache0
-		if c == nil {
-			throw("getMCache called with no P or outside bootstrapping")
-		}
 	} else {
 		c = pp.mcache
 	}
diff --git a/src/runtime/mgcscavenge.go b/src/runtime/mgcscavenge.go
index a242577bd9..ab4e28a60b 100644
--- a/src/runtime/mgcscavenge.go
+++ b/src/runtime/mgcscavenge.go
@@ -734,6 +734,9 @@ func (p *pageAlloc) scavengeRangeLocked(ci chunkIdx, base, npages uint) uintptr
 
 	// Update consistent accounting too.
 	c := getMCache()
+	if c == nil {
+		throw("scavengeRangeLocked called without a P or outside bootstrapping")
+	}
 	stats := memstats.heapStats.acquire(c)
 	atomic.Xaddint64(&stats.committed, -nbytes)
 	atomic.Xaddint64(&stats.released, nbytes)
diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index 66a59cb999..6b29f34a82 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -1247,6 +1247,10 @@ HaveSpan:
 	}
 	// Update consistent stats.
 	c := getMCache()
+	if c == nil {
+		// TODO(mknyszek): Remove this and handle this case to fix #42339.
+		throw("allocSpan called without P or outside bootstrapping")
+	}
 	stats := memstats.heapStats.acquire(c)
 	atomic.Xaddint64(&stats.committed, int64(scav))
 	atomic.Xaddint64(&stats.released, -int64(scav))
@@ -1341,6 +1345,10 @@ func (h *mheap) grow(npage uintptr) bool {
 		// just add directly to heap_released.
 		atomic.Xadd64(&memstats.heap_released, int64(asize))
 		c := getMCache()
+		if c == nil {
+			// TODO(mknyszek): Remove this and handle this case to fix #42339.
+			throw("grow called without P or outside bootstrapping")
+		}
 		stats := memstats.heapStats.acquire(c)
 		atomic.Xaddint64(&stats.released, int64(asize))
 		memstats.heapStats.release(c)
@@ -1440,6 +1448,10 @@ func (h *mheap) freeSpanLocked(s *mspan, typ spanAllocType) {
 	}
 	// Update consistent stats.
 	c := getMCache()
+	if c == nil {
+		// TODO(mknyszek): Remove this and handle this case to fix #42339.
+		throw("freeSpanLocked called without P or outside bootstrapping")
+	}
 	stats := memstats.heapStats.acquire(c)
 	switch typ {
 	case spanAllocHeap:
-- 
cgit v1.3


From 40f0359d52e04ed124a8f81e1ef8ac86957dd983 Mon Sep 17 00:00:00 2001
From: Brad Fitzpatrick <bradfitz@golang.org>
Date: Wed, 3 Jun 2020 11:03:22 -0700
Subject: runtime: avoid a bit of unneeded work when MemProfileRate==1

Change-Id: I1dc355bcaeb0e5fb06a7fddc4cf5db596d22e0b3
Reviewed-on: https://go-review.googlesource.com/c/go/+/236148
Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org>
TryBot-Result: Go Bot <gobot@golang.org>
Trust: Emmanuel Odeke <emmanuel@orijtech.com>
Reviewed-by: Austin Clements <austin@google.com>
---
 src/runtime/malloc.go | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'src/runtime/malloc.go')

diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index 551acd0796..f20ded5bf7 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -1221,6 +1221,13 @@ func profilealloc(mp *m, x unsafe.Pointer, size uintptr) {
 // distribution (exp(MemProfileRate)), so the best return value is a random
 // number taken from an exponential distribution whose mean is MemProfileRate.
 func nextSample() uintptr {
+	if MemProfileRate == 1 {
+		// Callers assign our return value to
+		// mcache.next_sample, but next_sample is not used
+		// when the rate is 1. So avoid the math below and
+		// just return something.
+		return 0
+	}
 	if GOOS == "plan9" {
 		// Plan 9 doesn't support floating point in note handler.
 		if g := getg(); g == g.m.gsignal {
-- 
cgit v1.3