diff options
Diffstat (limited to 'src/runtime')
35 files changed, 3004 insertions, 176 deletions
diff --git a/src/runtime/_mkmalloc/mkmalloc.go b/src/runtime/_mkmalloc/mkmalloc.go index 986b0aa9f8..1f040c8861 100644 --- a/src/runtime/_mkmalloc/mkmalloc.go +++ b/src/runtime/_mkmalloc/mkmalloc.go @@ -254,7 +254,8 @@ func inline(config generatorConfig) []byte { } // Write out the package and import declarations. - out.WriteString("// Code generated by mkmalloc.go; DO NOT EDIT.\n\n") + out.WriteString("// Code generated by mkmalloc.go; DO NOT EDIT.\n") + out.WriteString("// See overview in malloc_stubs.go.\n\n") out.WriteString("package " + f.Name.Name + "\n\n") for _, importDecl := range importDecls { out.Write(mustFormatNode(fset, importDecl)) diff --git a/src/runtime/arena_test.go b/src/runtime/arena_test.go index ca5223b59c..0bb1950464 100644 --- a/src/runtime/arena_test.go +++ b/src/runtime/arena_test.go @@ -36,6 +36,11 @@ type largeScalar [UserArenaChunkBytes + 1]byte type largePointer [UserArenaChunkBytes/unsafe.Sizeof(&smallPointer{}) + 1]*smallPointer func TestUserArena(t *testing.T) { + if Clobberfree() { + // This test crashes with SEGV in clobberfree in mgcsweep.go with GODEBUG=clobberfree=1. + t.Skip("triggers SEGV with GODEBUG=clobberfree=1") + } + // Set GOMAXPROCS to 2 so we don't run too many of these // tests in parallel. defer GOMAXPROCS(GOMAXPROCS(2)) @@ -228,6 +233,11 @@ func runSubTestUserArenaSlice[S comparable](t *testing.T, value []S, parallel bo } func TestUserArenaLiveness(t *testing.T) { + if Clobberfree() { + // This test crashes with SEGV in clobberfree in mgcsweep.go with GODEBUG=clobberfree=1. + t.Skip("triggers SEGV with GODEBUG=clobberfree=1") + } + t.Run("Free", func(t *testing.T) { testUserArenaLiveness(t, false) }) @@ -320,6 +330,11 @@ func testUserArenaLiveness(t *testing.T, useArenaFinalizer bool) { } func TestUserArenaClearsPointerBits(t *testing.T) { + if Clobberfree() { + // This test crashes with SEGV in clobberfree in mgcsweep.go with GODEBUG=clobberfree=1. + t.Skip("triggers SEGV with GODEBUG=clobberfree=1") + } + // This is a regression test for a serious issue wherein if pointer bits // aren't properly cleared, it's possible to allocate scalar data down // into a previously pointer-ful area, causing misinterpretation by the GC. diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s index ea85146936..7c746803a8 100644 --- a/src/runtime/asm_amd64.s +++ b/src/runtime/asm_amd64.s @@ -181,6 +181,14 @@ TEXT runtime·rt0_go(SB),NOSPLIT|NOFRAME|TOPFRAME,$0 MOVQ AX, 24(SP) MOVQ BX, 32(SP) + // This is typically the entry point for Go programs. + // Call stack unwinding must not proceed past this frame. + // Set the frame pointer register to 0 so that frame pointer-based unwinders + // (which don't use debug info for performance reasons) + // won't attempt to unwind past this function. + // See go.dev/issue/63630 + MOVQ $0, BP + // create istack out of the given (operating system) stack. // _cgo_init may update stackguard. MOVQ $runtime·g0(SB), DI @@ -408,6 +416,13 @@ TEXT runtime·asminit(SB),NOSPLIT,$0-0 RET TEXT runtime·mstart(SB),NOSPLIT|TOPFRAME|NOFRAME,$0 + // This is the root frame of new Go-created OS threads. + // Call stack unwinding must not proceed past this frame. + // Set the frame pointer register to 0 so that frame pointer-based unwinders + // (which don't use debug info for performance reasons) + // won't attempt to unwind past this function. + // See go.dev/issue/63630 + MOVD $0, BP CALL runtime·mstart0(SB) RET // not reached diff --git a/src/runtime/asm_arm64.s b/src/runtime/asm_arm64.s index 902a7066aa..01f2690f4e 100644 --- a/src/runtime/asm_arm64.s +++ b/src/runtime/asm_arm64.s @@ -109,6 +109,14 @@ TEXT runtime·rt0_go(SB),NOSPLIT|TOPFRAME,$0 MOVW R0, 8(RSP) // argc MOVD R1, 16(RSP) // argv + // This is typically the entry point for Go programs. + // Call stack unwinding must not proceed past this frame. + // Set the frame pointer register to 0 so that frame pointer-based unwinders + // (which don't use debug info for performance reasons) + // won't attempt to unwind past this function. + // See go.dev/issue/63630 + MOVD $0, R29 + #ifdef TLS_darwin // Initialize TLS. MOVD ZR, g // clear g, make sure it's not junk. @@ -248,6 +256,13 @@ TEXT runtime·asminit(SB),NOSPLIT|NOFRAME,$0-0 RET TEXT runtime·mstart(SB),NOSPLIT|TOPFRAME,$0 + // This is the root frame of new Go-created OS threads. + // Call stack unwinding must not proceed past this frame. + // Set the frame pointer register to 0 so that frame pointer-based unwinders + // (which don't use debug info for performance reasons) + // won't attempt to unwind past this function. + // See go.dev/issue/63630 + MOVD $0, R29 BL runtime·mstart0(SB) RET // not reached diff --git a/src/runtime/asm_riscv64.s b/src/runtime/asm_riscv64.s index 5bd16181ee..428701a503 100644 --- a/src/runtime/asm_riscv64.s +++ b/src/runtime/asm_riscv64.s @@ -623,14 +623,14 @@ TEXT _cgo_topofstack(SB),NOSPLIT,$8 RET // func goexit(neverCallThisFunction) -// The top-most function running on a goroutine -// returns to goexit+PCQuantum. +// The top-most function running on a goroutine, returns to goexit+PCQuantum*2. +// Note that the NOPs are written in a manner that will not be compressed, +// since the offset must be known by the runtime. TEXT runtime·goexit(SB),NOSPLIT|NOFRAME|TOPFRAME,$0-0 - MOV ZERO, ZERO // NOP + WORD $0x00000013 // NOP JMP runtime·goexit1(SB) // does not return // traceback from goexit1 must hit code range of goexit - MOV ZERO, ZERO // NOP - + WORD $0x00000013 // NOP // This is called from .init_array and follows the platform, not the Go ABI. TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0 diff --git a/src/runtime/crash_test.go b/src/runtime/crash_test.go index 2b8ca549ad..00e67aeca0 100644 --- a/src/runtime/crash_test.go +++ b/src/runtime/crash_test.go @@ -413,6 +413,15 @@ func TestRepanickedPanicSandwich(t *testing.T) { } } +func TestDoublePanicWithSameValue(t *testing.T) { + output := runTestProg(t, "testprog", "DoublePanicWithSameValue") + want := `panic: message +` + if !strings.HasPrefix(output, want) { + t.Fatalf("output does not start with %q:\n%s", want, output) + } +} + func TestGoexitCrash(t *testing.T) { // External linking brings in cgo, causing deadlock detection not working. testenv.MustInternalLink(t, deadlockBuildTypes) diff --git a/src/runtime/debuglog.go b/src/runtime/debuglog.go index e993e396c1..405f2455c6 100644 --- a/src/runtime/debuglog.go +++ b/src/runtime/debuglog.go @@ -196,7 +196,8 @@ const ( debugLogPtr debugLogString debugLogConstString - debugLogStringOverflow + debugLogHexdump + debugLogOverflow debugLogPC debugLogTraceback @@ -365,7 +366,7 @@ func (l *dloggerImpl) s(x string) *dloggerImpl { l.w.uvarint(uint64(len(b))) l.w.bytes(b) if len(b) != len(x) { - l.w.byte(debugLogStringOverflow) + l.w.byte(debugLogOverflow) l.w.uvarint(uint64(len(x) - len(b))) } } @@ -373,6 +374,32 @@ func (l *dloggerImpl) s(x string) *dloggerImpl { } //go:nosplit +func (l dloggerFake) hexdump(p unsafe.Pointer, bytes uintptr) dloggerFake { return l } + +//go:nosplit +func (l *dloggerImpl) hexdump(p unsafe.Pointer, bytes uintptr) *dloggerImpl { + var b []byte + bb := (*slice)(unsafe.Pointer(&b)) + bb.array = unsafe.Pointer(p) + bb.len, bb.cap = int(bytes), int(bytes) + if len(b) > debugLogStringLimit { + b = b[:debugLogStringLimit] + } + + l.w.byte(debugLogHexdump) + l.w.uvarint(uint64(uintptr(p))) + l.w.uvarint(uint64(len(b))) + l.w.bytes(b) + + if uintptr(len(b)) != bytes { + l.w.byte(debugLogOverflow) + l.w.uvarint(uint64(bytes) - uint64(len(b))) + } + + return l +} + +//go:nosplit func (l dloggerFake) pc(x uintptr) dloggerFake { return l } //go:nosplit @@ -708,9 +735,30 @@ func (r *debugLogReader) printVal() bool { s := *(*string)(unsafe.Pointer(&str)) print(s) - case debugLogStringOverflow: + case debugLogOverflow: print("..(", r.uvarint(), " more bytes)..") + case debugLogHexdump: + p := uintptr(r.uvarint()) + bl := r.uvarint() + if r.begin+bl > r.end { + r.begin = r.end + print("<hexdump length corrupted>") + break + } + println() // Start on a new line + hd := hexdumper{addr: p} + for bl > 0 { + b := r.data.b[r.begin%uint64(len(r.data.b)):] + if uint64(len(b)) > bl { + b = b[:bl] + } + r.begin += uint64(len(b)) + bl -= uint64(len(b)) + hd.write(b) + } + hd.close() + case debugLogPC: printDebugLogPC(uintptr(r.uvarint()), false) diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go index 3a781b7551..6e0360aaca 100644 --- a/src/runtime/export_test.go +++ b/src/runtime/export_test.go @@ -238,6 +238,12 @@ func SetEnvs(e []string) { envs = e } const PtrSize = goarch.PtrSize +const ClobberdeadPtr = clobberdeadPtr + +func Clobberfree() bool { + return debug.clobberfree != 0 +} + var ForceGCPeriod = &forcegcperiod // SetTracebackEnv is like runtime/debug.SetTraceback, but it raises @@ -633,6 +639,34 @@ func RunGetgThreadSwitchTest() { } } +// Expose freegc for testing. +func Freegc(p unsafe.Pointer, size uintptr, noscan bool) { + freegc(p, size, noscan) +} + +// Expose gcAssistBytes for the current g for testing. +func AssistCredit() int64 { + assistG := getg() + if assistG.m.curg != nil { + assistG = assistG.m.curg + } + return assistG.gcAssistBytes +} + +// Expose gcBlackenEnabled for testing. +func GcBlackenEnable() bool { + // Note we do a non-atomic load here. + // Some checks against gcBlackenEnabled (e.g., in mallocgc) + // are currently done via non-atomic load for performance reasons, + // but other checks are done via atomic load (e.g., in mgcmark.go), + // so interpreting this value in a test may be subtle. + return gcBlackenEnabled != 0 +} + +const SizeSpecializedMallocEnabled = sizeSpecializedMallocEnabled + +const RuntimeFreegcEnabled = runtimeFreegcEnabled + const ( PageSize = pageSize PallocChunkPages = pallocChunkPages @@ -1472,6 +1506,15 @@ func Releasem() { releasem(getg().m) } +// GoschedIfBusy is an explicit preemption check to call back +// into the scheduler. This is useful for tests that run code +// which spend most of their time as non-preemptible, as it +// can be placed right after becoming preemptible again to ensure +// that the scheduler gets a chance to preempt the goroutine. +func GoschedIfBusy() { + goschedIfBusy() +} + type PIController struct { piController } @@ -1988,3 +2031,36 @@ func (head *ListHeadManual) Pop() unsafe.Pointer { func (head *ListHeadManual) Remove(p unsafe.Pointer) { head.l.remove(p) } + +func Hexdumper(base uintptr, wordBytes int, mark func(addr uintptr, start func()), data ...[]byte) string { + buf := make([]byte, 0, 2048) + getg().writebuf = buf + h := hexdumper{addr: base, addrBytes: 4, wordBytes: uint8(wordBytes)} + if mark != nil { + h.mark = func(addr uintptr, m hexdumpMarker) { + mark(addr, m.start) + } + } + for _, d := range data { + h.write(d) + } + h.close() + n := len(getg().writebuf) + getg().writebuf = nil + if n == cap(buf) { + panic("Hexdumper buf too small") + } + return string(buf[:n]) +} + +func HexdumpWords(p, bytes uintptr) string { + buf := make([]byte, 0, 2048) + getg().writebuf = buf + hexdumpWords(p, bytes, nil) + n := len(getg().writebuf) + getg().writebuf = nil + if n == cap(buf) { + panic("HexdumpWords buf too small") + } + return string(buf[:n]) +} diff --git a/src/runtime/hexdump.go b/src/runtime/hexdump.go new file mode 100644 index 0000000000..0d7dbb540b --- /dev/null +++ b/src/runtime/hexdump.go @@ -0,0 +1,269 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package runtime + +import ( + "internal/goarch" + "unsafe" +) + +// hexdumpWords prints a word-oriented hex dump of [p, p+len). +// +// If mark != nil, it will be passed to hexdumper.mark. +func hexdumpWords(p, len uintptr, mark func(uintptr, hexdumpMarker)) { + printlock() + + // Provide a default annotation + symMark := func(u uintptr, hm hexdumpMarker) { + if mark != nil { + mark(u, hm) + } + + // Can we symbolize this value? + val := *(*uintptr)(unsafe.Pointer(u)) + fn := findfunc(val) + if fn.valid() { + hm.start() + print("<", funcname(fn), "+", hex(val-fn.entry()), ">\n") + } + } + + h := hexdumper{addr: p, mark: symMark} + h.write(unsafe.Slice((*byte)(unsafe.Pointer(p)), len)) + h.close() + printunlock() +} + +// hexdumper is a Swiss-army knife hex dumper. +// +// To use, optionally set addr and wordBytes, then call write repeatedly, +// followed by close. +type hexdumper struct { + // addr is the address to print for the first byte of data. + addr uintptr + + // addrBytes is the number of bytes of addr to print. If this is 0, it + // defaults to goarch.PtrSize. + addrBytes uint8 + + // wordBytes is the number of bytes in a word. If wordBytes is 1, this + // prints a byte-oriented dump. If it's > 1, this interprets the data as a + // sequence of words of the given size. If it's 0, it's treated as + // goarch.PtrSize. + wordBytes uint8 + + // mark is an optional function that can annotate values in the hex dump. + // + // If non-nil, it is called with the address of every complete, aligned word + // in the hex dump. + // + // If it decides to print an annotation, it must first call m.start(), then + // print the annotation, followed by a new line. + mark func(addr uintptr, m hexdumpMarker) + + // Below here is state + + ready int8 // 0=need to init state; 1=need to print header; 2=ready + + // dataBuf accumulates a line at a time of data, in case it's split across + // buffers. + dataBuf [16]byte + dataPos uint8 + dataSkip uint8 // Skip first n bytes of buf on first line + + // toPos maps from byte offset in data to a visual offset in the printed line. + toPos [16]byte +} + +type hexdumpMarker struct { + chars int +} + +func (h *hexdumper) write(data []byte) { + if h.ready == 0 { + h.init() + } + + // Handle leading data + if h.dataPos > 0 { + n := copy(h.dataBuf[h.dataPos:], data) + h.dataPos += uint8(n) + data = data[n:] + if h.dataPos < uint8(len(h.dataBuf)) { + return + } + h.flushLine(h.dataBuf[:]) + h.dataPos = 0 + } + + // Handle full lines in data + for len(data) >= len(h.dataBuf) { + h.flushLine(data[:len(h.dataBuf)]) + data = data[len(h.dataBuf):] + } + + // Handle trailing data + h.dataPos = uint8(copy(h.dataBuf[:], data)) +} + +func (h *hexdumper) close() { + if h.dataPos > 0 { + h.flushLine(h.dataBuf[:h.dataPos]) + } +} + +func (h *hexdumper) init() { + const bytesPerLine = len(h.dataBuf) + + if h.addrBytes == 0 { + h.addrBytes = goarch.PtrSize + } else if h.addrBytes < 0 || h.addrBytes > goarch.PtrSize { + throw("invalid addrBytes") + } + + if h.wordBytes == 0 { + h.wordBytes = goarch.PtrSize + } + wb := int(h.wordBytes) + if wb < 0 || wb >= bytesPerLine || wb&(wb-1) != 0 { + throw("invalid wordBytes") + } + + // Construct position mapping. + for i := range h.toPos { + // First, calculate the "field" within the line, applying byte swizzling. + field := 0 + if goarch.BigEndian { + field = i + } else { + field = i ^ int(wb-1) + } + // Translate this field into a visual offset. + // "00112233 44556677 8899AABB CCDDEEFF" + h.toPos[i] = byte(field*2 + field/4 + field/8) + } + + // The first line may need to skip some fields to get to alignment. + // Round down the starting address. + nAddr := h.addr &^ uintptr(bytesPerLine-1) + // Skip bytes to get to alignment. + h.dataPos = uint8(h.addr - nAddr) + h.dataSkip = uint8(h.addr - nAddr) + h.addr = nAddr + + // We're ready to print the header. + h.ready = 1 +} + +func (h *hexdumper) flushLine(data []byte) { + const bytesPerLine = len(h.dataBuf) + + const maxAddrChars = 2 * goarch.PtrSize + const addrSep = ": " + dataStart := int(2*h.addrBytes) + len(addrSep) + // dataChars uses the same formula to toPos above. We calculate it with the + // "last field", then add the size of the last field. + const dataChars = (bytesPerLine-1)*2 + (bytesPerLine-1)/4 + (bytesPerLine-1)/8 + 2 + const asciiSep = " " + asciiStart := dataStart + dataChars + len(asciiSep) + const asciiChars = bytesPerLine + nlPos := asciiStart + asciiChars + + var lineBuf [maxAddrChars + len(addrSep) + dataChars + len(asciiSep) + asciiChars + 1]byte + clear := func() { + for i := range lineBuf { + lineBuf[i] = ' ' + } + } + clear() + + if h.ready == 1 { + // Print column offsets header. + for offset, pos := range h.toPos { + h.fmtHex(lineBuf[dataStart+int(pos+1):][:1], uint64(offset)) + } + // Print ASCII offsets. + for offset := range asciiChars { + h.fmtHex(lineBuf[asciiStart+offset:][:1], uint64(offset)) + } + lineBuf[nlPos] = '\n' + gwrite(lineBuf[:nlPos+1]) + clear() + h.ready = 2 + } + + // Format address. + h.fmtHex(lineBuf[:2*h.addrBytes], uint64(h.addr)) + copy(lineBuf[2*h.addrBytes:], addrSep) + // Format data in hex and ASCII. + for offset, b := range data { + if offset < int(h.dataSkip) { + continue + } + + pos := h.toPos[offset] + h.fmtHex(lineBuf[dataStart+int(pos):][:2], uint64(b)) + + copy(lineBuf[dataStart+dataChars:], asciiSep) + ascii := uint8('.') + if b >= ' ' && b <= '~' { + ascii = b + } + lineBuf[asciiStart+offset] = ascii + } + // Trim buffer. + end := asciiStart + len(data) + lineBuf[end] = '\n' + buf := lineBuf[:end+1] + + // Print. + gwrite(buf) + + // Print marks. + if h.mark != nil { + clear() + for offset := 0; offset+int(h.wordBytes) <= len(data); offset += int(h.wordBytes) { + if offset < int(h.dataSkip) { + continue + } + addr := h.addr + uintptr(offset) + // Find the position of the left edge of this word + caret := dataStart + int(min(h.toPos[offset], h.toPos[offset+int(h.wordBytes)-1])) + h.mark(addr, hexdumpMarker{caret}) + } + } + + h.addr += uintptr(bytesPerLine) + h.dataPos = 0 + h.dataSkip = 0 +} + +// fmtHex formats v in base 16 into buf. It fills all of buf. If buf is too +// small to represent v, it the output will start with '*'. +func (h *hexdumper) fmtHex(buf []byte, v uint64) { + const dig = "0123456789abcdef" + i := len(buf) - 1 + for ; i >= 0; i-- { + buf[i] = dig[v%16] + v /= 16 + } + if v != 0 { + // Indicate that we couldn't fit the whole number. + buf[0] = '*' + } +} + +func (m hexdumpMarker) start() { + var spaces [64]byte + for i := range spaces { + spaces[i] = ' ' + } + for m.chars > len(spaces) { + gwrite(spaces[:]) + m.chars -= len(spaces) + } + gwrite(spaces[:m.chars]) + print("^ ") +} diff --git a/src/runtime/hexdump_test.go b/src/runtime/hexdump_test.go new file mode 100644 index 0000000000..cc44e48e4b --- /dev/null +++ b/src/runtime/hexdump_test.go @@ -0,0 +1,151 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package runtime_test + +import ( + "fmt" + "internal/abi" + "internal/goarch" + "runtime" + "slices" + "strings" + "testing" + "unsafe" +) + +func TestHexdumper(t *testing.T) { + check := func(label, got, want string) { + got = strings.TrimRight(got, "\n") + want = strings.TrimPrefix(want, "\n") + want = strings.TrimRight(want, "\n") + if got != want { + t.Errorf("%s: got\n%s\nwant\n%s", label, got, want) + } + } + + data := make([]byte, 32) + for i := range data { + data[i] = 0x10 + byte(i) + } + + check("basic", runtime.Hexdumper(0, 1, nil, data), ` + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef +00000000: 10111213 14151617 18191a1b 1c1d1e1f ................ +00000010: 20212223 24252627 28292a2b 2c2d2e2f !"#$%&'()*+,-./`) + + if !goarch.BigEndian { + // Different word sizes + check("word=4", runtime.Hexdumper(0, 4, nil, data), ` + 3 2 1 0 7 6 5 4 b a 9 8 f e d c 0123456789abcdef +00000000: 13121110 17161514 1b1a1918 1f1e1d1c ................ +00000010: 23222120 27262524 2b2a2928 2f2e2d2c !"#$%&'()*+,-./`) + check("word=8", runtime.Hexdumper(0, 8, nil, data), ` + 7 6 5 4 3 2 1 0 f e d c b a 9 8 0123456789abcdef +00000000: 17161514 13121110 1f1e1d1c 1b1a1918 ................ +00000010: 27262524 23222120 2f2e2d2c 2b2a2928 !"#$%&'()*+,-./`) + } + + // Starting offset + check("offset=1", runtime.Hexdumper(1, 1, nil, data), ` + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef +00000000: 101112 13141516 1718191a 1b1c1d1e ............... +00000010: 1f202122 23242526 2728292a 2b2c2d2e . !"#$%&'()*+,-. +00000020: 2f /`) + if !goarch.BigEndian { + // ... combined with a word size + check("offset=1 and word=4", runtime.Hexdumper(1, 4, nil, data), ` + 3 2 1 0 7 6 5 4 b a 9 8 f e d c 0123456789abcdef +00000000: 121110 16151413 1a191817 1e1d1c1b ............... +00000010: 2221201f 26252423 2a292827 2e2d2c2b . !"#$%&'()*+,-. +00000020: 2f /`) + } + + // Partial data full of annoying boundaries. + partials := make([][]byte, 0) + for i := 0; i < len(data); i += 2 { + partials = append(partials, data[i:i+2]) + } + check("partials", runtime.Hexdumper(1, 1, nil, partials...), ` + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef +00000000: 101112 13141516 1718191a 1b1c1d1e ............... +00000010: 1f202122 23242526 2728292a 2b2c2d2e . !"#$%&'()*+,-. +00000020: 2f /`) + + // Marks. + check("marks", runtime.Hexdumper(0, 1, func(addr uintptr, start func()) { + if addr%7 == 0 { + start() + println("mark") + } + }, data), ` + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef +00000000: 10111213 14151617 18191a1b 1c1d1e1f ................ + ^ mark + ^ mark + ^ mark +00000010: 20212223 24252627 28292a2b 2c2d2e2f !"#$%&'()*+,-./ + ^ mark + ^ mark`) + if !goarch.BigEndian { + check("marks and word=4", runtime.Hexdumper(0, 4, func(addr uintptr, start func()) { + if addr%7 == 0 { + start() + println("mark") + } + }, data), ` + 3 2 1 0 7 6 5 4 b a 9 8 f e d c 0123456789abcdef +00000000: 13121110 17161514 1b1a1918 1f1e1d1c ................ + ^ mark +00000010: 23222120 27262524 2b2a2928 2f2e2d2c !"#$%&'()*+,-./ + ^ mark`) + } +} + +func TestHexdumpWords(t *testing.T) { + if goarch.BigEndian || goarch.PtrSize != 8 { + // We could support these, but it's kind of a pain. + t.Skip("requires 64-bit little endian") + } + + // Most of this is in hexdumper. Here we just test the symbolizer. + + pc := abi.FuncPCABIInternal(TestHexdumpWords) + pcs := slices.Repeat([]uintptr{pc}, 3) + + // Make sure pcs doesn't move around on us. + var p runtime.Pinner + defer p.Unpin() + p.Pin(&pcs[0]) + // Get a 16 byte, 16-byte-aligned chunk of pcs so the hexdump is simple. + start := uintptr(unsafe.Pointer(&pcs[0])) + start = (start + 15) &^ uintptr(15) + + // Do the hex dump. + got := runtime.HexdumpWords(start, 16) + + // Construct the expected output. + pcStr := fmt.Sprintf("%016x", pc) + pcStr = pcStr[:8] + " " + pcStr[8:] // Add middle space + ascii := make([]byte, 8) + for i := range ascii { + b := byte(pc >> (8 * i)) + if b >= ' ' && b <= '~' { + ascii[i] = b + } else { + ascii[i] = '.' + } + } + want := fmt.Sprintf(` + 7 6 5 4 3 2 1 0 f e d c b a 9 8 0123456789abcdef +%016x: %s %s %s%s + ^ <runtime_test.TestHexdumpWords+0x0> + ^ <runtime_test.TestHexdumpWords+0x0> +`, start, pcStr, pcStr, ascii, ascii) + want = strings.TrimPrefix(want, "\n") + + if got != want { + t.Errorf("got\n%s\nwant\n%s", got, want) + } +} diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go index fc4f21b532..d49dacaf68 100644 --- a/src/runtime/malloc.go +++ b/src/runtime/malloc.go @@ -1080,7 +1080,8 @@ func (c *mcache) nextFree(spc spanClass) (v gclinkptr, s *mspan, checkGCTrigger // // We might consider turning these on by default; many of them previously were. // They account for a few % of mallocgc's cost though, which does matter somewhat -// at scale. +// at scale. (When testing changes to malloc, consider enabling this, and also +// some function-local 'doubleCheck' consts such as in mbitmap.go currently.) const doubleCheckMalloc = false // sizeSpecializedMallocEnabled is the set of conditions where we enable the size-specialized @@ -1089,6 +1090,14 @@ const doubleCheckMalloc = false // properly on plan9, so size-specialized malloc is also disabled on plan9. const sizeSpecializedMallocEnabled = goexperiment.SizeSpecializedMalloc && GOOS != "plan9" && !asanenabled && !raceenabled && !msanenabled && !valgrindenabled +// runtimeFreegcEnabled is the set of conditions where we enable the runtime.freegc +// implementation and the corresponding allocation-related changes: the experiment must be +// enabled, and none of the memory sanitizers should be enabled. We allow the race detector, +// in contrast to sizeSpecializedMallocEnabled. +// TODO(thepudds): it would be nice to check Valgrind integration, though there are some hints +// there might not be any canned tests in tree for Go's integration with Valgrind. +const runtimeFreegcEnabled = goexperiment.RuntimeFreegc && !asanenabled && !msanenabled && !valgrindenabled + // Allocate an object of size bytes. // Small objects are allocated from the per-P cache's free lists. // Large objects (> 32 kB) are allocated straight from the heap. @@ -1150,7 +1159,8 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer { size += asanRZ } - // Assist the GC if needed. + // Assist the GC if needed. (On the reuse path, we currently compensate for this; + // changes here might require changes there.) if gcBlackenEnabled != 0 { deductAssistCredit(size) } @@ -1413,6 +1423,16 @@ func mallocgcSmallNoscan(size uintptr, typ *_type, needzero bool) (unsafe.Pointe size = uintptr(gc.SizeClassToSize[sizeclass]) spc := makeSpanClass(sizeclass, true) span := c.alloc[spc] + + // First, check for a reusable object. + if runtimeFreegcEnabled && c.hasReusableNoscan(spc) { + // We have a reusable object, use it. + x := mallocgcSmallNoscanReuse(c, span, spc, size, needzero) + mp.mallocing = 0 + releasem(mp) + return x, size + } + v := nextFreeFast(span) if v == 0 { v, span, checkGCTrigger = c.nextFree(spc) @@ -1472,6 +1492,55 @@ func mallocgcSmallNoscan(size uintptr, typ *_type, needzero bool) (unsafe.Pointe return x, size } +// mallocgcSmallNoscanReuse returns a previously freed noscan object after preparing it for reuse. +// It must only be called if hasReusableNoscan returned true. +func mallocgcSmallNoscanReuse(c *mcache, span *mspan, spc spanClass, size uintptr, needzero bool) unsafe.Pointer { + // TODO(thepudds): could nextFreeFast, nextFree and nextReusable return unsafe.Pointer? + // Maybe doesn't matter. gclinkptr might be for historical reasons. + v, span := c.nextReusableNoScan(span, spc) + x := unsafe.Pointer(v) + + // Compensate for the GC assist credit deducted in mallocgc (before calling us and + // after we return) because this is not a newly allocated object. We use the full slot + // size (elemsize) here because that's what mallocgc deducts overall. Note we only + // adjust this when gcBlackenEnabled is true, which follows mallocgc behavior. + // TODO(thepudds): a follow-up CL adds a more specific test of our assist credit + // handling, including for validating internal fragmentation handling. + if gcBlackenEnabled != 0 { + addAssistCredit(size) + } + + // This is a previously used object, so only check needzero (and not span.needzero) + // for clearing. + if needzero { + memclrNoHeapPointers(x, size) + } + + // See publicationBarrier comment in mallocgcSmallNoscan. + publicationBarrier() + + // Finish and return. Note that we do not update span.freeIndexForScan, profiling info, + // nor do we check gcTrigger. + // TODO(thepudds): the current approach is viable for a GOEXPERIMENT, but + // means we do not profile reused heap objects. Ultimately, we will need a better + // approach for profiling, or at least ensure we are not introducing bias in the + // profiled allocations. + // TODO(thepudds): related, we probably want to adjust how allocs and frees are counted + // in the existing stats. Currently, reused objects are not counted as allocs nor + // frees, but instead roughly appear as if the original heap object lived on. We + // probably will also want some additional runtime/metrics, and generally think about + // user-facing observability & diagnostics, though all this likely can wait for an + // official proposal. + if writeBarrier.enabled { + // Allocate black during GC. + // All slots hold nil so no scanning is needed. + // This may be racing with GC so do it atomically if there can be + // a race marking the bit. + gcmarknewobject(span, uintptr(x)) + } + return x +} + func mallocgcSmallScanNoHeader(size uintptr, typ *_type) (unsafe.Pointer, uintptr) { // Set mp.mallocing to keep from being preempted by GC. mp := acquirem() @@ -1816,8 +1885,6 @@ func postMallocgcDebug(x unsafe.Pointer, elemsize uintptr, typ *_type) { // by size bytes, and assists the GC if necessary. // // Caller must be preemptible. -// -// Returns the G for which the assist credit was accounted. func deductAssistCredit(size uintptr) { // Charge the current user G for this allocation. assistG := getg() @@ -1836,6 +1903,267 @@ func deductAssistCredit(size uintptr) { } } +// addAssistCredit is like deductAssistCredit, +// but adds credit rather than removes, +// and never calls gcAssistAlloc. +func addAssistCredit(size uintptr) { + // Credit the current user G. + assistG := getg() + if assistG.m.curg != nil { // TODO(thepudds): do we need to do this? + assistG = assistG.m.curg + } + // Credit the size against the G. + assistG.gcAssistBytes += int64(size) +} + +const ( + // doubleCheckReusable enables some additional invariant checks for the + // runtime.freegc and reusable objects. Note that some of these checks alter timing, + // and it is good to test changes with and without this enabled. + doubleCheckReusable = false + + // debugReusableLog enables some printlns for runtime.freegc and reusable objects. + debugReusableLog = false +) + +// freegc records that a heap object is reusable and available for +// immediate reuse in a subsequent mallocgc allocation, without +// needing to wait for the GC cycle to progress. +// +// The information is recorded in a free list stored in the +// current P's mcache. The caller must pass in the user size +// and whether the object has pointers, which allows a faster free +// operation. +// +// freegc must be called by the effective owner of ptr who knows +// the pointer is logically dead, with no possible aliases that might +// be used past that moment. In other words, ptr must be the +// last and only pointer to its referent. +// +// The intended caller is the compiler. +// +// Note: please do not send changes that attempt to add freegc calls +// to the standard library. +// +// ptr must point to a heap object or into the current g's stack, +// in which case freegc is a no-op. In particular, ptr must not point +// to memory in the data or bss sections, which is partially enforced. +// For objects with a malloc header, ptr should point mallocHeaderSize bytes +// past the base; otherwise, ptr should point to the base of the heap object. +// In other words, ptr should be the same pointer that was returned by mallocgc. +// +// In addition, the caller must know that ptr's object has no specials, such +// as might have been created by a call to SetFinalizer or AddCleanup. +// (Internally, the runtime deals appropriately with internally-created +// specials, such as specials for memory profiling). +// +// If the size of ptr's object is less than 16 bytes or greater than +// 32KiB - gc.MallocHeaderSize bytes, freegc is currently a no-op. It must only +// be called in alloc-safe places. It currently throws if noscan is false +// (support for which is implemented in a later CL in our stack). +// +// Note that freegc accepts an unsafe.Pointer and hence keeps the pointer +// alive. It therefore could be a pessimization in some cases (such +// as a long-lived function) if the caller does not call freegc before +// or roughly when the liveness analysis of the compiler +// would otherwise have determined ptr's object is reclaimable by the GC. +func freegc(ptr unsafe.Pointer, size uintptr, noscan bool) bool { + if !runtimeFreegcEnabled || !reusableSize(size) { + return false + } + if sizeSpecializedMallocEnabled && !noscan { + // TODO(thepudds): temporarily disable freegc with SizeSpecializedMalloc for pointer types + // until we finish integrating. + return false + } + + if ptr == nil { + throw("freegc nil") + } + + // Set mp.mallocing to keep from being preempted by GC. + // Otherwise, the GC could flush our mcache or otherwise cause problems. + mp := acquirem() + if mp.mallocing != 0 { + throw("freegc deadlock") + } + if mp.gsignal == getg() { + throw("freegc during signal") + } + mp.mallocing = 1 + + if mp.curg.stack.lo <= uintptr(ptr) && uintptr(ptr) < mp.curg.stack.hi { + // This points into our stack, so free is a no-op. + mp.mallocing = 0 + releasem(mp) + return false + } + + if doubleCheckReusable { + // TODO(thepudds): we could enforce no free on globals in bss or data. Maybe by + // checking span via spanOf or spanOfHeap, or maybe walk from firstmoduledata + // like isGoPointerWithoutSpan, or activeModules, or something. If so, we might + // be able to delay checking until reuse (e.g., check span just before reusing, + // though currently we don't always need to lookup a span on reuse). If we think + // no usage patterns could result in globals, maybe enforcement for globals could + // be behind -d=checkptr=1 or similar. The compiler can have knowledge of where + // a variable is allocated, but stdlib does not, although there are certain + // usage patterns that cannot result in a global. + // TODO(thepudds): separately, consider a local debugReusableMcacheOnly here + // to ignore freed objects if not in mspan in mcache, maybe when freeing and reading, + // by checking something like s.base() <= uintptr(v) && uintptr(v) < s.limit. Or + // maybe a GODEBUG or compiler debug flag. + span := spanOf(uintptr(ptr)) + if span == nil { + throw("nextReusable: nil span for pointer in free list") + } + if state := span.state.get(); state != mSpanInUse { + throw("nextReusable: span is not in use") + } + } + + if debug.clobberfree != 0 { + clobberfree(ptr, size) + } + + // We first check if p is still in our per-P cache. + // Get our per-P cache for small objects. + c := getMCache(mp) + if c == nil { + throw("freegc called without a P or outside bootstrapping") + } + + v := uintptr(ptr) + if !noscan && !heapBitsInSpan(size) { + // mallocgcSmallScanHeader expects to get the base address of the object back + // from the findReusable funcs (as well as from nextFreeFast and nextFree), and + // not mallocHeaderSize bytes into a object, so adjust that here. + v -= mallocHeaderSize + + // The size class lookup wants size to be adjusted by mallocHeaderSize. + size += mallocHeaderSize + } + + // TODO(thepudds): should verify (behind doubleCheckReusable constant) that our calculated + // sizeclass here matches what's in span found via spanOf(ptr) or findObject(ptr). + var sizeclass uint8 + if size <= gc.SmallSizeMax-8 { + sizeclass = gc.SizeToSizeClass8[divRoundUp(size, gc.SmallSizeDiv)] + } else { + sizeclass = gc.SizeToSizeClass128[divRoundUp(size-gc.SmallSizeMax, gc.LargeSizeDiv)] + } + + spc := makeSpanClass(sizeclass, noscan) + s := c.alloc[spc] + + if debugReusableLog { + if s.base() <= uintptr(v) && uintptr(v) < s.limit { + println("freegc [in mcache]:", hex(uintptr(v)), "sweepgen:", mheap_.sweepgen, "writeBarrier.enabled:", writeBarrier.enabled) + } else { + println("freegc [NOT in mcache]:", hex(uintptr(v)), "sweepgen:", mheap_.sweepgen, "writeBarrier.enabled:", writeBarrier.enabled) + } + } + + if noscan { + c.addReusableNoscan(spc, uintptr(v)) + } else { + // TODO(thepudds): implemented in later CL in our stack. + throw("freegc called for object with pointers, not yet implemented") + } + + // For stats, for now we leave allocCount alone, roughly pretending to the rest + // of the system that this potential reuse never happened. + + mp.mallocing = 0 + releasem(mp) + + return true +} + +// nextReusableNoScan returns the next reusable object for a noscan span, +// or 0 if no reusable object is found. +func (c *mcache) nextReusableNoScan(s *mspan, spc spanClass) (gclinkptr, *mspan) { + if !runtimeFreegcEnabled { + return 0, s + } + + // Pop a reusable pointer from the free list for this span class. + v := c.reusableNoscan[spc] + if v == 0 { + return 0, s + } + c.reusableNoscan[spc] = v.ptr().next + + if debugReusableLog { + println("reusing from ptr free list:", hex(v), "sweepgen:", mheap_.sweepgen, "writeBarrier.enabled:", writeBarrier.enabled) + } + if doubleCheckReusable { + doubleCheckNextReusable(v) // debug only sanity check + } + + // For noscan spans, we only need the span if the write barrier is enabled (so that our caller + // can call gcmarknewobject to allocate black). If the write barrier is enabled, we can skip + // looking up the span when the pointer is in a span in the mcache. + if !writeBarrier.enabled { + return v, nil + } + if s.base() <= uintptr(v) && uintptr(v) < s.limit { + // Return the original span. + return v, s + } + + // We must find and return the span. + span := spanOf(uintptr(v)) + if span == nil { + // TODO(thepudds): construct a test that triggers this throw. + throw("nextReusableNoScan: nil span for pointer in reusable object free list") + } + + return v, span +} + +// doubleCheckNextReusable checks some invariants. +// TODO(thepudds): will probably delete some of this. Can mostly be ignored for review. +func doubleCheckNextReusable(v gclinkptr) { + // TODO(thepudds): should probably take the spanClass as well to confirm expected + // sizeclass match. + _, span, objIndex := findObject(uintptr(v), 0, 0) + if span == nil { + throw("nextReusable: nil span for pointer in free list") + } + if state := span.state.get(); state != mSpanInUse { + throw("nextReusable: span is not in use") + } + if uintptr(v) < span.base() || uintptr(v) >= span.limit { + throw("nextReusable: span is not in range") + } + if span.objBase(uintptr(v)) != uintptr(v) { + print("nextReusable: v=", hex(v), " base=", hex(span.objBase(uintptr(v))), "\n") + throw("nextReusable: v is non-base-address for object found on pointer free list") + } + if span.isFree(objIndex) { + throw("nextReusable: pointer on free list is free") + } + + const debugReusableEnsureSwept = false + if debugReusableEnsureSwept { + // Currently disabled. + // Note: ensureSwept here alters behavior (not just an invariant check). + span.ensureSwept() + if span.isFree(objIndex) { + throw("nextReusable: pointer on free list is free after ensureSwept") + } + } +} + +// reusableSize reports if size is a currently supported size for a reusable object. +func reusableSize(size uintptr) bool { + if size < maxTinySize || size > maxSmallSize-mallocHeaderSize { + return false + } + return true +} + // memclrNoHeapPointersChunked repeatedly calls memclrNoHeapPointers // on chunks of the buffer to be zeroed, with opportunities for preemption // along the way. memclrNoHeapPointers contains no safepoints and also diff --git a/src/runtime/malloc_generated.go b/src/runtime/malloc_generated.go index 2215dbaddb..5abb61257a 100644 --- a/src/runtime/malloc_generated.go +++ b/src/runtime/malloc_generated.go @@ -1,4 +1,5 @@ // Code generated by mkmalloc.go; DO NOT EDIT. +// See overview in malloc_stubs.go. package runtime @@ -6400,6 +6401,32 @@ func mallocgcSmallNoScanSC2(size uintptr, typ *_type, needzero bool) unsafe.Poin const spc = spanClass(sizeclass<<1) | spanClass(1) span := c.alloc[spc] + if runtimeFreegcEnabled && c.hasReusableNoscan(spc) { + + v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero) + mp.mallocing = 0 + releasem(mp) + x := v + { + + if valgrindenabled { + valgrindMalloc(x, size) + } + + if gcBlackenEnabled != 0 && elemsize != 0 { + if assistG := getg().m.curg; assistG != nil { + assistG.gcAssistBytes -= int64(elemsize - size) + } + } + + if debug.malloc { + postMallocgcDebug(x, elemsize, typ) + } + return x + } + + } + var nextFreeFastResult gclinkptr if span.allocCache != 0 { theBit := sys.TrailingZeros64(span.allocCache) @@ -6497,6 +6524,32 @@ func mallocgcSmallNoScanSC3(size uintptr, typ *_type, needzero bool) unsafe.Poin const spc = spanClass(sizeclass<<1) | spanClass(1) span := c.alloc[spc] + if runtimeFreegcEnabled && c.hasReusableNoscan(spc) { + + v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero) + mp.mallocing = 0 + releasem(mp) + x := v + { + + if valgrindenabled { + valgrindMalloc(x, size) + } + + if gcBlackenEnabled != 0 && elemsize != 0 { + if assistG := getg().m.curg; assistG != nil { + assistG.gcAssistBytes -= int64(elemsize - size) + } + } + + if debug.malloc { + postMallocgcDebug(x, elemsize, typ) + } + return x + } + + } + var nextFreeFastResult gclinkptr if span.allocCache != 0 { theBit := sys.TrailingZeros64(span.allocCache) @@ -6594,6 +6647,32 @@ func mallocgcSmallNoScanSC4(size uintptr, typ *_type, needzero bool) unsafe.Poin const spc = spanClass(sizeclass<<1) | spanClass(1) span := c.alloc[spc] + if runtimeFreegcEnabled && c.hasReusableNoscan(spc) { + + v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero) + mp.mallocing = 0 + releasem(mp) + x := v + { + + if valgrindenabled { + valgrindMalloc(x, size) + } + + if gcBlackenEnabled != 0 && elemsize != 0 { + if assistG := getg().m.curg; assistG != nil { + assistG.gcAssistBytes -= int64(elemsize - size) + } + } + + if debug.malloc { + postMallocgcDebug(x, elemsize, typ) + } + return x + } + + } + var nextFreeFastResult gclinkptr if span.allocCache != 0 { theBit := sys.TrailingZeros64(span.allocCache) @@ -6691,6 +6770,32 @@ func mallocgcSmallNoScanSC5(size uintptr, typ *_type, needzero bool) unsafe.Poin const spc = spanClass(sizeclass<<1) | spanClass(1) span := c.alloc[spc] + if runtimeFreegcEnabled && c.hasReusableNoscan(spc) { + + v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero) + mp.mallocing = 0 + releasem(mp) + x := v + { + + if valgrindenabled { + valgrindMalloc(x, size) + } + + if gcBlackenEnabled != 0 && elemsize != 0 { + if assistG := getg().m.curg; assistG != nil { + assistG.gcAssistBytes -= int64(elemsize - size) + } + } + + if debug.malloc { + postMallocgcDebug(x, elemsize, typ) + } + return x + } + + } + var nextFreeFastResult gclinkptr if span.allocCache != 0 { theBit := sys.TrailingZeros64(span.allocCache) @@ -6788,6 +6893,32 @@ func mallocgcSmallNoScanSC6(size uintptr, typ *_type, needzero bool) unsafe.Poin const spc = spanClass(sizeclass<<1) | spanClass(1) span := c.alloc[spc] + if runtimeFreegcEnabled && c.hasReusableNoscan(spc) { + + v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero) + mp.mallocing = 0 + releasem(mp) + x := v + { + + if valgrindenabled { + valgrindMalloc(x, size) + } + + if gcBlackenEnabled != 0 && elemsize != 0 { + if assistG := getg().m.curg; assistG != nil { + assistG.gcAssistBytes -= int64(elemsize - size) + } + } + + if debug.malloc { + postMallocgcDebug(x, elemsize, typ) + } + return x + } + + } + var nextFreeFastResult gclinkptr if span.allocCache != 0 { theBit := sys.TrailingZeros64(span.allocCache) @@ -6885,6 +7016,32 @@ func mallocgcSmallNoScanSC7(size uintptr, typ *_type, needzero bool) unsafe.Poin const spc = spanClass(sizeclass<<1) | spanClass(1) span := c.alloc[spc] + if runtimeFreegcEnabled && c.hasReusableNoscan(spc) { + + v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero) + mp.mallocing = 0 + releasem(mp) + x := v + { + + if valgrindenabled { + valgrindMalloc(x, size) + } + + if gcBlackenEnabled != 0 && elemsize != 0 { + if assistG := getg().m.curg; assistG != nil { + assistG.gcAssistBytes -= int64(elemsize - size) + } + } + + if debug.malloc { + postMallocgcDebug(x, elemsize, typ) + } + return x + } + + } + var nextFreeFastResult gclinkptr if span.allocCache != 0 { theBit := sys.TrailingZeros64(span.allocCache) @@ -6982,6 +7139,32 @@ func mallocgcSmallNoScanSC8(size uintptr, typ *_type, needzero bool) unsafe.Poin const spc = spanClass(sizeclass<<1) | spanClass(1) span := c.alloc[spc] + if runtimeFreegcEnabled && c.hasReusableNoscan(spc) { + + v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero) + mp.mallocing = 0 + releasem(mp) + x := v + { + + if valgrindenabled { + valgrindMalloc(x, size) + } + + if gcBlackenEnabled != 0 && elemsize != 0 { + if assistG := getg().m.curg; assistG != nil { + assistG.gcAssistBytes -= int64(elemsize - size) + } + } + + if debug.malloc { + postMallocgcDebug(x, elemsize, typ) + } + return x + } + + } + var nextFreeFastResult gclinkptr if span.allocCache != 0 { theBit := sys.TrailingZeros64(span.allocCache) @@ -7079,6 +7262,32 @@ func mallocgcSmallNoScanSC9(size uintptr, typ *_type, needzero bool) unsafe.Poin const spc = spanClass(sizeclass<<1) | spanClass(1) span := c.alloc[spc] + if runtimeFreegcEnabled && c.hasReusableNoscan(spc) { + + v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero) + mp.mallocing = 0 + releasem(mp) + x := v + { + + if valgrindenabled { + valgrindMalloc(x, size) + } + + if gcBlackenEnabled != 0 && elemsize != 0 { + if assistG := getg().m.curg; assistG != nil { + assistG.gcAssistBytes -= int64(elemsize - size) + } + } + + if debug.malloc { + postMallocgcDebug(x, elemsize, typ) + } + return x + } + + } + var nextFreeFastResult gclinkptr if span.allocCache != 0 { theBit := sys.TrailingZeros64(span.allocCache) @@ -7176,6 +7385,32 @@ func mallocgcSmallNoScanSC10(size uintptr, typ *_type, needzero bool) unsafe.Poi const spc = spanClass(sizeclass<<1) | spanClass(1) span := c.alloc[spc] + if runtimeFreegcEnabled && c.hasReusableNoscan(spc) { + + v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero) + mp.mallocing = 0 + releasem(mp) + x := v + { + + if valgrindenabled { + valgrindMalloc(x, size) + } + + if gcBlackenEnabled != 0 && elemsize != 0 { + if assistG := getg().m.curg; assistG != nil { + assistG.gcAssistBytes -= int64(elemsize - size) + } + } + + if debug.malloc { + postMallocgcDebug(x, elemsize, typ) + } + return x + } + + } + var nextFreeFastResult gclinkptr if span.allocCache != 0 { theBit := sys.TrailingZeros64(span.allocCache) @@ -7273,6 +7508,32 @@ func mallocgcSmallNoScanSC11(size uintptr, typ *_type, needzero bool) unsafe.Poi const spc = spanClass(sizeclass<<1) | spanClass(1) span := c.alloc[spc] + if runtimeFreegcEnabled && c.hasReusableNoscan(spc) { + + v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero) + mp.mallocing = 0 + releasem(mp) + x := v + { + + if valgrindenabled { + valgrindMalloc(x, size) + } + + if gcBlackenEnabled != 0 && elemsize != 0 { + if assistG := getg().m.curg; assistG != nil { + assistG.gcAssistBytes -= int64(elemsize - size) + } + } + + if debug.malloc { + postMallocgcDebug(x, elemsize, typ) + } + return x + } + + } + var nextFreeFastResult gclinkptr if span.allocCache != 0 { theBit := sys.TrailingZeros64(span.allocCache) @@ -7370,6 +7631,32 @@ func mallocgcSmallNoScanSC12(size uintptr, typ *_type, needzero bool) unsafe.Poi const spc = spanClass(sizeclass<<1) | spanClass(1) span := c.alloc[spc] + if runtimeFreegcEnabled && c.hasReusableNoscan(spc) { + + v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero) + mp.mallocing = 0 + releasem(mp) + x := v + { + + if valgrindenabled { + valgrindMalloc(x, size) + } + + if gcBlackenEnabled != 0 && elemsize != 0 { + if assistG := getg().m.curg; assistG != nil { + assistG.gcAssistBytes -= int64(elemsize - size) + } + } + + if debug.malloc { + postMallocgcDebug(x, elemsize, typ) + } + return x + } + + } + var nextFreeFastResult gclinkptr if span.allocCache != 0 { theBit := sys.TrailingZeros64(span.allocCache) @@ -7467,6 +7754,32 @@ func mallocgcSmallNoScanSC13(size uintptr, typ *_type, needzero bool) unsafe.Poi const spc = spanClass(sizeclass<<1) | spanClass(1) span := c.alloc[spc] + if runtimeFreegcEnabled && c.hasReusableNoscan(spc) { + + v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero) + mp.mallocing = 0 + releasem(mp) + x := v + { + + if valgrindenabled { + valgrindMalloc(x, size) + } + + if gcBlackenEnabled != 0 && elemsize != 0 { + if assistG := getg().m.curg; assistG != nil { + assistG.gcAssistBytes -= int64(elemsize - size) + } + } + + if debug.malloc { + postMallocgcDebug(x, elemsize, typ) + } + return x + } + + } + var nextFreeFastResult gclinkptr if span.allocCache != 0 { theBit := sys.TrailingZeros64(span.allocCache) @@ -7564,6 +7877,32 @@ func mallocgcSmallNoScanSC14(size uintptr, typ *_type, needzero bool) unsafe.Poi const spc = spanClass(sizeclass<<1) | spanClass(1) span := c.alloc[spc] + if runtimeFreegcEnabled && c.hasReusableNoscan(spc) { + + v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero) + mp.mallocing = 0 + releasem(mp) + x := v + { + + if valgrindenabled { + valgrindMalloc(x, size) + } + + if gcBlackenEnabled != 0 && elemsize != 0 { + if assistG := getg().m.curg; assistG != nil { + assistG.gcAssistBytes -= int64(elemsize - size) + } + } + + if debug.malloc { + postMallocgcDebug(x, elemsize, typ) + } + return x + } + + } + var nextFreeFastResult gclinkptr if span.allocCache != 0 { theBit := sys.TrailingZeros64(span.allocCache) @@ -7661,6 +8000,32 @@ func mallocgcSmallNoScanSC15(size uintptr, typ *_type, needzero bool) unsafe.Poi const spc = spanClass(sizeclass<<1) | spanClass(1) span := c.alloc[spc] + if runtimeFreegcEnabled && c.hasReusableNoscan(spc) { + + v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero) + mp.mallocing = 0 + releasem(mp) + x := v + { + + if valgrindenabled { + valgrindMalloc(x, size) + } + + if gcBlackenEnabled != 0 && elemsize != 0 { + if assistG := getg().m.curg; assistG != nil { + assistG.gcAssistBytes -= int64(elemsize - size) + } + } + + if debug.malloc { + postMallocgcDebug(x, elemsize, typ) + } + return x + } + + } + var nextFreeFastResult gclinkptr if span.allocCache != 0 { theBit := sys.TrailingZeros64(span.allocCache) @@ -7758,6 +8123,32 @@ func mallocgcSmallNoScanSC16(size uintptr, typ *_type, needzero bool) unsafe.Poi const spc = spanClass(sizeclass<<1) | spanClass(1) span := c.alloc[spc] + if runtimeFreegcEnabled && c.hasReusableNoscan(spc) { + + v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero) + mp.mallocing = 0 + releasem(mp) + x := v + { + + if valgrindenabled { + valgrindMalloc(x, size) + } + + if gcBlackenEnabled != 0 && elemsize != 0 { + if assistG := getg().m.curg; assistG != nil { + assistG.gcAssistBytes -= int64(elemsize - size) + } + } + + if debug.malloc { + postMallocgcDebug(x, elemsize, typ) + } + return x + } + + } + var nextFreeFastResult gclinkptr if span.allocCache != 0 { theBit := sys.TrailingZeros64(span.allocCache) @@ -7855,6 +8246,32 @@ func mallocgcSmallNoScanSC17(size uintptr, typ *_type, needzero bool) unsafe.Poi const spc = spanClass(sizeclass<<1) | spanClass(1) span := c.alloc[spc] + if runtimeFreegcEnabled && c.hasReusableNoscan(spc) { + + v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero) + mp.mallocing = 0 + releasem(mp) + x := v + { + + if valgrindenabled { + valgrindMalloc(x, size) + } + + if gcBlackenEnabled != 0 && elemsize != 0 { + if assistG := getg().m.curg; assistG != nil { + assistG.gcAssistBytes -= int64(elemsize - size) + } + } + + if debug.malloc { + postMallocgcDebug(x, elemsize, typ) + } + return x + } + + } + var nextFreeFastResult gclinkptr if span.allocCache != 0 { theBit := sys.TrailingZeros64(span.allocCache) @@ -7952,6 +8369,32 @@ func mallocgcSmallNoScanSC18(size uintptr, typ *_type, needzero bool) unsafe.Poi const spc = spanClass(sizeclass<<1) | spanClass(1) span := c.alloc[spc] + if runtimeFreegcEnabled && c.hasReusableNoscan(spc) { + + v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero) + mp.mallocing = 0 + releasem(mp) + x := v + { + + if valgrindenabled { + valgrindMalloc(x, size) + } + + if gcBlackenEnabled != 0 && elemsize != 0 { + if assistG := getg().m.curg; assistG != nil { + assistG.gcAssistBytes -= int64(elemsize - size) + } + } + + if debug.malloc { + postMallocgcDebug(x, elemsize, typ) + } + return x + } + + } + var nextFreeFastResult gclinkptr if span.allocCache != 0 { theBit := sys.TrailingZeros64(span.allocCache) @@ -8049,6 +8492,32 @@ func mallocgcSmallNoScanSC19(size uintptr, typ *_type, needzero bool) unsafe.Poi const spc = spanClass(sizeclass<<1) | spanClass(1) span := c.alloc[spc] + if runtimeFreegcEnabled && c.hasReusableNoscan(spc) { + + v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero) + mp.mallocing = 0 + releasem(mp) + x := v + { + + if valgrindenabled { + valgrindMalloc(x, size) + } + + if gcBlackenEnabled != 0 && elemsize != 0 { + if assistG := getg().m.curg; assistG != nil { + assistG.gcAssistBytes -= int64(elemsize - size) + } + } + + if debug.malloc { + postMallocgcDebug(x, elemsize, typ) + } + return x + } + + } + var nextFreeFastResult gclinkptr if span.allocCache != 0 { theBit := sys.TrailingZeros64(span.allocCache) @@ -8146,6 +8615,32 @@ func mallocgcSmallNoScanSC20(size uintptr, typ *_type, needzero bool) unsafe.Poi const spc = spanClass(sizeclass<<1) | spanClass(1) span := c.alloc[spc] + if runtimeFreegcEnabled && c.hasReusableNoscan(spc) { + + v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero) + mp.mallocing = 0 + releasem(mp) + x := v + { + + if valgrindenabled { + valgrindMalloc(x, size) + } + + if gcBlackenEnabled != 0 && elemsize != 0 { + if assistG := getg().m.curg; assistG != nil { + assistG.gcAssistBytes -= int64(elemsize - size) + } + } + + if debug.malloc { + postMallocgcDebug(x, elemsize, typ) + } + return x + } + + } + var nextFreeFastResult gclinkptr if span.allocCache != 0 { theBit := sys.TrailingZeros64(span.allocCache) @@ -8243,6 +8738,32 @@ func mallocgcSmallNoScanSC21(size uintptr, typ *_type, needzero bool) unsafe.Poi const spc = spanClass(sizeclass<<1) | spanClass(1) span := c.alloc[spc] + if runtimeFreegcEnabled && c.hasReusableNoscan(spc) { + + v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero) + mp.mallocing = 0 + releasem(mp) + x := v + { + + if valgrindenabled { + valgrindMalloc(x, size) + } + + if gcBlackenEnabled != 0 && elemsize != 0 { + if assistG := getg().m.curg; assistG != nil { + assistG.gcAssistBytes -= int64(elemsize - size) + } + } + + if debug.malloc { + postMallocgcDebug(x, elemsize, typ) + } + return x + } + + } + var nextFreeFastResult gclinkptr if span.allocCache != 0 { theBit := sys.TrailingZeros64(span.allocCache) @@ -8340,6 +8861,32 @@ func mallocgcSmallNoScanSC22(size uintptr, typ *_type, needzero bool) unsafe.Poi const spc = spanClass(sizeclass<<1) | spanClass(1) span := c.alloc[spc] + if runtimeFreegcEnabled && c.hasReusableNoscan(spc) { + + v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero) + mp.mallocing = 0 + releasem(mp) + x := v + { + + if valgrindenabled { + valgrindMalloc(x, size) + } + + if gcBlackenEnabled != 0 && elemsize != 0 { + if assistG := getg().m.curg; assistG != nil { + assistG.gcAssistBytes -= int64(elemsize - size) + } + } + + if debug.malloc { + postMallocgcDebug(x, elemsize, typ) + } + return x + } + + } + var nextFreeFastResult gclinkptr if span.allocCache != 0 { theBit := sys.TrailingZeros64(span.allocCache) @@ -8437,6 +8984,32 @@ func mallocgcSmallNoScanSC23(size uintptr, typ *_type, needzero bool) unsafe.Poi const spc = spanClass(sizeclass<<1) | spanClass(1) span := c.alloc[spc] + if runtimeFreegcEnabled && c.hasReusableNoscan(spc) { + + v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero) + mp.mallocing = 0 + releasem(mp) + x := v + { + + if valgrindenabled { + valgrindMalloc(x, size) + } + + if gcBlackenEnabled != 0 && elemsize != 0 { + if assistG := getg().m.curg; assistG != nil { + assistG.gcAssistBytes -= int64(elemsize - size) + } + } + + if debug.malloc { + postMallocgcDebug(x, elemsize, typ) + } + return x + } + + } + var nextFreeFastResult gclinkptr if span.allocCache != 0 { theBit := sys.TrailingZeros64(span.allocCache) @@ -8534,6 +9107,32 @@ func mallocgcSmallNoScanSC24(size uintptr, typ *_type, needzero bool) unsafe.Poi const spc = spanClass(sizeclass<<1) | spanClass(1) span := c.alloc[spc] + if runtimeFreegcEnabled && c.hasReusableNoscan(spc) { + + v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero) + mp.mallocing = 0 + releasem(mp) + x := v + { + + if valgrindenabled { + valgrindMalloc(x, size) + } + + if gcBlackenEnabled != 0 && elemsize != 0 { + if assistG := getg().m.curg; assistG != nil { + assistG.gcAssistBytes -= int64(elemsize - size) + } + } + + if debug.malloc { + postMallocgcDebug(x, elemsize, typ) + } + return x + } + + } + var nextFreeFastResult gclinkptr if span.allocCache != 0 { theBit := sys.TrailingZeros64(span.allocCache) @@ -8631,6 +9230,32 @@ func mallocgcSmallNoScanSC25(size uintptr, typ *_type, needzero bool) unsafe.Poi const spc = spanClass(sizeclass<<1) | spanClass(1) span := c.alloc[spc] + if runtimeFreegcEnabled && c.hasReusableNoscan(spc) { + + v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero) + mp.mallocing = 0 + releasem(mp) + x := v + { + + if valgrindenabled { + valgrindMalloc(x, size) + } + + if gcBlackenEnabled != 0 && elemsize != 0 { + if assistG := getg().m.curg; assistG != nil { + assistG.gcAssistBytes -= int64(elemsize - size) + } + } + + if debug.malloc { + postMallocgcDebug(x, elemsize, typ) + } + return x + } + + } + var nextFreeFastResult gclinkptr if span.allocCache != 0 { theBit := sys.TrailingZeros64(span.allocCache) @@ -8728,6 +9353,32 @@ func mallocgcSmallNoScanSC26(size uintptr, typ *_type, needzero bool) unsafe.Poi const spc = spanClass(sizeclass<<1) | spanClass(1) span := c.alloc[spc] + if runtimeFreegcEnabled && c.hasReusableNoscan(spc) { + + v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero) + mp.mallocing = 0 + releasem(mp) + x := v + { + + if valgrindenabled { + valgrindMalloc(x, size) + } + + if gcBlackenEnabled != 0 && elemsize != 0 { + if assistG := getg().m.curg; assistG != nil { + assistG.gcAssistBytes -= int64(elemsize - size) + } + } + + if debug.malloc { + postMallocgcDebug(x, elemsize, typ) + } + return x + } + + } + var nextFreeFastResult gclinkptr if span.allocCache != 0 { theBit := sys.TrailingZeros64(span.allocCache) diff --git a/src/runtime/malloc_stubs.go b/src/runtime/malloc_stubs.go index 224746f3d4..e9752956b8 100644 --- a/src/runtime/malloc_stubs.go +++ b/src/runtime/malloc_stubs.go @@ -7,6 +7,8 @@ // to produce a full mallocgc function that's specialized for a span class // or specific size in the case of the tiny allocator. // +// To generate the specialized mallocgc functions, do 'go run .' inside runtime/_mkmalloc. +// // To assemble a mallocgc function, the mallocStub function is cloned, and the call to // inlinedMalloc is replaced with the inlined body of smallScanNoHeaderStub, // smallNoScanStub or tinyStub, depending on the parameters being specialized. @@ -71,7 +73,8 @@ func mallocStub(size uintptr, typ *_type, needzero bool) unsafe.Pointer { } } - // Assist the GC if needed. + // Assist the GC if needed. (On the reuse path, we currently compensate for this; + // changes here might require changes there.) if gcBlackenEnabled != 0 { deductAssistCredit(size) } @@ -242,6 +245,23 @@ func smallNoScanStub(size uintptr, typ *_type, needzero bool) (unsafe.Pointer, u c := getMCache(mp) const spc = spanClass(sizeclass<<1) | spanClass(noscanint_) span := c.alloc[spc] + + // First, check for a reusable object. + if runtimeFreegcEnabled && c.hasReusableNoscan(spc) { + // We have a reusable object, use it. + v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero) + mp.mallocing = 0 + releasem(mp) + + // TODO(thepudds): note that the generated return path is essentially duplicated + // by the generator. For example, see the two postMallocgcDebug calls and + // related duplicated code on the return path currently in the generated + // mallocgcSmallNoScanSC2 function. One set of those correspond to this + // return here. We might be able to de-duplicate the generated return path + // by updating the generator, perhaps by jumping to a shared return or similar. + return v, elemsize + } + v := nextFreeFastStub(span) if v == 0 { v, span, checkGCTrigger = c.nextFree(spc) diff --git a/src/runtime/malloc_test.go b/src/runtime/malloc_test.go index bf58947bbc..97cf0eed54 100644 --- a/src/runtime/malloc_test.go +++ b/src/runtime/malloc_test.go @@ -16,6 +16,7 @@ import ( "runtime" . "runtime" "strings" + "sync" "sync/atomic" "testing" "time" @@ -234,6 +235,364 @@ func TestTinyAllocIssue37262(t *testing.T) { runtime.Releasem() } +// TestFreegc does basic testing of explicit frees. +func TestFreegc(t *testing.T) { + tests := []struct { + size string + f func(noscan bool) func(*testing.T) + noscan bool + }{ + // Types without pointers. + {"size=16", testFreegc[[16]byte], true}, // smallest we support currently + {"size=17", testFreegc[[17]byte], true}, + {"size=64", testFreegc[[64]byte], true}, + {"size=500", testFreegc[[500]byte], true}, + {"size=512", testFreegc[[512]byte], true}, + {"size=4096", testFreegc[[4096]byte], true}, + {"size=20000", testFreegc[[20000]byte], true}, // not power of 2 or spc boundary + {"size=32KiB-8", testFreegc[[1<<15 - 8]byte], true}, // max noscan small object for 64-bit + } + + // Run the tests twice if not in -short mode or not otherwise saving test time. + // First while manually calling runtime.GC to slightly increase isolation (perhaps making + // problems more reproducible). + for _, tt := range tests { + runtime.GC() + t.Run(fmt.Sprintf("gc=yes/ptrs=%v/%s", !tt.noscan, tt.size), tt.f(tt.noscan)) + } + runtime.GC() + + if testing.Short() || !RuntimeFreegcEnabled || runtime.Raceenabled { + return + } + + // Again, but without manually calling runtime.GC in the loop (perhaps less isolation might + // trigger problems). + for _, tt := range tests { + t.Run(fmt.Sprintf("gc=no/ptrs=%v/%s", !tt.noscan, tt.size), tt.f(tt.noscan)) + } + runtime.GC() +} + +func testFreegc[T comparable](noscan bool) func(*testing.T) { + // We use stressMultiple to influence the duration of the tests. + // When testing freegc changes, stressMultiple can be increased locally + // to test longer or in some cases with more goroutines. + // It can also be helpful to test with GODEBUG=clobberfree=1 and + // with and without doubleCheckMalloc and doubleCheckReusable enabled. + stressMultiple := 10 + if testing.Short() || !RuntimeFreegcEnabled || runtime.Raceenabled { + stressMultiple = 1 + } + + return func(t *testing.T) { + alloc := func() *T { + // Force heap alloc, plus some light validation of zeroed memory. + t.Helper() + p := Escape(new(T)) + var zero T + if *p != zero { + t.Fatalf("allocator returned non-zero memory: %v", *p) + } + return p + } + + free := func(p *T) { + t.Helper() + var zero T + if *p != zero { + t.Fatalf("found non-zero memory before freegc (tests do not modify memory): %v", *p) + } + runtime.Freegc(unsafe.Pointer(p), unsafe.Sizeof(*p), noscan) + } + + t.Run("basic-free", func(t *testing.T) { + // Test that freeing a live heap object doesn't crash. + for range 100 { + p := alloc() + free(p) + } + }) + + t.Run("stack-free", func(t *testing.T) { + // Test that freeing a stack object doesn't crash. + for range 100 { + var x [32]byte + var y [32]*int + runtime.Freegc(unsafe.Pointer(&x), unsafe.Sizeof(x), true) // noscan + runtime.Freegc(unsafe.Pointer(&y), unsafe.Sizeof(y), false) // !noscan + } + }) + + // Check our allocations. These tests rely on the + // current implementation treating a re-used object + // as not adding to the allocation counts seen + // by testing.AllocsPerRun. (This is not the desired + // long-term behavior, but it is the current behavior and + // makes these tests convenient). + + t.Run("allocs-baseline", func(t *testing.T) { + // Baseline result without any explicit free. + allocs := testing.AllocsPerRun(100, func() { + for range 100 { + p := alloc() + _ = p + } + }) + if allocs < 100 { + // TODO(thepudds): we get exactly 100 for almost all the tests, but investigate why + // ~101 allocs for TestFreegc/ptrs=true/size=32KiB-8. + t.Fatalf("expected >=100 allocations, got %v", allocs) + } + }) + + t.Run("allocs-with-free", func(t *testing.T) { + // Same allocations, but now using explicit free so that + // no allocs get reported. (Again, not the desired long-term behavior). + if SizeSpecializedMallocEnabled && !noscan { + // TODO(thepudds): skip at this point in the stack for size-specialized malloc + // with !noscan. Additional integration with sizespecializedmalloc is in a later CL. + t.Skip("temporarily skipping alloc tests for GOEXPERIMENT=sizespecializedmalloc for pointer types") + } + if !RuntimeFreegcEnabled { + t.Skip("skipping alloc tests with runtime.freegc disabled") + } + allocs := testing.AllocsPerRun(100, func() { + for range 100 { + p := alloc() + free(p) + } + }) + if allocs != 0 { + t.Fatalf("expected 0 allocations, got %v", allocs) + } + }) + + t.Run("free-multiple", func(t *testing.T) { + // Multiple allocations outstanding before explicitly freeing, + // but still within the limit of our smallest free list size + // so that no allocs are reported. (Again, not long-term behavior). + if SizeSpecializedMallocEnabled && !noscan { + // TODO(thepudds): skip at this point in the stack for size-specialized malloc + // with !noscan. Additional integration with sizespecializedmalloc is in a later CL. + t.Skip("temporarily skipping alloc tests for GOEXPERIMENT=sizespecializedmalloc for pointer types") + } + if !RuntimeFreegcEnabled { + t.Skip("skipping alloc tests with runtime.freegc disabled") + } + const maxOutstanding = 20 + s := make([]*T, 0, maxOutstanding) + allocs := testing.AllocsPerRun(100*stressMultiple, func() { + s = s[:0] + for range maxOutstanding { + p := alloc() + s = append(s, p) + } + for _, p := range s { + free(p) + } + }) + if allocs != 0 { + t.Fatalf("expected 0 allocations, got %v", allocs) + } + }) + + if runtime.GOARCH == "wasm" { + // TODO(thepudds): for wasm, double-check if just slow, vs. some test logic problem, + // vs. something else. It might have been wasm was slowest with tests that spawn + // many goroutines, which might be expected for wasm. This skip might no longer be + // needed now that we have tuned test execution time more, or perhaps wasm should just + // always run in short mode, which might also let us remove this skip. + t.Skip("skipping remaining freegc tests, was timing out on wasm") + } + + t.Run("free-many", func(t *testing.T) { + // Confirm we are graceful if we have more freed elements at once + // than the max free list size. + s := make([]*T, 0, 1000) + iterations := stressMultiple * stressMultiple // currently 1 (-short) or 100 + for range iterations { + s = s[:0] + for range 1000 { + p := alloc() + s = append(s, p) + } + for _, p := range s { + free(p) + } + } + }) + + t.Run("duplicate-check", func(t *testing.T) { + // A simple duplicate allocation test. We track what should be the set + // of live pointers in a map across a series of allocs and frees, + // and fail if a live pointer value is returned by an allocation. + // TODO: maybe add randomness? allow more live pointers? do across goroutines? + live := make(map[uintptr]bool) + for i := range 100 * stressMultiple { + var s []*T + // Alloc 10 times, tracking the live pointer values. + for j := range 10 { + p := alloc() + uptr := uintptr(unsafe.Pointer(p)) + if live[uptr] { + t.Fatalf("found duplicate pointer (0x%x). i: %d j: %d", uptr, i, j) + } + live[uptr] = true + s = append(s, p) + } + // Explicitly free those pointers, removing them from the live map. + for k := range s { + p := s[k] + s[k] = nil + uptr := uintptr(unsafe.Pointer(p)) + free(p) + delete(live, uptr) + } + } + }) + + t.Run("free-other-goroutine", func(t *testing.T) { + // Use explicit free, but the free happens on a different goroutine than the alloc. + // This also lightly simulates how the free code sees P migration or flushing + // the mcache, assuming we have > 1 P. (Not using testing.AllocsPerRun here). + iterations := 10 * stressMultiple * stressMultiple // currently 10 (-short) or 1000 + for _, capacity := range []int{2} { + for range iterations { + ch := make(chan *T, capacity) + var wg sync.WaitGroup + for range 2 { + wg.Add(1) + go func() { + defer wg.Done() + for p := range ch { + free(p) + } + }() + } + for range 100 { + p := alloc() + ch <- p + } + close(ch) + wg.Wait() + } + } + }) + + t.Run("many-goroutines", func(t *testing.T) { + // Allocate across multiple goroutines, freeing on the same goroutine. + // TODO: probably remove the duplicate checking here; not that useful. + counts := []int{1, 2, 4, 8, 10 * stressMultiple} + for _, goroutines := range counts { + var wg sync.WaitGroup + for range goroutines { + wg.Add(1) + go func() { + defer wg.Done() + live := make(map[uintptr]bool) + for range 100 * stressMultiple { + p := alloc() + uptr := uintptr(unsafe.Pointer(p)) + if live[uptr] { + panic("TestFreeLive: found duplicate pointer") + } + live[uptr] = true + free(p) + delete(live, uptr) + } + }() + } + wg.Wait() + } + }) + + t.Run("assist-credit", func(t *testing.T) { + // Allocate and free using the same span class repeatedly while + // verifying it results in a net zero change in assist credit. + // This helps double-check our manipulation of the assist credit + // during mallocgc/freegc, including in cases when there is + // internal fragmentation when the requested mallocgc size is + // smaller than the size class. + // + // See https://go.dev/cl/717520 for some additional discussion, + // including how we can deliberately cause the test to fail currently + // if we purposefully introduce some assist credit bugs. + if SizeSpecializedMallocEnabled && !noscan { + // TODO(thepudds): skip this test at this point in the stack; later CL has + // integration with sizespecializedmalloc. + t.Skip("temporarily skip assist credit tests for GOEXPERIMENT=sizespecializedmalloc for pointer types") + } + if !RuntimeFreegcEnabled { + t.Skip("skipping assist credit test with runtime.freegc disabled") + } + + // Use a background goroutine to continuously run the GC. + done := make(chan struct{}) + defer close(done) + go func() { + for { + select { + case <-done: + return + default: + runtime.GC() + } + } + }() + + // If making changes related to this test, consider testing locally with + // larger counts, like 100K or 1M. + counts := []int{1, 2, 10, 100 * stressMultiple} + // Dropping down to GOMAXPROCS=1 might help reduce noise. + defer GOMAXPROCS(GOMAXPROCS(1)) + size := int64(unsafe.Sizeof(*new(T))) + for _, count := range counts { + // Start by forcing a GC to reset this g's assist credit + // and perhaps help us get a cleaner measurement of GC cycle count. + runtime.GC() + for i := range count { + // We disable preemption to reduce other code's ability to adjust this g's + // assist credit or otherwise change things while we are measuring. + Acquirem() + + // We do two allocations per loop, with the second allocation being + // the one we measure. The first allocation tries to ensure at least one + // reusable object on the mspan's free list when we do our measured allocation. + p := alloc() + free(p) + + // Now do our primary allocation of interest, bracketed by measurements. + // We measure more than we strictly need (to log details in case of a failure). + creditStart := AssistCredit() + blackenStart := GcBlackenEnable() + p = alloc() + blackenAfterAlloc := GcBlackenEnable() + creditAfterAlloc := AssistCredit() + free(p) + blackenEnd := GcBlackenEnable() + creditEnd := AssistCredit() + + Releasem() + GoschedIfBusy() + + delta := creditEnd - creditStart + if delta != 0 { + t.Logf("assist credit non-zero delta: %d", delta) + t.Logf("\t| size: %d i: %d count: %d", size, i, count) + t.Logf("\t| credit before: %d credit after: %d", creditStart, creditEnd) + t.Logf("\t| alloc delta: %d free delta: %d", + creditAfterAlloc-creditStart, creditEnd-creditAfterAlloc) + t.Logf("\t| gcBlackenEnable (start / after alloc / end): %v/%v/%v", + blackenStart, blackenAfterAlloc, blackenEnd) + t.FailNow() + } + } + } + }) + } +} + func TestPageCacheLeak(t *testing.T) { defer GOMAXPROCS(GOMAXPROCS(1)) leaked := PageCachePagesLeaked() @@ -337,6 +696,13 @@ func BenchmarkMalloc16(b *testing.B) { } } +func BenchmarkMalloc32(b *testing.B) { + for i := 0; i < b.N; i++ { + p := new([4]int64) + Escape(p) + } +} + func BenchmarkMallocTypeInfo8(b *testing.B) { for i := 0; i < b.N; i++ { p := new(struct { @@ -355,6 +721,15 @@ func BenchmarkMallocTypeInfo16(b *testing.B) { } } +func BenchmarkMallocTypeInfo32(b *testing.B) { + for i := 0; i < b.N; i++ { + p := new(struct { + p [32 / unsafe.Sizeof(uintptr(0))]*int + }) + Escape(p) + } +} + type LargeStruct struct { x [16][]byte } diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go index cade81031d..82872f1454 100644 --- a/src/runtime/mcache.go +++ b/src/runtime/mcache.go @@ -44,7 +44,17 @@ type mcache struct { // The rest is not accessed on every malloc. - alloc [numSpanClasses]*mspan // spans to allocate from, indexed by spanClass + // alloc contains spans to allocate from, indexed by spanClass. + alloc [numSpanClasses]*mspan + + // TODO(thepudds): better to interleave alloc and reusableScan/reusableNoscan so that + // a single malloc call can often access both in the same cache line for a given spanClass. + // It's not interleaved right now in part to have slightly smaller diff, and might be + // negligible effect on current microbenchmarks. + + // reusableNoscan contains linked lists of reusable noscan heap objects, indexed by spanClass. + // The next pointers are stored in the first word of the heap objects. + reusableNoscan [numSpanClasses]gclinkptr stackcache [_NumStackOrders]stackfreelist @@ -96,6 +106,7 @@ func allocmcache() *mcache { c.alloc[i] = &emptymspan } c.nextSample = nextSample() + return c } @@ -153,6 +164,16 @@ func (c *mcache) refill(spc spanClass) { if s.allocCount != s.nelems { throw("refill of span with free space remaining") } + + // TODO(thepudds): we might be able to allow mallocgcTiny to reuse 16 byte objects from spc==5, + // but for now, just clear our reusable objects for tinySpanClass. + if spc == tinySpanClass { + c.reusableNoscan[spc] = 0 + } + if c.reusableNoscan[spc] != 0 { + throw("refill of span with reusable pointers remaining on pointer free list") + } + if s != &emptymspan { // Mark this span as no longer cached. if s.sweepgen != mheap_.sweepgen+3 { @@ -312,6 +333,13 @@ func (c *mcache) releaseAll() { c.tinyAllocs = 0 memstats.heapStats.release() + // Clear the reusable linked lists. + // For noscan objects, the nodes of the linked lists are the reusable heap objects themselves, + // so we can simply clear the linked list head pointers. + // TODO(thepudds): consider having debug logging of a non-empty reusable lists getting cleared, + // maybe based on the existing debugReusableLog. + clear(c.reusableNoscan[:]) + // Update heapLive and heapScan. gcController.update(dHeapLive, scanAlloc) } @@ -339,3 +367,25 @@ func (c *mcache) prepareForSweep() { stackcache_clear(c) c.flushGen.Store(mheap_.sweepgen) // Synchronizes with gcStart } + +// addReusableNoscan adds a noscan object pointer to the reusable pointer free list +// for a span class. +func (c *mcache) addReusableNoscan(spc spanClass, ptr uintptr) { + if !runtimeFreegcEnabled { + return + } + + // Add to the reusable pointers free list. + v := gclinkptr(ptr) + v.ptr().next = c.reusableNoscan[spc] + c.reusableNoscan[spc] = v +} + +// hasReusableNoscan reports whether there is a reusable object available for +// a noscan spc. +func (c *mcache) hasReusableNoscan(spc spanClass) bool { + if !runtimeFreegcEnabled { + return false + } + return c.reusableNoscan[spc] != 0 +} diff --git a/src/runtime/mcleanup.go b/src/runtime/mcleanup.go index 383217aa05..fc71af9f3f 100644 --- a/src/runtime/mcleanup.go +++ b/src/runtime/mcleanup.go @@ -72,8 +72,9 @@ import ( // pass the object to the [KeepAlive] function after the last point // where the object must remain reachable. func AddCleanup[T, S any](ptr *T, cleanup func(S), arg S) Cleanup { - // Explicitly force ptr to escape to the heap. + // Explicitly force ptr and cleanup to escape to the heap. ptr = abi.Escape(ptr) + cleanup = abi.Escape(cleanup) // The pointer to the object must be valid. if ptr == nil { @@ -82,7 +83,8 @@ func AddCleanup[T, S any](ptr *T, cleanup func(S), arg S) Cleanup { usptr := uintptr(unsafe.Pointer(ptr)) // Check that arg is not equal to ptr. - if kind := abi.TypeOf(arg).Kind(); kind == abi.Pointer || kind == abi.UnsafePointer { + argType := abi.TypeOf(arg) + if kind := argType.Kind(); kind == abi.Pointer || kind == abi.UnsafePointer { if unsafe.Pointer(ptr) == *((*unsafe.Pointer)(unsafe.Pointer(&arg))) { panic("runtime.AddCleanup: ptr is equal to arg, cleanup will never run") } @@ -98,12 +100,23 @@ func AddCleanup[T, S any](ptr *T, cleanup func(S), arg S) Cleanup { return Cleanup{} } - fn := func() { - cleanup(arg) + // Create new storage for the argument. + var argv *S + if size := unsafe.Sizeof(arg); size < maxTinySize && argType.PtrBytes == 0 { + // Side-step the tiny allocator to avoid liveness issues, since this box + // will be treated like a root by the GC. We model the box as an array of + // uintptrs to guarantee maximum allocator alignment. + // + // TODO(mknyszek): Consider just making space in cleanupFn for this. The + // unfortunate part of this is it would grow specialCleanup by 16 bytes, so + // while there wouldn't be an allocation, *every* cleanup would take the + // memory overhead hit. + box := new([maxTinySize / goarch.PtrSize]uintptr) + argv = (*S)(unsafe.Pointer(box)) + } else { + argv = new(S) } - // Closure must escape. - fv := *(**funcval)(unsafe.Pointer(&fn)) - fv = abi.Escape(fv) + *argv = arg // Find the containing object. base, _, _ := findObject(usptr, 0, 0) @@ -120,7 +133,16 @@ func AddCleanup[T, S any](ptr *T, cleanup func(S), arg S) Cleanup { gcCleanups.createGs() } - id := addCleanup(unsafe.Pointer(ptr), fv) + id := addCleanup(unsafe.Pointer(ptr), cleanupFn{ + // Instantiate a caller function to call the cleanup, that is cleanup(*argv). + // + // TODO(mknyszek): This allocates because the generic dictionary argument + // gets closed over, but callCleanup doesn't even use the dictionary argument, + // so theoretically that could be removed, eliminating an allocation. + call: callCleanup[S], + fn: *(**funcval)(unsafe.Pointer(&cleanup)), + arg: unsafe.Pointer(argv), + }) if debug.checkfinalizers != 0 { cleanupFn := *(**funcval)(unsafe.Pointer(&cleanup)) setCleanupContext(unsafe.Pointer(ptr), abi.TypeFor[T](), sys.GetCallerPC(), cleanupFn.fn, id) @@ -131,6 +153,16 @@ func AddCleanup[T, S any](ptr *T, cleanup func(S), arg S) Cleanup { } } +// callCleanup is a helper for calling cleanups in a polymorphic way. +// +// In practice, all it does is call fn(*arg). arg must be a *T. +// +//go:noinline +func callCleanup[T any](fn *funcval, arg unsafe.Pointer) { + cleanup := *(*func(T))(unsafe.Pointer(&fn)) + cleanup(*(*T)(arg)) +} + // Cleanup is a handle to a cleanup call for a specific object. type Cleanup struct { // id is the unique identifier for the cleanup within the arena. @@ -216,7 +248,17 @@ const cleanupBlockSize = 512 // that the cleanup queue does not grow during marking (but it can shrink). type cleanupBlock struct { cleanupBlockHeader - cleanups [(cleanupBlockSize - unsafe.Sizeof(cleanupBlockHeader{})) / goarch.PtrSize]*funcval + cleanups [(cleanupBlockSize - unsafe.Sizeof(cleanupBlockHeader{})) / unsafe.Sizeof(cleanupFn{})]cleanupFn +} + +var cleanupFnPtrMask = [...]uint8{0b111} + +// cleanupFn represents a cleanup function with it's argument, yet to be called. +type cleanupFn struct { + // call is an adapter function that understands how to safely call fn(*arg). + call func(*funcval, unsafe.Pointer) + fn *funcval // cleanup function passed to AddCleanup. + arg unsafe.Pointer // pointer to argument to pass to cleanup function. } var cleanupBlockPtrMask [cleanupBlockSize / goarch.PtrSize / 8]byte @@ -245,8 +287,8 @@ type cleanupBlockHeader struct { // // Must only be called if the GC is in the sweep phase (gcphase == _GCoff), // because it does not synchronize with the garbage collector. -func (b *cleanupBlock) enqueue(fn *funcval) bool { - b.cleanups[b.n] = fn +func (b *cleanupBlock) enqueue(c cleanupFn) bool { + b.cleanups[b.n] = c b.n++ return b.full() } @@ -375,7 +417,7 @@ func (q *cleanupQueue) tryTakeWork() bool { // enqueue queues a single cleanup for execution. // // Called by the sweeper, and only the sweeper. -func (q *cleanupQueue) enqueue(fn *funcval) { +func (q *cleanupQueue) enqueue(c cleanupFn) { mp := acquirem() pp := mp.p.ptr() b := pp.cleanups @@ -396,7 +438,7 @@ func (q *cleanupQueue) enqueue(fn *funcval) { } pp.cleanups = b } - if full := b.enqueue(fn); full { + if full := b.enqueue(c); full { q.full.push(&b.lfnode) pp.cleanups = nil q.addWork(1) @@ -641,7 +683,8 @@ func runCleanups() { gcCleanups.beginRunningCleanups() for i := 0; i < int(b.n); i++ { - fn := b.cleanups[i] + c := b.cleanups[i] + b.cleanups[i] = cleanupFn{} var racectx uintptr if raceenabled { @@ -650,20 +693,15 @@ func runCleanups() { // the same goroutine. // // Synchronize on fn. This would fail to find races on the - // closed-over values in fn (suppose fn is passed to multiple - // AddCleanup calls) if fn was not unique, but it is. Update - // the synchronization on fn if you intend to optimize it - // and store the cleanup function and cleanup argument on the - // queue directly. - racerelease(unsafe.Pointer(fn)) + // closed-over values in fn (suppose arg is passed to multiple + // AddCleanup calls) if arg was not unique, but it is. + racerelease(unsafe.Pointer(c.arg)) racectx = raceEnterNewCtx() - raceacquire(unsafe.Pointer(fn)) + raceacquire(unsafe.Pointer(c.arg)) } // Execute the next cleanup. - cleanup := *(*func())(unsafe.Pointer(&fn)) - cleanup() - b.cleanups[i] = nil + c.call(c.fn, c.arg) if raceenabled { // Restore the old context. diff --git a/src/runtime/mcleanup_test.go b/src/runtime/mcleanup_test.go index 22b9eccd20..341d30afa7 100644 --- a/src/runtime/mcleanup_test.go +++ b/src/runtime/mcleanup_test.go @@ -336,3 +336,31 @@ func TestCleanupLost(t *testing.T) { t.Errorf("expected %d cleanups to be executed, got %d", got, want) } } + +// BenchmarkAddCleanupAndStop benchmarks adding and removing a cleanup +// from the same allocation. +// +// At face value, this benchmark is unrealistic, since no program would +// do this in practice. However, adding cleanups to new allocations in a +// loop is also unrealistic. It adds additional unused allocations, +// exercises uncommon performance pitfalls in AddCleanup (traversing the +// specials list, which should just be its own benchmark), and executing +// cleanups at a frequency that is unlikely to appear in real programs. +// +// This benchmark is still useful however, since we can get a low-noise +// measurement of the cost of AddCleanup and Stop all in one without the +// above pitfalls: we can measure the pure overhead. We can then separate +// out the cost of each in CPU profiles if we so choose (they're not so +// inexpensive as to make this infeasible). +func BenchmarkAddCleanupAndStop(b *testing.B) { + b.ReportAllocs() + + type T struct { + v int + p unsafe.Pointer + } + x := new(T) + for b.Loop() { + runtime.AddCleanup(x, func(int) {}, 14).Stop() + } +} diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go index 43afbc330b..febcd9558c 100644 --- a/src/runtime/mgc.go +++ b/src/runtime/mgc.go @@ -1727,7 +1727,13 @@ func gcBgMarkWorker(ready chan struct{}) { // the stack (see gopark). Prevent deadlock from recursively // starting GC by disabling preemption. gp.m.preemptoff = "GC worker init" - node := &new(gcBgMarkWorkerNodePadded).gcBgMarkWorkerNode // TODO: technically not allowed in the heap. See comment in tagptr.go. + // TODO: This is technically not allowed in the heap. See comment in tagptr.go. + // + // It is kept alive simply by virtue of being used in the infinite loop + // below. gcBgMarkWorkerPool keeps pointers to nodes that are not + // GC-visible, so this must be kept alive indefinitely (even if + // GOMAXPROCS decreases). + node := &new(gcBgMarkWorkerNodePadded).gcBgMarkWorkerNode gp.m.preemptoff = "" node.gp.set(gp) diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go index dd76973c62..714b9a51df 100644 --- a/src/runtime/mgcmark.go +++ b/src/runtime/mgcmark.go @@ -204,7 +204,7 @@ func gcMarkRootCheck() { }) } -// ptrmask for an allocation containing a single pointer. +// oneptrmask for an allocation containing a single pointer. var oneptrmask = [...]uint8{1} // markroot scans the i'th root. @@ -251,7 +251,7 @@ func markroot(gcw *gcWork, i uint32, flushBgCredit bool) int64 { // N.B. This only needs to synchronize with cleanup execution, which only resets these blocks. // All cleanup queueing happens during sweep. n := uintptr(atomic.Load(&cb.n)) - scanblock(uintptr(unsafe.Pointer(&cb.cleanups[0])), n*goarch.PtrSize, &cleanupBlockPtrMask[0], gcw, nil) + scanblock(uintptr(unsafe.Pointer(&cb.cleanups[0])), n*unsafe.Sizeof(cleanupFn{}), &cleanupBlockPtrMask[0], gcw, nil) } case work.baseSpans <= i && i < work.baseStacks: @@ -489,7 +489,7 @@ func gcScanFinalizer(spf *specialfinalizer, s *mspan, gcw *gcWork) { // gcScanCleanup scans the relevant parts of a cleanup special as a root. func gcScanCleanup(spc *specialCleanup, gcw *gcWork) { // The special itself is a root. - scanblock(uintptr(unsafe.Pointer(&spc.fn)), goarch.PtrSize, &oneptrmask[0], gcw, nil) + scanblock(uintptr(unsafe.Pointer(&spc.cleanup)), unsafe.Sizeof(cleanupFn{}), &cleanupFnPtrMask[0], gcw, nil) } // gcAssistAlloc performs GC work to make gp's assist debt positive. @@ -1524,29 +1524,32 @@ func scanConservative(b, n uintptr, ptrmask *uint8, gcw *gcWork, state *stackSca if debugScanConservative { printlock() print("conservatively scanning [", hex(b), ",", hex(b+n), ")\n") - hexdumpWords(b, b+n, func(p uintptr) byte { + hexdumpWords(b, n, func(p uintptr, m hexdumpMarker) { if ptrmask != nil { word := (p - b) / goarch.PtrSize bits := *addb(ptrmask, word/8) if (bits>>(word%8))&1 == 0 { - return '$' + return } } val := *(*uintptr)(unsafe.Pointer(p)) if state != nil && state.stack.lo <= val && val < state.stack.hi { - return '@' + m.start() + println("ptr to stack") + return } span := spanOfHeap(val) if span == nil { - return ' ' + return } idx := span.objIndex(val) if span.isFreeOrNewlyAllocated(idx) { - return ' ' + return } - return '*' + m.start() + println("ptr to heap") }) printunlock() } diff --git a/src/runtime/mgcmark_greenteagc.go b/src/runtime/mgcmark_greenteagc.go index 3594b33cfd..fa560f9966 100644 --- a/src/runtime/mgcmark_greenteagc.go +++ b/src/runtime/mgcmark_greenteagc.go @@ -978,7 +978,9 @@ func spanSetScans(spanBase uintptr, nelems uint16, imb *spanInlineMarkBits, toSc } func scanObjectSmall(spanBase, b, objSize uintptr, gcw *gcWork) { - ptrBits := heapBitsSmallForAddrInline(spanBase, b, objSize) + hbitsBase, _ := spanHeapBitsRange(spanBase, gc.PageSize, objSize) + hbits := (*byte)(unsafe.Pointer(hbitsBase)) + ptrBits := extractHeapBitsSmall(hbits, spanBase, b, objSize) gcw.heapScanWork += int64(sys.Len64(uint64(ptrBits)) * goarch.PtrSize) nptrs := 0 n := sys.OnesCount64(uint64(ptrBits)) @@ -1017,12 +1019,14 @@ func scanObjectsSmall(base, objSize uintptr, elems uint16, gcw *gcWork, scans *g break } n := sys.OnesCount64(uint64(bits)) + hbitsBase, _ := spanHeapBitsRange(base, gc.PageSize, objSize) + hbits := (*byte)(unsafe.Pointer(hbitsBase)) for range n { j := sys.TrailingZeros64(uint64(bits)) bits &^= 1 << j b := base + uintptr(i*(goarch.PtrSize*8)+j)*objSize - ptrBits := heapBitsSmallForAddrInline(base, b, objSize) + ptrBits := extractHeapBitsSmall(hbits, base, b, objSize) gcw.heapScanWork += int64(sys.Len64(uint64(ptrBits)) * goarch.PtrSize) n := sys.OnesCount64(uint64(ptrBits)) @@ -1056,10 +1060,7 @@ func scanObjectsSmall(base, objSize uintptr, elems uint16, gcw *gcWork, scans *g } } -func heapBitsSmallForAddrInline(spanBase, addr, elemsize uintptr) uintptr { - hbitsBase, _ := spanHeapBitsRange(spanBase, gc.PageSize, elemsize) - hbits := (*byte)(unsafe.Pointer(hbitsBase)) - +func extractHeapBitsSmall(hbits *byte, spanBase, addr, elemsize uintptr) uintptr { // These objects are always small enough that their bitmaps // fit in a single word, so just load the word or two we need. // diff --git a/src/runtime/mgcpacer.go b/src/runtime/mgcpacer.go index 32c1b941e5..388cce83cd 100644 --- a/src/runtime/mgcpacer.go +++ b/src/runtime/mgcpacer.go @@ -10,7 +10,7 @@ import ( "internal/runtime/atomic" "internal/runtime/math" "internal/strconv" - _ "unsafe" // for go:linkname + _ "unsafe" ) const ( @@ -749,30 +749,33 @@ func (c *gcControllerState) enlistWorker() { } } -// findRunnableGCWorker returns a background mark worker for pp if it -// should be run. This must only be called when gcBlackenEnabled != 0. -func (c *gcControllerState) findRunnableGCWorker(pp *p, now int64) (*g, int64) { +// assignWaitingGCWorker assigns a background mark worker to pp if one should +// be run. +// +// If a worker is selected, it is assigned to pp.nextMarkGCWorker and the P is +// wired as a GC mark worker. The G is still in _Gwaiting. If no worker is +// selected, ok returns false. +// +// If assignedWaitingGCWorker returns true, this P must either: +// - Mark the G as runnable and run it, clearing pp.nextMarkGCWorker. +// - Or, call c.releaseNextGCMarkWorker. +// +// This must only be called when gcBlackenEnabled != 0. +func (c *gcControllerState) assignWaitingGCWorker(pp *p, now int64) (bool, int64) { if gcBlackenEnabled == 0 { throw("gcControllerState.findRunnable: blackening not enabled") } - // Since we have the current time, check if the GC CPU limiter - // hasn't had an update in a while. This check is necessary in - // case the limiter is on but hasn't been checked in a while and - // so may have left sufficient headroom to turn off again. if now == 0 { now = nanotime() } - if gcCPULimiter.needUpdate(now) { - gcCPULimiter.update(now) - } if !gcShouldScheduleWorker(pp) { // No good reason to schedule a worker. This can happen at // the end of the mark phase when there are still // assists tapering off. Don't bother running a worker // now because it'll just return immediately. - return nil, now + return false, now } if c.dedicatedMarkWorkersNeeded.Load() <= 0 && c.fractionalUtilizationGoal == 0 { @@ -783,7 +786,7 @@ func (c *gcControllerState) findRunnableGCWorker(pp *p, now int64) (*g, int64) { // When a dedicated worker stops running, the gcBgMarkWorker loop notes // the need for the worker before returning it to the pool. If we don't // see the need now, we wouldn't have found it in the pool anyway. - return nil, now + return false, now } // Grab a worker before we commit to running below. @@ -800,7 +803,7 @@ func (c *gcControllerState) findRunnableGCWorker(pp *p, now int64) (*g, int64) { // it will always do so with queued global work. Thus, that P // will be immediately eligible to re-run the worker G it was // just using, ensuring work can complete. - return nil, now + return false, now } decIfPositive := func(val *atomic.Int64) bool { @@ -823,7 +826,7 @@ func (c *gcControllerState) findRunnableGCWorker(pp *p, now int64) (*g, int64) { } else if c.fractionalUtilizationGoal == 0 { // No need for fractional workers. gcBgMarkWorkerPool.push(&node.node) - return nil, now + return false, now } else { // Is this P behind on the fractional utilization // goal? @@ -833,12 +836,51 @@ func (c *gcControllerState) findRunnableGCWorker(pp *p, now int64) (*g, int64) { if delta > 0 && float64(pp.gcFractionalMarkTime.Load())/float64(delta) > c.fractionalUtilizationGoal { // Nope. No need to run a fractional worker. gcBgMarkWorkerPool.push(&node.node) - return nil, now + return false, now } // Run a fractional worker. pp.gcMarkWorkerMode = gcMarkWorkerFractionalMode } + pp.nextGCMarkWorker = node + return true, now +} + +// findRunnableGCWorker returns a background mark worker for pp if it +// should be run. +// +// If findRunnableGCWorker returns a G, this P is wired as a GC mark worker and +// must run the G. +// +// This must only be called when gcBlackenEnabled != 0. +// +// This function is allowed to have write barriers because it is called from +// the portion of findRunnable that always has a P. +// +//go:yeswritebarrierrec +func (c *gcControllerState) findRunnableGCWorker(pp *p, now int64) (*g, int64) { + // Since we have the current time, check if the GC CPU limiter + // hasn't had an update in a while. This check is necessary in + // case the limiter is on but hasn't been checked in a while and + // so may have left sufficient headroom to turn off again. + if now == 0 { + now = nanotime() + } + if gcCPULimiter.needUpdate(now) { + gcCPULimiter.update(now) + } + + // If a worker wasn't already assigned by procresize, assign one now. + if pp.nextGCMarkWorker == nil { + ok, now := c.assignWaitingGCWorker(pp, now) + if !ok { + return nil, now + } + } + + node := pp.nextGCMarkWorker + pp.nextGCMarkWorker = nil + // Run the background mark worker. gp := node.gp.ptr() trace := traceAcquire() @@ -850,6 +892,23 @@ func (c *gcControllerState) findRunnableGCWorker(pp *p, now int64) (*g, int64) { return gp, now } +// Release an unused pp.nextGCMarkWorker, if any. +// +// This function is allowed to have write barriers because it is called from +// the portion of schedule. +// +//go:yeswritebarrierrec +func (c *gcControllerState) releaseNextGCMarkWorker(pp *p) { + node := pp.nextGCMarkWorker + if node == nil { + return + } + + c.markWorkerStop(pp.gcMarkWorkerMode, 0) + gcBgMarkWorkerPool.push(&node.node) + pp.nextGCMarkWorker = nil +} + // resetLive sets up the controller state for the next mark phase after the end // of the previous one. Must be called after endCycle and before commit, before // the world is started. diff --git a/src/runtime/mgcsweep.go b/src/runtime/mgcsweep.go index c3d6afb90a..4eecb1cfd9 100644 --- a/src/runtime/mgcsweep.go +++ b/src/runtime/mgcsweep.go @@ -885,7 +885,7 @@ func (s *mspan) reportZombies() { if length > 1024 { length = 1024 } - hexdumpWords(addr, addr+length, nil) + hexdumpWords(addr, length, nil) } mbits.advance() abits.advance() diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go index 711c7790eb..d2ff063b00 100644 --- a/src/runtime/mheap.go +++ b/src/runtime/mheap.go @@ -435,7 +435,7 @@ type mspan struct { // indicating a free object. freeindex is then adjusted so that subsequent scans begin // just past the newly discovered free object. // - // If freeindex == nelems, this span has no free objects. + // If freeindex == nelems, this span has no free objects, though might have reusable objects. // // allocBits is a bitmap of objects in this span. // If n >= freeindex and allocBits[n/8] & (1<<(n%8)) is 0 @@ -2161,7 +2161,7 @@ func removefinalizer(p unsafe.Pointer) { type specialCleanup struct { _ sys.NotInHeap special special - fn *funcval + cleanup cleanupFn // Globally unique ID for the cleanup, obtained from mheap_.cleanupID. id uint64 } @@ -2170,14 +2170,18 @@ type specialCleanup struct { // cleanups are allowed on an object, and even the same pointer. // A cleanup id is returned which can be used to uniquely identify // the cleanup. -func addCleanup(p unsafe.Pointer, f *funcval) uint64 { +func addCleanup(p unsafe.Pointer, c cleanupFn) uint64 { + // TODO(mknyszek): Consider pooling specialCleanups on the P + // so we don't have to take the lock every time. Just locking + // is a considerable part of the cost of AddCleanup. This + // would also require reserving some cleanup IDs on the P. lock(&mheap_.speciallock) s := (*specialCleanup)(mheap_.specialCleanupAlloc.alloc()) mheap_.cleanupID++ // Increment first. ID 0 is reserved. id := mheap_.cleanupID unlock(&mheap_.speciallock) s.special.kind = _KindSpecialCleanup - s.fn = f + s.cleanup = c s.id = id mp := acquirem() @@ -2187,17 +2191,16 @@ func addCleanup(p unsafe.Pointer, f *funcval) uint64 { // situation where it's possible that markrootSpans // has already run but mark termination hasn't yet. if gcphase != _GCoff { - gcw := &mp.p.ptr().gcw // Mark the cleanup itself, since the // special isn't part of the GC'd heap. - scanblock(uintptr(unsafe.Pointer(&s.fn)), goarch.PtrSize, &oneptrmask[0], gcw, nil) + gcScanCleanup(s, &mp.p.ptr().gcw) } releasem(mp) - // Keep f alive. There's a window in this function where it's - // only reachable via the special while the special hasn't been - // added to the specials list yet. This is similar to a bug + // Keep c and its referents alive. There's a window in this function + // where it's only reachable via the special while the special hasn't + // been added to the specials list yet. This is similar to a bug // discovered for weak handles, see #70455. - KeepAlive(f) + KeepAlive(c) return id } @@ -2534,7 +2537,15 @@ func getOrAddWeakHandle(p unsafe.Pointer) *atomic.Uintptr { s := (*specialWeakHandle)(mheap_.specialWeakHandleAlloc.alloc()) unlock(&mheap_.speciallock) - handle := new(atomic.Uintptr) + // N.B. Pad the weak handle to ensure it doesn't share a tiny + // block with any other allocations. This can lead to leaks, such + // as in go.dev/issue/76007. As an alternative, we could consider + // using the currently-unused 8-byte noscan size class. + type weakHandleBox struct { + h atomic.Uintptr + _ [maxTinySize - unsafe.Sizeof(atomic.Uintptr{})]byte + } + handle := &(new(weakHandleBox).h) s.special.kind = _KindSpecialWeakHandle s.handle = handle handle.Store(uintptr(p)) @@ -2792,7 +2803,7 @@ func freeSpecial(s *special, p unsafe.Pointer, size uintptr) { // Cleanups, unlike finalizers, do not resurrect the objects // they're attached to, so we only need to pass the cleanup // function, not the object. - gcCleanups.enqueue(sc.fn) + gcCleanups.enqueue(sc.cleanup) lock(&mheap_.speciallock) mheap_.specialCleanupAlloc.free(unsafe.Pointer(sc)) unlock(&mheap_.speciallock) diff --git a/src/runtime/panic.go b/src/runtime/panic.go index e1105afd0f..ff2dec386f 100644 --- a/src/runtime/panic.go +++ b/src/runtime/panic.go @@ -746,7 +746,7 @@ func printpanics(p *_panic) { } print("panic: ") printpanicval(p.arg) - if p.repanicked { + if p.recovered && p.repanicked { print(" [recovered, repanicked]") } else if p.recovered { print(" [recovered]") diff --git a/src/runtime/print.go b/src/runtime/print.go index c01db9d7f9..d2733fb266 100644 --- a/src/runtime/print.go +++ b/src/runtime/print.go @@ -5,7 +5,6 @@ package runtime import ( - "internal/goarch" "internal/strconv" "unsafe" ) @@ -212,43 +211,3 @@ func printeface(e eface) { func printiface(i iface) { print("(", i.tab, ",", i.data, ")") } - -// hexdumpWords prints a word-oriented hex dump of [p, end). -// -// If mark != nil, it will be called with each printed word's address -// and should return a character mark to appear just before that -// word's value. It can return 0 to indicate no mark. -func hexdumpWords(p, end uintptr, mark func(uintptr) byte) { - printlock() - var markbuf [1]byte - markbuf[0] = ' ' - minhexdigits = int(unsafe.Sizeof(uintptr(0)) * 2) - for i := uintptr(0); p+i < end; i += goarch.PtrSize { - if i%16 == 0 { - if i != 0 { - println() - } - print(hex(p+i), ": ") - } - - if mark != nil { - markbuf[0] = mark(p + i) - if markbuf[0] == 0 { - markbuf[0] = ' ' - } - } - gwrite(markbuf[:]) - val := *(*uintptr)(unsafe.Pointer(p + i)) - print(hex(val)) - print(" ") - - // Can we symbolize val? - fn := findfunc(val) - if fn.valid() { - print("<", funcname(fn), "+", hex(val-fn.entry()), "> ") - } - } - minhexdigits = 0 - println() - printunlock() -} diff --git a/src/runtime/proc.go b/src/runtime/proc.go index 21b276cabf..58fb4bd681 100644 --- a/src/runtime/proc.go +++ b/src/runtime/proc.go @@ -3120,7 +3120,7 @@ func startm(pp *p, spinning, lockheld bool) { //go:nowritebarrierrec func handoffp(pp *p) { // handoffp must start an M in any situation where - // findrunnable would return a G to run on pp. + // findRunnable would return a G to run on pp. // if it has local work, start it straight away if !runqempty(pp) || !sched.runq.empty() { @@ -3363,7 +3363,7 @@ func findRunnable() (gp *g, inheritTime, tryWakeP bool) { mp := getg().m // The conditions here and in handoffp must agree: if - // findrunnable would return a G to run, handoffp must start + // findRunnable would return a G to run, handoffp must start // an M. top: @@ -3587,7 +3587,7 @@ top: goto top } if releasep() != pp { - throw("findrunnable: wrong p") + throw("findRunnable: wrong p") } now = pidleput(pp, now) unlock(&sched.lock) @@ -3632,7 +3632,7 @@ top: if mp.spinning { mp.spinning = false if sched.nmspinning.Add(-1) < 0 { - throw("findrunnable: negative nmspinning") + throw("findRunnable: negative nmspinning") } // Note the for correctness, only the last M transitioning from @@ -3705,10 +3705,10 @@ top: if netpollinited() && (netpollAnyWaiters() || pollUntil != 0) && sched.lastpoll.Swap(0) != 0 { sched.pollUntil.Store(pollUntil) if mp.p != 0 { - throw("findrunnable: netpoll with p") + throw("findRunnable: netpoll with p") } if mp.spinning { - throw("findrunnable: netpoll with spinning") + throw("findRunnable: netpoll with spinning") } delay := int64(-1) if pollUntil != 0 { @@ -3974,7 +3974,7 @@ func checkIdleGCNoP() (*p, *g) { // timers and the network poller if there isn't one already. func wakeNetPoller(when int64) { if sched.lastpoll.Load() == 0 { - // In findrunnable we ensure that when polling the pollUntil + // In findRunnable we ensure that when polling the pollUntil // field is either zero or the time to which the current // poll is expected to run. This can have a spurious wakeup // but should never miss a wakeup. @@ -3999,7 +3999,7 @@ func resetspinning() { gp.m.spinning = false nmspinning := sched.nmspinning.Add(-1) if nmspinning < 0 { - throw("findrunnable: negative nmspinning") + throw("findRunnable: negative nmspinning") } // M wakeup policy is deliberately somewhat conservative, so check if we // need to wakeup another P here. See "Worker thread parking/unparking" @@ -4136,11 +4136,23 @@ top: gp, inheritTime, tryWakeP := findRunnable() // blocks until work is available + // May be on a new P. + pp = mp.p.ptr() + // findRunnable may have collected an allp snapshot. The snapshot is // only required within findRunnable. Clear it to all GC to collect the // slice. mp.clearAllpSnapshot() + // If the P was assigned a next GC mark worker but findRunnable + // selected anything else, release the worker so another P may run it. + // + // N.B. If this occurs because a higher-priority goroutine was selected + // (trace reader), then tryWakeP is set, which will wake another P to + // run the worker. If this occurs because the GC is no longer active, + // there is no need to wakep. + gcController.releaseNextGCMarkWorker(pp) + if debug.dontfreezetheworld > 0 && freezing.Load() { // See comment in freezetheworld. We don't want to perturb // scheduler state, so we didn't gcstopm in findRunnable, but @@ -4659,6 +4671,11 @@ func reentersyscall(pc, sp, bp uintptr) { gp.m.locks-- } +// debugExtendGrunningNoP is a debug mode that extends the windows in which +// we're _Grunning without a P in order to try to shake out bugs with code +// assuming this state is impossible. +const debugExtendGrunningNoP = false + // Standard syscall entry used by the go syscall library and normal cgo calls. // // This is exported via linkname to assembly in the syscall package and x/sys. @@ -4771,6 +4788,9 @@ func entersyscallblock() { // <-- // Caution: we're in a small window where we are in _Grunning without a P. // --> + if debugExtendGrunningNoP { + usleep(10) + } casgstatus(gp, _Grunning, _Gsyscall) if gp.syscallsp < gp.stack.lo || gp.stack.hi < gp.syscallsp { systemstack(func() { @@ -4853,6 +4873,9 @@ func exitsyscall() { // Caution: we're in a window where we may be in _Grunning without a P. // Either we will grab a P or call exitsyscall0, where we'll switch to // _Grunnable. + if debugExtendGrunningNoP { + usleep(10) + } // Grab and clear our old P. oldp := gp.m.oldp.ptr() @@ -6026,8 +6049,10 @@ func procresize(nprocs int32) *p { unlock(&allpLock) } + // Assign Ms to Ps with runnable goroutines. var runnablePs *p var runnablePsNeedM *p + var idlePs *p for i := nprocs - 1; i >= 0; i-- { pp := allp[i] if gp.m.p.ptr() == pp { @@ -6035,7 +6060,8 @@ func procresize(nprocs int32) *p { } pp.status = _Pidle if runqempty(pp) { - pidleput(pp, now) + pp.link.set(idlePs) + idlePs = pp continue } @@ -6061,6 +6087,8 @@ func procresize(nprocs int32) *p { pp.link.set(runnablePs) runnablePs = pp } + // Assign Ms to remaining runnable Ps without usable oldm. See comment + // above. for runnablePsNeedM != nil { pp := runnablePsNeedM runnablePsNeedM = pp.link.ptr() @@ -6071,6 +6099,62 @@ func procresize(nprocs int32) *p { runnablePs = pp } + // Now that we've assigned Ms to Ps with runnable goroutines, assign GC + // mark workers to remaining idle Ps, if needed. + // + // By assigning GC workers to Ps here, we slightly speed up starting + // the world, as we will start enough Ps to run all of the user + // goroutines and GC mark workers all at once, rather than using a + // sequence of wakep calls as each P's findRunnable realizes it needs + // to run a mark worker instead of a user goroutine. + // + // By assigning GC workers to Ps only _after_ previously-running Ps are + // assigned Ms, we ensure that goroutines previously running on a P + // continue to run on the same P, with GC mark workers preferring + // previously-idle Ps. This helps prevent goroutines from shuffling + // around too much across STW. + // + // N.B., if there aren't enough Ps left in idlePs for all of the GC + // mark workers, then findRunnable will still choose to run mark + // workers on Ps assigned above. + // + // N.B., we do this during any STW in the mark phase, not just the + // sweep termination STW that starts the mark phase. gcBgMarkWorker + // always preempts by removing itself from the P, so even unrelated + // STWs during the mark require that Ps reselect mark workers upon + // restart. + if gcBlackenEnabled != 0 { + for idlePs != nil { + pp := idlePs + + ok, _ := gcController.assignWaitingGCWorker(pp, now) + if !ok { + // No more mark workers needed. + break + } + + // Got a worker, P is now runnable. + // + // mget may return nil if there aren't enough Ms, in + // which case startTheWorldWithSema will start one. + // + // N.B. findRunnableGCWorker will make the worker G + // itself runnable. + idlePs = pp.link.ptr() + mp := mget() + pp.m.set(mp) + pp.link.set(runnablePs) + runnablePs = pp + } + } + + // Finally, any remaining Ps are truly idle. + for idlePs != nil { + pp := idlePs + idlePs = pp.link.ptr() + pidleput(pp, now) + } + stealOrder.reset(uint32(nprocs)) var int32p *int32 = &gomaxprocs // make compiler check that gomaxprocs is an int32 atomic.Store((*uint32)(unsafe.Pointer(int32p)), uint32(nprocs)) @@ -6173,6 +6257,10 @@ func releasepNoTrace() *p { print("releasep: m=", gp.m, " m->p=", gp.m.p.ptr(), " p->m=", hex(pp.m), " p->status=", pp.status, "\n") throw("releasep: invalid p state") } + + // P must clear if nextGCMarkWorker if it stops. + gcController.releaseNextGCMarkWorker(pp) + gp.m.p = 0 pp.m = 0 pp.status = _Pidle @@ -7259,7 +7347,7 @@ func pidlegetSpinning(now int64) (*p, int64) { pp, now := pidleget(now) if pp == nil { - // See "Delicate dance" comment in findrunnable. We found work + // See "Delicate dance" comment in findRunnable. We found work // that we cannot take, we must synchronize with non-spinning // Ms that may be preparing to drop their P. sched.needspinning.Store(1) @@ -7497,23 +7585,36 @@ func runqgrab(pp *p, batch *[256]guintptr, batchHead uint32, stealRunNextG bool) // Try to steal from pp.runnext. if next := pp.runnext; next != 0 { if pp.status == _Prunning { - // Sleep to ensure that pp isn't about to run the g - // we are about to steal. - // The important use case here is when the g running - // on pp ready()s another g and then almost - // immediately blocks. Instead of stealing runnext - // in this window, back off to give pp a chance to - // schedule runnext. This will avoid thrashing gs - // between different Ps. - // A sync chan send/recv takes ~50ns as of time of - // writing, so 3us gives ~50x overshoot. - if !osHasLowResTimer { - usleep(3) - } else { - // On some platforms system timer granularity is - // 1-15ms, which is way too much for this - // optimization. So just yield. - osyield() + if mp := pp.m.ptr(); mp != nil { + if gp := mp.curg; gp == nil || readgstatus(gp)&^_Gscan != _Gsyscall { + // Sleep to ensure that pp isn't about to run the g + // we are about to steal. + // The important use case here is when the g running + // on pp ready()s another g and then almost + // immediately blocks. Instead of stealing runnext + // in this window, back off to give pp a chance to + // schedule runnext. This will avoid thrashing gs + // between different Ps. + // A sync chan send/recv takes ~50ns as of time of + // writing, so 3us gives ~50x overshoot. + // If curg is nil, we assume that the P is likely + // to be in the scheduler. If curg isn't nil and isn't + // in a syscall, then it's either running, waiting, or + // runnable. In this case we want to sleep because the + // P might either call into the scheduler soon (running), + // or already is (since we found a waiting or runnable + // goroutine hanging off of a running P, suggesting it + // either recently transitioned out of running, or will + // transition to running shortly). + if !osHasLowResTimer { + usleep(3) + } else { + // On some platforms system timer granularity is + // 1-15ms, which is way too much for this + // optimization. So just yield. + osyield() + } + } } } if !pp.runnext.cas(next, 0) { diff --git a/src/runtime/proc_test.go b/src/runtime/proc_test.go index b3084f4895..35a1aeab1f 100644 --- a/src/runtime/proc_test.go +++ b/src/runtime/proc_test.go @@ -1221,7 +1221,7 @@ func TestTraceSTW(t *testing.T) { var errors int for i := range runs { - err := runTestTracesSTW(t, i) + err := runTestTracesSTW(t, i, "TraceSTW", "stop-the-world (read mem stats)") if err != nil { t.Logf("Run %d failed: %v", i, err) errors++ @@ -1235,7 +1235,43 @@ func TestTraceSTW(t *testing.T) { } } -func runTestTracesSTW(t *testing.T, run int) (err error) { +// TestTraceGCSTW verifies that goroutines continue running on the same M and P +// after a GC STW. +func TestTraceGCSTW(t *testing.T) { + // Very similar to TestTraceSTW, but using a STW that starts the GC. + // When the GC starts, the background GC mark workers start running, + // which provide an additional source of disturbance to the scheduler. + // + // procresize assigns GC workers to previously-idle Ps to avoid + // changing what the previously-running Ps are doing. + + if testing.Short() { + t.Skip("skipping in -short mode") + } + + if runtime.NumCPU() < 8 { + t.Skip("This test sets GOMAXPROCS=8 and wants to avoid thread descheduling as much as possible. Skip on machines with less than 8 CPUs") + } + + const runs = 50 + + var errors int + for i := range runs { + err := runTestTracesSTW(t, i, "TraceGCSTW", "stop-the-world (GC sweep termination)") + if err != nil { + t.Logf("Run %d failed: %v", i, err) + errors++ + } + } + + pct := float64(errors)/float64(runs) + t.Logf("Errors: %d/%d = %f%%", errors, runs, 100*pct) + if pct > 0.25 { + t.Errorf("Error rate too high") + } +} + +func runTestTracesSTW(t *testing.T, run int, name, stwType string) (err error) { t.Logf("Run %d", run) // By default, TSAN sleeps for 1s at exit to allow background @@ -1243,7 +1279,7 @@ func runTestTracesSTW(t *testing.T, run int) (err error) { // much, since we are running 50 iterations, so disable the sleep. // // Outside of race mode, GORACE does nothing. - buf := []byte(runTestProg(t, "testprog", "TraceSTW", "GORACE=atexit_sleep_ms=0")) + buf := []byte(runTestProg(t, "testprog", name, "GORACE=atexit_sleep_ms=0")) // We locally "fail" the run (return an error) if the trace exhibits // unwanted scheduling. i.e., the target goroutines did not remain on @@ -1253,7 +1289,7 @@ func runTestTracesSTW(t *testing.T, run int) (err error) { // occur, such as a trace parse error. defer func() { if err != nil || t.Failed() { - testtrace.Dump(t, fmt.Sprintf("TestTraceSTW-run%d", run), []byte(buf), false) + testtrace.Dump(t, fmt.Sprintf("Test%s-run%d", name, run), []byte(buf), false) } }() @@ -1509,12 +1545,10 @@ findEnd: break findEnd case trace.EventRangeBegin: r := ev.Range() - if r.Name == "stop-the-world (read mem stats)" { + if r.Name == stwType { // Note when we see the STW begin. This is not // load bearing; it's purpose is simply to fail - // the test if we manage to remove the STW from - // ReadMemStat, so we remember to change this - // test to add some new source of STW. + // the test if we accidentally remove the STW. stwSeen = true } } diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go index 6c955460d4..56082bf7f5 100644 --- a/src/runtime/runtime2.go +++ b/src/runtime/runtime2.go @@ -854,6 +854,18 @@ type p struct { // mark worker started. gcMarkWorkerStartTime int64 + // nextGCMarkWorker is the next mark worker to run. This may be set + // during start-the-world to assign a worker to this P. The P runs this + // worker on the next call to gcController.findRunnableGCWorker. If the + // P runs something else or stops, it must release this worker via + // gcController.releaseNextGCMarkWorker. + // + // See comment in gcBgMarkWorker about the lifetime of + // gcBgMarkWorkerNode. + // + // Only accessed by this P or during STW. + nextGCMarkWorker *gcBgMarkWorkerNode + // gcw is this P's GC work buffer cache. The work buffer is // filled by write barriers, drained by mutator assists, and // disposed on certain GC state transitions. @@ -1425,9 +1437,9 @@ var ( // must be set. An idle P (passed to pidleput) cannot add new timers while // idle, so if it has no timers at that time, its mask may be cleared. // - // Thus, we get the following effects on timer-stealing in findrunnable: + // Thus, we get the following effects on timer-stealing in findRunnable: // - // - Idle Ps with no timers when they go idle are never checked in findrunnable + // - Idle Ps with no timers when they go idle are never checked in findRunnable // (for work- or timer-stealing; this is the ideal case). // - Running Ps must always be checked. // - Idle Ps whose timers are stolen must continue to be checked until they run diff --git a/src/runtime/slice.go b/src/runtime/slice.go index e31d5dccb2..a9e8fc1610 100644 --- a/src/runtime/slice.go +++ b/src/runtime/slice.go @@ -399,3 +399,107 @@ func bytealg_MakeNoZero(len int) []byte { cap := roundupsize(uintptr(len), true) return unsafe.Slice((*byte)(mallocgc(cap, nil, false)), cap)[:len] } + +// moveSlice copies the input slice to the heap and returns it. +// et is the element type of the slice. +func moveSlice(et *_type, old unsafe.Pointer, len, cap int) (unsafe.Pointer, int, int) { + if cap == 0 { + if old != nil { + old = unsafe.Pointer(&zerobase) + } + return old, 0, 0 + } + capmem := uintptr(cap) * et.Size_ + new := mallocgc(capmem, et, true) + bulkBarrierPreWriteSrcOnly(uintptr(new), uintptr(old), capmem, et) + memmove(new, old, capmem) + return new, len, cap +} + +// moveSliceNoScan is like moveSlice except the element type is known to +// not have any pointers. We instead pass in the size of the element. +func moveSliceNoScan(elemSize uintptr, old unsafe.Pointer, len, cap int) (unsafe.Pointer, int, int) { + if cap == 0 { + if old != nil { + old = unsafe.Pointer(&zerobase) + } + return old, 0, 0 + } + capmem := uintptr(cap) * elemSize + new := mallocgc(capmem, nil, false) + memmove(new, old, capmem) + return new, len, cap +} + +// moveSliceNoCap is like moveSlice, but can pick any appropriate capacity +// for the returned slice. +// Elements between len and cap in the returned slice will be zeroed. +func moveSliceNoCap(et *_type, old unsafe.Pointer, len int) (unsafe.Pointer, int, int) { + if len == 0 { + if old != nil { + old = unsafe.Pointer(&zerobase) + } + return old, 0, 0 + } + lenmem := uintptr(len) * et.Size_ + capmem := roundupsize(lenmem, false) + new := mallocgc(capmem, et, true) + bulkBarrierPreWriteSrcOnly(uintptr(new), uintptr(old), lenmem, et) + memmove(new, old, lenmem) + return new, len, int(capmem / et.Size_) +} + +// moveSliceNoCapNoScan is a combination of moveSliceNoScan and moveSliceNoCap. +func moveSliceNoCapNoScan(elemSize uintptr, old unsafe.Pointer, len int) (unsafe.Pointer, int, int) { + if len == 0 { + if old != nil { + old = unsafe.Pointer(&zerobase) + } + return old, 0, 0 + } + lenmem := uintptr(len) * elemSize + capmem := roundupsize(lenmem, true) + new := mallocgc(capmem, nil, false) + memmove(new, old, lenmem) + if capmem > lenmem { + memclrNoHeapPointers(add(new, lenmem), capmem-lenmem) + } + return new, len, int(capmem / elemSize) +} + +// growsliceBuf is like growslice, but we can use the given buffer +// as a backing store if we want. bufPtr must be on the stack. +func growsliceBuf(oldPtr unsafe.Pointer, newLen, oldCap, num int, et *_type, bufPtr unsafe.Pointer, bufLen int) slice { + if newLen > bufLen { + // Doesn't fit, process like a normal growslice. + return growslice(oldPtr, newLen, oldCap, num, et) + } + oldLen := newLen - num + if oldPtr != bufPtr && oldLen != 0 { + // Move data to start of buffer. + // Note: bufPtr is on the stack, so no write barrier needed. + memmove(bufPtr, oldPtr, uintptr(oldLen)*et.Size_) + } + // Pick a new capacity. + // + // Unlike growslice, we don't need to double the size each time. + // The work done here is not proportional to the length of the slice. + // (Unless the memmove happens above, but that is rare, and in any + // case there are not many elements on this path.) + // + // Instead, we try to just bump up to the next size class. + // This will ensure that we don't waste any space when we eventually + // call moveSlice with the resulting slice. + newCap := int(roundupsize(uintptr(newLen)*et.Size_, !et.Pointers()) / et.Size_) + + // Zero slice beyond newLen. + // The buffer is stack memory, so NoHeapPointers is ok. + // Caller will overwrite [oldLen:newLen], so we don't need to zero that portion. + // If et.Pointers(), buffer is at least initialized so we don't need to + // worry about the caller overwriting junk in [oldLen:newLen]. + if newLen < newCap { + memclrNoHeapPointers(add(bufPtr, uintptr(newLen)*et.Size_), uintptr(newCap-newLen)*et.Size_) + } + + return slice{bufPtr, newLen, newCap} +} diff --git a/src/runtime/slice_test.go b/src/runtime/slice_test.go index cd2bc26d1e..5463b6c02f 100644 --- a/src/runtime/slice_test.go +++ b/src/runtime/slice_test.go @@ -6,6 +6,9 @@ package runtime_test import ( "fmt" + "internal/race" + "internal/testenv" + "runtime" "testing" ) @@ -499,3 +502,319 @@ func BenchmarkAppendInPlace(b *testing.B) { }) } + +//go:noinline +func byteSlice(n int) []byte { + var r []byte + for i := range n { + r = append(r, byte(i)) + } + return r +} +func TestAppendByteInLoop(t *testing.T) { + testenv.SkipIfOptimizationOff(t) + if race.Enabled { + t.Skip("skipping in -race mode") + } + for _, test := range [][3]int{ + {0, 0, 0}, + {1, 1, 8}, + {2, 1, 8}, + {8, 1, 8}, + {9, 1, 16}, + {16, 1, 16}, + {17, 1, 24}, + {24, 1, 24}, + {25, 1, 32}, + {32, 1, 32}, + {33, 1, 64}, // If we up the stack buffer size from 32->64, this line and the next would become 48. + {48, 1, 64}, + {49, 1, 64}, + {64, 1, 64}, + {65, 2, 128}, + } { + n := test[0] + want := test[1] + wantCap := test[2] + var r []byte + got := testing.AllocsPerRun(10, func() { + r = byteSlice(n) + }) + if got != float64(want) { + t.Errorf("for size %d, got %f allocs want %d", n, got, want) + } + if cap(r) != wantCap { + t.Errorf("for size %d, got capacity %d want %d", n, cap(r), wantCap) + } + } +} + +//go:noinline +func ptrSlice(n int, p *[]*byte) { + var r []*byte + for range n { + r = append(r, nil) + } + *p = r +} +func TestAppendPtrInLoop(t *testing.T) { + testenv.SkipIfOptimizationOff(t) + if race.Enabled { + t.Skip("skipping in -race mode") + } + var tests [][3]int + if runtime.PtrSize == 8 { + tests = [][3]int{ + {0, 0, 0}, + {1, 1, 1}, + {2, 1, 2}, + {3, 1, 3}, // This is the interesting case, allocates 24 bytes when before it was 32. + {4, 1, 4}, + {5, 1, 8}, + {6, 1, 8}, + {7, 1, 8}, + {8, 1, 8}, + {9, 2, 16}, + } + } else { + tests = [][3]int{ + {0, 0, 0}, + {1, 1, 2}, + {2, 1, 2}, + {3, 1, 4}, + {4, 1, 4}, + {5, 1, 6}, // These two are also 24 bytes instead of 32. + {6, 1, 6}, // + {7, 1, 8}, + {8, 1, 8}, + {9, 1, 16}, + {10, 1, 16}, + {11, 1, 16}, + {12, 1, 16}, + {13, 1, 16}, + {14, 1, 16}, + {15, 1, 16}, + {16, 1, 16}, + {17, 2, 32}, + } + } + for _, test := range tests { + n := test[0] + want := test[1] + wantCap := test[2] + var r []*byte + got := testing.AllocsPerRun(10, func() { + ptrSlice(n, &r) + }) + if got != float64(want) { + t.Errorf("for size %d, got %f allocs want %d", n, got, want) + } + if cap(r) != wantCap { + t.Errorf("for size %d, got capacity %d want %d", n, cap(r), wantCap) + } + } +} + +//go:noinline +func byteCapSlice(n int) ([]byte, int) { + var r []byte + for i := range n { + r = append(r, byte(i)) + } + return r, cap(r) +} +func TestAppendByteCapInLoop(t *testing.T) { + testenv.SkipIfOptimizationOff(t) + if race.Enabled { + t.Skip("skipping in -race mode") + } + for _, test := range [][3]int{ + {0, 0, 0}, + {1, 1, 8}, + {2, 1, 8}, + {8, 1, 8}, + {9, 1, 16}, + {16, 1, 16}, + {17, 1, 24}, + {24, 1, 24}, + {25, 1, 32}, + {32, 1, 32}, + {33, 1, 64}, + {48, 1, 64}, + {49, 1, 64}, + {64, 1, 64}, + {65, 2, 128}, + } { + n := test[0] + want := test[1] + wantCap := test[2] + var r []byte + got := testing.AllocsPerRun(10, func() { + r, _ = byteCapSlice(n) + }) + if got != float64(want) { + t.Errorf("for size %d, got %f allocs want %d", n, got, want) + } + if cap(r) != wantCap { + t.Errorf("for size %d, got capacity %d want %d", n, cap(r), wantCap) + } + } +} + +func TestAppendGeneric(t *testing.T) { + type I *int + r := testAppendGeneric[I](100) + if len(r) != 100 { + t.Errorf("bad length") + } +} + +//go:noinline +func testAppendGeneric[E any](n int) []E { + var r []E + var z E + for range n { + r = append(r, z) + } + return r +} + +func appendSomeBytes(r []byte, s []byte) []byte { + for _, b := range s { + r = append(r, b) + } + return r +} + +func TestAppendOfArg(t *testing.T) { + r := make([]byte, 24) + for i := 0; i < 24; i++ { + r[i] = byte(i) + } + appendSomeBytes(r, []byte{25, 26, 27}) + // Do the same thing, trying to overwrite any + // stack-allocated buffers used above. + s := make([]byte, 24) + for i := 0; i < 24; i++ { + s[i] = 99 + } + appendSomeBytes(s, []byte{99, 99, 99}) + // Check that we still have the right data. + for i, b := range r { + if b != byte(i) { + t.Errorf("r[%d]=%d, want %d", i, b, byte(i)) + } + } + +} + +func BenchmarkAppendInLoop(b *testing.B) { + for _, size := range []int{0, 1, 8, 16, 32, 64, 128} { + b.Run(fmt.Sprintf("%d", size), + func(b *testing.B) { + b.ReportAllocs() + for b.Loop() { + byteSlice(size) + } + }) + } +} + +func TestMoveToHeapEarly(t *testing.T) { + // Just checking that this compiles. + var x []int + y := x // causes a move2heap in the entry block + for range 5 { + x = append(x, 5) + } + _ = y +} + +func TestMoveToHeapCap(t *testing.T) { + var c int + r := func() []byte { + var s []byte + for i := range 10 { + s = append(s, byte(i)) + } + c = cap(s) + return s + }() + if c != cap(r) { + t.Errorf("got cap=%d, want %d", c, cap(r)) + } + sinkSlice = r +} + +//go:noinline +func runit(f func()) { + f() +} + +func TestMoveToHeapClosure1(t *testing.T) { + var c int + r := func() []byte { + var s []byte + for i := range 10 { + s = append(s, byte(i)) + } + runit(func() { + c = cap(s) + }) + return s + }() + if c != cap(r) { + t.Errorf("got cap=%d, want %d", c, cap(r)) + } + sinkSlice = r +} +func TestMoveToHeapClosure2(t *testing.T) { + var c int + r := func() []byte { + var s []byte + for i := range 10 { + s = append(s, byte(i)) + } + c = func() int { + return cap(s) + }() + return s + }() + if c != cap(r) { + t.Errorf("got cap=%d, want %d", c, cap(r)) + } + sinkSlice = r +} + +//go:noinline +func buildClosure(t *testing.T) ([]byte, func()) { + var s []byte + for i := range 20 { + s = append(s, byte(i)) + } + c := func() { + for i, b := range s { + if b != byte(i) { + t.Errorf("s[%d]=%d, want %d", i, b, i) + } + } + } + return s, c +} + +func TestMoveToHeapClosure3(t *testing.T) { + _, f := buildClosure(t) + overwriteStack(0) + f() +} + +//go:noinline +func overwriteStack(n int) uint64 { + var x [100]uint64 + for i := range x { + x[i] = 0xabcdabcdabcdabcd + } + return x[n] +} + +var sinkSlice []byte diff --git a/src/runtime/sys_riscv64.go b/src/runtime/sys_riscv64.go index e710840819..65dc684c33 100644 --- a/src/runtime/sys_riscv64.go +++ b/src/runtime/sys_riscv64.go @@ -4,7 +4,12 @@ package runtime -import "unsafe" +import ( + "unsafe" + + "internal/abi" + "internal/runtime/sys" +) // adjust Gobuf as if it executed a call to fn with context ctxt // and then did an immediate Gosave. @@ -12,7 +17,9 @@ func gostartcall(buf *gobuf, fn, ctxt unsafe.Pointer) { if buf.lr != 0 { throw("invalid use of gostartcall") } - buf.lr = buf.pc + // Use double the PC quantum on riscv64, so that we retain + // four byte alignment and use non-compressed instructions. + buf.lr = abi.FuncPCABI0(goexit) + sys.PCQuantum*2 buf.pc = uintptr(fn) buf.ctxt = ctxt } diff --git a/src/runtime/testdata/testprog/crash.go b/src/runtime/testdata/testprog/crash.go index 556215a71e..fcce388871 100644 --- a/src/runtime/testdata/testprog/crash.go +++ b/src/runtime/testdata/testprog/crash.go @@ -22,6 +22,7 @@ func init() { register("RepanickedPanic", RepanickedPanic) register("RepanickedMiddlePanic", RepanickedMiddlePanic) register("RepanickedPanicSandwich", RepanickedPanicSandwich) + register("DoublePanicWithSameValue", DoublePanicWithSameValue) } func test(name string) { @@ -189,3 +190,13 @@ func RepanickedPanicSandwich() { panic("outer") }() } + +// Double panic with same value and not recovered. +// See issue 76099. +func DoublePanicWithSameValue() { + var e any = "message" + defer func() { + panic(e) + }() + panic(e) +} diff --git a/src/runtime/testdata/testprog/gc.go b/src/runtime/testdata/testprog/gc.go index bbe1453401..32e2c5e1b4 100644 --- a/src/runtime/testdata/testprog/gc.go +++ b/src/runtime/testdata/testprog/gc.go @@ -396,7 +396,7 @@ func gcMemoryLimit(gcPercent int) { // should do considerably better than this bound. bound := int64(myLimit + 16<<20) if runtime.GOOS == "darwin" { - bound += 16 << 20 // Be more lax on Darwin, see issue 73136. + bound += 24 << 20 // Be more lax on Darwin, see issue 73136. } start := time.Now() for time.Since(start) < 200*time.Millisecond { diff --git a/src/runtime/testdata/testprog/stw_trace.go b/src/runtime/testdata/testprog/stw_trace.go index 0fed55b875..0fa15da09e 100644 --- a/src/runtime/testdata/testprog/stw_trace.go +++ b/src/runtime/testdata/testprog/stw_trace.go @@ -7,15 +7,18 @@ package main import ( "context" "log" + "math/rand/v2" "os" "runtime" "runtime/debug" + "runtime/metrics" "runtime/trace" "sync/atomic" ) func init() { register("TraceSTW", TraceSTW) + register("TraceGCSTW", TraceGCSTW) } // The parent writes to ping and waits for the children to write back @@ -53,7 +56,7 @@ func TraceSTW() { // https://go.dev/issue/65694). Alternatively, we could just ignore the // trace if the GC runs. runtime.GOMAXPROCS(4) - debug.SetGCPercent(0) + debug.SetGCPercent(-1) if err := trace.Start(os.Stdout); err != nil { log.Fatalf("failed to start tracing: %v", err) @@ -86,6 +89,112 @@ func TraceSTW() { stop.Store(true) } +// Variant of TraceSTW for GC STWs. We want the GC mark workers to start on +// previously-idle Ps, rather than bumping the current P. +func TraceGCSTW() { + ctx := context.Background() + + // The idea here is to have 2 target goroutines that are constantly + // running. When the world restarts after STW, we expect these + // goroutines to continue execution on the same M and P. + // + // Set GOMAXPROCS=8 to make room for the 2 target goroutines, 1 parent, + // 2 dedicated workers, and a bit of slack. + // + // Disable the GC initially so we can be sure it only triggers once we + // are ready. + runtime.GOMAXPROCS(8) + debug.SetGCPercent(-1) + + if err := trace.Start(os.Stdout); err != nil { + log.Fatalf("failed to start tracing: %v", err) + } + defer trace.Stop() + + for i := range 2 { + go traceSTWTarget(i) + } + + // Wait for children to start running. + ping.Store(1) + for pong[0].Load() != 1 {} + for pong[1].Load() != 1 {} + + trace.Log(ctx, "TraceSTW", "start") + + // STW + triggerGC() + + // Make sure to run long enough for the children to schedule again + // after STW. This is included for good measure, but the goroutines + // really ought to have already scheduled since the entire GC + // completed. + ping.Store(2) + for pong[0].Load() != 2 {} + for pong[1].Load() != 2 {} + + trace.Log(ctx, "TraceSTW", "end") + + stop.Store(true) +} + +func triggerGC() { + // Allocate a bunch to trigger the GC rather than using runtime.GC. The + // latter blocks until the GC is complete, which is convenient, but + // messes with scheduling as it gives this P a chance to steal the + // other goroutines before their Ps get up and running again. + + // Bring heap size up prior to enabling the GC to ensure that there is + // a decent amount of work in case the GC triggers immediately upon + // re-enabling. + for range 1000 { + alloc() + } + + sample := make([]metrics.Sample, 1) + sample[0].Name = "/gc/cycles/total:gc-cycles" + metrics.Read(sample) + + start := sample[0].Value.Uint64() + + debug.SetGCPercent(100) + + // Keep allocating until the GC is complete. We really only need to + // continue until the mark workers are scheduled, but there isn't a + // good way to measure that. + for { + metrics.Read(sample) + if sample[0].Value.Uint64() != start { + return + } + + alloc() + } +} + +// Allocate a tree data structure to generate plenty of scan work for the GC. + +type node struct { + children []*node +} + +var gcSink node + +func alloc() { + // 10% chance of adding a node a each layer. + + curr := &gcSink + for { + if len(curr.children) == 0 || rand.Float32() < 0.1 { + curr.children = append(curr.children, new(node)) + return + } + + i := rand.IntN(len(curr.children)) + curr = curr.children[i] + } +} + // Manually insert a morestack call. Leaf functions can omit morestack, but // non-leaf functions should include them. diff --git a/src/runtime/traceback.go b/src/runtime/traceback.go index 6649f72471..74aaeba876 100644 --- a/src/runtime/traceback.go +++ b/src/runtime/traceback.go @@ -1366,16 +1366,19 @@ func tracebackHexdump(stk stack, frame *stkframe, bad uintptr) { // Print the hex dump. print("stack: frame={sp:", hex(frame.sp), ", fp:", hex(frame.fp), "} stack=[", hex(stk.lo), ",", hex(stk.hi), ")\n") - hexdumpWords(lo, hi, func(p uintptr) byte { - switch p { - case frame.fp: - return '>' - case frame.sp: - return '<' - case bad: - return '!' + hexdumpWords(lo, hi-lo, func(p uintptr, m hexdumpMarker) { + if p == frame.fp { + m.start() + println("FP") + } + if p == frame.sp { + m.start() + println("SP") + } + if p == bad { + m.start() + println("bad") } - return 0 }) } |
