aboutsummaryrefslogtreecommitdiff
path: root/src/runtime/malloc_stubs.go
diff options
context:
space:
mode:
Diffstat (limited to 'src/runtime/malloc_stubs.go')
-rw-r--r--src/runtime/malloc_stubs.go586
1 files changed, 586 insertions, 0 deletions
diff --git a/src/runtime/malloc_stubs.go b/src/runtime/malloc_stubs.go
new file mode 100644
index 0000000000..7fd1444189
--- /dev/null
+++ b/src/runtime/malloc_stubs.go
@@ -0,0 +1,586 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file contains stub functions that are not meant to be called directly,
+// but that will be assembled together using the inlining logic in runtime/_mkmalloc
+// to produce a full mallocgc function that's specialized for a span class
+// or specific size in the case of the tiny allocator.
+//
+// To assemble a mallocgc function, the mallocStub function is cloned, and the call to
+// inlinedMalloc is replaced with the inlined body of smallScanNoHeaderStub,
+// smallNoScanStub or tinyStub, depending on the parameters being specialized.
+//
+// The size_ (for the tiny case) and elemsize_, sizeclass_, and noscanint_ (for all three cases)
+// identifiers are replaced with the value of the parameter in the specialized case.
+// The nextFreeFastStub, nextFreeFastTiny, heapSetTypeNoHeaderStub, and writeHeapBitsSmallStub
+// functions are also inlined by _mkmalloc.
+
+package runtime
+
+import (
+ "internal/goarch"
+ "internal/runtime/sys"
+ "unsafe"
+)
+
+// These identifiers will all be replaced by the inliner. So their values don't
+// really matter: they just need to be set so that the stub functions, which
+// will never be used on their own, can compile. elemsize_ can't be set to
+// zero because we divide by it in nextFreeFastTiny, and the compiler would
+// complain about a division by zero. Its replaced value will always be greater
+// than zero.
+const elemsize_ = 8
+const sizeclass_ = 0
+const noscanint_ = 0
+const size_ = 0
+
+func malloc0(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
+ if doubleCheckMalloc {
+ if gcphase == _GCmarktermination {
+ throw("mallocgc called with gcphase == _GCmarktermination")
+ }
+ }
+
+ // Short-circuit zero-sized allocation requests.
+ return unsafe.Pointer(&zerobase)
+}
+
+func mallocPanic(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
+ panic("not defined for sizeclass")
+}
+
+func mallocStub(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
+ if doubleCheckMalloc {
+ if gcphase == _GCmarktermination {
+ throw("mallocgc called with gcphase == _GCmarktermination")
+ }
+ }
+
+ // It's possible for any malloc to trigger sweeping, which may in
+ // turn queue finalizers. Record this dynamic lock edge.
+ // N.B. Compiled away if lockrank experiment is not enabled.
+ lockRankMayQueueFinalizer()
+
+ // Pre-malloc debug hooks.
+ if debug.malloc {
+ if x := preMallocgcDebug(size, typ); x != nil {
+ return x
+ }
+ }
+
+ // Assist the GC if needed.
+ if gcBlackenEnabled != 0 {
+ deductAssistCredit(size)
+ }
+
+ // Actually do the allocation.
+ x, elemsize := inlinedMalloc(size, typ, needzero)
+
+ // Adjust our GC assist debt to account for internal fragmentation.
+ if gcBlackenEnabled != 0 && elemsize != 0 {
+ if assistG := getg().m.curg; assistG != nil {
+ assistG.gcAssistBytes -= int64(elemsize - size)
+ }
+ }
+
+ // Post-malloc debug hooks.
+ if debug.malloc {
+ postMallocgcDebug(x, elemsize, typ)
+ }
+ return x
+}
+
+// inlinedMalloc will never be called. It is defined just so that the compiler can compile
+// the mallocStub function, which will also never be called, but instead used as a template
+// to generate a size-specialized malloc function. The call to inlinedMalloc in mallocStub
+// will be replaced with the inlined body of smallScanNoHeaderStub, smallNoScanStub, or tinyStub
+// when generating the size-specialized malloc function. See the comment at the top of this
+// file for more information.
+func inlinedMalloc(size uintptr, typ *_type, needzero bool) (unsafe.Pointer, uintptr) {
+ return unsafe.Pointer(uintptr(0)), 0
+}
+
+func doubleCheckSmallScanNoHeader(size uintptr, typ *_type, mp *m) {
+ if mp.mallocing != 0 {
+ throw("malloc deadlock")
+ }
+ if mp.gsignal == getg() {
+ throw("malloc during signal")
+ }
+ if typ == nil || !typ.Pointers() {
+ throw("noscan allocated in scan-only path")
+ }
+ if !heapBitsInSpan(size) {
+ throw("heap bits in not in span for non-header-only path")
+ }
+}
+
+func smallScanNoHeaderStub(size uintptr, typ *_type, needzero bool) (unsafe.Pointer, uintptr) {
+ const sizeclass = sizeclass_
+ const elemsize = elemsize_
+
+ // Set mp.mallocing to keep from being preempted by GC.
+ mp := acquirem()
+ if doubleCheckMalloc {
+ doubleCheckSmallScanNoHeader(size, typ, mp)
+ }
+ mp.mallocing = 1
+
+ checkGCTrigger := false
+ c := getMCache(mp)
+ const spc = spanClass(sizeclass<<1) | spanClass(noscanint_)
+ span := c.alloc[spc]
+ v := nextFreeFastStub(span)
+ if v == 0 {
+ v, span, checkGCTrigger = c.nextFree(spc)
+ }
+ x := unsafe.Pointer(v)
+ if span.needzero != 0 {
+ memclrNoHeapPointers(x, elemsize)
+ }
+ if goarch.PtrSize == 8 && sizeclass == 1 {
+ // initHeapBits already set the pointer bits for the 8-byte sizeclass
+ // on 64-bit platforms.
+ c.scanAlloc += 8
+ } else {
+ dataSize := size // make the inliner happy
+ x := uintptr(x)
+ scanSize := heapSetTypeNoHeaderStub(x, dataSize, typ, span)
+ c.scanAlloc += scanSize
+ }
+
+ // Ensure that the stores above that initialize x to
+ // type-safe memory and set the heap bits occur before
+ // the caller can make x observable to the garbage
+ // collector. Otherwise, on weakly ordered machines,
+ // the garbage collector could follow a pointer to x,
+ // but see uninitialized memory or stale heap bits.
+ publicationBarrier()
+
+ if writeBarrier.enabled {
+ // Allocate black during GC.
+ // All slots hold nil so no scanning is needed.
+ // This may be racing with GC so do it atomically if there can be
+ // a race marking the bit.
+ gcmarknewobject(span, uintptr(x))
+ } else {
+ // Track the last free index before the mark phase. This field
+ // is only used by the garbage collector. During the mark phase
+ // this is used by the conservative scanner to filter out objects
+ // that are both free and recently-allocated. It's safe to do that
+ // because we allocate-black if the GC is enabled. The conservative
+ // scanner produces pointers out of thin air, so without additional
+ // synchronization it might otherwise observe a partially-initialized
+ // object, which could crash the program.
+ span.freeIndexForScan = span.freeindex
+ }
+
+ // Note cache c only valid while m acquired; see #47302
+ //
+ // N.B. Use the full size because that matches how the GC
+ // will update the mem profile on the "free" side.
+ //
+ // TODO(mknyszek): We should really count the header as part
+ // of gc_sys or something. The code below just pretends it is
+ // internal fragmentation and matches the GC's accounting by
+ // using the whole allocation slot.
+ c.nextSample -= int64(elemsize)
+ if c.nextSample < 0 || MemProfileRate != c.memProfRate {
+ profilealloc(mp, x, elemsize)
+ }
+ mp.mallocing = 0
+ releasem(mp)
+
+ if checkGCTrigger {
+ if t := (gcTrigger{kind: gcTriggerHeap}); t.test() {
+ gcStart(t)
+ }
+ }
+
+ return x, elemsize
+}
+
+func doubleCheckSmallNoScan(typ *_type, mp *m) {
+ if mp.mallocing != 0 {
+ throw("malloc deadlock")
+ }
+ if mp.gsignal == getg() {
+ throw("malloc during signal")
+ }
+ if typ != nil && typ.Pointers() {
+ throw("expected noscan type for noscan alloc")
+ }
+}
+
+func smallNoScanStub(size uintptr, typ *_type, needzero bool) (unsafe.Pointer, uintptr) {
+ // TODO(matloob): Add functionality to mkmalloc to allow us to inline a non-constant
+ // sizeclass_ and elemsize_ value (instead just set to the expressions to look up the size class
+ // and elemsize. We'd also need to teach mkmalloc that values that are touched by these (specifically
+ // spc below) should turn into vars. This would allow us to generate mallocgcSmallNoScan itself,
+ // so that its code could not diverge from the generated functions.
+ const sizeclass = sizeclass_
+ const elemsize = elemsize_
+
+ // Set mp.mallocing to keep from being preempted by GC.
+ mp := acquirem()
+ if doubleCheckMalloc {
+ doubleCheckSmallNoScan(typ, mp)
+ }
+ mp.mallocing = 1
+
+ checkGCTrigger := false
+ c := getMCache(mp)
+ const spc = spanClass(sizeclass<<1) | spanClass(noscanint_)
+ span := c.alloc[spc]
+ v := nextFreeFastStub(span)
+ if v == 0 {
+ v, span, checkGCTrigger = c.nextFree(spc)
+ }
+ x := unsafe.Pointer(v)
+ if needzero && span.needzero != 0 {
+ memclrNoHeapPointers(x, elemsize)
+ }
+
+ // Ensure that the stores above that initialize x to
+ // type-safe memory and set the heap bits occur before
+ // the caller can make x observable to the garbage
+ // collector. Otherwise, on weakly ordered machines,
+ // the garbage collector could follow a pointer to x,
+ // but see uninitialized memory or stale heap bits.
+ publicationBarrier()
+
+ if writeBarrier.enabled {
+ // Allocate black during GC.
+ // All slots hold nil so no scanning is needed.
+ // This may be racing with GC so do it atomically if there can be
+ // a race marking the bit.
+ gcmarknewobject(span, uintptr(x))
+ } else {
+ // Track the last free index before the mark phase. This field
+ // is only used by the garbage collector. During the mark phase
+ // this is used by the conservative scanner to filter out objects
+ // that are both free and recently-allocated. It's safe to do that
+ // because we allocate-black if the GC is enabled. The conservative
+ // scanner produces pointers out of thin air, so without additional
+ // synchronization it might otherwise observe a partially-initialized
+ // object, which could crash the program.
+ span.freeIndexForScan = span.freeindex
+ }
+
+ // Note cache c only valid while m acquired; see #47302
+ //
+ // N.B. Use the full size because that matches how the GC
+ // will update the mem profile on the "free" side.
+ //
+ // TODO(mknyszek): We should really count the header as part
+ // of gc_sys or something. The code below just pretends it is
+ // internal fragmentation and matches the GC's accounting by
+ // using the whole allocation slot.
+ c.nextSample -= int64(elemsize)
+ if c.nextSample < 0 || MemProfileRate != c.memProfRate {
+ profilealloc(mp, x, elemsize)
+ }
+ mp.mallocing = 0
+ releasem(mp)
+
+ if checkGCTrigger {
+ if t := (gcTrigger{kind: gcTriggerHeap}); t.test() {
+ gcStart(t)
+ }
+ }
+ return x, elemsize
+}
+
+func doubleCheckTiny(size uintptr, typ *_type, mp *m) {
+ if mp.mallocing != 0 {
+ throw("malloc deadlock")
+ }
+ if mp.gsignal == getg() {
+ throw("malloc during signal")
+ }
+ if typ != nil && typ.Pointers() {
+ throw("expected noscan for tiny alloc")
+ }
+}
+
+func tinyStub(size uintptr, typ *_type, needzero bool) (unsafe.Pointer, uintptr) {
+ const constsize = size_
+ const elemsize = elemsize_
+
+ // Set mp.mallocing to keep from being preempted by GC.
+ mp := acquirem()
+ if doubleCheckMalloc {
+ doubleCheckTiny(constsize, typ, mp)
+ }
+ mp.mallocing = 1
+
+ // Tiny allocator.
+ //
+ // Tiny allocator combines several tiny allocation requests
+ // into a single memory block. The resulting memory block
+ // is freed when all subobjects are unreachable. The subobjects
+ // must be noscan (don't have pointers), this ensures that
+ // the amount of potentially wasted memory is bounded.
+ //
+ // Size of the memory block used for combining (maxTinySize) is tunable.
+ // Current setting is 16 bytes, which relates to 2x worst case memory
+ // wastage (when all but one subobjects are unreachable).
+ // 8 bytes would result in no wastage at all, but provides less
+ // opportunities for combining.
+ // 32 bytes provides more opportunities for combining,
+ // but can lead to 4x worst case wastage.
+ // The best case winning is 8x regardless of block size.
+ //
+ // Objects obtained from tiny allocator must not be freed explicitly.
+ // So when an object will be freed explicitly, we ensure that
+ // its size >= maxTinySize.
+ //
+ // SetFinalizer has a special case for objects potentially coming
+ // from tiny allocator, it such case it allows to set finalizers
+ // for an inner byte of a memory block.
+ //
+ // The main targets of tiny allocator are small strings and
+ // standalone escaping variables. On a json benchmark
+ // the allocator reduces number of allocations by ~12% and
+ // reduces heap size by ~20%.
+ c := getMCache(mp)
+ off := c.tinyoffset
+ // Align tiny pointer for required (conservative) alignment.
+ if constsize&7 == 0 {
+ off = alignUp(off, 8)
+ } else if goarch.PtrSize == 4 && constsize == 12 {
+ // Conservatively align 12-byte objects to 8 bytes on 32-bit
+ // systems so that objects whose first field is a 64-bit
+ // value is aligned to 8 bytes and does not cause a fault on
+ // atomic access. See issue 37262.
+ // TODO(mknyszek): Remove this workaround if/when issue 36606
+ // is resolved.
+ off = alignUp(off, 8)
+ } else if constsize&3 == 0 {
+ off = alignUp(off, 4)
+ } else if constsize&1 == 0 {
+ off = alignUp(off, 2)
+ }
+ if off+constsize <= maxTinySize && c.tiny != 0 {
+ // The object fits into existing tiny block.
+ x := unsafe.Pointer(c.tiny + off)
+ c.tinyoffset = off + constsize
+ c.tinyAllocs++
+ mp.mallocing = 0
+ releasem(mp)
+ return x, 0
+ }
+ // Allocate a new maxTinySize block.
+ checkGCTrigger := false
+ span := c.alloc[tinySpanClass]
+ v := nextFreeFastTiny(span)
+ if v == 0 {
+ v, span, checkGCTrigger = c.nextFree(tinySpanClass)
+ }
+ x := unsafe.Pointer(v)
+ (*[2]uint64)(x)[0] = 0 // Always zero
+ (*[2]uint64)(x)[1] = 0
+ // See if we need to replace the existing tiny block with the new one
+ // based on amount of remaining free space.
+ if !raceenabled && (constsize < c.tinyoffset || c.tiny == 0) {
+ // Note: disabled when race detector is on, see comment near end of this function.
+ c.tiny = uintptr(x)
+ c.tinyoffset = constsize
+ }
+
+ // Ensure that the stores above that initialize x to
+ // type-safe memory and set the heap bits occur before
+ // the caller can make x observable to the garbage
+ // collector. Otherwise, on weakly ordered machines,
+ // the garbage collector could follow a pointer to x,
+ // but see uninitialized memory or stale heap bits.
+ publicationBarrier()
+
+ if writeBarrier.enabled {
+ // Allocate black during GC.
+ // All slots hold nil so no scanning is needed.
+ // This may be racing with GC so do it atomically if there can be
+ // a race marking the bit.
+ gcmarknewobject(span, uintptr(x))
+ } else {
+ // Track the last free index before the mark phase. This field
+ // is only used by the garbage collector. During the mark phase
+ // this is used by the conservative scanner to filter out objects
+ // that are both free and recently-allocated. It's safe to do that
+ // because we allocate-black if the GC is enabled. The conservative
+ // scanner produces pointers out of thin air, so without additional
+ // synchronization it might otherwise observe a partially-initialized
+ // object, which could crash the program.
+ span.freeIndexForScan = span.freeindex
+ }
+
+ // Note cache c only valid while m acquired; see #47302
+ //
+ // N.B. Use the full size because that matches how the GC
+ // will update the mem profile on the "free" side.
+ //
+ // TODO(mknyszek): We should really count the header as part
+ // of gc_sys or something. The code below just pretends it is
+ // internal fragmentation and matches the GC's accounting by
+ // using the whole allocation slot.
+ c.nextSample -= int64(elemsize)
+ if c.nextSample < 0 || MemProfileRate != c.memProfRate {
+ profilealloc(mp, x, elemsize)
+ }
+ mp.mallocing = 0
+ releasem(mp)
+
+ if checkGCTrigger {
+ if t := (gcTrigger{kind: gcTriggerHeap}); t.test() {
+ gcStart(t)
+ }
+ }
+
+ if raceenabled {
+ // Pad tinysize allocations so they are aligned with the end
+ // of the tinyalloc region. This ensures that any arithmetic
+ // that goes off the top end of the object will be detectable
+ // by checkptr (issue 38872).
+ // Note that we disable tinyalloc when raceenabled for this to work.
+ // TODO: This padding is only performed when the race detector
+ // is enabled. It would be nice to enable it if any package
+ // was compiled with checkptr, but there's no easy way to
+ // detect that (especially at compile time).
+ // TODO: enable this padding for all allocations, not just
+ // tinyalloc ones. It's tricky because of pointer maps.
+ // Maybe just all noscan objects?
+ x = add(x, elemsize-constsize)
+ }
+ return x, elemsize
+}
+
+// TODO(matloob): Should we let the go compiler inline this instead of using mkmalloc?
+// We won't be able to use elemsize_ but that's probably ok.
+func nextFreeFastTiny(span *mspan) gclinkptr {
+ const nbytes = 8192
+ const nelems = uint16((nbytes - unsafe.Sizeof(spanInlineMarkBits{})) / elemsize_)
+ var nextFreeFastResult gclinkptr
+ if span.allocCache != 0 {
+ theBit := sys.TrailingZeros64(span.allocCache) // Is there a free object in the allocCache?
+ result := span.freeindex + uint16(theBit)
+ if result < nelems {
+ freeidx := result + 1
+ if !(freeidx%64 == 0 && freeidx != nelems) {
+ span.allocCache >>= uint(theBit + 1)
+ span.freeindex = freeidx
+ span.allocCount++
+ nextFreeFastResult = gclinkptr(uintptr(result)*elemsize_ + span.base())
+ }
+ }
+ }
+ return nextFreeFastResult
+}
+
+func nextFreeFastStub(span *mspan) gclinkptr {
+ var nextFreeFastResult gclinkptr
+ if span.allocCache != 0 {
+ theBit := sys.TrailingZeros64(span.allocCache) // Is there a free object in the allocCache?
+ result := span.freeindex + uint16(theBit)
+ if result < span.nelems {
+ freeidx := result + 1
+ if !(freeidx%64 == 0 && freeidx != span.nelems) {
+ span.allocCache >>= uint(theBit + 1)
+ span.freeindex = freeidx
+ span.allocCount++
+ nextFreeFastResult = gclinkptr(uintptr(result)*elemsize_ + span.base())
+ }
+ }
+ }
+ return nextFreeFastResult
+}
+
+func heapSetTypeNoHeaderStub(x, dataSize uintptr, typ *_type, span *mspan) uintptr {
+ if doubleCheckHeapSetType && (!heapBitsInSpan(dataSize) || !heapBitsInSpan(elemsize_)) {
+ throw("tried to write heap bits, but no heap bits in span")
+ }
+ scanSize := writeHeapBitsSmallStub(span, x, dataSize, typ)
+ if doubleCheckHeapSetType {
+ doubleCheckHeapType(x, dataSize, typ, nil, span)
+ }
+ return scanSize
+}
+
+// writeHeapBitsSmallStub writes the heap bits for small objects whose ptr/scalar data is
+// stored as a bitmap at the end of the span.
+//
+// Assumes dataSize is <= ptrBits*goarch.PtrSize. x must be a pointer into the span.
+// heapBitsInSpan(dataSize) must be true. dataSize must be >= typ.Size_.
+//
+//go:nosplit
+func writeHeapBitsSmallStub(span *mspan, x, dataSize uintptr, typ *_type) uintptr {
+ // The objects here are always really small, so a single load is sufficient.
+ src0 := readUintptr(getGCMask(typ))
+
+ const elemsize = elemsize_
+
+ // Create repetitions of the bitmap if we have a small slice backing store.
+ scanSize := typ.PtrBytes
+ src := src0
+ if typ.Size_ == goarch.PtrSize {
+ src = (1 << (dataSize / goarch.PtrSize)) - 1
+ } else {
+ // N.B. We rely on dataSize being an exact multiple of the type size.
+ // The alternative is to be defensive and mask out src to the length
+ // of dataSize. The purpose is to save on one additional masking operation.
+ if doubleCheckHeapSetType && !asanenabled && dataSize%typ.Size_ != 0 {
+ throw("runtime: (*mspan).writeHeapBitsSmall: dataSize is not a multiple of typ.Size_")
+ }
+ for i := typ.Size_; i < dataSize; i += typ.Size_ {
+ src |= src0 << (i / goarch.PtrSize)
+ scanSize += typ.Size_
+ }
+ }
+
+ // Since we're never writing more than one uintptr's worth of bits, we're either going
+ // to do one or two writes.
+ dstBase, _ := spanHeapBitsRange(span.base(), pageSize, elemsize)
+ dst := unsafe.Pointer(dstBase)
+ o := (x - span.base()) / goarch.PtrSize
+ i := o / ptrBits
+ j := o % ptrBits
+ const bits uintptr = elemsize / goarch.PtrSize
+ // In the if statement below, we have to do two uintptr writes if the bits
+ // we need to write straddle across two different memory locations. But if
+ // the number of bits we're writing divides evenly into the number of bits
+ // in the uintptr we're writing, this can never happen. Since bitsIsPowerOfTwo
+ // is a compile-time constant in the generated code, in the case where the size is
+ // a power of two less than or equal to ptrBits, the compiler can remove the
+ // 'two writes' branch of the if statement and always do only one write without
+ // the check.
+ const bitsIsPowerOfTwo = bits&(bits-1) == 0
+ if bits > ptrBits || (!bitsIsPowerOfTwo && j+bits > ptrBits) {
+ // Two writes.
+ bits0 := ptrBits - j
+ bits1 := bits - bits0
+ dst0 := (*uintptr)(add(dst, (i+0)*goarch.PtrSize))
+ dst1 := (*uintptr)(add(dst, (i+1)*goarch.PtrSize))
+ *dst0 = (*dst0)&(^uintptr(0)>>bits0) | (src << j)
+ *dst1 = (*dst1)&^((1<<bits1)-1) | (src >> bits0)
+ } else {
+ // One write.
+ dst := (*uintptr)(add(dst, i*goarch.PtrSize))
+ *dst = (*dst)&^(((1<<(min(bits, ptrBits)))-1)<<j) | (src << j) // We're taking the min so this compiles on 32 bit platforms. But if bits > ptrbits we always take the other branch
+ }
+
+ const doubleCheck = false
+ if doubleCheck {
+ writeHeapBitsDoubleCheck(span, x, dataSize, src, src0, i, j, bits, typ)
+ }
+ return scanSize
+}
+
+func writeHeapBitsDoubleCheck(span *mspan, x, dataSize, src, src0, i, j, bits uintptr, typ *_type) {
+ srcRead := span.heapBitsSmallForAddr(x)
+ if srcRead != src {
+ print("runtime: x=", hex(x), " i=", i, " j=", j, " bits=", bits, "\n")
+ print("runtime: dataSize=", dataSize, " typ.Size_=", typ.Size_, " typ.PtrBytes=", typ.PtrBytes, "\n")
+ print("runtime: src0=", hex(src0), " src=", hex(src), " srcRead=", hex(srcRead), "\n")
+ throw("bad pointer bits written for small object")
+ }
+}