diff options
Diffstat (limited to 'src/runtime/malloc_stubs.go')
| -rw-r--r-- | src/runtime/malloc_stubs.go | 586 |
1 files changed, 586 insertions, 0 deletions
diff --git a/src/runtime/malloc_stubs.go b/src/runtime/malloc_stubs.go new file mode 100644 index 0000000000..7fd1444189 --- /dev/null +++ b/src/runtime/malloc_stubs.go @@ -0,0 +1,586 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// This file contains stub functions that are not meant to be called directly, +// but that will be assembled together using the inlining logic in runtime/_mkmalloc +// to produce a full mallocgc function that's specialized for a span class +// or specific size in the case of the tiny allocator. +// +// To assemble a mallocgc function, the mallocStub function is cloned, and the call to +// inlinedMalloc is replaced with the inlined body of smallScanNoHeaderStub, +// smallNoScanStub or tinyStub, depending on the parameters being specialized. +// +// The size_ (for the tiny case) and elemsize_, sizeclass_, and noscanint_ (for all three cases) +// identifiers are replaced with the value of the parameter in the specialized case. +// The nextFreeFastStub, nextFreeFastTiny, heapSetTypeNoHeaderStub, and writeHeapBitsSmallStub +// functions are also inlined by _mkmalloc. + +package runtime + +import ( + "internal/goarch" + "internal/runtime/sys" + "unsafe" +) + +// These identifiers will all be replaced by the inliner. So their values don't +// really matter: they just need to be set so that the stub functions, which +// will never be used on their own, can compile. elemsize_ can't be set to +// zero because we divide by it in nextFreeFastTiny, and the compiler would +// complain about a division by zero. Its replaced value will always be greater +// than zero. +const elemsize_ = 8 +const sizeclass_ = 0 +const noscanint_ = 0 +const size_ = 0 + +func malloc0(size uintptr, typ *_type, needzero bool) unsafe.Pointer { + if doubleCheckMalloc { + if gcphase == _GCmarktermination { + throw("mallocgc called with gcphase == _GCmarktermination") + } + } + + // Short-circuit zero-sized allocation requests. + return unsafe.Pointer(&zerobase) +} + +func mallocPanic(size uintptr, typ *_type, needzero bool) unsafe.Pointer { + panic("not defined for sizeclass") +} + +func mallocStub(size uintptr, typ *_type, needzero bool) unsafe.Pointer { + if doubleCheckMalloc { + if gcphase == _GCmarktermination { + throw("mallocgc called with gcphase == _GCmarktermination") + } + } + + // It's possible for any malloc to trigger sweeping, which may in + // turn queue finalizers. Record this dynamic lock edge. + // N.B. Compiled away if lockrank experiment is not enabled. + lockRankMayQueueFinalizer() + + // Pre-malloc debug hooks. + if debug.malloc { + if x := preMallocgcDebug(size, typ); x != nil { + return x + } + } + + // Assist the GC if needed. + if gcBlackenEnabled != 0 { + deductAssistCredit(size) + } + + // Actually do the allocation. + x, elemsize := inlinedMalloc(size, typ, needzero) + + // Adjust our GC assist debt to account for internal fragmentation. + if gcBlackenEnabled != 0 && elemsize != 0 { + if assistG := getg().m.curg; assistG != nil { + assistG.gcAssistBytes -= int64(elemsize - size) + } + } + + // Post-malloc debug hooks. + if debug.malloc { + postMallocgcDebug(x, elemsize, typ) + } + return x +} + +// inlinedMalloc will never be called. It is defined just so that the compiler can compile +// the mallocStub function, which will also never be called, but instead used as a template +// to generate a size-specialized malloc function. The call to inlinedMalloc in mallocStub +// will be replaced with the inlined body of smallScanNoHeaderStub, smallNoScanStub, or tinyStub +// when generating the size-specialized malloc function. See the comment at the top of this +// file for more information. +func inlinedMalloc(size uintptr, typ *_type, needzero bool) (unsafe.Pointer, uintptr) { + return unsafe.Pointer(uintptr(0)), 0 +} + +func doubleCheckSmallScanNoHeader(size uintptr, typ *_type, mp *m) { + if mp.mallocing != 0 { + throw("malloc deadlock") + } + if mp.gsignal == getg() { + throw("malloc during signal") + } + if typ == nil || !typ.Pointers() { + throw("noscan allocated in scan-only path") + } + if !heapBitsInSpan(size) { + throw("heap bits in not in span for non-header-only path") + } +} + +func smallScanNoHeaderStub(size uintptr, typ *_type, needzero bool) (unsafe.Pointer, uintptr) { + const sizeclass = sizeclass_ + const elemsize = elemsize_ + + // Set mp.mallocing to keep from being preempted by GC. + mp := acquirem() + if doubleCheckMalloc { + doubleCheckSmallScanNoHeader(size, typ, mp) + } + mp.mallocing = 1 + + checkGCTrigger := false + c := getMCache(mp) + const spc = spanClass(sizeclass<<1) | spanClass(noscanint_) + span := c.alloc[spc] + v := nextFreeFastStub(span) + if v == 0 { + v, span, checkGCTrigger = c.nextFree(spc) + } + x := unsafe.Pointer(v) + if span.needzero != 0 { + memclrNoHeapPointers(x, elemsize) + } + if goarch.PtrSize == 8 && sizeclass == 1 { + // initHeapBits already set the pointer bits for the 8-byte sizeclass + // on 64-bit platforms. + c.scanAlloc += 8 + } else { + dataSize := size // make the inliner happy + x := uintptr(x) + scanSize := heapSetTypeNoHeaderStub(x, dataSize, typ, span) + c.scanAlloc += scanSize + } + + // Ensure that the stores above that initialize x to + // type-safe memory and set the heap bits occur before + // the caller can make x observable to the garbage + // collector. Otherwise, on weakly ordered machines, + // the garbage collector could follow a pointer to x, + // but see uninitialized memory or stale heap bits. + publicationBarrier() + + if writeBarrier.enabled { + // Allocate black during GC. + // All slots hold nil so no scanning is needed. + // This may be racing with GC so do it atomically if there can be + // a race marking the bit. + gcmarknewobject(span, uintptr(x)) + } else { + // Track the last free index before the mark phase. This field + // is only used by the garbage collector. During the mark phase + // this is used by the conservative scanner to filter out objects + // that are both free and recently-allocated. It's safe to do that + // because we allocate-black if the GC is enabled. The conservative + // scanner produces pointers out of thin air, so without additional + // synchronization it might otherwise observe a partially-initialized + // object, which could crash the program. + span.freeIndexForScan = span.freeindex + } + + // Note cache c only valid while m acquired; see #47302 + // + // N.B. Use the full size because that matches how the GC + // will update the mem profile on the "free" side. + // + // TODO(mknyszek): We should really count the header as part + // of gc_sys or something. The code below just pretends it is + // internal fragmentation and matches the GC's accounting by + // using the whole allocation slot. + c.nextSample -= int64(elemsize) + if c.nextSample < 0 || MemProfileRate != c.memProfRate { + profilealloc(mp, x, elemsize) + } + mp.mallocing = 0 + releasem(mp) + + if checkGCTrigger { + if t := (gcTrigger{kind: gcTriggerHeap}); t.test() { + gcStart(t) + } + } + + return x, elemsize +} + +func doubleCheckSmallNoScan(typ *_type, mp *m) { + if mp.mallocing != 0 { + throw("malloc deadlock") + } + if mp.gsignal == getg() { + throw("malloc during signal") + } + if typ != nil && typ.Pointers() { + throw("expected noscan type for noscan alloc") + } +} + +func smallNoScanStub(size uintptr, typ *_type, needzero bool) (unsafe.Pointer, uintptr) { + // TODO(matloob): Add functionality to mkmalloc to allow us to inline a non-constant + // sizeclass_ and elemsize_ value (instead just set to the expressions to look up the size class + // and elemsize. We'd also need to teach mkmalloc that values that are touched by these (specifically + // spc below) should turn into vars. This would allow us to generate mallocgcSmallNoScan itself, + // so that its code could not diverge from the generated functions. + const sizeclass = sizeclass_ + const elemsize = elemsize_ + + // Set mp.mallocing to keep from being preempted by GC. + mp := acquirem() + if doubleCheckMalloc { + doubleCheckSmallNoScan(typ, mp) + } + mp.mallocing = 1 + + checkGCTrigger := false + c := getMCache(mp) + const spc = spanClass(sizeclass<<1) | spanClass(noscanint_) + span := c.alloc[spc] + v := nextFreeFastStub(span) + if v == 0 { + v, span, checkGCTrigger = c.nextFree(spc) + } + x := unsafe.Pointer(v) + if needzero && span.needzero != 0 { + memclrNoHeapPointers(x, elemsize) + } + + // Ensure that the stores above that initialize x to + // type-safe memory and set the heap bits occur before + // the caller can make x observable to the garbage + // collector. Otherwise, on weakly ordered machines, + // the garbage collector could follow a pointer to x, + // but see uninitialized memory or stale heap bits. + publicationBarrier() + + if writeBarrier.enabled { + // Allocate black during GC. + // All slots hold nil so no scanning is needed. + // This may be racing with GC so do it atomically if there can be + // a race marking the bit. + gcmarknewobject(span, uintptr(x)) + } else { + // Track the last free index before the mark phase. This field + // is only used by the garbage collector. During the mark phase + // this is used by the conservative scanner to filter out objects + // that are both free and recently-allocated. It's safe to do that + // because we allocate-black if the GC is enabled. The conservative + // scanner produces pointers out of thin air, so without additional + // synchronization it might otherwise observe a partially-initialized + // object, which could crash the program. + span.freeIndexForScan = span.freeindex + } + + // Note cache c only valid while m acquired; see #47302 + // + // N.B. Use the full size because that matches how the GC + // will update the mem profile on the "free" side. + // + // TODO(mknyszek): We should really count the header as part + // of gc_sys or something. The code below just pretends it is + // internal fragmentation and matches the GC's accounting by + // using the whole allocation slot. + c.nextSample -= int64(elemsize) + if c.nextSample < 0 || MemProfileRate != c.memProfRate { + profilealloc(mp, x, elemsize) + } + mp.mallocing = 0 + releasem(mp) + + if checkGCTrigger { + if t := (gcTrigger{kind: gcTriggerHeap}); t.test() { + gcStart(t) + } + } + return x, elemsize +} + +func doubleCheckTiny(size uintptr, typ *_type, mp *m) { + if mp.mallocing != 0 { + throw("malloc deadlock") + } + if mp.gsignal == getg() { + throw("malloc during signal") + } + if typ != nil && typ.Pointers() { + throw("expected noscan for tiny alloc") + } +} + +func tinyStub(size uintptr, typ *_type, needzero bool) (unsafe.Pointer, uintptr) { + const constsize = size_ + const elemsize = elemsize_ + + // Set mp.mallocing to keep from being preempted by GC. + mp := acquirem() + if doubleCheckMalloc { + doubleCheckTiny(constsize, typ, mp) + } + mp.mallocing = 1 + + // Tiny allocator. + // + // Tiny allocator combines several tiny allocation requests + // into a single memory block. The resulting memory block + // is freed when all subobjects are unreachable. The subobjects + // must be noscan (don't have pointers), this ensures that + // the amount of potentially wasted memory is bounded. + // + // Size of the memory block used for combining (maxTinySize) is tunable. + // Current setting is 16 bytes, which relates to 2x worst case memory + // wastage (when all but one subobjects are unreachable). + // 8 bytes would result in no wastage at all, but provides less + // opportunities for combining. + // 32 bytes provides more opportunities for combining, + // but can lead to 4x worst case wastage. + // The best case winning is 8x regardless of block size. + // + // Objects obtained from tiny allocator must not be freed explicitly. + // So when an object will be freed explicitly, we ensure that + // its size >= maxTinySize. + // + // SetFinalizer has a special case for objects potentially coming + // from tiny allocator, it such case it allows to set finalizers + // for an inner byte of a memory block. + // + // The main targets of tiny allocator are small strings and + // standalone escaping variables. On a json benchmark + // the allocator reduces number of allocations by ~12% and + // reduces heap size by ~20%. + c := getMCache(mp) + off := c.tinyoffset + // Align tiny pointer for required (conservative) alignment. + if constsize&7 == 0 { + off = alignUp(off, 8) + } else if goarch.PtrSize == 4 && constsize == 12 { + // Conservatively align 12-byte objects to 8 bytes on 32-bit + // systems so that objects whose first field is a 64-bit + // value is aligned to 8 bytes and does not cause a fault on + // atomic access. See issue 37262. + // TODO(mknyszek): Remove this workaround if/when issue 36606 + // is resolved. + off = alignUp(off, 8) + } else if constsize&3 == 0 { + off = alignUp(off, 4) + } else if constsize&1 == 0 { + off = alignUp(off, 2) + } + if off+constsize <= maxTinySize && c.tiny != 0 { + // The object fits into existing tiny block. + x := unsafe.Pointer(c.tiny + off) + c.tinyoffset = off + constsize + c.tinyAllocs++ + mp.mallocing = 0 + releasem(mp) + return x, 0 + } + // Allocate a new maxTinySize block. + checkGCTrigger := false + span := c.alloc[tinySpanClass] + v := nextFreeFastTiny(span) + if v == 0 { + v, span, checkGCTrigger = c.nextFree(tinySpanClass) + } + x := unsafe.Pointer(v) + (*[2]uint64)(x)[0] = 0 // Always zero + (*[2]uint64)(x)[1] = 0 + // See if we need to replace the existing tiny block with the new one + // based on amount of remaining free space. + if !raceenabled && (constsize < c.tinyoffset || c.tiny == 0) { + // Note: disabled when race detector is on, see comment near end of this function. + c.tiny = uintptr(x) + c.tinyoffset = constsize + } + + // Ensure that the stores above that initialize x to + // type-safe memory and set the heap bits occur before + // the caller can make x observable to the garbage + // collector. Otherwise, on weakly ordered machines, + // the garbage collector could follow a pointer to x, + // but see uninitialized memory or stale heap bits. + publicationBarrier() + + if writeBarrier.enabled { + // Allocate black during GC. + // All slots hold nil so no scanning is needed. + // This may be racing with GC so do it atomically if there can be + // a race marking the bit. + gcmarknewobject(span, uintptr(x)) + } else { + // Track the last free index before the mark phase. This field + // is only used by the garbage collector. During the mark phase + // this is used by the conservative scanner to filter out objects + // that are both free and recently-allocated. It's safe to do that + // because we allocate-black if the GC is enabled. The conservative + // scanner produces pointers out of thin air, so without additional + // synchronization it might otherwise observe a partially-initialized + // object, which could crash the program. + span.freeIndexForScan = span.freeindex + } + + // Note cache c only valid while m acquired; see #47302 + // + // N.B. Use the full size because that matches how the GC + // will update the mem profile on the "free" side. + // + // TODO(mknyszek): We should really count the header as part + // of gc_sys or something. The code below just pretends it is + // internal fragmentation and matches the GC's accounting by + // using the whole allocation slot. + c.nextSample -= int64(elemsize) + if c.nextSample < 0 || MemProfileRate != c.memProfRate { + profilealloc(mp, x, elemsize) + } + mp.mallocing = 0 + releasem(mp) + + if checkGCTrigger { + if t := (gcTrigger{kind: gcTriggerHeap}); t.test() { + gcStart(t) + } + } + + if raceenabled { + // Pad tinysize allocations so they are aligned with the end + // of the tinyalloc region. This ensures that any arithmetic + // that goes off the top end of the object will be detectable + // by checkptr (issue 38872). + // Note that we disable tinyalloc when raceenabled for this to work. + // TODO: This padding is only performed when the race detector + // is enabled. It would be nice to enable it if any package + // was compiled with checkptr, but there's no easy way to + // detect that (especially at compile time). + // TODO: enable this padding for all allocations, not just + // tinyalloc ones. It's tricky because of pointer maps. + // Maybe just all noscan objects? + x = add(x, elemsize-constsize) + } + return x, elemsize +} + +// TODO(matloob): Should we let the go compiler inline this instead of using mkmalloc? +// We won't be able to use elemsize_ but that's probably ok. +func nextFreeFastTiny(span *mspan) gclinkptr { + const nbytes = 8192 + const nelems = uint16((nbytes - unsafe.Sizeof(spanInlineMarkBits{})) / elemsize_) + var nextFreeFastResult gclinkptr + if span.allocCache != 0 { + theBit := sys.TrailingZeros64(span.allocCache) // Is there a free object in the allocCache? + result := span.freeindex + uint16(theBit) + if result < nelems { + freeidx := result + 1 + if !(freeidx%64 == 0 && freeidx != nelems) { + span.allocCache >>= uint(theBit + 1) + span.freeindex = freeidx + span.allocCount++ + nextFreeFastResult = gclinkptr(uintptr(result)*elemsize_ + span.base()) + } + } + } + return nextFreeFastResult +} + +func nextFreeFastStub(span *mspan) gclinkptr { + var nextFreeFastResult gclinkptr + if span.allocCache != 0 { + theBit := sys.TrailingZeros64(span.allocCache) // Is there a free object in the allocCache? + result := span.freeindex + uint16(theBit) + if result < span.nelems { + freeidx := result + 1 + if !(freeidx%64 == 0 && freeidx != span.nelems) { + span.allocCache >>= uint(theBit + 1) + span.freeindex = freeidx + span.allocCount++ + nextFreeFastResult = gclinkptr(uintptr(result)*elemsize_ + span.base()) + } + } + } + return nextFreeFastResult +} + +func heapSetTypeNoHeaderStub(x, dataSize uintptr, typ *_type, span *mspan) uintptr { + if doubleCheckHeapSetType && (!heapBitsInSpan(dataSize) || !heapBitsInSpan(elemsize_)) { + throw("tried to write heap bits, but no heap bits in span") + } + scanSize := writeHeapBitsSmallStub(span, x, dataSize, typ) + if doubleCheckHeapSetType { + doubleCheckHeapType(x, dataSize, typ, nil, span) + } + return scanSize +} + +// writeHeapBitsSmallStub writes the heap bits for small objects whose ptr/scalar data is +// stored as a bitmap at the end of the span. +// +// Assumes dataSize is <= ptrBits*goarch.PtrSize. x must be a pointer into the span. +// heapBitsInSpan(dataSize) must be true. dataSize must be >= typ.Size_. +// +//go:nosplit +func writeHeapBitsSmallStub(span *mspan, x, dataSize uintptr, typ *_type) uintptr { + // The objects here are always really small, so a single load is sufficient. + src0 := readUintptr(getGCMask(typ)) + + const elemsize = elemsize_ + + // Create repetitions of the bitmap if we have a small slice backing store. + scanSize := typ.PtrBytes + src := src0 + if typ.Size_ == goarch.PtrSize { + src = (1 << (dataSize / goarch.PtrSize)) - 1 + } else { + // N.B. We rely on dataSize being an exact multiple of the type size. + // The alternative is to be defensive and mask out src to the length + // of dataSize. The purpose is to save on one additional masking operation. + if doubleCheckHeapSetType && !asanenabled && dataSize%typ.Size_ != 0 { + throw("runtime: (*mspan).writeHeapBitsSmall: dataSize is not a multiple of typ.Size_") + } + for i := typ.Size_; i < dataSize; i += typ.Size_ { + src |= src0 << (i / goarch.PtrSize) + scanSize += typ.Size_ + } + } + + // Since we're never writing more than one uintptr's worth of bits, we're either going + // to do one or two writes. + dstBase, _ := spanHeapBitsRange(span.base(), pageSize, elemsize) + dst := unsafe.Pointer(dstBase) + o := (x - span.base()) / goarch.PtrSize + i := o / ptrBits + j := o % ptrBits + const bits uintptr = elemsize / goarch.PtrSize + // In the if statement below, we have to do two uintptr writes if the bits + // we need to write straddle across two different memory locations. But if + // the number of bits we're writing divides evenly into the number of bits + // in the uintptr we're writing, this can never happen. Since bitsIsPowerOfTwo + // is a compile-time constant in the generated code, in the case where the size is + // a power of two less than or equal to ptrBits, the compiler can remove the + // 'two writes' branch of the if statement and always do only one write without + // the check. + const bitsIsPowerOfTwo = bits&(bits-1) == 0 + if bits > ptrBits || (!bitsIsPowerOfTwo && j+bits > ptrBits) { + // Two writes. + bits0 := ptrBits - j + bits1 := bits - bits0 + dst0 := (*uintptr)(add(dst, (i+0)*goarch.PtrSize)) + dst1 := (*uintptr)(add(dst, (i+1)*goarch.PtrSize)) + *dst0 = (*dst0)&(^uintptr(0)>>bits0) | (src << j) + *dst1 = (*dst1)&^((1<<bits1)-1) | (src >> bits0) + } else { + // One write. + dst := (*uintptr)(add(dst, i*goarch.PtrSize)) + *dst = (*dst)&^(((1<<(min(bits, ptrBits)))-1)<<j) | (src << j) // We're taking the min so this compiles on 32 bit platforms. But if bits > ptrbits we always take the other branch + } + + const doubleCheck = false + if doubleCheck { + writeHeapBitsDoubleCheck(span, x, dataSize, src, src0, i, j, bits, typ) + } + return scanSize +} + +func writeHeapBitsDoubleCheck(span *mspan, x, dataSize, src, src0, i, j, bits uintptr, typ *_type) { + srcRead := span.heapBitsSmallForAddr(x) + if srcRead != src { + print("runtime: x=", hex(x), " i=", i, " j=", j, " bits=", bits, "\n") + print("runtime: dataSize=", dataSize, " typ.Size_=", typ.Size_, " typ.PtrBytes=", typ.PtrBytes, "\n") + print("runtime: src0=", hex(src0), " src=", hex(src), " srcRead=", hex(srcRead), "\n") + throw("bad pointer bits written for small object") + } +} |
