diff options
| author | Michael Anthony Knyszek <mknyszek@google.com> | 2023-11-14 22:05:53 +0000 |
|---|---|---|
| committer | Michael Knyszek <mknyszek@google.com> | 2023-11-16 05:53:55 +0000 |
| commit | d6ef98b8fa4851f025779ef4ade084d63290de2a (patch) | |
| tree | 75ff4789e138894df856523f34ef25bb2001edb4 /src/runtime/mbitmap_allocheaders.go | |
| parent | 17eb0a2bac79eda8dc71d628474989d05d9755c5 (diff) | |
| download | go-d6ef98b8fa4851f025779ef4ade084d63290de2a.tar.xz | |
runtime: optimize bulkBarrierPreWrite with allocheaders
Currently bulkBarrierPreWrite follows a fairly slow path wherein it
calls typePointersOf, which ends up calling into fastForward. This does
some fairly heavy computation to move the iterator forward without any
assumptions about where it lands at all. It needs to be completely
general to support splitting at arbitrary boundaries, for example for
scanning oblets.
This means that copying objects during the GC mark phase is fairly
expensive, and is a regression from before allocheaders.
However, in almost all cases bulkBarrierPreWrite and
bulkBarrierPreWriteSrcOnly have perfect type information. We can do a
lot better in these cases because we're starting on a type-size
boundary, which is exactly what the iterator is built around.
This change adds the typePointersOfType method which produces a
typePointers iterator from a pointer and a type. This change
significantly improves the performance of these bulk write barriers,
eliminating some performance regressions that were noticed on the perf
dashboard.
There are still just a couple cases where we have to use the more
general typePointersOf calls, but they're fairly rare; most bulk
barriers have perfect type information.
This change is tested by the GCInfo tests in the runtime and the GCBits
tests in the reflect package via an additional check in getgcmask.
Results for tile38 before and after allocheaders. There was previous a
regression in the p90, now it's gone. Also, the overall win has been
boosted slightly.
tile38 $ benchstat noallocheaders.results allocheaders.results
name old time/op new time/op delta
Tile38QueryLoad 481µs ± 1% 468µs ± 1% -2.71% (p=0.000 n=10+10)
name old average-RSS-bytes new average-RSS-bytes delta
Tile38QueryLoad 6.32GB ± 1% 6.23GB ± 0% -1.38% (p=0.000 n=9+8)
name old peak-RSS-bytes new peak-RSS-bytes delta
Tile38QueryLoad 6.49GB ± 1% 6.40GB ± 1% -1.38% (p=0.002 n=10+10)
name old peak-VM-bytes new peak-VM-bytes delta
Tile38QueryLoad 7.72GB ± 1% 7.64GB ± 1% -1.07% (p=0.007 n=10+10)
name old p50-latency-ns new p50-latency-ns delta
Tile38QueryLoad 212k ± 1% 205k ± 0% -3.02% (p=0.000 n=10+9)
name old p90-latency-ns new p90-latency-ns delta
Tile38QueryLoad 622k ± 1% 616k ± 1% -1.03% (p=0.005 n=10+10)
name old p99-latency-ns new p99-latency-ns delta
Tile38QueryLoad 4.55M ± 2% 4.39M ± 2% -3.51% (p=0.000 n=10+10)
name old ops/s new ops/s delta
Tile38QueryLoad 12.5k ± 1% 12.8k ± 1% +2.78% (p=0.000 n=10+10)
Change-Id: I0a48f848eae8777d0fd6769c3a1fe449f8d9d0a6
Reviewed-on: https://go-review.googlesource.com/c/go/+/542219
Reviewed-by: Cherry Mui <cherryyz@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Diffstat (limited to 'src/runtime/mbitmap_allocheaders.go')
| -rw-r--r-- | src/runtime/mbitmap_allocheaders.go | 194 |
1 files changed, 182 insertions, 12 deletions
diff --git a/src/runtime/mbitmap_allocheaders.go b/src/runtime/mbitmap_allocheaders.go index 319d71f92f..03cec5ffcc 100644 --- a/src/runtime/mbitmap_allocheaders.go +++ b/src/runtime/mbitmap_allocheaders.go @@ -58,6 +58,7 @@ package runtime import ( + "internal/abi" "internal/goarch" "runtime/internal/sys" "unsafe" @@ -200,6 +201,29 @@ func (span *mspan) typePointersOfUnchecked(addr uintptr) typePointers { return typePointers{elem: addr, addr: addr, mask: readUintptr(gcdata), typ: typ} } +// typePointersOfType is like typePointersOf, but assumes addr points to one or more +// contiguous instances of the provided type. The provided type must not be nil and +// it must not have its type metadata encoded as a gcprog. +// +// It returns an iterator that tiles typ.GCData starting from addr. It's the caller's +// responsibility to limit iteration. +// +// nosplit because its callers are nosplit and require all their callees to be nosplit. +// +//go:nosplit +func (span *mspan) typePointersOfType(typ *abi.Type, addr uintptr) typePointers { + const doubleCheck = false + if doubleCheck && (typ == nil || typ.Kind_&kindGCProg != 0) { + throw("bad type passed to typePointersOfType") + } + if span.spanclass.noscan() { + return typePointers{} + } + // Since we have the type, pretend we have a header. + gcdata := typ.GCData + return typePointers{elem: addr, addr: addr, mask: readUintptr(gcdata), typ: typ} +} + // nextFast is the fast path of next. nextFast is written to be inlineable and, // as the name implies, fast. // @@ -368,15 +392,30 @@ func (span *mspan) objBase(addr uintptr) uintptr { // The caller is also responsible for cgo pointer checks if this // may be writing Go pointers into non-Go memory. // -// The pointer bitmap is not maintained for allocations containing +// Pointer data is not maintained for allocations containing // no pointers at all; any caller of bulkBarrierPreWrite must first // make sure the underlying allocation contains pointers, usually // by checking typ.PtrBytes. // +// The typ argument is the type of the space at src and dst (and the +// element type if src and dst refer to arrays) and it is optional. +// If typ is nil, the barrier will still behave as expected and typ +// is used purely as an optimization. However, it must be used with +// care. +// +// If typ is not nil, then src and dst must point to one or more values +// of type typ. The caller must ensure that the ranges [src, src+size) +// and [dst, dst+size) refer to one or more whole values of type src and +// dst (leaving off the pointerless tail of the space is OK). If this +// precondition is not followed, this function will fail to scan the +// right pointers. +// +// When in doubt, pass nil for typ. That is safe and will always work. +// // Callers must perform cgo checks if goexperiment.CgoCheck2. // //go:nosplit -func bulkBarrierPreWrite(dst, src, size uintptr) { +func bulkBarrierPreWrite(dst, src, size uintptr, typ *abi.Type) { if (dst|src|size)&(goarch.PtrSize-1) != 0 { throw("bulkBarrierPreWrite: unaligned arguments") } @@ -411,7 +450,18 @@ func bulkBarrierPreWrite(dst, src, size uintptr) { } buf := &getg().m.p.ptr().wbBuf - tp := s.typePointersOf(dst, size) + // Double-check that the bitmaps generated in the two possible paths match. + const doubleCheck = false + if doubleCheck { + doubleCheckTypePointersOfType(s, typ, dst, size) + } + + var tp typePointers + if typ != nil && typ.Kind_&kindGCProg == 0 { + tp = s.typePointersOfType(typ, dst) + } else { + tp = s.typePointersOf(dst, size) + } if src == 0 { for { var addr uintptr @@ -446,8 +496,12 @@ func bulkBarrierPreWrite(dst, src, size uintptr) { // This is used for special cases where e.g. dst was just // created and zeroed with malloc. // +// The type of the space can be provided purely as an optimization. +// See bulkBarrierPreWrite's comment for more details -- use this +// optimization with great care. +// //go:nosplit -func bulkBarrierPreWriteSrcOnly(dst, src, size uintptr) { +func bulkBarrierPreWriteSrcOnly(dst, src, size uintptr, typ *abi.Type) { if (dst|src|size)&(goarch.PtrSize-1) != 0 { throw("bulkBarrierPreWrite: unaligned arguments") } @@ -455,7 +509,20 @@ func bulkBarrierPreWriteSrcOnly(dst, src, size uintptr) { return } buf := &getg().m.p.ptr().wbBuf - tp := spanOf(dst).typePointersOf(dst, size) + s := spanOf(dst) + + // Double-check that the bitmaps generated in the two possible paths match. + const doubleCheck = false + if doubleCheck { + doubleCheckTypePointersOfType(s, typ, dst, size) + } + + var tp typePointers + if typ != nil && typ.Kind_&kindGCProg == 0 { + tp = s.typePointersOfType(typ, dst) + } else { + tp = s.typePointersOf(dst, size) + } for { var addr uintptr if tp, addr = tp.next(dst + size); addr == 0 { @@ -993,6 +1060,52 @@ func doubleCheckHeapPointersInterior(x, interior, size, dataSize uintptr, typ *_ throw("heapSetType: pointer entry not correct") } +//go:nosplit +func doubleCheckTypePointersOfType(s *mspan, typ *_type, addr, size uintptr) { + if typ == nil || typ.Kind_&kindGCProg != 0 { + return + } + if typ.Kind_&kindMask == kindInterface { + // Interfaces are unfortunately inconsistently handled + // when it comes to the type pointer, so it's easy to + // produce a lot of false positives here. + return + } + tp0 := s.typePointersOfType(typ, addr) + tp1 := s.typePointersOf(addr, size) + failed := false + for { + var addr0, addr1 uintptr + tp0, addr0 = tp0.next(addr + size) + tp1, addr1 = tp1.next(addr + size) + if addr0 != addr1 { + failed = true + break + } + if addr0 == 0 { + break + } + } + if failed { + tp0 := s.typePointersOfType(typ, addr) + tp1 := s.typePointersOf(addr, size) + print("runtime: addr=", hex(addr), " size=", size, "\n") + print("runtime: type=", toRType(typ).string(), "\n") + dumpTypePointers(tp0) + dumpTypePointers(tp1) + for { + var addr0, addr1 uintptr + tp0, addr0 = tp0.next(addr + size) + tp1, addr1 = tp1.next(addr + size) + print("runtime: ", hex(addr0), " ", hex(addr1), "\n") + if addr0 == 0 && addr1 == 0 { + break + } + } + throw("mismatch between typePointersOfType and typePointersOf") + } +} + func dumpTypePointers(tp typePointers) { print("runtime: tp.elem=", hex(tp.elem), " tp.typ=", unsafe.Pointer(tp.typ), "\n") print("runtime: tp.addr=", hex(tp.addr), " tp.mask=") @@ -1015,12 +1128,19 @@ func getgcmask(ep any) (mask []byte) { e := *efaceOf(&ep) p := e.data t := e._type + + var et *_type + if t.Kind_&kindMask != kindPtr { + throw("bad argument to getgcmask: expected type to be a pointer to the value type whose mask is being queried") + } + et = (*ptrtype)(unsafe.Pointer(t)).Elem + // data or bss for _, datap := range activeModules() { // data if datap.data <= uintptr(p) && uintptr(p) < datap.edata { bitmap := datap.gcdatamask.bytedata - n := (*ptrtype)(unsafe.Pointer(t)).Elem.Size_ + n := et.Size_ mask = make([]byte, n/goarch.PtrSize) for i := uintptr(0); i < n; i += goarch.PtrSize { off := (uintptr(p) + i - datap.data) / goarch.PtrSize @@ -1032,7 +1152,7 @@ func getgcmask(ep any) (mask []byte) { // bss if datap.bss <= uintptr(p) && uintptr(p) < datap.ebss { bitmap := datap.gcbssmask.bytedata - n := (*ptrtype)(unsafe.Pointer(t)).Elem.Size_ + n := et.Size_ mask = make([]byte, n/goarch.PtrSize) for i := uintptr(0); i < n; i += goarch.PtrSize { off := (uintptr(p) + i - datap.bss) / goarch.PtrSize @@ -1056,13 +1176,13 @@ func getgcmask(ep any) (mask []byte) { base = tp.addr // Unroll the full bitmap the GC would actually observe. - mask = make([]byte, (limit-base)/goarch.PtrSize) + maskFromHeap := make([]byte, (limit-base)/goarch.PtrSize) for { var addr uintptr if tp, addr = tp.next(limit); addr == 0 { break } - mask[(addr-base)/goarch.PtrSize] = 1 + maskFromHeap[(addr-base)/goarch.PtrSize] = 1 } // Double-check that every part of the ptr/scalar we're not @@ -1074,11 +1194,61 @@ func getgcmask(ep any) (mask []byte) { } } - // Callers expect this mask to end at the last pointer. - for len(mask) > 0 && mask[len(mask)-1] == 0 { - mask = mask[:len(mask)-1] + // Callers (and a check we're about to run) expects this mask + // to end at the last pointer. + for len(maskFromHeap) > 0 && maskFromHeap[len(maskFromHeap)-1] == 0 { + maskFromHeap = maskFromHeap[:len(maskFromHeap)-1] } + if et.Kind_&kindGCProg == 0 { + // Unroll again, but this time from the type information. + maskFromType := make([]byte, (limit-base)/goarch.PtrSize) + tp = s.typePointersOfType(et, base) + for { + var addr uintptr + if tp, addr = tp.next(limit); addr == 0 { + break + } + maskFromType[(addr-base)/goarch.PtrSize] = 1 + } + + // Validate that the prefix of maskFromType is equal to + // maskFromHeap. maskFromType may contain more pointers than + // maskFromHeap produces because maskFromHeap may be able to + // get exact type information for certain classes of objects. + // With maskFromType, we're always just tiling the type bitmap + // through to the elemsize. + // + // It's OK if maskFromType has pointers in elemsize that extend + // past the actual populated space; we checked above that all + // that space is zeroed, so just the GC will just see nil pointers. + differs := false + for i := range maskFromHeap { + if maskFromHeap[i] != maskFromType[i] { + differs = true + break + } + } + + if differs { + print("runtime: heap mask=") + for _, b := range maskFromHeap { + print(b) + } + println() + print("runtime: type mask=") + for _, b := range maskFromType { + print(b) + } + println() + print("runtime: type=", toRType(et).string(), "\n") + throw("found two different masks from two different methods") + } + } + + // Select the heap mask to return. We may not have a type mask. + mask = maskFromHeap + // Make sure we keep ep alive. We may have stopped referencing // ep's data pointer sometime before this point and it's possible // for that memory to get freed. |
