aboutsummaryrefslogtreecommitdiff
path: root/src/runtime/mpallocbits.go
diff options
context:
space:
mode:
authorKeith Randall <khr@golang.org>2020-07-03 11:28:50 -0700
committerKeith Randall <khr@golang.org>2020-08-17 21:58:31 +0000
commit4e5ed83e8d2fbbbc8f6524f40ab3b6733dc57a38 (patch)
treeb2337198172af9cd4b279c31d9757e822c68a2bc /src/runtime/mpallocbits.go
parent88c094c96a164aef2134e548d495c4bc14dc4687 (diff)
downloadgo-4e5ed83e8d2fbbbc8f6524f40ab3b6733dc57a38.tar.xz
runtime: use bit-parallel operations to compute heap bit summaries
The new implementation is much faster in all cases. name old time/op new time/op delta PallocBitsSummarize/Unpacked00-16 142ns ± 1% 7ns ± 2% -94.75% (p=0.000 n=10+9) PallocBitsSummarize/UnpackedFFFFFFFFFFFFFFFF-16 172ns ± 0% 24ns ± 0% -86.02% (p=0.000 n=9+9) PallocBitsSummarize/UnpackedAA-16 145ns ± 0% 32ns ± 0% -78.16% (p=0.000 n=8+10) PallocBitsSummarize/UnpackedAAAAAAAAAAAAAAAA-16 172ns ± 0% 33ns ± 0% -80.95% (p=0.000 n=9+9) PallocBitsSummarize/Unpacked80000000AAAAAAAA-16 162ns ± 1% 60ns ± 0% -62.69% (p=0.000 n=10+9) PallocBitsSummarize/UnpackedAAAAAAAA00000001-16 163ns ± 0% 68ns ± 1% -58.47% (p=0.000 n=8+10) PallocBitsSummarize/UnpackedBBBBBBBBBBBBBBBB-16 172ns ± 0% 35ns ± 0% -79.70% (p=0.000 n=9+9) PallocBitsSummarize/Unpacked80000000BBBBBBBB-16 161ns ± 0% 63ns ± 0% -60.61% (p=0.000 n=8+10) PallocBitsSummarize/UnpackedBBBBBBBB00000001-16 163ns ± 0% 60ns ± 0% -63.14% (p=0.000 n=9+10) PallocBitsSummarize/UnpackedCCCCCCCCCCCCCCCC-16 172ns ± 0% 39ns ± 0% -77.41% (p=0.000 n=7+10) PallocBitsSummarize/Unpacked4444444444444444-16 172ns ± 0% 39ns ± 0% -77.42% (p=0.000 n=7+10) PallocBitsSummarize/Unpacked4040404040404040-16 173ns ± 2% 51ns ± 1% -70.55% (p=0.000 n=10+10) PallocBitsSummarize/Unpacked4000400040004000-16 160ns ± 1% 53ns ± 0% -66.78% (p=0.000 n=10+10) PallocBitsSummarize/Unpacked1000404044CCAAFF-16 169ns ± 1% 59ns ± 1% -65.28% (p=0.000 n=10+10) Change-Id: I94daa645b76a9cf9c93edeb2058d7132216fcb72 Reviewed-on: https://go-review.googlesource.com/c/go/+/240900 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Michael Knyszek <mknyszek@google.com>
Diffstat (limited to 'src/runtime/mpallocbits.go')
-rw-r--r--src/runtime/mpallocbits.go145
1 files changed, 83 insertions, 62 deletions
diff --git a/src/runtime/mpallocbits.go b/src/runtime/mpallocbits.go
index a8011341bc..ff79bfbc1a 100644
--- a/src/runtime/mpallocbits.go
+++ b/src/runtime/mpallocbits.go
@@ -120,78 +120,99 @@ func (b *pageBits) popcntRange(i, n uint) (s uint) {
// sake of documentation, 0s are free pages and 1s are allocated pages.
type pallocBits pageBits
-// consec8tab is a table containing the number of consecutive
-// zero bits for any uint8 value.
-//
-// The table is generated by calling consec8(i) for each
-// possible uint8 value, which is defined as:
-//
-// // consec8 counts the maximum number of consecutive 0 bits
-// // in a uint8.
-// func consec8(n uint8) int {
-// n = ^n
-// i := 0
-// for n != 0 {
-// n &= (n << 1)
-// i++
-// }
-// return i
-// }
-var consec8tab = [256]uint{
- 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
- 5, 4, 3, 3, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2,
- 4, 3, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2,
- 6, 5, 4, 4, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2,
- 4, 3, 2, 2, 2, 1, 1, 1, 3, 2, 1, 1, 2, 1, 1, 1,
- 5, 4, 3, 3, 2, 2, 2, 2, 3, 2, 1, 1, 2, 1, 1, 1,
- 4, 3, 2, 2, 2, 1, 1, 1, 3, 2, 1, 1, 2, 1, 1, 1,
- 7, 6, 5, 5, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3,
- 4, 3, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2,
- 5, 4, 3, 3, 2, 2, 2, 2, 3, 2, 1, 1, 2, 1, 1, 1,
- 4, 3, 2, 2, 2, 1, 1, 1, 3, 2, 1, 1, 2, 1, 1, 1,
- 6, 5, 4, 4, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2,
- 4, 3, 2, 2, 2, 1, 1, 1, 3, 2, 1, 1, 2, 1, 1, 1,
- 5, 4, 3, 3, 2, 2, 2, 2, 3, 2, 1, 1, 2, 1, 1, 1,
- 4, 3, 2, 2, 2, 1, 1, 1, 3, 2, 1, 1, 2, 1, 1, 0,
-}
-
// summarize returns a packed summary of the bitmap in pallocBits.
func (b *pallocBits) summarize() pallocSum {
- // TODO(mknyszek): There may be something more clever to be done
- // here to make the summarize operation more efficient. For example,
- // we can compute start and end with 64-bit wide operations easily,
- // but max is a bit more complex. Perhaps there exists some way to
- // leverage the 64-bit start and end to our advantage?
- var start, max, end uint
+ var start, max, cur uint
+ const notSetYet = ^uint(0) // sentinel for start value
+ start = notSetYet
for i := 0; i < len(b); i++ {
- a := b[i]
- for j := 0; j < 64; j += 8 {
- k := uint8(a >> j)
+ x := b[i]
+ if x == 0 {
+ cur += 64
+ continue
+ }
+ t := uint(sys.TrailingZeros64(x))
+ l := uint(sys.LeadingZeros64(x))
- // Compute start.
- si := uint(sys.TrailingZeros8(k))
- if start == uint(i*64+j) {
- start += si
- }
+ // Finish any region spanning the uint64s
+ cur += t
+ if start == notSetYet {
+ start = cur
+ }
+ if cur > max {
+ max = cur
+ }
+ // Final region that might span to next uint64
+ cur = l
+ }
+ if start == notSetYet {
+ // Made it all the way through without finding a single 1 bit.
+ const n = uint(64 * len(b))
+ return packPallocSum(n, n, n)
+ }
+ if cur > max {
+ max = cur
+ }
+ if max >= 64-2 {
+ // There is no way an internal run of zeros could beat max.
+ return packPallocSum(start, max, cur)
+ }
+ // Now look inside each uint64 for runs of zeros.
+ // All uint64s must be nonzero, or we would have aborted above.
+outer:
+ for i := 0; i < len(b); i++ {
+ x := b[i]
- // Compute max.
- if end+si > max {
- max = end + si
- }
- if mi := consec8tab[k]; mi > max {
- max = mi
+ // Look inside this uint64. We have a pattern like
+ // 000000 1xxxxx1 000000
+ // We need to look inside the 1xxxxx1 for any contiguous
+ // region of zeros.
+
+ // We already know the trailing zeros are no larger than max. Remove them.
+ x >>= sys.TrailingZeros64(x) & 63
+ if x&(x+1) == 0 { // no more zeros (except at the top).
+ continue
+ }
+
+ // Strategy: shrink all runs of zeros by max. If any runs of zero
+ // remain, then we've identified a larger maxiumum zero run.
+ p := max // number of zeros we still need to shrink by.
+ k := uint(1) // current minimum length of runs of ones in x.
+ for {
+ // Shrink all runs of zeros by p places (except the top zeros).
+ for p > 0 {
+ if p <= k {
+ // Shift p ones down into the top of each run of zeros.
+ x |= x >> (p & 63)
+ if x&(x+1) == 0 { // no more zeros (except at the top).
+ continue outer
+ }
+ break
+ }
+ // Shift k ones down into the top of each run of zeros.
+ x |= x >> (k & 63)
+ if x&(x+1) == 0 { // no more zeros (except at the top).
+ continue outer
+ }
+ p -= k
+ // We've just doubled the minimum length of 1-runs.
+ // This allows us to shift farther in the next iteration.
+ k *= 2
}
- // Compute end.
- if k == 0 {
- end += 8
- } else {
- end = uint(sys.LeadingZeros8(k))
+ // The length of the lowest-order zero run is an increment to our maximum.
+ j := uint(sys.TrailingZeros64(^x)) // count contiguous trailing ones
+ x >>= j & 63 // remove trailing ones
+ j = uint(sys.TrailingZeros64(x)) // count contiguous trailing zeros
+ x >>= j & 63 // remove zeros
+ max += j // we have a new maximum!
+ if x&(x+1) == 0 { // no more zeros (except at the top).
+ continue outer
}
+ p = j // remove j more zeros from each zero run.
}
}
- return packPallocSum(start, max, end)
+ return packPallocSum(start, max, cur)
}
// find searches for npages contiguous free pages in pallocBits and returns