diff options
| author | Michael Pratt <mpratt@google.com> | 2024-11-04 12:41:33 -0500 |
|---|---|---|
| committer | Gopher Robot <gobot@golang.org> | 2024-11-20 22:41:14 +0000 |
| commit | dce30a19200d06b778fa90c96231b3cf71fe72d8 (patch) | |
| tree | 539ee3c413bb3c2b1b0acc756660792ba92fecf5 /src/internal/runtime | |
| parent | b8ba5b440b1f84920e80852ec984520046adaf3a (diff) | |
| download | go-dce30a19200d06b778fa90c96231b3cf71fe72d8.tar.xz | |
cmd/compile: intrinsify swissmap match calls with SIMD on amd64
Use similar SIMD operations to the ones used in Abseil. We still
using 8-slot groups (even though the XMM registers could handle 16-slot
groups) to keep the implementation simpler (no changes to the memory
layout of maps).
Still, the implementations of matchH2 and matchEmpty are shorter than
the portable version using standard arithmetic operations. They also
return a packed bitset, which avoids the need to shift in bitset.first.
That said, the packed bitset is a downside in cognitive complexity, as
we have to think about two different possible representations. This
doesn't leak out of the API, but we do need to intrinsify bitset to
switch to a compatible implementation.
The compiler's intrinsics don't support intrinsifying methods, so the
implementations move to free functions.
This makes operations between 0-3% faster on my machine. e.g.,
MapGetHit/impl=runtimeMap/t=Int64/len=6-12 12.34n ± 1% 11.42n ± 1% -7.46% (p=0.000 n=25)
MapGetHit/impl=runtimeMap/t=Int64/len=12-12 15.14n ± 2% 14.88n ± 1% -1.72% (p=0.009 n=25)
MapGetHit/impl=runtimeMap/t=Int64/len=18-12 15.04n ± 6% 14.66n ± 2% -2.53% (p=0.000 n=25)
MapGetHit/impl=runtimeMap/t=Int64/len=24-12 15.80n ± 1% 15.48n ± 3% ~ (p=0.444 n=25)
MapGetHit/impl=runtimeMap/t=Int64/len=30-12 15.55n ± 4% 14.77n ± 3% -5.02% (p=0.004 n=25)
MapGetHit/impl=runtimeMap/t=Int64/len=64-12 15.26n ± 1% 15.05n ± 1% ~ (p=0.055 n=25)
MapGetHit/impl=runtimeMap/t=Int64/len=128-12 15.34n ± 1% 15.02n ± 2% -2.09% (p=0.000 n=25)
MapGetHit/impl=runtimeMap/t=Int64/len=256-12 15.42n ± 1% 15.15n ± 1% -1.75% (p=0.001 n=25)
MapGetHit/impl=runtimeMap/t=Int64/len=512-12 15.48n ± 1% 15.18n ± 1% -1.94% (p=0.000 n=25)
MapGetHit/impl=runtimeMap/t=Int64/len=1024-12 17.38n ± 1% 17.05n ± 1% -1.90% (p=0.000 n=25)
MapGetHit/impl=runtimeMap/t=Int64/len=2048-12 17.96n ± 0% 17.59n ± 1% -2.06% (p=0.000 n=25)
MapGetHit/impl=runtimeMap/t=Int64/len=4096-12 18.36n ± 1% 18.18n ± 1% -0.98% (p=0.013 n=25)
MapGetHit/impl=runtimeMap/t=Int64/len=8192-12 18.75n ± 0% 18.31n ± 1% -2.35% (p=0.000 n=25)
MapGetHit/impl=runtimeMap/t=Int64/len=65536-12 26.25n ± 0% 25.95n ± 1% -1.14% (p=0.000 n=25)
MapGetHit/impl=runtimeMap/t=Int64/len=262144-12 44.24n ± 1% 44.06n ± 1% ~ (p=0.181 n=25)
MapGetHit/impl=runtimeMap/t=Int64/len=1048576-12 85.02n ± 0% 85.35n ± 0% +0.39% (p=0.032 n=25)
MapGetHit/impl=runtimeMap/t=Int64/len=4194304-12 98.87n ± 1% 98.85n ± 1% ~ (p=0.799 n=25)
For #54766.
Cq-Include-Trybots: luci.golang.try:gotip-linux-ppc64_power10,gotip-linux-amd64-goamd64v3
Change-Id: Ic1b852f02744404122cb3672900fd95f4625905e
Reviewed-on: https://go-review.googlesource.com/c/go/+/626277
Reviewed-by: Keith Randall <khr@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Michael Pratt <mpratt@google.com>
Reviewed-by: Keith Randall <khr@google.com>
Diffstat (limited to 'src/internal/runtime')
| -rw-r--r-- | src/internal/runtime/maps/group.go | 97 | ||||
| -rw-r--r-- | src/internal/runtime/maps/runtime_fast32_swiss.go | 8 | ||||
| -rw-r--r-- | src/internal/runtime/maps/runtime_fast64_swiss.go | 8 |
3 files changed, 97 insertions, 16 deletions
diff --git a/src/internal/runtime/maps/group.go b/src/internal/runtime/maps/group.go index 59df1fb25a..f3e9d4d12b 100644 --- a/src/internal/runtime/maps/group.go +++ b/src/internal/runtime/maps/group.go @@ -30,33 +30,82 @@ const ( // bitset represents a set of slots within a group. // -// The underlying representation uses one byte per slot, where each byte is +// The underlying representation depends on GOARCH. +// +// On AMD64, bitset uses one bit per slot, where the bit is set if the slot is +// part of the set. All of the ctrlGroup.match* methods are replaced with +// intrinsics that return this packed representation. +// +// On other architectures, bitset uses one byte per slot, where each byte is // either 0x80 if the slot is part of the set or 0x00 otherwise. This makes it -// convenient to calculate for an entire group at once (e.g. see matchEmpty). +// convenient to calculate for an entire group at once using standard +// arithemetic instructions. type bitset uint64 -// first assumes that only the MSB of each control byte can be set (e.g. bitset -// is the result of matchEmpty or similar) and returns the relative index of the -// first control byte in the group that has the MSB set. +// first returns the relative index of the first control byte in the group that +// is in the set. // -// Returns abi.SwissMapGroupSlots if the bitset is empty. +// Preconditions: b is not 0 (empty). func (b bitset) first() uintptr { + return bitsetFirst(b) +} + +// Portable implementation of first. +// +// On AMD64, this is replaced with an intrisic that simply does +// TrailingZeros64. There is no need to shift as the bitset is packed. +func bitsetFirst(b bitset) uintptr { return uintptr(sys.TrailingZeros64(uint64(b))) >> 3 } -// removeFirst removes the first set bit (that is, resets the least significant +// removeFirst clears the first set bit (that is, resets the least significant // set bit to 0). func (b bitset) removeFirst() bitset { return b & (b - 1) } -// removeBelow removes all set bits below slot i (non-inclusive). +// removeBelow clears all set bits below slot i (non-inclusive). func (b bitset) removeBelow(i uintptr) bitset { + return bitsetRemoveBelow(b, i) +} + +// Portable implementation of removeBelow. +// +// On AMD64, this is replaced with an intrisic that clears the lower i bits. +func bitsetRemoveBelow(b bitset, i uintptr) bitset { // Clear all bits below slot i's byte. mask := (uint64(1) << (8 * uint64(i))) - 1 return b &^ bitset(mask) } +// lowestSet returns true if the bit is set for the lowest index in the bitset. +// +// This is intended for use with shiftOutLowest to loop over all entries in the +// bitset regardless of whether they are set. +func (b bitset) lowestSet() bool { + return bitsetLowestSet(b) +} + +// Portable implementation of lowestSet. +// +// On AMD64, this is replaced with an intrisic that checks the lowest bit. +func bitsetLowestSet(b bitset) bool { + return b&(1<<7) != 0 +} + +// shiftOutLowest shifts the lowest entry out of the bitset. Afterwards, the +// lowest entry in the bitset corresponds to the next slot. +func (b bitset) shiftOutLowest() bitset { + return bitsetShiftOutLowest(b) +} + +// Portable implementation of shiftOutLowest. +// +// On AMD64, this is replaced with an intrisic that shifts a single bit. +func bitsetShiftOutLowest(b bitset) bitset { + return b >> 8 +} + // Each slot in the hash table has a control byte which can have one of three // states: empty, deleted, and full. They have the following bit patterns: // @@ -96,6 +145,14 @@ func (g *ctrlGroup) setEmpty() { // matchH2 returns the set of slots which are full and for which the 7-bit hash // matches the given value. May return false positives. func (g ctrlGroup) matchH2(h uintptr) bitset { + return ctrlGroupMatchH2(g, h) +} + +// Portable implementation of matchH2. +// +// Note: On AMD64, this is an intrinsic implemented with SIMD instructions. See +// note on bitset about the packed instrinsified return value. +func ctrlGroupMatchH2(g ctrlGroup, h uintptr) bitset { // NB: This generic matching routine produces false positive matches when // h is 2^N and the control bytes have a seq of 2^N followed by 2^N+1. For // example: if ctrls==0x0302 and h=02, we'll compute v as 0x0100. When we @@ -110,6 +167,14 @@ func (g ctrlGroup) matchH2(h uintptr) bitset { // matchEmpty returns the set of slots in the group that are empty. func (g ctrlGroup) matchEmpty() bitset { + return ctrlGroupMatchEmpty(g) +} + +// Portable implementation of matchEmpty. +// +// Note: On AMD64, this is an intrinsic implemented with SIMD instructions. See +// note on bitset about the packed instrinsified return value. +func ctrlGroupMatchEmpty(g ctrlGroup) bitset { // An empty slot is 1000 0000 // A deleted slot is 1111 1110 // A full slot is 0??? ???? @@ -123,6 +188,14 @@ func (g ctrlGroup) matchEmpty() bitset { // matchEmptyOrDeleted returns the set of slots in the group that are empty or // deleted. func (g ctrlGroup) matchEmptyOrDeleted() bitset { + return ctrlGroupMatchEmptyOrDeleted(g) +} + +// Portable implementation of matchEmptyOrDeleted. +// +// Note: On AMD64, this is an intrinsic implemented with SIMD instructions. See +// note on bitset about the packed instrinsified return value. +func ctrlGroupMatchEmptyOrDeleted(g ctrlGroup) bitset { // An empty slot is 1000 0000 // A deleted slot is 1111 1110 // A full slot is 0??? ???? @@ -134,6 +207,14 @@ func (g ctrlGroup) matchEmptyOrDeleted() bitset { // matchFull returns the set of slots in the group that are full. func (g ctrlGroup) matchFull() bitset { + return ctrlGroupMatchFull(g) +} + +// Portable implementation of matchFull. +// +// Note: On AMD64, this is an intrinsic implemented with SIMD instructions. See +// note on bitset about the packed instrinsified return value. +func ctrlGroupMatchFull(g ctrlGroup) bitset { // An empty slot is 1000 0000 // A deleted slot is 1111 1110 // A full slot is 0??? ???? diff --git a/src/internal/runtime/maps/runtime_fast32_swiss.go b/src/internal/runtime/maps/runtime_fast32_swiss.go index 15facbfe8a..46023cc9b7 100644 --- a/src/internal/runtime/maps/runtime_fast32_swiss.go +++ b/src/internal/runtime/maps/runtime_fast32_swiss.go @@ -38,12 +38,12 @@ func runtime_mapaccess1_fast32(typ *abi.SwissMapType, m *Map, key uint32) unsafe slotKey := g.key(typ, 0) slotSize := typ.SlotSize for full != 0 { - if key == *(*uint32)(slotKey) && full&(1<<7) != 0 { + if key == *(*uint32)(slotKey) && full.lowestSet() { slotElem := unsafe.Pointer(uintptr(slotKey) + typ.ElemOff) return slotElem } slotKey = unsafe.Pointer(uintptr(slotKey) + slotSize) - full >>= 8 + full = full.shiftOutLowest() } return unsafe.Pointer(&zeroVal[0]) } @@ -107,12 +107,12 @@ func runtime_mapaccess2_fast32(typ *abi.SwissMapType, m *Map, key uint32) (unsaf slotKey := g.key(typ, 0) slotSize := typ.SlotSize for full != 0 { - if key == *(*uint32)(slotKey) && full&(1<<7) != 0 { + if key == *(*uint32)(slotKey) && full.lowestSet() { slotElem := unsafe.Pointer(uintptr(slotKey) + typ.ElemOff) return slotElem, true } slotKey = unsafe.Pointer(uintptr(slotKey) + slotSize) - full >>= 8 + full = full.shiftOutLowest() } return unsafe.Pointer(&zeroVal[0]), false } diff --git a/src/internal/runtime/maps/runtime_fast64_swiss.go b/src/internal/runtime/maps/runtime_fast64_swiss.go index f08e7ef869..6bc6b2f0b1 100644 --- a/src/internal/runtime/maps/runtime_fast64_swiss.go +++ b/src/internal/runtime/maps/runtime_fast64_swiss.go @@ -38,12 +38,12 @@ func runtime_mapaccess1_fast64(typ *abi.SwissMapType, m *Map, key uint64) unsafe slotKey := g.key(typ, 0) slotSize := typ.SlotSize for full != 0 { - if key == *(*uint64)(slotKey) && full&(1<<7) != 0 { + if key == *(*uint64)(slotKey) && full.lowestSet() { slotElem := unsafe.Pointer(uintptr(slotKey) + 8) return slotElem } slotKey = unsafe.Pointer(uintptr(slotKey) + slotSize) - full >>= 8 + full = full.shiftOutLowest() } return unsafe.Pointer(&zeroVal[0]) } @@ -107,12 +107,12 @@ func runtime_mapaccess2_fast64(typ *abi.SwissMapType, m *Map, key uint64) (unsaf slotKey := g.key(typ, 0) slotSize := typ.SlotSize for full != 0 { - if key == *(*uint64)(slotKey) && full&(1<<7) != 0 { + if key == *(*uint64)(slotKey) && full.lowestSet() { slotElem := unsafe.Pointer(uintptr(slotKey) + 8) return slotElem, true } slotKey = unsafe.Pointer(uintptr(slotKey) + slotSize) - full >>= 8 + full = full.shiftOutLowest() } return unsafe.Pointer(&zeroVal[0]), false } |
