diff options
| author | Alan Donovan <adonovan@google.com> | 2026-01-29 18:14:06 -0500 |
|---|---|---|
| committer | Alan Donovan <adonovan@google.com> | 2026-04-07 15:17:06 -0700 |
| commit | 230d1407b04da5d65505415b1a7b4f6d70c75086 (patch) | |
| tree | 34f6d97f886f75338b82a9ca21f8bd12073916b6 | |
| parent | e3b3216242691ab0e02164458bda98346ce36adf (diff) | |
| download | go-230d1407b04da5d65505415b1a7b4f6d70c75086.tar.xz | |
hash/maphash: add BloomFilter example
This CL adds an example of using the maphash package to construct
a Bloom filter, a compact probabilistic set of values whose
hash function and equivalence relation are specified by a Hasher.
Change-Id: I1545d75e23c1456178098d09487ba9fed03e5a27
Reviewed-on: https://go-review.googlesource.com/c/go/+/740440
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Robert Griesemer <gri@google.com>
| -rw-r--r-- | src/hash/maphash/example_bloom_test.go | 157 |
1 files changed, 157 insertions, 0 deletions
diff --git a/src/hash/maphash/example_bloom_test.go b/src/hash/maphash/example_bloom_test.go new file mode 100644 index 0000000000..2a2a27e1e4 --- /dev/null +++ b/src/hash/maphash/example_bloom_test.go @@ -0,0 +1,157 @@ +// Copyright 2026 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package maphash_test + +// This file demonstrates a Bloom filter. + +import ( + "fmt" + "hash/maphash" + "math" + "math/bits" +) + +// BloomFilter is a Bloom filter, a probabilistic space-efficient +// representation of a set of values of type V based on hashing. +// +// It provides two operations: Insert inserts an element and Contains +// queries whether a value is a member of the set. +// +// However, unlike a typical set, the size is independent of the +// number of elements. The catch: Contains is permitted to report +// true even for some elements that are not present. The trade-off +// between space and accuracy is determined by parameters at +// construction. +type BloomFilter[V any] struct { + hasher maphash.Hasher[V] + seeds []maphash.Seed // each seed determines a hash function + bytes []byte // bit vector +} + +// NewBloomFilterComparable returns a new BloomFilter for the +// specified elements using their natural comparison. +func NewComparableBloomFilter[V comparable](n int, fpProb float64) *BloomFilter[V] { + return NewBloomFilter(maphash.ComparableHasher[V]{}, n, fpProb) +} + +// NewBloomFilter constructs a new BloomFilter capable of holding n +// elements with the specified probability of false positive results, +// assuming a well dispersed hash function. +func NewBloomFilter[V any](hasher maphash.Hasher[V], n int, fpProb float64) *BloomFilter[V] { + nbytes, nseeds := calibrate(n, fpProb) + + seeds := make([]maphash.Seed, nseeds) + for i := range nseeds { + seeds[i] = maphash.MakeSeed() + } + + return &BloomFilter[V]{ + hasher: hasher, + bytes: make([]byte, nbytes), + seeds: seeds, + } +} + +func (f *BloomFilter[V]) Contains(v V) bool { + for _, seed := range f.seeds { + index, bit := f.locate(seed, v) + if f.bytes[index]&bit == 0 { + return false + } + } + return true +} + +func (f *BloomFilter[V]) Insert(v V) { + for _, seed := range f.seeds { + index, bit := f.locate(seed, v) + f.bytes[index] |= bit + } +} + +func (f *BloomFilter[V]) locate(seed maphash.Seed, v V) (uint64, byte) { + // Optimization note: the dynamic call to hasher.Hash causes h + // to escape. You can use a sync.Pool can help mitigate the + // allocation cost. + // + // Alternatively, you could copy and specialize the filter logic + // for a specific implementation of maphash.Hasher, allowing + // the compiler's escape analysis to determine that the + // hasher.Hash call does not in fact cause h to escape. + var h maphash.Hash + h.SetSeed(seed) + f.hasher.Hash(&h, v) + hash := h.Sum64() + + index := reduce(hash, uint64(len(f.bytes))) + mask := byte(1 << (hash % 8)) + return index, mask +} + +// reduce maps hash to a value in the interval [0, n). +// See https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction +func reduce(hash, n uint64) uint64 { + if n > 1<<32-1 { + hi, _ := bits.Mul64(hash, n) + return hi + } + return hash >> 32 * n >> 32 +} + +func calibrate(n int, fpProb float64) (bytes, seeds int) { + // Following https://en.wikipedia.org/wiki/Bloom_filter: + // - k is the number of hash functions, + // - m is the size of the bit field; + // - n is the number of set bits. + + if n == 0 { + return 1, 1 + } + + logFpProb := math.Log(fpProb) + m := -(float64(n) * logFpProb) / (math.Ln2 * math.Ln2) + + // Round up to a byte. + // TODO(adonovan): opt: round up to the next allocation size + // class (see bytes.growSlice) and then compute the possibly + // smaller number of needed seeds based on this higher number. + bytes = int(m) / 8 + if float64(bytes*8) < m { + bytes++ + } + + k := -logFpProb / math.Ln2 + seeds = max(int(math.Round(k)), 1) + return bytes, seeds +} + +func Example_bloomFilter() { + // Create a Bloom filter optimized for 2 elements with + // a one-in-a-billion false-positive rate. + // + // (This low rate demands a lot of space: 88 bits and + // 30 hash functions. More typical rates are 1-5%; + // at 5%, we need only 16 bits and 4 hash functions.) + f := NewComparableBloomFilter[string](2, 1e-9) + + // Insert two elements. + f.Insert("apple") + f.Insert("banana") + + // Check whether elements are present. + // + // "cherry" was not inserted, but Contains is probabilistic, so + // this test will spuriously report Contains("cherry") = true + // about once every billion runs. + for _, fruit := range []string{"apple", "banana", "cherry"} { + fmt.Printf("Contains(%q) = %v\n", fruit, f.Contains(fruit)) + } + + // Output: + // + // Contains("apple") = true + // Contains("banana") = true + // Contains("cherry") = false +} |
