aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlan Donovan <adonovan@google.com>2026-01-29 18:14:06 -0500
committerAlan Donovan <adonovan@google.com>2026-04-07 15:17:06 -0700
commit230d1407b04da5d65505415b1a7b4f6d70c75086 (patch)
tree34f6d97f886f75338b82a9ca21f8bd12073916b6
parente3b3216242691ab0e02164458bda98346ce36adf (diff)
downloadgo-230d1407b04da5d65505415b1a7b4f6d70c75086.tar.xz
hash/maphash: add BloomFilter example
This CL adds an example of using the maphash package to construct a Bloom filter, a compact probabilistic set of values whose hash function and equivalence relation are specified by a Hasher. Change-Id: I1545d75e23c1456178098d09487ba9fed03e5a27 Reviewed-on: https://go-review.googlesource.com/c/go/+/740440 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Robert Griesemer <gri@google.com>
-rw-r--r--src/hash/maphash/example_bloom_test.go157
1 files changed, 157 insertions, 0 deletions
diff --git a/src/hash/maphash/example_bloom_test.go b/src/hash/maphash/example_bloom_test.go
new file mode 100644
index 0000000000..2a2a27e1e4
--- /dev/null
+++ b/src/hash/maphash/example_bloom_test.go
@@ -0,0 +1,157 @@
+// Copyright 2026 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package maphash_test
+
+// This file demonstrates a Bloom filter.
+
+import (
+ "fmt"
+ "hash/maphash"
+ "math"
+ "math/bits"
+)
+
+// BloomFilter is a Bloom filter, a probabilistic space-efficient
+// representation of a set of values of type V based on hashing.
+//
+// It provides two operations: Insert inserts an element and Contains
+// queries whether a value is a member of the set.
+//
+// However, unlike a typical set, the size is independent of the
+// number of elements. The catch: Contains is permitted to report
+// true even for some elements that are not present. The trade-off
+// between space and accuracy is determined by parameters at
+// construction.
+type BloomFilter[V any] struct {
+ hasher maphash.Hasher[V]
+ seeds []maphash.Seed // each seed determines a hash function
+ bytes []byte // bit vector
+}
+
+// NewBloomFilterComparable returns a new BloomFilter for the
+// specified elements using their natural comparison.
+func NewComparableBloomFilter[V comparable](n int, fpProb float64) *BloomFilter[V] {
+ return NewBloomFilter(maphash.ComparableHasher[V]{}, n, fpProb)
+}
+
+// NewBloomFilter constructs a new BloomFilter capable of holding n
+// elements with the specified probability of false positive results,
+// assuming a well dispersed hash function.
+func NewBloomFilter[V any](hasher maphash.Hasher[V], n int, fpProb float64) *BloomFilter[V] {
+ nbytes, nseeds := calibrate(n, fpProb)
+
+ seeds := make([]maphash.Seed, nseeds)
+ for i := range nseeds {
+ seeds[i] = maphash.MakeSeed()
+ }
+
+ return &BloomFilter[V]{
+ hasher: hasher,
+ bytes: make([]byte, nbytes),
+ seeds: seeds,
+ }
+}
+
+func (f *BloomFilter[V]) Contains(v V) bool {
+ for _, seed := range f.seeds {
+ index, bit := f.locate(seed, v)
+ if f.bytes[index]&bit == 0 {
+ return false
+ }
+ }
+ return true
+}
+
+func (f *BloomFilter[V]) Insert(v V) {
+ for _, seed := range f.seeds {
+ index, bit := f.locate(seed, v)
+ f.bytes[index] |= bit
+ }
+}
+
+func (f *BloomFilter[V]) locate(seed maphash.Seed, v V) (uint64, byte) {
+ // Optimization note: the dynamic call to hasher.Hash causes h
+ // to escape. You can use a sync.Pool can help mitigate the
+ // allocation cost.
+ //
+ // Alternatively, you could copy and specialize the filter logic
+ // for a specific implementation of maphash.Hasher, allowing
+ // the compiler's escape analysis to determine that the
+ // hasher.Hash call does not in fact cause h to escape.
+ var h maphash.Hash
+ h.SetSeed(seed)
+ f.hasher.Hash(&h, v)
+ hash := h.Sum64()
+
+ index := reduce(hash, uint64(len(f.bytes)))
+ mask := byte(1 << (hash % 8))
+ return index, mask
+}
+
+// reduce maps hash to a value in the interval [0, n).
+// See https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction
+func reduce(hash, n uint64) uint64 {
+ if n > 1<<32-1 {
+ hi, _ := bits.Mul64(hash, n)
+ return hi
+ }
+ return hash >> 32 * n >> 32
+}
+
+func calibrate(n int, fpProb float64) (bytes, seeds int) {
+ // Following https://en.wikipedia.org/wiki/Bloom_filter:
+ // - k is the number of hash functions,
+ // - m is the size of the bit field;
+ // - n is the number of set bits.
+
+ if n == 0 {
+ return 1, 1
+ }
+
+ logFpProb := math.Log(fpProb)
+ m := -(float64(n) * logFpProb) / (math.Ln2 * math.Ln2)
+
+ // Round up to a byte.
+ // TODO(adonovan): opt: round up to the next allocation size
+ // class (see bytes.growSlice) and then compute the possibly
+ // smaller number of needed seeds based on this higher number.
+ bytes = int(m) / 8
+ if float64(bytes*8) < m {
+ bytes++
+ }
+
+ k := -logFpProb / math.Ln2
+ seeds = max(int(math.Round(k)), 1)
+ return bytes, seeds
+}
+
+func Example_bloomFilter() {
+ // Create a Bloom filter optimized for 2 elements with
+ // a one-in-a-billion false-positive rate.
+ //
+ // (This low rate demands a lot of space: 88 bits and
+ // 30 hash functions. More typical rates are 1-5%;
+ // at 5%, we need only 16 bits and 4 hash functions.)
+ f := NewComparableBloomFilter[string](2, 1e-9)
+
+ // Insert two elements.
+ f.Insert("apple")
+ f.Insert("banana")
+
+ // Check whether elements are present.
+ //
+ // "cherry" was not inserted, but Contains is probabilistic, so
+ // this test will spuriously report Contains("cherry") = true
+ // about once every billion runs.
+ for _, fruit := range []string{"apple", "banana", "cherry"} {
+ fmt.Printf("Contains(%q) = %v\n", fruit, f.Contains(fruit))
+ }
+
+ // Output:
+ //
+ // Contains("apple") = true
+ // Contains("banana") = true
+ // Contains("cherry") = false
+}