feat: add bloom for memory read/write

irootyou · Aug 20, 2023 · a74bd64 · a74bd64
1 parent 64fdea5
commit a74bd64
Show file tree

Hide file tree

Showing 5 changed files with 84 additions and 283 deletions.
diff --git a/db/memory/memory.go b/db/memory/memory.go
@@ -2,19 +2,23 @@ package memory
 
 import (
 	"errors"
+	"github.com/ByteStorage/FlyDB/lib/bloom"
 	"sync"
 )
 
 // MemTable is an in-memory table
 type MemTable struct {
 	table map[string][]byte // key -> value
 	mutex sync.RWMutex      // protect table
+	bloom *bloom.Filter     // bloom filter
 }
 
 // NewMemTable create a new MemTable
 func NewMemTable() *MemTable {
 	return &MemTable{
 		table: make(map[string][]byte),
+		// Initialize with no keys and 10 bits per key
+		bloom: bloom.NewBloomFilter(1000, 0.01),
 	}
 }
 
@@ -23,13 +27,20 @@ func (m *MemTable) Put(key string, value []byte) {
 	m.mutex.Lock()
 	defer m.mutex.Unlock()
 	m.table[key] = value
+	// Add the key to the bloom filter
+	m.bloom.Add([]byte(key))
 }
 
 // Get a value from the table
 func (m *MemTable) Get(key string) ([]byte, error) {
 	m.mutex.RLock()
 	defer m.mutex.RUnlock()
 
+	// Immediate return if the key is not in the bloom filter
+	if !m.bloom.MayContainItem([]byte(key)) {
+		return nil, errors.New("key not found")
+	}
+
 	value, ok := m.table[key]
 	if !ok {
 		return nil, errors.New("key not found")
@@ -39,6 +50,8 @@ func (m *MemTable) Get(key string) ([]byte, error) {
 }
 
 // Delete a key from the table
+// Note: Bloom filters don't support deletion without affecting accuracy
+// so we don't remove the key from the bloom filter.
 func (m *MemTable) Delete(key string) {
 	m.mutex.Lock()
 	defer m.mutex.Unlock()

diff --git a/go.mod b/go.mod
@@ -18,9 +18,9 @@ require (
 	github.com/klauspost/reedsolomon v1.11.7
 	github.com/pkg/errors v0.9.1
 	github.com/plar/go-adaptive-radix-tree v1.0.5
+	github.com/spaolacci/murmur3 v1.1.0
 	github.com/stretchr/testify v1.8.2
 	github.com/tecbot/gorocksdb v0.0.0-20191217155057-f0fad39f321c
-	github.com/tidwall/wal v1.1.7
 	go.etcd.io/bbolt v1.3.7
 	go.uber.org/zap v1.24.0
 	golang.org/x/crypto v0.0.0-20190701094942-4def268fd1a4
@@ -49,10 +49,6 @@ require (
 	github.com/mattn/go-isatty v0.0.16 // indirect
 	github.com/pmezard/go-difflib v1.0.0 // indirect
 	github.com/rogpeppe/go-internal v1.9.0 // indirect
-	github.com/tidwall/gjson v1.14.4 // indirect
-	github.com/tidwall/match v1.1.1 // indirect
-	github.com/tidwall/pretty v1.2.1 // indirect
-	github.com/tidwall/tinylru v1.2.1 // indirect
 	go.uber.org/atomic v1.7.0 // indirect
 	go.uber.org/multierr v1.6.0 // indirect
 	golang.org/x/net v0.8.0 // indirect

diff --git a/go.sum b/go.sum
@@ -169,6 +169,8 @@ github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZV
 github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs=
 github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo=
 github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE=
+github.com/spaolacci/murmur3 v1.1.0 h1:7c1g84S4BPRrfL5Xrdp6fOJ206sU9y293DDHaoy0bLI=
+github.com/spaolacci/murmur3 v1.1.0/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA=
 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
 github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
 github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
@@ -186,19 +188,6 @@ github.com/stretchr/testify v1.8.2 h1:+h33VjcLVPDHtOdpUCuF+7gSuG3yGIftsP1YvFihtJ
 github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
 github.com/tecbot/gorocksdb v0.0.0-20191217155057-f0fad39f321c h1:g+WoO5jjkqGAzHWCjJB1zZfXPIAaDpzXIEJ0eS6B5Ok=
 github.com/tecbot/gorocksdb v0.0.0-20191217155057-f0fad39f321c/go.mod h1:ahpPrc7HpcfEWDQRZEmnXMzHY03mLDYMCxeDzy46i+8=
-github.com/tidwall/gjson v1.10.2/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
-github.com/tidwall/gjson v1.14.4 h1:uo0p8EbA09J7RQaflQ1aBRffTR7xedD2bcIVSYxLnkM=
-github.com/tidwall/gjson v1.14.4/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
-github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA=
-github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM=
-github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU=
-github.com/tidwall/pretty v1.2.1 h1:qjsOFOWWQl+N3RsoF5/ssm1pHmJJwhjlSbZ51I6wMl4=
-github.com/tidwall/pretty v1.2.1/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU=
-github.com/tidwall/tinylru v1.1.0/go.mod h1:3+bX+TJ2baOLMWTnlyNWHh4QMnFyARg2TLTQ6OFbzw8=
-github.com/tidwall/tinylru v1.2.1 h1:VgBr72c2IEr+V+pCdkPZUwiQ0KJknnWIYbhxAVkYfQk=
-github.com/tidwall/tinylru v1.2.1/go.mod h1:9bQnEduwB6inr2Y7AkBP7JPgCkyrhTV/ZpX0oOOpBI4=
-github.com/tidwall/wal v1.1.7 h1:emc1TRjIVsdKKSnpwGBAcsAGg0767SvUk8+ygx7Bb+4=
-github.com/tidwall/wal v1.1.7/go.mod h1:r6lR1j27W9EPalgHiB7zLJDYu3mzW5BQP5KrzBpYY/E=
 github.com/tv42/httpunix v0.0.0-20150427012821-b75d8614f926/go.mod h1:9ESjWnEqriFuLhtthL60Sar/7RFoluCcXsuvEwTV5KM=
 go.etcd.io/bbolt v1.3.7 h1:j+zJOnnEjF/kyHlDDgGnVL/AIqIJPq8UoB2GSNfkUfQ=
 go.etcd.io/bbolt v1.3.7/go.mod h1:N9Mkw9X8x5fupy0IKsmuqVtoGDyxsaDlbk4Rd05IAQw=

diff --git a/lib/bloom/bloom.go b/lib/bloom/bloom.go
@@ -1,151 +1,65 @@
 package bloom
 
-import "math"
-
-const (
-	seed = 0xbc9f1d34
-	m    = 0xc6a4a793
+import (
+	"github.com/spaolacci/murmur3"
+	"math"
 )
 
-// Filter is an encoded set of []byte keys.
-type Filter []byte
-
-// MayContainKey returns whether the filter may contain given key. False positives
-func (f Filter) MayContainKey(k []byte) bool {
-	return f.mayContain(Hash(k))
+// Filter represents a structure for the filter itself.
+type Filter struct {
+	bitSet    []bool // Bit array to hold the state of the data
+	size      uint32 // Size of the bit array
+	numHashes uint8  // Number of hash functions to use
 }
 
-// MayContain returns whether the filter may contain given key. False positives
-// are possible, where it returns true for keys not in the original set.
-func (f Filter) mayContain(h uint32) bool {
-	// check if the filter is empty
-	if len(f) < 2 {
-		return false
-	}
-	// obtain the number of hash functions
-	k := f[len(f)-1]
-	// if k > 30, this is reserved for potentially new encodings for short Bloom filters.
-	if k > 30 {
-		// This is reserved for potentially new encodings for short Bloom filters.
-		// Consider it a match.
-		return true
-	}
-	// calculate the total number of bits in the filter.
-	nBits := uint32(8 * (len(f) - 1))
-	// change the hash value by right shift and left shift to generate different bit positions for subsequent iterations.
-	delta := h>>17 | h<<15
-	for j := uint8(0); j < k; j++ {
-		// For each hash function, calculate the bit position bitPos
-		bitPos := h % nBits
-		// Check if the corresponding bit has been set.
-		// If the bit has not been set, the key is definitely not in the set, and false is returned.
-		if f[bitPos/8]&(1<<(bitPos%8)) == 0 {
-			return false
-		}
-		h += delta
-	}
-	return true
-}
-
-// NewFilter returns a new Bloom filter that encodes a set of []byte keys with
-// the given number of bits per key, approximately.
-//
-// A good bitsPerKey value is 10, which yields a filter with ~ 1% false
-// positive rate.
-func NewFilter(keys []uint32, bitsPerKey int) Filter {
-	return Filter(appendFilter(nil, keys, bitsPerKey))
-}
-
-// BloomBitsPerKey returns the bits per key required by bloomfilter based on
-// the false positive rate.
-func BloomBitsPerKey(numEntries int, fp float64) int {
-	size := -1 * float64(numEntries) * math.Log(fp) / math.Pow(float64(0.69314718056), 2)
-	locs := math.Ceil(float64(0.69314718056) * size / float64(numEntries))
-	return int(locs)
-}
+// NewBloomFilter initializes a new Bloom filter based on the expected number of items and desired false positive rate.
+func NewBloomFilter(expectedItems uint32, fpRate float64) *Filter {
+	// Calculate the size of bit array using the expected number of items and desired false positive rate
+	size := uint32(-float64(expectedItems) * math.Log(fpRate) / (math.Ln2 * math.Ln2))
+	// Calculate the optimal number of hash functions based on the size of bit array and expected number of items
+	numHashes := uint8(float64(size) / float64(expectedItems) * math.Ln2)
 
-func appendFilter(buf []byte, keys []uint32, bitsPerKey int) []byte {
-	if bitsPerKey < 0 {
-		bitsPerKey = 0
+	return &Filter{
+		bitSet:    make([]bool, size),
+		size:      size,
+		numHashes: numHashes,
 	}
-	// 0.69 is approximately ln(2).
-	k := uint32(float64(bitsPerKey) * 0.69)
-	if k < 1 {
-		k = 1
-	}
-	if k > 30 {
-		k = 30
-	}
-
-	nBits := len(keys) * bitsPerKey
-	// For small len(keys), we can see a very high false positive rate. Fix it
-	// by enforcing a minimum bloom filter length.
-	if nBits < 64 {
-		nBits = 64
-	}
-	nBytes := (nBits + 7) / 8
-	nBits = nBytes * 8
-	buf, filter := extend(buf, nBytes+1)
+}
 
-	for _, h := range keys {
-		delta := h>>17 | h<<15
-		for j := uint32(0); j < k; j++ {
-			bitPos := h % uint32(nBits)
-			filter[bitPos/8] |= 1 << (bitPos % 8)
-			h += delta
-		}
+// Add inserts an item into the Bloom filter.
+func (f *Filter) Add(item []byte) {
+	hashes := f.hash(item)
+	// For each hash value, find the position and set the bit to true
+	for i := uint8(0); i < f.numHashes; i++ {
+		position := hashes[i] % f.size
+		f.bitSet[position] = true
 	}
-	filter[nBytes] = uint8(k)
-
-	return buf
 }
 
-// extend appends n zero bytes to b. It returns the overall slice (of length
-// n+len(originalB)) and the slice of n trailing zeroes.
-func extend(b []byte, n int) (overall, trailer []byte) {
-	want := n + len(b)
-	if want <= cap(b) {
-		overall = b[:want]
-		trailer = overall[len(b):]
-		for i := range trailer {
-			trailer[i] = 0
-		}
-	} else {
-		// Grow the capacity exponentially, with a 1KiB minimum.
-		c := 1024
-		for c < want {
-			c += c / 4
+// MayContainItem checks if an item is possibly in the set.
+// If it returns false, the item is definitely not in the set.
+// If it returns true, the item might be in the set, but it can also be a false positive.
+func (f *Filter) MayContainItem(item []byte) bool {
+	hashes := f.hash(item)
+	for i := uint8(0); i < f.numHashes; i++ {
+		position := hashes[i] % f.size
+		if !f.bitSet[position] {
+			return false
 		}
-		overall = make([]byte, want, c)
-		trailer = overall[len(b):]
-		copy(overall, b)
 	}
-	return overall, trailer
+	return true
 }
 
-// Hash implements a hashing algorithm similar to the Murmur hash.
-func Hash(b []byte) uint32 {
-	// The original algorithm uses a seed of 0x9747b28c.
-	h := uint32(seed) ^ uint32(len(b))*m
-	// Pick up four bytes at a time.
-	for ; len(b) >= 4; b = b[4:] {
-		// The original algorithm uses the following commented out code to load
-		h += uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
-		h *= m
-		h ^= h >> 16
-	}
-	// Pick up remaining bytes.
-	switch len(b) {
-	case 3:
-		h += uint32(b[2]) << 16
-		fallthrough
-	case 2:
-		h += uint32(b[1]) << 8
-		fallthrough
-	case 1:
-		h += uint32(b[0])
-		h *= m
-		h ^= h >> 24
+// hash produces multiple hash values for an item.
+// It leverages two hash values from murmur3 and generates as many as needed through a linear combination.
+func (f *Filter) hash(item []byte) []uint32 {
+	h1, h2 := murmur3.Sum128(item) // Get two 64-bit hash values
+	var result []uint32
+
+	// Use the two hash values to generate the required number of hash functions.
+	for i := uint8(0); i < f.numHashes; i++ {
+		h := h1 + uint64(i)*h2
+		result = append(result, uint32(h))
 	}
-	return h
+	return result
 }