Skip to content

Commit

Permalink
feat: add bloom for memory read/write
Browse files Browse the repository at this point in the history
  • Loading branch information
sjcsjc123 committed Aug 20, 2023
1 parent 64fdea5 commit a74bd64
Show file tree
Hide file tree
Showing 5 changed files with 84 additions and 283 deletions.
13 changes: 13 additions & 0 deletions db/memory/memory.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,23 @@ package memory

import (
"errors"
"github.com/ByteStorage/FlyDB/lib/bloom"
"sync"
)

// MemTable is an in-memory table
type MemTable struct {
table map[string][]byte // key -> value
mutex sync.RWMutex // protect table
bloom *bloom.Filter // bloom filter
}

// NewMemTable create a new MemTable
func NewMemTable() *MemTable {
return &MemTable{
table: make(map[string][]byte),
// Initialize with no keys and 10 bits per key
bloom: bloom.NewBloomFilter(1000, 0.01),
}
}

Expand All @@ -23,13 +27,20 @@ func (m *MemTable) Put(key string, value []byte) {
m.mutex.Lock()
defer m.mutex.Unlock()
m.table[key] = value
// Add the key to the bloom filter
m.bloom.Add([]byte(key))
}

// Get a value from the table
func (m *MemTable) Get(key string) ([]byte, error) {
m.mutex.RLock()
defer m.mutex.RUnlock()

// Immediate return if the key is not in the bloom filter
if !m.bloom.MayContainItem([]byte(key)) {
return nil, errors.New("key not found")
}

value, ok := m.table[key]
if !ok {
return nil, errors.New("key not found")
Expand All @@ -39,6 +50,8 @@ func (m *MemTable) Get(key string) ([]byte, error) {
}

// Delete a key from the table
// Note: Bloom filters don't support deletion without affecting accuracy
// so we don't remove the key from the bloom filter.
func (m *MemTable) Delete(key string) {
m.mutex.Lock()
defer m.mutex.Unlock()
Expand Down
6 changes: 1 addition & 5 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@ require (
github.com/klauspost/reedsolomon v1.11.7
github.com/pkg/errors v0.9.1
github.com/plar/go-adaptive-radix-tree v1.0.5
github.com/spaolacci/murmur3 v1.1.0
github.com/stretchr/testify v1.8.2
github.com/tecbot/gorocksdb v0.0.0-20191217155057-f0fad39f321c
github.com/tidwall/wal v1.1.7
go.etcd.io/bbolt v1.3.7
go.uber.org/zap v1.24.0
golang.org/x/crypto v0.0.0-20190701094942-4def268fd1a4
Expand Down Expand Up @@ -49,10 +49,6 @@ require (
github.com/mattn/go-isatty v0.0.16 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/rogpeppe/go-internal v1.9.0 // indirect
github.com/tidwall/gjson v1.14.4 // indirect
github.com/tidwall/match v1.1.1 // indirect
github.com/tidwall/pretty v1.2.1 // indirect
github.com/tidwall/tinylru v1.2.1 // indirect
go.uber.org/atomic v1.7.0 // indirect
go.uber.org/multierr v1.6.0 // indirect
golang.org/x/net v0.8.0 // indirect
Expand Down
15 changes: 2 additions & 13 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,8 @@ github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZV
github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs=
github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo=
github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE=
github.com/spaolacci/murmur3 v1.1.0 h1:7c1g84S4BPRrfL5Xrdp6fOJ206sU9y293DDHaoy0bLI=
github.com/spaolacci/murmur3 v1.1.0/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
Expand All @@ -186,19 +188,6 @@ github.com/stretchr/testify v1.8.2 h1:+h33VjcLVPDHtOdpUCuF+7gSuG3yGIftsP1YvFihtJ
github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
github.com/tecbot/gorocksdb v0.0.0-20191217155057-f0fad39f321c h1:g+WoO5jjkqGAzHWCjJB1zZfXPIAaDpzXIEJ0eS6B5Ok=
github.com/tecbot/gorocksdb v0.0.0-20191217155057-f0fad39f321c/go.mod h1:ahpPrc7HpcfEWDQRZEmnXMzHY03mLDYMCxeDzy46i+8=
github.com/tidwall/gjson v1.10.2/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
github.com/tidwall/gjson v1.14.4 h1:uo0p8EbA09J7RQaflQ1aBRffTR7xedD2bcIVSYxLnkM=
github.com/tidwall/gjson v1.14.4/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA=
github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM=
github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU=
github.com/tidwall/pretty v1.2.1 h1:qjsOFOWWQl+N3RsoF5/ssm1pHmJJwhjlSbZ51I6wMl4=
github.com/tidwall/pretty v1.2.1/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU=
github.com/tidwall/tinylru v1.1.0/go.mod h1:3+bX+TJ2baOLMWTnlyNWHh4QMnFyARg2TLTQ6OFbzw8=
github.com/tidwall/tinylru v1.2.1 h1:VgBr72c2IEr+V+pCdkPZUwiQ0KJknnWIYbhxAVkYfQk=
github.com/tidwall/tinylru v1.2.1/go.mod h1:9bQnEduwB6inr2Y7AkBP7JPgCkyrhTV/ZpX0oOOpBI4=
github.com/tidwall/wal v1.1.7 h1:emc1TRjIVsdKKSnpwGBAcsAGg0767SvUk8+ygx7Bb+4=
github.com/tidwall/wal v1.1.7/go.mod h1:r6lR1j27W9EPalgHiB7zLJDYu3mzW5BQP5KrzBpYY/E=
github.com/tv42/httpunix v0.0.0-20150427012821-b75d8614f926/go.mod h1:9ESjWnEqriFuLhtthL60Sar/7RFoluCcXsuvEwTV5KM=
go.etcd.io/bbolt v1.3.7 h1:j+zJOnnEjF/kyHlDDgGnVL/AIqIJPq8UoB2GSNfkUfQ=
go.etcd.io/bbolt v1.3.7/go.mod h1:N9Mkw9X8x5fupy0IKsmuqVtoGDyxsaDlbk4Rd05IAQw=
Expand Down
180 changes: 47 additions & 133 deletions lib/bloom/bloom.go
Original file line number Diff line number Diff line change
@@ -1,151 +1,65 @@
package bloom

import "math"

const (
seed = 0xbc9f1d34
m = 0xc6a4a793
import (
"github.com/spaolacci/murmur3"
"math"
)

// Filter is an encoded set of []byte keys.
type Filter []byte

// MayContainKey returns whether the filter may contain given key. False positives
func (f Filter) MayContainKey(k []byte) bool {
return f.mayContain(Hash(k))
// Filter represents a structure for the filter itself.
type Filter struct {
bitSet []bool // Bit array to hold the state of the data
size uint32 // Size of the bit array
numHashes uint8 // Number of hash functions to use
}

// MayContain returns whether the filter may contain given key. False positives
// are possible, where it returns true for keys not in the original set.
func (f Filter) mayContain(h uint32) bool {
// check if the filter is empty
if len(f) < 2 {
return false
}
// obtain the number of hash functions
k := f[len(f)-1]
// if k > 30, this is reserved for potentially new encodings for short Bloom filters.
if k > 30 {
// This is reserved for potentially new encodings for short Bloom filters.
// Consider it a match.
return true
}
// calculate the total number of bits in the filter.
nBits := uint32(8 * (len(f) - 1))
// change the hash value by right shift and left shift to generate different bit positions for subsequent iterations.
delta := h>>17 | h<<15
for j := uint8(0); j < k; j++ {
// For each hash function, calculate the bit position bitPos
bitPos := h % nBits
// Check if the corresponding bit has been set.
// If the bit has not been set, the key is definitely not in the set, and false is returned.
if f[bitPos/8]&(1<<(bitPos%8)) == 0 {
return false
}
h += delta
}
return true
}

// NewFilter returns a new Bloom filter that encodes a set of []byte keys with
// the given number of bits per key, approximately.
//
// A good bitsPerKey value is 10, which yields a filter with ~ 1% false
// positive rate.
func NewFilter(keys []uint32, bitsPerKey int) Filter {
return Filter(appendFilter(nil, keys, bitsPerKey))
}

// BloomBitsPerKey returns the bits per key required by bloomfilter based on
// the false positive rate.
func BloomBitsPerKey(numEntries int, fp float64) int {
size := -1 * float64(numEntries) * math.Log(fp) / math.Pow(float64(0.69314718056), 2)
locs := math.Ceil(float64(0.69314718056) * size / float64(numEntries))
return int(locs)
}
// NewBloomFilter initializes a new Bloom filter based on the expected number of items and desired false positive rate.
func NewBloomFilter(expectedItems uint32, fpRate float64) *Filter {
// Calculate the size of bit array using the expected number of items and desired false positive rate
size := uint32(-float64(expectedItems) * math.Log(fpRate) / (math.Ln2 * math.Ln2))
// Calculate the optimal number of hash functions based on the size of bit array and expected number of items
numHashes := uint8(float64(size) / float64(expectedItems) * math.Ln2)

func appendFilter(buf []byte, keys []uint32, bitsPerKey int) []byte {
if bitsPerKey < 0 {
bitsPerKey = 0
return &Filter{
bitSet: make([]bool, size),
size: size,
numHashes: numHashes,
}
// 0.69 is approximately ln(2).
k := uint32(float64(bitsPerKey) * 0.69)
if k < 1 {
k = 1
}
if k > 30 {
k = 30
}

nBits := len(keys) * bitsPerKey
// For small len(keys), we can see a very high false positive rate. Fix it
// by enforcing a minimum bloom filter length.
if nBits < 64 {
nBits = 64
}
nBytes := (nBits + 7) / 8
nBits = nBytes * 8
buf, filter := extend(buf, nBytes+1)
}

for _, h := range keys {
delta := h>>17 | h<<15
for j := uint32(0); j < k; j++ {
bitPos := h % uint32(nBits)
filter[bitPos/8] |= 1 << (bitPos % 8)
h += delta
}
// Add inserts an item into the Bloom filter.
func (f *Filter) Add(item []byte) {
hashes := f.hash(item)
// For each hash value, find the position and set the bit to true
for i := uint8(0); i < f.numHashes; i++ {
position := hashes[i] % f.size
f.bitSet[position] = true
}
filter[nBytes] = uint8(k)

return buf
}

// extend appends n zero bytes to b. It returns the overall slice (of length
// n+len(originalB)) and the slice of n trailing zeroes.
func extend(b []byte, n int) (overall, trailer []byte) {
want := n + len(b)
if want <= cap(b) {
overall = b[:want]
trailer = overall[len(b):]
for i := range trailer {
trailer[i] = 0
}
} else {
// Grow the capacity exponentially, with a 1KiB minimum.
c := 1024
for c < want {
c += c / 4
// MayContainItem checks if an item is possibly in the set.
// If it returns false, the item is definitely not in the set.
// If it returns true, the item might be in the set, but it can also be a false positive.
func (f *Filter) MayContainItem(item []byte) bool {
hashes := f.hash(item)
for i := uint8(0); i < f.numHashes; i++ {
position := hashes[i] % f.size
if !f.bitSet[position] {
return false
}
overall = make([]byte, want, c)
trailer = overall[len(b):]
copy(overall, b)
}
return overall, trailer
return true
}

// Hash implements a hashing algorithm similar to the Murmur hash.
func Hash(b []byte) uint32 {
// The original algorithm uses a seed of 0x9747b28c.
h := uint32(seed) ^ uint32(len(b))*m
// Pick up four bytes at a time.
for ; len(b) >= 4; b = b[4:] {
// The original algorithm uses the following commented out code to load
h += uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
h *= m
h ^= h >> 16
}
// Pick up remaining bytes.
switch len(b) {
case 3:
h += uint32(b[2]) << 16
fallthrough
case 2:
h += uint32(b[1]) << 8
fallthrough
case 1:
h += uint32(b[0])
h *= m
h ^= h >> 24
// hash produces multiple hash values for an item.
// It leverages two hash values from murmur3 and generates as many as needed through a linear combination.
func (f *Filter) hash(item []byte) []uint32 {
h1, h2 := murmur3.Sum128(item) // Get two 64-bit hash values
var result []uint32

// Use the two hash values to generate the required number of hash functions.
for i := uint8(0); i < f.numHashes; i++ {
h := h1 + uint64(i)*h2
result = append(result, uint32(h))
}
return h
return result
}
Loading

0 comments on commit a74bd64

Please sign in to comment.