-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
6afc1f2
commit c5dfd59
Showing
3 changed files
with
222 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,40 @@ | ||
bloomfilter | ||
=========== | ||
|
||
a sha1-based bloom filter for go | ||
## About | ||
|
||
This package attempts to implement a bloom filter in Go. | ||
|
||
http://en.wikipedia.org/wiki/Bloom_filter | ||
|
||
provides an explanation of what a bloom filter is. Essentially it is a probabilistic memebership | ||
function with good size characteristics. For example, we may wish to read in the words from | ||
the dictionary file and then test words that users enter to see if they are valid. The bloom filter | ||
can test this with over 99% accuracy using only 100k in a data structure. | ||
|
||
The approach in this package for hashing items into the filter is to obtain the 160 bit SHA1 | ||
hash of the original input item, which should give a good distribution. Then, this 160 bit | ||
value is decomposed into five 32-bit integers which are then used as modulo (wrapping) offsets | ||
into a BitSet (the storage mechanism used by this package). The bits at those offsets are set to | ||
true (1). | ||
|
||
Should an input token ever hash to five locations that are already set in the BitSet, it will | ||
be considered a collision (false positive). Experiment with size settings that minimize collisions. | ||
|
||
The size argument provided to the constructor is a desired size in bits for the BitSet used by the | ||
bloom filter as a storage mechanism. This value is rounded up to a byte boundary. | ||
|
||
## Installing | ||
|
||
$ go get github.com/bradclawsie/bloomfilter | ||
|
||
## Docs | ||
|
||
$ go doc github.com/bradclawsie/bloomfilter | ||
|
||
## Examples | ||
|
||
The included unit test file contains an example use case of reading in a dict file from | ||
a local path. You will need to edit the test and set that to run it. | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
package bloomfilter | ||
|
||
import ( | ||
"io" | ||
"bytes" | ||
"crypto/sha1" | ||
"encoding/binary" | ||
"github.com/bradclawsie/bitset" | ||
) | ||
|
||
// BloomFilter is implemented using the bitset package. | ||
type BloomFilter struct { | ||
bitset *bitset.BitSet | ||
} | ||
|
||
// Construct a new BloomFilter intended to model n bits. | ||
// The BitSet constructor will round that number up to | ||
// the next byte boundary. The BitSet should be adequately compact. | ||
// Values written into the bloom filter will use modulo to determine | ||
// the index to set...meaning, overflow indexes will wrap. | ||
// The BitSet is already concurrent safe through the use of RWMutex. | ||
// Note: each entry into the filter sets five values, so having | ||
// n below be less than five is nonsensical | ||
func NewBloomFilter(n uint32) (*BloomFilter) { | ||
b := new(BloomFilter) | ||
b.bitset = bitset.NewBitSet(n) | ||
return b | ||
} | ||
|
||
// Alias for the constructor | ||
func New(n uint32) (*BloomFilter) { | ||
return NewBloomFilter(n) | ||
} | ||
|
||
// A SHA1 is 160 bits which we can decompose into 5 32-bit ints | ||
type SHA1_ints [5]uint32 | ||
|
||
// The filter values corresponding to offsets derived from the SHA1-ints | ||
type FilterVals [5]bool | ||
|
||
// For a given string s: | ||
// 1. Get its sha1 hash | ||
// 2. For that 160 bit sha1, decompose it in to five 32 bit ints | ||
// 3. Return those as SHA1_ints | ||
func GetSHA1_ints(s string) (SHA1_ints,error) { | ||
h := sha1.New() | ||
io.WriteString(h,s) | ||
sha1_bytes := h.Sum(nil) | ||
j := 4 | ||
k := 5 | ||
var sha1_ints SHA1_ints | ||
for i := 0; i < k; j += 4 { | ||
tb := sha1_bytes[i*4:j] | ||
// convert it into a 32 bit int | ||
tbuf := bytes.NewBuffer(tb) | ||
var u32 uint32 | ||
err := binary.Read(tbuf,binary.LittleEndian,&u32) | ||
if err != nil { | ||
var empty_ints SHA1_ints | ||
return empty_ints,err | ||
} | ||
sha1_ints[i] = u32 | ||
i++ | ||
} | ||
return sha1_ints,nil | ||
} | ||
|
||
// Return the size of the underlying BitSet. May be greater than | ||
// the arg provided to the constructor...the BitSet package rounds | ||
// up to a byte boundary. | ||
func (b *BloomFilter) Size() int { | ||
return b.bitset.Size() | ||
} | ||
|
||
// For a set of SHA1_ints, write a truth value (1) into the bloom filter | ||
// at the modulo offset correlated to its value. | ||
// Returns a boolean indicating if there was a collision in the filter | ||
// (meaning all indexes to be set were already set to true) | ||
func (b *BloomFilter) Write(sha1_ints SHA1_ints) (bool,error) { | ||
l := uint32(b.bitset.Size()) | ||
// warn if the filter positions have already been written | ||
collision := true | ||
for _,v := range sha1_ints { | ||
j := v % l | ||
existing_at_j,get_err := b.bitset.GetBitN(int(j)) | ||
if get_err != nil { | ||
return false,get_err | ||
} | ||
collision = collision && existing_at_j | ||
set_err := b.bitset.SetBitN(int(j)) | ||
if set_err != nil { | ||
return false,set_err | ||
} | ||
} | ||
return collision,nil | ||
} | ||
|
||
// Read the filter values for the modulo offsets for the SHA1_ints, and also | ||
// send back a convenience bool to indicate if they were all true or not | ||
func (b *BloomFilter) Read(sha1_ints SHA1_ints) (FilterVals,bool,error) { | ||
l := uint32(b.bitset.Size()) | ||
var fv FilterVals | ||
all := true | ||
var get_err error | ||
for i,v := range sha1_ints { | ||
fv[i],get_err = b.bitset.GetBitN(int(v % l)) | ||
if get_err != nil { | ||
return fv,false,get_err | ||
} | ||
all = all && fv[i] | ||
} | ||
return fv,all,nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
package bloomfilter | ||
|
||
import ( | ||
"testing" | ||
"fmt" | ||
"io/ioutil" | ||
"bytes" | ||
) | ||
|
||
func TestFilter(t *testing.T) { | ||
dict_file := "" | ||
// SET THIS LINE TO *YOUR* DICT FILE | ||
// dict_file = "/usr/share/dict/american-english" // ubuntu | ||
if dict_file == "" { | ||
fmt.Printf("\n\n****\nset dict_file in TestFilter to be a full path to a dictionary file, and rerun\n****\n\n\n") | ||
return | ||
} | ||
var size uint32 = 800000 | ||
bf := NewBloomFilter(size) | ||
dict_bytes,dict_err := ioutil.ReadFile(dict_file) | ||
if dict_err != nil { | ||
e := fmt.Sprintf("%s\n",dict_err.Error()) | ||
t.Errorf(e) | ||
} | ||
sep := []byte("\n") | ||
collisions := 0 | ||
writes := 0 | ||
word_bytes := bytes.Split(dict_bytes,sep) | ||
for _,v := range word_bytes { | ||
sha1_ints,sha1_err := GetSHA1_ints((string(v))) | ||
if sha1_err != nil { | ||
e := fmt.Sprintf("%s\n",sha1_err.Error()) | ||
t.Errorf(e) | ||
} | ||
collision,write_err := bf.Write(sha1_ints) | ||
if write_err != nil { | ||
e := fmt.Sprintf("%s\n",write_err.Error()) | ||
t.Errorf(e) | ||
} | ||
_,in_filter,read_err := bf.Read(sha1_ints) | ||
if read_err != nil { | ||
e := fmt.Sprintf("%s\n",read_err.Error()) | ||
t.Errorf(e) | ||
} | ||
if (!in_filter) { | ||
e := fmt.Sprintf("%v sha1_ints do not all read as true after a write",sha1_ints) | ||
t.Errorf(e) | ||
} | ||
writes++ | ||
if collision { | ||
collisions++ | ||
} | ||
} | ||
// make sure gibberish is not found in the filter | ||
sha1_ints,sha1_err := GetSHA1_ints("azzxxxdddhhhu") | ||
if sha1_err != nil { | ||
e := fmt.Sprintf("%s\n",sha1_err.Error()) | ||
t.Errorf(e) | ||
} | ||
_,in_filter,read_err := bf.Read(sha1_ints) | ||
if read_err != nil { | ||
e := fmt.Sprintf("%s\n",read_err.Error()) | ||
t.Errorf(e) | ||
} | ||
if in_filter { | ||
t.Errorf("non dict word was found in filter?") | ||
} | ||
|
||
fmt.Printf("writes: %d, collisions (false positives): %d\n",writes,collisions) | ||
rate := 100.0 - ((float64(collisions)/float64(writes)) * 100.0) | ||
fmt.Printf("filter of size %d will be correct %f of the time\n",size,rate) | ||
} |