Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
bradclawsie committed Jun 29, 2013
1 parent 6afc1f2 commit c5dfd59
Show file tree
Hide file tree
Showing 3 changed files with 222 additions and 1 deletion.
38 changes: 37 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,40 @@
bloomfilter
===========

a sha1-based bloom filter for go
## About

This package attempts to implement a bloom filter in Go.

http://en.wikipedia.org/wiki/Bloom_filter

provides an explanation of what a bloom filter is. Essentially it is a probabilistic memebership
function with good size characteristics. For example, we may wish to read in the words from
the dictionary file and then test words that users enter to see if they are valid. The bloom filter
can test this with over 99% accuracy using only 100k in a data structure.

The approach in this package for hashing items into the filter is to obtain the 160 bit SHA1
hash of the original input item, which should give a good distribution. Then, this 160 bit
value is decomposed into five 32-bit integers which are then used as modulo (wrapping) offsets
into a BitSet (the storage mechanism used by this package). The bits at those offsets are set to
true (1).

Should an input token ever hash to five locations that are already set in the BitSet, it will
be considered a collision (false positive). Experiment with size settings that minimize collisions.

The size argument provided to the constructor is a desired size in bits for the BitSet used by the
bloom filter as a storage mechanism. This value is rounded up to a byte boundary.

## Installing

$ go get github.com/bradclawsie/bloomfilter

## Docs

$ go doc github.com/bradclawsie/bloomfilter

## Examples

The included unit test file contains an example use case of reading in a dict file from
a local path. You will need to edit the test and set that to run it.


113 changes: 113 additions & 0 deletions bloomfilter.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
package bloomfilter

import (
"io"
"bytes"
"crypto/sha1"
"encoding/binary"
"github.com/bradclawsie/bitset"
)

// BloomFilter is implemented using the bitset package.
type BloomFilter struct {
bitset *bitset.BitSet
}

// Construct a new BloomFilter intended to model n bits.
// The BitSet constructor will round that number up to
// the next byte boundary. The BitSet should be adequately compact.
// Values written into the bloom filter will use modulo to determine
// the index to set...meaning, overflow indexes will wrap.
// The BitSet is already concurrent safe through the use of RWMutex.
// Note: each entry into the filter sets five values, so having
// n below be less than five is nonsensical
func NewBloomFilter(n uint32) (*BloomFilter) {
b := new(BloomFilter)
b.bitset = bitset.NewBitSet(n)
return b
}

// Alias for the constructor
func New(n uint32) (*BloomFilter) {
return NewBloomFilter(n)
}

// A SHA1 is 160 bits which we can decompose into 5 32-bit ints
type SHA1_ints [5]uint32

// The filter values corresponding to offsets derived from the SHA1-ints
type FilterVals [5]bool

// For a given string s:
// 1. Get its sha1 hash
// 2. For that 160 bit sha1, decompose it in to five 32 bit ints
// 3. Return those as SHA1_ints
func GetSHA1_ints(s string) (SHA1_ints,error) {
h := sha1.New()
io.WriteString(h,s)
sha1_bytes := h.Sum(nil)
j := 4
k := 5
var sha1_ints SHA1_ints
for i := 0; i < k; j += 4 {
tb := sha1_bytes[i*4:j]
// convert it into a 32 bit int
tbuf := bytes.NewBuffer(tb)
var u32 uint32
err := binary.Read(tbuf,binary.LittleEndian,&u32)
if err != nil {
var empty_ints SHA1_ints
return empty_ints,err
}
sha1_ints[i] = u32
i++
}
return sha1_ints,nil
}

// Return the size of the underlying BitSet. May be greater than
// the arg provided to the constructor...the BitSet package rounds
// up to a byte boundary.
func (b *BloomFilter) Size() int {
return b.bitset.Size()
}

// For a set of SHA1_ints, write a truth value (1) into the bloom filter
// at the modulo offset correlated to its value.
// Returns a boolean indicating if there was a collision in the filter
// (meaning all indexes to be set were already set to true)
func (b *BloomFilter) Write(sha1_ints SHA1_ints) (bool,error) {
l := uint32(b.bitset.Size())
// warn if the filter positions have already been written
collision := true
for _,v := range sha1_ints {
j := v % l
existing_at_j,get_err := b.bitset.GetBitN(int(j))
if get_err != nil {
return false,get_err
}
collision = collision && existing_at_j
set_err := b.bitset.SetBitN(int(j))
if set_err != nil {
return false,set_err
}
}
return collision,nil
}

// Read the filter values for the modulo offsets for the SHA1_ints, and also
// send back a convenience bool to indicate if they were all true or not
func (b *BloomFilter) Read(sha1_ints SHA1_ints) (FilterVals,bool,error) {
l := uint32(b.bitset.Size())
var fv FilterVals
all := true
var get_err error
for i,v := range sha1_ints {
fv[i],get_err = b.bitset.GetBitN(int(v % l))
if get_err != nil {
return fv,false,get_err
}
all = all && fv[i]
}
return fv,all,nil
}
72 changes: 72 additions & 0 deletions bloomfilter_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
package bloomfilter

import (
"testing"
"fmt"
"io/ioutil"
"bytes"
)

func TestFilter(t *testing.T) {
dict_file := ""
// SET THIS LINE TO *YOUR* DICT FILE
// dict_file = "/usr/share/dict/american-english" // ubuntu
if dict_file == "" {
fmt.Printf("\n\n****\nset dict_file in TestFilter to be a full path to a dictionary file, and rerun\n****\n\n\n")
return
}
var size uint32 = 800000
bf := NewBloomFilter(size)
dict_bytes,dict_err := ioutil.ReadFile(dict_file)
if dict_err != nil {
e := fmt.Sprintf("%s\n",dict_err.Error())
t.Errorf(e)
}
sep := []byte("\n")
collisions := 0
writes := 0
word_bytes := bytes.Split(dict_bytes,sep)
for _,v := range word_bytes {
sha1_ints,sha1_err := GetSHA1_ints((string(v)))
if sha1_err != nil {
e := fmt.Sprintf("%s\n",sha1_err.Error())
t.Errorf(e)
}
collision,write_err := bf.Write(sha1_ints)
if write_err != nil {
e := fmt.Sprintf("%s\n",write_err.Error())
t.Errorf(e)
}
_,in_filter,read_err := bf.Read(sha1_ints)
if read_err != nil {
e := fmt.Sprintf("%s\n",read_err.Error())
t.Errorf(e)
}
if (!in_filter) {
e := fmt.Sprintf("%v sha1_ints do not all read as true after a write",sha1_ints)
t.Errorf(e)
}
writes++
if collision {
collisions++
}
}
// make sure gibberish is not found in the filter
sha1_ints,sha1_err := GetSHA1_ints("azzxxxdddhhhu")
if sha1_err != nil {
e := fmt.Sprintf("%s\n",sha1_err.Error())
t.Errorf(e)
}
_,in_filter,read_err := bf.Read(sha1_ints)
if read_err != nil {
e := fmt.Sprintf("%s\n",read_err.Error())
t.Errorf(e)
}
if in_filter {
t.Errorf("non dict word was found in filter?")
}

fmt.Printf("writes: %d, collisions (false positives): %d\n",writes,collisions)
rate := 100.0 - ((float64(collisions)/float64(writes)) * 100.0)
fmt.Printf("filter of size %d will be correct %f of the time\n",size,rate)
}

0 comments on commit c5dfd59

Please sign in to comment.