initial commit

bradclawsie · Jun 29, 2013 · c5dfd59 · c5dfd59
1 parent 6afc1f2
commit c5dfd59
Show file tree

Hide file tree

Showing 3 changed files with 222 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -1,4 +1,40 @@
 bloomfilter
 ===========
 
-a sha1-based bloom filter for go
+## About
+
+This package attempts to implement a bloom filter in Go.
+
+http://en.wikipedia.org/wiki/Bloom_filter
+
+provides an explanation of what a bloom filter is. Essentially it is a probabilistic memebership
+function with good size characteristics. For example, we may wish to read in the words from
+the dictionary file and then test words that users enter to see if they are valid. The bloom filter
+can test this with over 99% accuracy using only 100k in a data structure.
+
+The approach in this package for hashing items into the filter is to obtain the 160 bit SHA1
+hash of the original input item, which should give a good distribution. Then, this 160 bit
+value is decomposed into five 32-bit integers which are then used as modulo (wrapping) offsets
+into a BitSet (the storage mechanism used by this package). The bits at those offsets are set to
+true (1).
+
+Should an input token ever hash to five locations that are already set in the BitSet, it will
+be considered a collision (false positive). Experiment with size settings that minimize collisions.
+
+The size argument provided to the constructor is a desired size in bits for the BitSet used by the
+bloom filter as a storage mechanism. This value is rounded up to a byte boundary.
+
+## Installing
+
+   $ go get github.com/bradclawsie/bloomfilter
+
+## Docs
+
+   $ go doc github.com/bradclawsie/bloomfilter
+
+## Examples
+
+The included unit test file contains an example use case of reading in a dict file from
+a local path. You will need to edit the test and set that to run it.
+
+
diff --git a/bloomfilter.go b/bloomfilter.go
@@ -0,0 +1,113 @@
+package bloomfilter
+
+import (
+	"io"
+	"bytes"
+	"crypto/sha1"
+	"encoding/binary"
+	"github.com/bradclawsie/bitset"
+)
+
+// BloomFilter is implemented using the bitset package.
+type BloomFilter struct {
+	bitset *bitset.BitSet
+}
+
+// Construct a new BloomFilter intended to model n bits.
+// The BitSet constructor will round that number up to 
+// the next byte boundary. The BitSet should be adequately compact.
+// Values written into the bloom filter will use modulo to determine
+// the index to set...meaning, overflow indexes will wrap.
+// The BitSet is already concurrent safe through the use of RWMutex.
+// Note: each entry into the filter sets five values, so having
+// n below be less than five is nonsensical
+func NewBloomFilter(n uint32) (*BloomFilter) {
+	b := new(BloomFilter)
+	b.bitset = bitset.NewBitSet(n)
+	return b
+}
+
+// Alias for the constructor
+func New(n uint32) (*BloomFilter) {
+	return NewBloomFilter(n)
+}
+
+// A SHA1 is 160 bits which we can decompose into 5 32-bit ints
+type SHA1_ints [5]uint32
+
+// The filter values corresponding to offsets derived from the SHA1-ints
+type FilterVals [5]bool
+
+// For a given string s:
+// 1. Get its sha1 hash
+// 2. For that 160 bit sha1, decompose it in to five 32 bit ints
+// 3. Return those as SHA1_ints
+func GetSHA1_ints(s string) (SHA1_ints,error) {
+	h := sha1.New()
+	io.WriteString(h,s)
+	sha1_bytes := h.Sum(nil)
+	j := 4
+	k := 5
+	var sha1_ints SHA1_ints
+	for i := 0; i < k; j += 4 {
+	 	tb := sha1_bytes[i*4:j]
+		// convert it into a 32 bit int
+		tbuf := bytes.NewBuffer(tb)
+		var u32 uint32
+		err := binary.Read(tbuf,binary.LittleEndian,&u32)
+		if err != nil {
+			var empty_ints SHA1_ints
+			return empty_ints,err
+		}
+		sha1_ints[i] = u32
+	 	i++
+	}
+	return sha1_ints,nil
+}
+
+// Return the size of the underlying BitSet. May be greater than
+// the arg provided to the constructor...the BitSet package rounds
+// up to a byte boundary.
+func (b *BloomFilter) Size() int {
+	return b.bitset.Size()
+}
+
+// For a set of SHA1_ints, write a truth value (1) into the bloom filter
+// at the modulo offset correlated to its value.
+// Returns a boolean indicating if there was a collision in the filter
+// (meaning all indexes to be set were already set to true)
+func (b *BloomFilter) Write(sha1_ints SHA1_ints) (bool,error) {
+	l := uint32(b.bitset.Size())
+	// warn if the filter positions have already been written
+	collision := true
+	for _,v := range sha1_ints {
+		j := v % l
+		existing_at_j,get_err := b.bitset.GetBitN(int(j))
+		if get_err != nil {
+			return false,get_err
+		}
+		collision = collision && existing_at_j
+		set_err := b.bitset.SetBitN(int(j))
+		if set_err != nil {
+			return false,set_err
+		}
+	}
+	return collision,nil
+}
+
+// Read the filter values for the modulo offsets for the SHA1_ints, and also
+// send back a convenience bool to indicate if they were all true or not
+func (b *BloomFilter) Read(sha1_ints SHA1_ints) (FilterVals,bool,error) {
+	l := uint32(b.bitset.Size())
+	var fv FilterVals
+	all := true
+	var get_err error
+	for i,v := range sha1_ints {
+		fv[i],get_err = b.bitset.GetBitN(int(v % l))
+		if get_err != nil {
+			return fv,false,get_err
+		}
+		all = all && fv[i]
+	}
+	return fv,all,nil
+} 
diff --git a/bloomfilter_test.go b/bloomfilter_test.go
@@ -0,0 +1,72 @@
+package bloomfilter
+
+import (
+	"testing"
+	"fmt"
+	"io/ioutil"
+	"bytes"
+)
+
+func TestFilter(t *testing.T) {
+	dict_file := "" 
+	// SET THIS LINE TO *YOUR* DICT FILE
+	// dict_file = "/usr/share/dict/american-english" // ubuntu
+	if dict_file == "" {
+		fmt.Printf("\n\n****\nset dict_file in TestFilter to be a full path to a dictionary file, and rerun\n****\n\n\n")
+		return
+	}
+	var size uint32 = 800000
+	bf := NewBloomFilter(size)
+	dict_bytes,dict_err := ioutil.ReadFile(dict_file)
+	if dict_err != nil {
+		e := fmt.Sprintf("%s\n",dict_err.Error())
+		t.Errorf(e)
+	}
+	sep := []byte("\n")
+	collisions := 0
+	writes := 0
+	word_bytes := bytes.Split(dict_bytes,sep)
+	for _,v := range word_bytes {
+		sha1_ints,sha1_err := GetSHA1_ints((string(v)))
+		if sha1_err != nil {
+			e := fmt.Sprintf("%s\n",sha1_err.Error())
+			t.Errorf(e)
+		}
+		collision,write_err := bf.Write(sha1_ints)
+		if write_err != nil {
+			e := fmt.Sprintf("%s\n",write_err.Error())
+			t.Errorf(e)
+		}
+		_,in_filter,read_err := bf.Read(sha1_ints)
+		if read_err != nil {
+			e := fmt.Sprintf("%s\n",read_err.Error())
+			t.Errorf(e)
+		}		
+		if (!in_filter) {
+			e := fmt.Sprintf("%v sha1_ints do not all read as true after a write",sha1_ints)
+			t.Errorf(e)
+		}
+		writes++
+		if collision {
+			collisions++
+		}
+	}
+	// make sure gibberish is not found in the filter
+	sha1_ints,sha1_err := GetSHA1_ints("azzxxxdddhhhu")
+	if sha1_err != nil {
+		e := fmt.Sprintf("%s\n",sha1_err.Error())
+		t.Errorf(e)
+	}
+	_,in_filter,read_err := bf.Read(sha1_ints)
+	if read_err != nil {
+		e := fmt.Sprintf("%s\n",read_err.Error())
+		t.Errorf(e)
+	}		
+	if in_filter {
+		t.Errorf("non dict word was found in filter?")
+	}
+
+	fmt.Printf("writes: %d, collisions (false positives): %d\n",writes,collisions)
+	rate := 100.0 - ((float64(collisions)/float64(writes)) * 100.0)
+	fmt.Printf("filter of size %d will be correct %f of the time\n",size,rate)
+}