The size, in bits, of the filter
The capacity, in entries, of the filter
The probability of a false positive that is tolerable
-The number of bits readily available to use for addresing filter bits
+The number of bits readily available to use for addressing filter bits
There is one major tunable that is not directly related to the above:
k: the number of bits set in the filter per entry
Based on these parameters, a combination of k=4 and k=5 provides the behavior
that bup needs. As such, I've implemented bloom addressing, adding and
checking functions in C for these two values. Because k=5 requires less space
-and gives better overall pfalse_positive perofrmance, it is preferred if a
+and gives better overall pfalse_positive performance, it is preferred if a
table with k=5 can represent the repository.
None of this tells us what max_pfalse_positive to choose.
Brandon Low <lostlogic@lostlogicx.com> 2011-02-04
"""
-import sys, os, math, mmap
+
+from __future__ import absolute_import
+import sys, os, math, mmap, struct
+
from bup import _helpers
-from bup.helpers import *
+from bup.helpers import (debug1, debug2, log, mmap_read, mmap_readwrite,
+ mmap_readwrite_private, unlink)
+
BLOOM_VERSION = 2
MAX_BITS_EACH = 32 # Kinda arbitrary, but 4 bytes per entry is pretty big
bloom_contains = _helpers.bloom_contains
bloom_add = _helpers.bloom_add
+# FIXME: check bloom create() and ShaBloom handling/ownership of "f".
+# The ownership semantics should be clarified since the caller needs
+# to know who is responsible for closing it.
class ShaBloom:
- """Wrapper which contains data from multiple index files.
- """
+ """Wrapper which contains data from multiple index files. """
def __init__(self, filename, f=None, readwrite=False, expected=-1):
self.name = filename
self.rwfile = None
k = self.k
return 100*(1-math.exp(-k*float(n)/m))**k
+ def add(self, ids):
+ """Add the hashes in ids (packed binary 20-bytes) to the filter."""
+ if not self.map:
+ raise Exception("Cannot add to closed bloom")
+ self.entries += bloom_add(self.map, ids, self.bits, self.k)
+
def add_idx(self, ix):
- """Add the object to the filter, return current pfalse_positive."""
- if not self.map: raise Exception, "Cannot add to closed bloom"
- self.entries += bloom_add(self.map, ix.shatable, self.bits, self.k)
+ """Add the object to the filter."""
+ self.add(ix.shatable)
self.idxnames.append(os.path.basename(ix.name))
def exists(self, sha):
- """Return nonempty if the object probably exists in the bloom filter."""
+ """Return nonempty if the object probably exists in the bloom filter.
+
+ If this function returns false, the object definitely does not exist.
+ If it returns true, there is a small probability that it exists
+ anyway, so you'll have to check it some other way.
+ """
global _total_searches, _total_steps
_total_searches += 1
- if not self.map: return None
+ if not self.map:
+ return None
found, steps = bloom_contains(self.map, str(sha), self.bits, self.k)
_total_steps += steps
return found
- @classmethod
- def create(cls, name, expected, delaywrite=None, f=None, k=None):
- """Create and return a bloom filter for `expected` entries."""
- bits = int(math.floor(math.log(expected*MAX_BITS_EACH/8,2)))
- k = k or ((bits <= MAX_BLOOM_BITS[5]) and 5 or 4)
- if bits > MAX_BLOOM_BITS[k]:
- log('bloom: warning, max bits exceeded, non-optimal\n')
- bits = MAX_BLOOM_BITS[k]
- debug1('bloom: using 2^%d bytes and %d hash functions\n' % (bits, k))
- f = f or open(name, 'w+b')
- f.write('BLOM')
- f.write(struct.pack('!IHHI', BLOOM_VERSION, bits, k, 0))
- assert(f.tell() == 16)
- # NOTE: On some systems this will not extend+zerofill, but it does on
- # darwin, linux, bsd and solaris.
- f.truncate(16+2**bits)
- f.seek(0)
- if delaywrite != None and not delaywrite:
- # tell it to expect very few objects, forcing a direct mmap
- expected = 1
- return cls(name, f=f, readwrite=True, expected=expected)
-
def __len__(self):
return int(self.entries)
+def create(name, expected, delaywrite=None, f=None, k=None):
+ """Create and return a bloom filter for `expected` entries."""
+ bits = int(math.floor(math.log(expected*MAX_BITS_EACH/8,2)))
+ k = k or ((bits <= MAX_BLOOM_BITS[5]) and 5 or 4)
+ if bits > MAX_BLOOM_BITS[k]:
+ log('bloom: warning, max bits exceeded, non-optimal\n')
+ bits = MAX_BLOOM_BITS[k]
+ debug1('bloom: using 2^%d bytes and %d hash functions\n' % (bits, k))
+ f = f or open(name, 'w+b')
+ f.write('BLOM')
+ f.write(struct.pack('!IHHI', BLOOM_VERSION, bits, k, 0))
+ assert(f.tell() == 16)
+ # NOTE: On some systems this will not extend+zerofill, but it does on
+ # darwin, linux, bsd and solaris.
+ f.truncate(16+2**bits)
+ f.seek(0)
+ if delaywrite != None and not delaywrite:
+ # tell it to expect very few objects, forcing a direct mmap
+ expected = 1
+ return ShaBloom(name, f=f, readwrite=True, expected=expected)
+
+
+def clear_bloom(dir):
+ unlink(os.path.join(dir, 'bup.bloom'))