from bup.helpers import *
from bup import _helpers, path
-MIDX_VERSION = 2
+MIDX_VERSION = 4
-"""Bloom constants:
-These bloom constants were chosen as a combination of convenience and quality.
+"""Discussion of bloom constants for bup:
There are four basic things to consider when building a bloom filter:
The size, in bits, of the filter
The probability of a false positive that is tolerable
The number of bits readily available to use for addresing filter bits
-Based on those four considerations, there are two basic filter tunables:
+There is one major tunable that is not directly related to the above:
k: the number of bits set in the filter per entry
-pfmax: the maximum pfalse_positive before growing the filter.
-Here's a wall of numbers showing the relationship between these two and the
-ratio between the size of the filter in bits and the entries in the filter:
+Here's a wall of numbers showing the relationship between k; the ratio between
+the filter size in bits and the entries in the filter; and pfalse_positive:
mn|k=3 |k=4 |k=5 |k=6 |k=7 |k=8 |k=9 |k=10 |k=11
8|3.05794|2.39687|2.16792|2.15771|2.29297|2.54917|2.92244|3.41909|4.05091
This eliminates pretty neatly any k>6 as long as we use the raw SHA for
addressing.
-filter size scales linearly with reposize for a given k and pfalse.
+filter size scales linearly with repository size for a given k and pfalse.
Here's a table of filter sizes for a 1 TiB repository:
* We want to be able to have a single bloom address entire repositories of
reasonable size.
-Based on those parameters, k=4 or k=5 seem to be the most reasonable options.
-k=5 is a bit limited on repository size, but not terrible. k=4 gives "plenty"
-of repository space, but has 3 times the pfalse positive when the filter is
-relatively empty. k=5 is trivial to code, so I did that. It should be pretty
-easy to make the bloom filter adapt when the repository requires more address
-bits than k=5 allows and switch down to k=4.
+Based on these parameters, a combination of k=4 and k=5 provides the behavior
+that bup needs. As such, I've implemented bloom addressing, adding and
+checking functions in C for these two values. Because k=5 requires less space
+and gives better overall pfalse_positive perofrmance, it is preferred if a
+table with k=5 can represent the repository.
+
+None of this tells us what max_pfalse_positive to choose.
+
Brandon Low <lostlogic@lostlogicx.com> 04-02-2011
"""
-BLOOM_VERSION = 1
-MAX_BITS_EACH = 32
-BLOOM_HASHES = 5
-MAX_BLOOM_BITS = 29
-MAX_PFALSE_POSITIVE = 1.
+BLOOM_VERSION = 2
+MAX_BITS_EACH = 32 # Kinda arbitrary, but 4 bytes per entry is pretty big
+MAX_BLOOM_BITS = {4: 37, 5: 29} # 160/k-log2(8)
+MAX_PFALSE_POSITIVE = 1. # Totally arbitrary, needs benchmarking
verbose = 0
ignore_midx = 0
return self._ofs_from_idx(idx)
return None
- def exists(self, hash):
+ def exists(self, hash, want_source=False):
"""Return nonempty if the object exists in this index."""
- return hash and (self._idx_from_hash(hash) != None) and True or None
+ if hash and (self._idx_from_hash(hash) != None):
+ return want_source and self.name or True
+ return None
def __len__(self):
return int(self.fanout[255])
str(buffer(self.map, 0, 256*4))))
self.fanout.append(0) # entry "-1"
nsha = self.fanout[255]
- self.shatable = buffer(self.map, 256*4, nsha*24)
+ self.sha_ofs = 256*4
+ self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
def _ofs_from_idx(self, idx):
return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
str(buffer(self.map, 8, 256*4))))
self.fanout.append(0) # entry "-1"
nsha = self.fanout[255]
- self.shatable = buffer(self.map, 8 + 256*4, nsha*20)
+ self.sha_ofs = 8 + 256*4
+ self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
self.ofstable = buffer(self.map,
- 8 + 256*4 + nsha*20 + nsha*4,
+ self.sha_ofs + nsha*20 + nsha*4,
nsha*4)
self.ofs64table = buffer(self.map,
8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
class ShaBloom:
"""Wrapper which contains data from multiple index files.
- Multiple index (.midx) files constitute a wrapper around index (.idx) files
- and make it possible for bup to expand Git's indexing capabilities to vast
- amounts of files.
"""
- def __init__(self, filename, readwrite=False):
+ def __init__(self, filename, f=None, readwrite=False, expected=-1):
self.name = filename
+ self.rwfile = None
+ self.map = None
assert(filename.endswith('.bloom'))
if readwrite:
- self.rwfile = open(filename, 'r+b')
- self.map = mmap_readwrite(self.rwfile, close=False)
+ assert(expected > 0)
+ self.rwfile = f = f or open(filename, 'r+b')
+ f.seek(0)
+
+ # Decide if we want to mmap() the pages as writable ('immediate'
+ # write) or else map them privately for later writing back to
+ # the file ('delayed' write). A bloom table's write access
+ # pattern is such that we dirty almost all the pages after adding
+ # very few entries. But the table is so big that dirtying
+ # *all* the pages often exceeds Linux's default
+ # /proc/sys/vm/dirty_ratio or /proc/sys/vm/dirty_background_ratio,
+ # thus causing it to start flushing the table before we're
+ # finished... even though there's more than enough space to
+ # store the bloom table in RAM.
+ #
+ # To work around that behaviour, if we calculate that we'll
+ # probably end up touching the whole table anyway (at least
+ # one bit flipped per memory page), let's use a "private" mmap,
+ # which defeats Linux's ability to flush it to disk. Then we'll
+ # flush it as one big lump during close().
+ pages = os.fstat(f.fileno()).st_size / 4096 * 5 # assume k=5
+ self.delaywrite = expected > pages
+ debug1('bloom: delaywrite=%r\n' % self.delaywrite)
+ if self.delaywrite:
+ self.map = mmap_readwrite_private(self.rwfile, close=False)
+ else:
+ self.map = mmap_readwrite(self.rwfile, close=False)
else:
self.rwfile = None
- self.map = mmap_read(open(filename, 'rb'))
- if str(self.map[0:4]) != 'BLOM':
- log('Warning: skipping: invalid BLOM header in %r\n' % filename)
+ f = f or open(filename, 'rb')
+ self.map = mmap_read(f)
+ got = str(self.map[0:4])
+ if got != 'BLOM':
+ log('Warning: invalid BLOM header (%r) in %r\n' % (got, filename))
return self._init_failed()
ver = struct.unpack('!I', self.map[4:8])[0]
if ver < BLOOM_VERSION:
% (ver, filename))
return self._init_failed()
- self.bits, self.entries = struct.unpack('!II', self.map[8:16])
+ self.bits, self.k, self.entries = struct.unpack('!HHI', self.map[8:16])
idxnamestr = str(self.map[16 + 2**self.bits:])
if idxnamestr:
self.idxnames = idxnamestr.split('\0')
def _init_failed(self):
if self.map:
- self.map.close()
self.map = None
if self.rwfile:
self.rwfile.close()
self.close()
def close(self):
- if self.map:
- if self.rwfile:
- debug2("bloom: closing with %d entries\n" % self.entries)
- self.map[12:16] = struct.pack('!I', self.entries)
+ if self.map and self.rwfile:
+ debug2("bloom: closing with %d entries\n" % self.entries)
+ self.map[12:16] = struct.pack('!I', self.entries)
+ if self.delaywrite:
+ self.rwfile.seek(0)
+ self.rwfile.write(self.map)
+ else:
self.map.flush()
- if self.rwfile:
self.rwfile.seek(16 + 2**self.bits)
if self.idxnames:
self.rwfile.write('\0'.join(self.idxnames))
def pfalse_positive(self, additional=0):
n = self.entries + additional
m = 8*2**self.bits
- k = BLOOM_HASHES
+ k = self.k
return 100*(1-math.exp(-k*float(n)/m))**k
def add_idx(self, ix):
"""Add the object to the filter, return current pfalse_positive."""
if not self.map: raise Exception, "Cannot add to closed bloom"
- self.entries += bloom_add(self.map, 16, ix.shatable, self.bits)
+ self.entries += bloom_add(self.map, ix.shatable, self.bits, self.k)
self.idxnames.append(os.path.basename(ix.name))
def exists(self, sha):
global _total_searches, _total_steps
_total_searches += 1
if not self.map: return None
- found, steps = bloom_contains(self.map, 16, str(sha), self.bits)
+ found, steps = bloom_contains(self.map, str(sha), self.bits, self.k)
_total_steps += steps
return found
@classmethod
- def create(cls, name, readwrite=False, expected=100000):
+ def create(cls, name, expected, delaywrite=None, f=None, k=None):
"""Create and return a bloom filter for `expected` entries."""
bits = int(math.floor(math.log(expected*MAX_BITS_EACH/8,2)))
- if bits > MAX_BLOOM_BITS:
+ k = k or ((bits <= MAX_BLOOM_BITS[5]) and 5 or 4)
+ if bits > MAX_BLOOM_BITS[k]:
log('bloom: warning, max bits exceeded, non-optimal\n')
- bits = MAX_BLOOM_BITS
- debug1('bloom: using 2^%d bytes for bloom filter\n' % bits)
- f = open(name, 'wb')
+ bits = MAX_BLOOM_BITS[k]
+ debug1('bloom: using 2^%d bytes and %d hash functions\n' % (bits, k))
+ f = f or open(name, 'w+b')
f.write('BLOM')
- f.write(struct.pack('!III', BLOOM_VERSION, bits, 0))
+ f.write(struct.pack('!IHHI', BLOOM_VERSION, bits, k, 0))
assert(f.tell() == 16)
- f.write('\0'*2**bits)
- f.close()
- return cls(name, readwrite=readwrite)
+ # NOTE: On some systems this will not extend+zerofill, but it does on
+ # darwin, linux, bsd and solaris.
+ f.truncate(16+2**bits)
+ f.seek(0)
+ if delaywrite != None and not delaywrite:
+ # tell it to expect very few objects, forcing a direct mmap
+ expected = 1
+ return cls(name, f=f, readwrite=True, expected=expected)
def __len__(self):
return self.entries
self.bits = _helpers.firstword(self.map[8:12])
self.entries = 2**self.bits
self.fanout = buffer(self.map, 12, self.entries*4)
- shaofs = 12 + self.entries*4
- nsha = self._fanget(self.entries-1)
- self.shatable = buffer(self.map, shaofs, nsha*20)
- self.idxnames = str(self.map[shaofs + 20*nsha:]).split('\0')
+ self.sha_ofs = 12 + self.entries*4
+ self.nsha = nsha = self._fanget(self.entries-1)
+ self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
+ self.whichlist = buffer(self.map, self.sha_ofs + nsha*20, nsha*4)
+ self.idxname_ofs = self.sha_ofs + 24*nsha
+ self.idxnames = str(self.map[self.idxname_ofs:]).split('\0')
def _init_failed(self):
self.bits = 0
def _get(self, i):
return str(self.shatable[i*20:(i+1)*20])
- def exists(self, hash):
+ def _get_idx_i(self, i):
+ return struct.unpack('!I', self.whichlist[i*4:(i+1)*4])[0]
+
+ def _get_idxname(self, i):
+ return self.idxnames[self._get_idx_i(i)]
+
+ def exists(self, hash, want_source=False):
"""Return nonempty if the object exists in the index files."""
global _total_searches, _total_steps
_total_searches += 1
end = mid
endv = _helpers.firstword(v)
else: # got it!
- return True
+ return want_source and self._get_idxname(mid) or True
return None
def __iter__(self):
def __len__(self):
return sum(len(pack) for pack in self.packs)
- def exists(self, hash):
+ def exists(self, hash, want_source=False):
"""Return nonempty if the object exists in the index files."""
global _total_searches
_total_searches += 1
for i in xrange(len(self.packs)):
p = self.packs[i]
_total_searches -= 1 # will be incremented by sub-pack
- if p.exists(hash):
+ ix = p.exists(hash, want_source=want_source)
+ if ix:
# reorder so most recently used packs are searched first
self.packs = [p] + self.packs[:i] + self.packs[i+1:]
- return p.name
+ return ix
self.do_bloom = True
return None
debug1('PackIdxList: using %d index%s.\n'
% (len(self.packs), len(self.packs)!=1 and 'es' or ''))
- def packname_containing(self, hash):
- # figure out which pack contains a given hash.
- # FIXME: if the midx file format would just *store* this information,
- # we could calculate it a lot more efficiently. But it's not needed
- # often, so let's do it like this.
- for f in glob.glob(os.path.join(self.dir,'*.idx')):
- full = os.path.join(self.dir, f)
- try:
- ix = open_idx(full)
- except GitError, e:
- add_error(e)
- continue
- if ix.exists(hash):
- return full
-
def add(self, hash):
"""Insert an additional object in the list."""
self.also.add(hash)
raise GitError(
"PackWriter not opened or can't check exists w/o objcache")
- def exists(self, id):
+ def exists(self, id, want_source=False):
"""Return non-empty if an object is found in the object cache."""
self._require_objcache()
- return self.objcache.exists(id)
+ return self.objcache.exists(id, want_source=want_source)
def maybe_write(self, type, content):
"""Write an object to the pack file if not present and return its id."""