-extract_bits = _helpers.extract_bits
-
-bloom_contains = _helpers.bloom_contains
-bloom_add = _helpers.bloom_add
-
-
-class ShaBloom:
- """Wrapper which contains data from multiple index files.
- """
- def __init__(self, filename, f=None, readwrite=False, expected=-1):
- self.name = filename
- self.rwfile = None
- self.map = None
- assert(filename.endswith('.bloom'))
- if readwrite:
- assert(expected > 0)
- self.rwfile = f = f or open(filename, 'r+b')
- f.seek(0)
-
- # Decide if we want to mmap() the pages as writable ('immediate'
- # write) or else map them privately for later writing back to
- # the file ('delayed' write). A bloom table's write access
- # pattern is such that we dirty almost all the pages after adding
- # very few entries. But the table is so big that dirtying
- # *all* the pages often exceeds Linux's default
- # /proc/sys/vm/dirty_ratio or /proc/sys/vm/dirty_background_ratio,
- # thus causing it to start flushing the table before we're
- # finished... even though there's more than enough space to
- # store the bloom table in RAM.
- #
- # To work around that behaviour, if we calculate that we'll
- # probably end up touching the whole table anyway (at least
- # one bit flipped per memory page), let's use a "private" mmap,
- # which defeats Linux's ability to flush it to disk. Then we'll
- # flush it as one big lump during close().
- pages = os.fstat(f.fileno()).st_size / 4096 * 5 # assume k=5
- self.delaywrite = expected > pages
- debug1('bloom: delaywrite=%r\n' % self.delaywrite)
- if self.delaywrite:
- self.map = mmap_readwrite_private(self.rwfile, close=False)
- else:
- self.map = mmap_readwrite(self.rwfile, close=False)
- else:
- self.rwfile = None
- f = f or open(filename, 'rb')
- self.map = mmap_read(f)
- got = str(self.map[0:4])
- if got != 'BLOM':
- log('Warning: invalid BLOM header (%r) in %r\n' % (got, filename))
- return self._init_failed()
- ver = struct.unpack('!I', self.map[4:8])[0]
- if ver < BLOOM_VERSION:
- log('Warning: ignoring old-style (v%d) bloom %r\n'
- % (ver, filename))
- return self._init_failed()
- if ver > BLOOM_VERSION:
- log('Warning: ignoring too-new (v%d) bloom %r\n'
- % (ver, filename))
- return self._init_failed()
-
- self.bits, self.k, self.entries = struct.unpack('!HHI', self.map[8:16])
- idxnamestr = str(self.map[16 + 2**self.bits:])
- if idxnamestr:
- self.idxnames = idxnamestr.split('\0')
- else:
- self.idxnames = []
-
- def _init_failed(self):
- if self.map:
- self.map = None
- if self.rwfile:
- self.rwfile.close()
- self.rwfile = None
- self.idxnames = []
- self.bits = self.entries = 0
-
- def valid(self):
- return self.map and self.bits
-
- def __del__(self):
- self.close()
-
- def close(self):
- if self.map and self.rwfile:
- debug2("bloom: closing with %d entries\n" % self.entries)
- self.map[12:16] = struct.pack('!I', self.entries)
- if self.delaywrite:
- self.rwfile.seek(0)
- self.rwfile.write(self.map)
- else:
- self.map.flush()
- self.rwfile.seek(16 + 2**self.bits)
- if self.idxnames:
- self.rwfile.write('\0'.join(self.idxnames))
- self._init_failed()
-
- def pfalse_positive(self, additional=0):
- n = self.entries + additional
- m = 8*2**self.bits
- k = self.k
- return 100*(1-math.exp(-k*float(n)/m))**k
-
- def add_idx(self, ix):
- """Add the object to the filter, return current pfalse_positive."""
- if not self.map: raise Exception, "Cannot add to closed bloom"
- self.entries += bloom_add(self.map, ix.shatable, self.bits, self.k)
- self.idxnames.append(os.path.basename(ix.name))
-
- def exists(self, sha):
- """Return nonempty if the object probably exists in the bloom filter."""
- global _total_searches, _total_steps
- _total_searches += 1
- if not self.map: return None
- found, steps = bloom_contains(self.map, str(sha), self.bits, self.k)
- _total_steps += steps
- return found
-
- @classmethod
- def create(cls, name, expected, delaywrite=None, f=None, k=None):
- """Create and return a bloom filter for `expected` entries."""
- bits = int(math.floor(math.log(expected*MAX_BITS_EACH/8,2)))
- k = k or ((bits <= MAX_BLOOM_BITS[5]) and 5 or 4)
- if bits > MAX_BLOOM_BITS[k]:
- log('bloom: warning, max bits exceeded, non-optimal\n')
- bits = MAX_BLOOM_BITS[k]
- debug1('bloom: using 2^%d bytes and %d hash functions\n' % (bits, k))
- f = f or open(name, 'w+b')
- f.write('BLOM')
- f.write(struct.pack('!IHHI', BLOOM_VERSION, bits, k, 0))
- assert(f.tell() == 16)
- # NOTE: On some systems this will not extend+zerofill, but it does on
- # darwin, linux, bsd and solaris.
- f.truncate(16+2**bits)
- f.seek(0)
- if delaywrite != None and not delaywrite:
- # tell it to expect very few objects, forcing a direct mmap
- expected = 1
- return cls(name, f=f, readwrite=True, expected=expected)
-
- def __len__(self):
- return int(self.entries)
-
-
-class PackMidx:
- """Wrapper which contains data from multiple index files.
- Multiple index (.midx) files constitute a wrapper around index (.idx) files
- and make it possible for bup to expand Git's indexing capabilities to vast
- amounts of files.
- """
- def __init__(self, filename):
- self.name = filename
- self.force_keep = False
- assert(filename.endswith('.midx'))
- self.map = mmap_read(open(filename))
- if str(self.map[0:4]) != 'MIDX':
- log('Warning: skipping: invalid MIDX header in %r\n' % filename)
- self.force_keep = True
- return self._init_failed()
- ver = struct.unpack('!I', self.map[4:8])[0]
- if ver < MIDX_VERSION:
- log('Warning: ignoring old-style (v%d) midx %r\n'
- % (ver, filename))
- self.force_keep = False # old stuff is boring
- return self._init_failed()
- if ver > MIDX_VERSION:
- log('Warning: ignoring too-new (v%d) midx %r\n'
- % (ver, filename))
- self.force_keep = True # new stuff is exciting
- return self._init_failed()
-
- self.bits = _helpers.firstword(self.map[8:12])
- self.entries = 2**self.bits
- self.fanout = buffer(self.map, 12, self.entries*4)
- self.sha_ofs = 12 + self.entries*4
- self.nsha = nsha = self._fanget(self.entries-1)
- self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
- self.whichlist = buffer(self.map, self.sha_ofs + nsha*20, nsha*4)
- self.idxname_ofs = self.sha_ofs + 24*nsha
- self.idxnames = str(self.map[self.idxname_ofs:]).split('\0')
-
- def _init_failed(self):
- self.bits = 0
- self.entries = 1
- self.fanout = buffer('\0\0\0\0')
- self.shatable = buffer('\0'*20)
- self.idxnames = []
-
- def _fanget(self, i):
- start = i*4
- s = self.fanout[start:start+4]
- return _helpers.firstword(s)
-
- def _get(self, i):
- return str(self.shatable[i*20:(i+1)*20])
-
- def _get_idx_i(self, i):
- return struct.unpack('!I', self.whichlist[i*4:(i+1)*4])[0]
-
- def _get_idxname(self, i):
- return self.idxnames[self._get_idx_i(i)]
-
- def exists(self, hash, want_source=False):
- """Return nonempty if the object exists in the index files."""
- global _total_searches, _total_steps
- _total_searches += 1
- want = str(hash)
- el = extract_bits(want, self.bits)
- if el:
- start = self._fanget(el-1)
- startv = el << (32-self.bits)
- else:
- start = 0
- startv = 0
- end = self._fanget(el)
- endv = (el+1) << (32-self.bits)
- _total_steps += 1 # lookup table is a step
- hashv = _helpers.firstword(hash)
- #print '(%08x) %08x %08x %08x' % (extract_bits(want, 32), startv, hashv, endv)
- while start < end:
- _total_steps += 1
- #print '! %08x %08x %08x %d - %d' % (startv, hashv, endv, start, end)
- mid = start + (hashv-startv)*(end-start-1)/(endv-startv)
- #print ' %08x %08x %08x %d %d %d' % (startv, hashv, endv, start, mid, end)
- v = self._get(mid)
- #print ' %08x' % self._num(v)
- if v < want:
- start = mid+1
- startv = _helpers.firstword(v)
- elif v > want:
- end = mid
- endv = _helpers.firstword(v)
- else: # got it!
- return want_source and self._get_idxname(mid) or True
- return None
-
- def __iter__(self):
- for i in xrange(self._fanget(self.entries-1)):
- yield buffer(self.shatable, i*20, 20)
-
- def __len__(self):
- return int(self._fanget(self.entries-1))
-
-