bloom: avoid kernel disk flushes when we dirty a lot of pages.

[bup.git] / lib / bup / git.py
diff --git a/lib/bup/git.py b/lib/bup/git.py

index e2808ec3b395562c0525018785e9e70e3a3c3fce..b75e80234b600aff7f0beeaa4247041b941e2c4f 100644 (file)
--- a/lib/bup/git.py
+++ b/lib/bup/git.py
@@ -6,10 +6,9 @@ import os, sys, zlib, time, subprocess, struct, stat, re, tempfile, math, glob
  from bup.helpers import *
  from bup import _helpers, path
  
-MIDX_VERSION = 2
+MIDX_VERSION = 4
  
-"""Bloom constants:
-These bloom constants were chosen as a combination of convenience and quality.
+"""Discussion of bloom constants for bup:
  
  There are four basic things to consider when building a bloom filter:
  The size, in bits, of the filter
@@ -17,12 +16,11 @@ The capacity, in entries, of the filter
  The probability of a false positive that is tolerable
  The number of bits readily available to use for addresing filter bits
  
-Based on those four considerations, there are two basic filter tunables:
+There is one major tunable that is not directly related to the above:
  k: the number of bits set in the filter per entry
-pfmax: the maximum pfalse_positive before growing the filter.
  
-Here's a wall of numbers showing the relationship between these two and the
-ratio between the size of the filter in bits and the entries in the filter:
+Here's a wall of numbers showing the relationship between k; the ratio between
+the filter size in bits and the entries in the filter; and pfalse_positive:
  
  mn|k=3    |k=4    |k=5    |k=6    |k=7    |k=8    |k=9    |k=10   |k=11
   8|3.05794|2.39687|2.16792|2.15771|2.29297|2.54917|2.92244|3.41909|4.05091
@@ -63,7 +61,7 @@ pfalse|obj k=4     |cap k=4    |obj k=5  |cap k=5    |obj k=6 |cap k=6
  This eliminates pretty neatly any k>6 as long as we use the raw SHA for
  addressing.
  
-filter size scales linearly with reposize for a given k and pfalse.
+filter size scales linearly with repository size for a given k and pfalse.
  
  Here's a table of filter sizes for a 1 TiB repository:
  
@@ -81,19 +79,20 @@ faulting on the midx doesn't overcome the benefit of the bloom filter.
  * We want to be able to have a single bloom address entire repositories of
  reasonable size.
  
-Based on those parameters, k=4 or k=5 seem to be the most reasonable options.
-k=5 is a bit limited on repository size, but not terrible.  k=4 gives "plenty"
-of repository space, but has 3 times the pfalse positive when the filter is
-relatively empty.  k=5 is trivial to code, so I did that.  It should be pretty
-easy to make the bloom filter adapt when the repository requires more address
-bits than k=5 allows and switch down to k=4.
+Based on these parameters, a combination of k=4 and k=5 provides the behavior
+that bup needs.  As such, I've implemented bloom addressing, adding and
+checking functions in C for these two values.  Because k=5 requires less space
+and gives better overall pfalse_positive perofrmance, it is preferred if a
+table with k=5 can represent the repository.
+
+None of this tells us what max_pfalse_positive to choose.
+
  Brandon Low <lostlogic@lostlogicx.com> 04-02-2011
  """
-BLOOM_VERSION = 1
-MAX_BITS_EACH = 32
-BLOOM_HASHES = 5
-MAX_BLOOM_BITS = 29
-MAX_PFALSE_POSITIVE = 1.
+BLOOM_VERSION = 2
+MAX_BITS_EACH = 32 # Kinda arbitrary, but 4 bytes per entry is pretty big
+MAX_BLOOM_BITS = {4: 37, 5: 29} # 160/k-log2(8)
+MAX_PFALSE_POSITIVE = 1. # Totally arbitrary, needs benchmarking
  
  verbose = 0
  ignore_midx = 0
@@ -248,9 +247,11 @@ class PackIdx:
              return self._ofs_from_idx(idx)
          return None
  
-    def exists(self, hash):
+    def exists(self, hash, want_source=False):
          """Return nonempty if the object exists in this index."""
-        return hash and (self._idx_from_hash(hash) != None) and True or None
+        if hash and (self._idx_from_hash(hash) != None):
+            return want_source and self.name or True
+        return None
  
      def __len__(self):
          return int(self.fanout[255])
@@ -287,7 +288,8 @@ class PackIdxV1(PackIdx):
                                           str(buffer(self.map, 0, 256*4))))
          self.fanout.append(0)  # entry "-1"
          nsha = self.fanout[255]
-        self.shatable = buffer(self.map, 256*4, nsha*24)
+        self.sha_ofs = 256*4
+        self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
  
      def _ofs_from_idx(self, idx):
          return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
@@ -311,9 +313,10 @@ class PackIdxV2(PackIdx):
                                           str(buffer(self.map, 8, 256*4))))
          self.fanout.append(0)  # entry "-1"
          nsha = self.fanout[255]
-        self.shatable = buffer(self.map, 8 + 256*4, nsha*20)
+        self.sha_ofs = 8 + 256*4
+        self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
          self.ofstable = buffer(self.map,
-                               8 + 256*4 + nsha*20 + nsha*4,
+                               self.sha_ofs + nsha*20 + nsha*4,
                                 nsha*4)
          self.ofs64table = buffer(self.map,
                                   8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
@@ -342,21 +345,47 @@ bloom_add = _helpers.bloom_add
  
  class ShaBloom:
      """Wrapper which contains data from multiple index files.
-    Multiple index (.midx) files constitute a wrapper around index (.idx) files
-    and make it possible for bup to expand Git's indexing capabilities to vast
-    amounts of files.
      """
-    def __init__(self, filename, readwrite=False):
+    def __init__(self, filename, f=None, readwrite=False, expected=-1):
          self.name = filename
+        self.rwfile = None
+        self.map = None
          assert(filename.endswith('.bloom'))
          if readwrite:
-            self.rwfile = open(filename, 'r+b')
-            self.map = mmap_readwrite(self.rwfile, close=False)
+            assert(expected > 0)
+            self.rwfile = f = f or open(filename, 'r+b')
+            f.seek(0)
+
+            # Decide if we want to mmap() the pages as writable ('immediate'
+            # write) or else map them privately for later writing back to
+            # the file ('delayed' write).  A bloom table's write access
+            # pattern is such that we dirty almost all the pages after adding
+            # very few entries.  But the table is so big that dirtying
+            # *all* the pages often exceeds Linux's default
+            # /proc/sys/vm/dirty_ratio or /proc/sys/vm/dirty_background_ratio,
+            # thus causing it to start flushing the table before we're
+            # finished... even though there's more than enough space to
+            # store the bloom table in RAM.
+            #
+            # To work around that behaviour, if we calculate that we'll
+            # probably end up touching the whole table anyway (at least
+            # one bit flipped per memory page), let's use a "private" mmap,
+            # which defeats Linux's ability to flush it to disk.  Then we'll
+            # flush it as one big lump during close().
+            pages = os.fstat(f.fileno()).st_size / 4096 * 5 # assume k=5
+            self.delaywrite = expected > pages
+            debug1('bloom: delaywrite=%r\n' % self.delaywrite)
+            if self.delaywrite:
+                self.map = mmap_readwrite_private(self.rwfile, close=False)
+            else:
+                self.map = mmap_readwrite(self.rwfile, close=False)
          else:
              self.rwfile = None
-            self.map = mmap_read(open(filename, 'rb'))
-        if str(self.map[0:4]) != 'BLOM':
-            log('Warning: skipping: invalid BLOM header in %r\n' % filename)
+            f = f or open(filename, 'rb')
+            self.map = mmap_read(f)
+        got = str(self.map[0:4])
+        if got != 'BLOM':
+            log('Warning: invalid BLOM header (%r) in %r\n' % (got, filename))
              return self._init_failed()
          ver = struct.unpack('!I', self.map[4:8])[0]
          if ver < BLOOM_VERSION:
@@ -368,7 +397,7 @@ class ShaBloom:
                  % (ver, filename))
              return self._init_failed()
  
-        self.bits, self.entries = struct.unpack('!II', self.map[8:16])
+        self.bits, self.k, self.entries = struct.unpack('!HHI', self.map[8:16])
          idxnamestr = str(self.map[16 + 2**self.bits:])
          if idxnamestr:
              self.idxnames = idxnamestr.split('\0')
@@ -377,7 +406,6 @@ class ShaBloom:
  
      def _init_failed(self):
          if self.map:
-            self.map.close()
              self.map = None
          if self.rwfile:
              self.rwfile.close()
@@ -392,12 +420,14 @@ class ShaBloom:
          self.close()
  
      def close(self):
-        if self.map:
-            if self.rwfile:
-                debug2("bloom: closing with %d entries\n" % self.entries)
-                self.map[12:16] = struct.pack('!I', self.entries)
+        if self.map and self.rwfile:
+            debug2("bloom: closing with %d entries\n" % self.entries)
+            self.map[12:16] = struct.pack('!I', self.entries)
+            if self.delaywrite:
+                self.rwfile.seek(0)
+                self.rwfile.write(self.map)
+            else:
                  self.map.flush()
-        if self.rwfile:
              self.rwfile.seek(16 + 2**self.bits)
              if self.idxnames:
                  self.rwfile.write('\0'.join(self.idxnames))
@@ -406,13 +436,13 @@ class ShaBloom:
      def pfalse_positive(self, additional=0):
          n = self.entries + additional
          m = 8*2**self.bits
-        k = BLOOM_HASHES
+        k = self.k
          return 100*(1-math.exp(-k*float(n)/m))**k
  
      def add_idx(self, ix):
          """Add the object to the filter, return current pfalse_positive."""
          if not self.map: raise Exception, "Cannot add to closed bloom"
-        self.entries += bloom_add(self.map, 16, ix.shatable, self.bits)
+        self.entries += bloom_add(self.map, ix.shatable, self.bits, self.k)
          self.idxnames.append(os.path.basename(ix.name))
  
      def exists(self, sha):
@@ -420,25 +450,31 @@ class ShaBloom:
          global _total_searches, _total_steps
          _total_searches += 1
          if not self.map: return None
-        found, steps = bloom_contains(self.map, 16, str(sha), self.bits)
+        found, steps = bloom_contains(self.map, str(sha), self.bits, self.k)
          _total_steps += steps
          return found
  
      @classmethod
-    def create(cls, name, readwrite=False, expected=100000):
+    def create(cls, name, expected, delaywrite=None, f=None, k=None):
          """Create and return a bloom filter for `expected` entries."""
          bits = int(math.floor(math.log(expected*MAX_BITS_EACH/8,2)))
-        if bits > MAX_BLOOM_BITS:
+        k = k or ((bits <= MAX_BLOOM_BITS[5]) and 5 or 4)
+        if bits > MAX_BLOOM_BITS[k]:
              log('bloom: warning, max bits exceeded, non-optimal\n')
-            bits = MAX_BLOOM_BITS
-        debug1('bloom: using 2^%d bytes for bloom filter\n' % bits)
-        f = open(name, 'wb')
+            bits = MAX_BLOOM_BITS[k]
+        debug1('bloom: using 2^%d bytes and %d hash functions\n' % (bits, k))
+        f = f or open(name, 'w+b')
          f.write('BLOM')
-        f.write(struct.pack('!III', BLOOM_VERSION, bits, 0))
+        f.write(struct.pack('!IHHI', BLOOM_VERSION, bits, k, 0))
          assert(f.tell() == 16)
-        f.write('\0'*2**bits)
-        f.close()
-        return cls(name, readwrite=readwrite)
+        # NOTE: On some systems this will not extend+zerofill, but it does on
+        # darwin, linux, bsd and solaris.
+        f.truncate(16+2**bits)
+        f.seek(0)
+        if delaywrite != None and not delaywrite:
+            # tell it to expect very few objects, forcing a direct mmap
+            expected = 1
+        return cls(name, f=f, readwrite=True, expected=expected)
  
      def __len__(self):
          return self.entries
@@ -474,10 +510,12 @@ class PackMidx:
          self.bits = _helpers.firstword(self.map[8:12])
          self.entries = 2**self.bits
          self.fanout = buffer(self.map, 12, self.entries*4)
-        shaofs = 12 + self.entries*4
-        nsha = self._fanget(self.entries-1)
-        self.shatable = buffer(self.map, shaofs, nsha*20)
-        self.idxnames = str(self.map[shaofs + 20*nsha:]).split('\0')
+        self.sha_ofs = 12 + self.entries*4
+        self.nsha = nsha = self._fanget(self.entries-1)
+        self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
+        self.whichlist = buffer(self.map, self.sha_ofs + nsha*20, nsha*4)
+        self.idxname_ofs = self.sha_ofs + 24*nsha
+        self.idxnames = str(self.map[self.idxname_ofs:]).split('\0')
  
      def _init_failed(self):
          self.bits = 0
@@ -494,7 +532,13 @@ class PackMidx:
      def _get(self, i):
          return str(self.shatable[i*20:(i+1)*20])
  
-    def exists(self, hash):
+    def _get_idx_i(self, i):
+        return struct.unpack('!I', self.whichlist[i*4:(i+1)*4])[0]
+
+    def _get_idxname(self, i):
+        return self.idxnames[self._get_idx_i(i)]
+
+    def exists(self, hash, want_source=False):
          """Return nonempty if the object exists in the index files."""
          global _total_searches, _total_steps
          _total_searches += 1
@@ -525,7 +569,7 @@ class PackMidx:
                  end = mid
                  endv = _helpers.firstword(v)
              else: # got it!
-                return True
+                return want_source and self._get_idxname(mid) or True
          return None
  
      def __iter__(self):
@@ -560,7 +604,7 @@ class PackIdxList:
      def __len__(self):
          return sum(len(pack) for pack in self.packs)
  
-    def exists(self, hash):
+    def exists(self, hash, want_source=False):
          """Return nonempty if the object exists in the index files."""
          global _total_searches
          _total_searches += 1
@@ -575,10 +619,11 @@ class PackIdxList:
          for i in xrange(len(self.packs)):
              p = self.packs[i]
              _total_searches -= 1  # will be incremented by sub-pack
-            if p.exists(hash):
+            ix = p.exists(hash, want_source=want_source)
+            if ix:
                  # reorder so most recently used packs are searched first
                  self.packs = [p] + self.packs[:i] + self.packs[i+1:]
-                return p.name
+                return ix
          self.do_bloom = True
          return None
  
@@ -658,21 +703,6 @@ class PackIdxList:
          debug1('PackIdxList: using %d index%s.\n'
              % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
  
-    def packname_containing(self, hash):
-        # figure out which pack contains a given hash.
-        # FIXME: if the midx file format would just *store* this information,
-        # we could calculate it a lot more efficiently.  But it's not needed
-        # often, so let's do it like this.
-        for f in glob.glob(os.path.join(self.dir,'*.idx')):
-            full = os.path.join(self.dir, f)
-            try:
-                ix = open_idx(full)
-            except GitError, e:
-                add_error(e)
-                continue
-            if ix.exists(hash):
-                return full
-
      def add(self, hash):
          """Insert an additional object in the list."""
          self.also.add(hash)
@@ -800,10 +830,10 @@ class PackWriter:
              raise GitError(
                      "PackWriter not opened or can't check exists w/o objcache")
  
-    def exists(self, id):
+    def exists(self, id, want_source=False):
          """Return non-empty if an object is found in the object cache."""
          self._require_objcache()
-        return self.objcache.exists(id)
+        return self.objcache.exists(id, want_source=want_source)
  
      def maybe_write(self, type, content):
          """Write an object to the pack file if not present and return its id."""