ShaBloom: Add k=4 support for large repositories

author Brandon Low <lostlogic@lostlogicx.com>

Mon, 7 Feb 2011 06:06:07 +0000 (22:06 -0800)

committer Avery Pennarun <apenwarr@gmail.com>

Mon, 7 Feb 2011 09:31:49 +0000 (01:31 -0800)
author Brandon Low <lostlogic@lostlogicx.com>
Mon, 7 Feb 2011 06:06:07 +0000 (22:06 -0800)
committer Avery Pennarun <apenwarr@gmail.com>
Mon, 7 Feb 2011 09:31:49 +0000 (01:31 -0800)
diff --git a/Documentation/bup-bloom.md b/Documentation/bup-bloom.md

index 4b947776f5d46978423e9d7591dffccac409528d..01373bff13288a3102e3784825f52aa2a2cd3f2f 100644 (file)
--- a/Documentation/bup-bloom.md
+++ b/Documentation/bup-bloom.md
@@ -8,7 +8,7 @@ bup-bloom - generates, regenerates, updates bloom filters
  
  # SYNOPSIS
  
-bup daemon [-d dir] [-o outfile]
+bup daemon [-d dir] [-o outfile] [-k hashes]
  
  # DESCRIPTION
  
@@ -23,8 +23,13 @@ it if needed.
      defaults to $BUP_DIR/objects/pack
  
  -o, --outfile=*outfile*
-:   the file to write the bloom filter to.
-    defaults to $dir/bup.bloom
+:   the file to write the bloom filter to.  defaults to
+    $dir/bup.bloom
+
+-k, --hashes=*hashes*
+:   number of hash functions to use only 4 and 5 are valid.
+    defaults to 5 for repositories < 2TiB and 4 otherwise.
+    see comments in git.py for more on this value.
  
  # BUP
  
diff --git a/cmd/bloom-cmd.py b/cmd/bloom-cmd.py

index 768d5fe5fdd3243c0e21f6f85682a15c18596a86..44579ac462fed276ce7b939246e63083a90d9393 100755 (executable)
--- a/cmd/bloom-cmd.py
+++ b/cmd/bloom-cmd.py
@@ -8,6 +8,7 @@ bup bloom [options...]
  --
  o,output=  output bloom filename (default: auto-generated)
  d,dir=     input directory to look for idx files (default: auto-generated)
+k,hashes=  number of hash functions to use (4 or 5) (default: auto-generated)
  """
  
  def do_bloom(path, outfilename):
@@ -63,9 +64,8 @@ def do_bloom(path, outfilename):
      if b is None:
          tf = tempfile.NamedTemporaryFile(
                  dir=path, suffix='bup.bloom', delete=False)
-        tempname = tf.name
-        tf.close()
-        b = git.ShaBloom.create(tempname, readwrite=True, expected=add_count)
+        b = git.ShaBloom.create(
+                tf.name, f=tf, readwrite=True, expected=add_count, k=opt.k)
      count = 0
      for ix in add:
          progress('Writing bloom: %d/%d\r' % (count, len(add)))
@@ -85,6 +85,9 @@ o = options.Options(optspec)
  if extra:
      o.fatal('no positional parameters expected')
  
+if opt.k and opt.k not in (4,5):
+    o.fatal('only k values of 4 and 5 are supported')
+
  git.check_repo_or_die()
  
  do_bloom(opt.dir or git.repo('objects/pack'), opt.output)
diff --git a/lib/bup/_helpers.c b/lib/bup/_helpers.c

index ff4c86a7961e808f568569ed6539f3454317141e..e4f072da810af5fcd2e56c335e641a5be0977a37 100644 (file)
--- a/lib/bup/_helpers.c
+++ b/lib/bup/_helpers.c
@@ -77,73 +77,131 @@ static PyObject *firstword(PyObject *self, PyObject *args)
  }
  
  
-static void to_bloom_address_bitmask(unsigned const char *buf, const int nbits,
-                                    uint32_t *v, unsigned char *bitmask)
+typedef struct {
+    uint32_t high;
+    unsigned char low;
+} bits40_t;
+
+
+static void to_bloom_address_bitmask4(const bits40_t *buf,
+       const int nbits, uint64_t *v, unsigned char *bitmask)
+{
+    int bit;
+    uint64_t raw, mask;
+
+    mask = (1<<nbits) - 1;
+    raw = (((uint64_t)ntohl(buf->high)) << 8) | buf->low;
+    bit = (raw >> (37-nbits)) & 0x7;
+    *v = (raw >> (40-nbits)) & mask;
+    *bitmask = 1 << bit;
+}
+
+static void to_bloom_address_bitmask5(const uint32_t *buf,
+       const int nbits, uint32_t *v, unsigned char *bitmask)
  {
      int bit;
      uint32_t raw, mask;
  
      mask = (1<<nbits) - 1;
-    raw = ntohl(*(uint32_t *)buf);
+    raw = ntohl(*buf);
      bit = (raw >> (29-nbits)) & 0x7;
      *v = (raw >> (32-nbits)) & mask;
      *bitmask = 1 << bit;
  }
  
-static void bloom_add_entry(
-       unsigned char *bloom, int ofs, unsigned char *sha, int nbits)
-{
-    unsigned char bitmask, *end;
-    uint32_t v;
  
-    for (end = sha + 20; sha < end; sha += 4)
-    {
-       to_bloom_address_bitmask(sha, nbits, &v, &bitmask);
-       bloom[ofs+v] |= bitmask;
-    }
+#define BLOOM_SET_BIT(name, address, itype, otype) \
+static void name(unsigned char *bloom, const void *buf, const int nbits)\
+{\
+    unsigned char bitmask;\
+    otype v;\
+    address((itype *)buf, nbits, &v, &bitmask);\
+    bloom[16+v] |= bitmask;\
+}
+BLOOM_SET_BIT(bloom_set_bit4, to_bloom_address_bitmask4, bits40_t, uint64_t)
+BLOOM_SET_BIT(bloom_set_bit5, to_bloom_address_bitmask5, uint32_t, uint32_t)
+
+
+#define BLOOM_GET_BIT(name, address, itype, otype) \
+static int name(const unsigned char *bloom, const void *buf, const int nbits)\
+{\
+    unsigned char bitmask;\
+    otype v;\
+    address((itype *)buf, nbits, &v, &bitmask);\
+    return bloom[16+v] & bitmask;\
  }
+BLOOM_GET_BIT(bloom_get_bit4, to_bloom_address_bitmask4, bits40_t, uint64_t)
+BLOOM_GET_BIT(bloom_get_bit5, to_bloom_address_bitmask5, uint32_t, uint32_t)
+
  
  static PyObject *bloom_add(PyObject *self, PyObject *args)
  {
+    void (*bloom_set_bit)(unsigned char *, const void *, const int);
      unsigned char *sha = NULL, *bloom = NULL;
-    int ofs = 0, len = 0, blen = 0, nbits = 0;
-    int i;
+    unsigned char *end;
+    int len = 0, blen = 0, nbits = 0, k = 0;
+
+    if (!PyArg_ParseTuple(args, "w#s#ii", &bloom, &blen, &sha, &len, &nbits, &k))
+       return NULL;
  
-    if (!PyArg_ParseTuple(args, "w#is#i",
-                          &bloom, &blen, &ofs, &sha, &len, &nbits))
+    if (k == 5)
+    {
+       if (nbits > 29)
+           return NULL;
+       bloom_set_bit = &bloom_set_bit5;
+    }
+    else if (k == 4)
+    {
+       if (nbits > 37)
+           return NULL;
+       bloom_set_bit = &bloom_set_bit4;
+    }
+    else
         return NULL;
  
-    if (blen < 16+(1<<nbits) || len % 20 != 0 || nbits > 29)
+    if (blen < 16+(1<<nbits) || len % 20 != 0)
         return NULL;
  
-    for (i = 0; i < len; i += 20)
-       bloom_add_entry(bloom, ofs, &sha[i], nbits);
+    for (end = sha + len; sha < end; sha += 20/k)
+       (*bloom_set_bit)(bloom, sha, nbits);
  
-    return Py_BuildValue("i", i/20);
+    return Py_BuildValue("i", len/20);
  }
  
  static PyObject *bloom_contains(PyObject *self, PyObject *args)
  {
+    int (*bloom_get_bit)(const unsigned char *, const void *, const int);
      unsigned char *sha = NULL, *bloom = NULL;
-    int ofs = 0, len = 0, blen = 0, nbits = 0;
-    unsigned char bitmask, *end;
-    uint32_t v;
+    int len = 0, blen = 0, nbits = 0, k = 0;
+    unsigned char *end;
      int steps;
  
-    if (!PyArg_ParseTuple(args, "t#is#i",
-                          &bloom, &blen, &ofs, &sha, &len, &nbits))
+    if (!PyArg_ParseTuple(args, "t#s#ii", &bloom, &blen, &sha, &len, &nbits, &k))
         return NULL;
  
-    if (len != 20 || nbits > 29)
+    if (len != 20)
         return NULL;
  
-    for (steps = 1, end = sha + 20; sha < end; sha += 4, steps++)
+    if (k == 5)
      {
-       to_bloom_address_bitmask(sha, nbits, &v, &bitmask);
-       if (!(bloom[ofs+v] & bitmask))
-           return Py_BuildValue("Oi", Py_None, steps);
+       if (nbits > 29)
+           return NULL;
+       bloom_get_bit = &bloom_get_bit5;
+    }
+    else if (k == 4)
+    {
+       if (nbits > 37)
+           return NULL;
+       bloom_get_bit = &bloom_get_bit4;
      }
-    return Py_BuildValue("Oi", Py_True, 5);
+    else
+       return NULL;
+
+    for (steps = 1, end = sha + 20; sha < end; sha += 20/k, steps++)
+       if (!bloom_get_bit(bloom, sha, nbits))
+           return Py_BuildValue("Oi", Py_None, steps);
+
+    return Py_BuildValue("Oi", Py_True, k);
  }
  
  
diff --git a/lib/bup/git.py b/lib/bup/git.py

index e2808ec3b395562c0525018785e9e70e3a3c3fce..19fe9bc6b8682f1ff49db03e6374b83cffe36245 100644 (file)
--- a/lib/bup/git.py
+++ b/lib/bup/git.py
@@ -8,8 +8,7 @@ from bup import _helpers, path
  
  MIDX_VERSION = 2
  
-"""Bloom constants:
-These bloom constants were chosen as a combination of convenience and quality.
+"""Discussion of bloom constants for bup:
  
  There are four basic things to consider when building a bloom filter:
  The size, in bits, of the filter
@@ -17,12 +16,11 @@ The capacity, in entries, of the filter
  The probability of a false positive that is tolerable
  The number of bits readily available to use for addresing filter bits
  
-Based on those four considerations, there are two basic filter tunables:
+There is one major tunable that is not directly related to the above:
  k: the number of bits set in the filter per entry
-pfmax: the maximum pfalse_positive before growing the filter.
  
-Here's a wall of numbers showing the relationship between these two and the
-ratio between the size of the filter in bits and the entries in the filter:
+Here's a wall of numbers showing the relationship between k; the ratio between
+the filter size in bits and the entries in the filter; and pfalse_positive:
  
  mn|k=3    |k=4    |k=5    |k=6    |k=7    |k=8    |k=9    |k=10   |k=11
   8|3.05794|2.39687|2.16792|2.15771|2.29297|2.54917|2.92244|3.41909|4.05091
@@ -63,7 +61,7 @@ pfalse|obj k=4     |cap k=4    |obj k=5  |cap k=5    |obj k=6 |cap k=6
  This eliminates pretty neatly any k>6 as long as we use the raw SHA for
  addressing.
  
-filter size scales linearly with reposize for a given k and pfalse.
+filter size scales linearly with repository size for a given k and pfalse.
  
  Here's a table of filter sizes for a 1 TiB repository:
  
@@ -81,19 +79,20 @@ faulting on the midx doesn't overcome the benefit of the bloom filter.
  * We want to be able to have a single bloom address entire repositories of
  reasonable size.
  
-Based on those parameters, k=4 or k=5 seem to be the most reasonable options.
-k=5 is a bit limited on repository size, but not terrible.  k=4 gives "plenty"
-of repository space, but has 3 times the pfalse positive when the filter is
-relatively empty.  k=5 is trivial to code, so I did that.  It should be pretty
-easy to make the bloom filter adapt when the repository requires more address
-bits than k=5 allows and switch down to k=4.
+Based on these parameters, a combination of k=4 and k=5 provides the behavior
+that bup needs.  As such, I've implemented bloom addressing, adding and
+checking functions in C for these two values.  Because k=5 requires less space
+and gives better overall pfalse_positive perofrmance, it is preferred if a
+table with k=5 can represent the repository.
+
+None of this tells us what max_pfalse_positive to choose.
+
  Brandon Low <lostlogic@lostlogicx.com> 04-02-2011
  """
-BLOOM_VERSION = 1
-MAX_BITS_EACH = 32
-BLOOM_HASHES = 5
-MAX_BLOOM_BITS = 29
-MAX_PFALSE_POSITIVE = 1.
+BLOOM_VERSION = 2
+MAX_BITS_EACH = 32 # Kinda arbitrary, but 4 bytes per entry is pretty big
+MAX_BLOOM_BITS = {4: 37, 5: 29} # 160/k-log2(8)
+MAX_PFALSE_POSITIVE = 1. # Totally arbitrary, needs benchmarking
  
  verbose = 0
  ignore_midx = 0
@@ -346,15 +345,15 @@ class ShaBloom:
      and make it possible for bup to expand Git's indexing capabilities to vast
      amounts of files.
      """
-    def __init__(self, filename, readwrite=False):
+    def __init__(self, filename, f=None, readwrite=False):
          self.name = filename
          assert(filename.endswith('.bloom'))
          if readwrite:
-            self.rwfile = open(filename, 'r+b')
+            self.rwfile = f or open(filename, 'r+b')
              self.map = mmap_readwrite(self.rwfile, close=False)
          else:
              self.rwfile = None
-            self.map = mmap_read(open(filename, 'rb'))
+            self.map = mmap_read(f or open(filename, 'rb'))
          if str(self.map[0:4]) != 'BLOM':
              log('Warning: skipping: invalid BLOM header in %r\n' % filename)
              return self._init_failed()
@@ -368,7 +367,7 @@ class ShaBloom:
                  % (ver, filename))
              return self._init_failed()
  
-        self.bits, self.entries = struct.unpack('!II', self.map[8:16])
+        self.bits, self.k, self.entries = struct.unpack('!HHI', self.map[8:16])
          idxnamestr = str(self.map[16 + 2**self.bits:])
          if idxnamestr:
              self.idxnames = idxnamestr.split('\0')
@@ -406,13 +405,13 @@ class ShaBloom:
      def pfalse_positive(self, additional=0):
          n = self.entries + additional
          m = 8*2**self.bits
-        k = BLOOM_HASHES
+        k = self.k
          return 100*(1-math.exp(-k*float(n)/m))**k
  
      def add_idx(self, ix):
          """Add the object to the filter, return current pfalse_positive."""
          if not self.map: raise Exception, "Cannot add to closed bloom"
-        self.entries += bloom_add(self.map, 16, ix.shatable, self.bits)
+        self.entries += bloom_add(self.map, ix.shatable, self.bits, self.k)
          self.idxnames.append(os.path.basename(ix.name))
  
      def exists(self, sha):
@@ -420,25 +419,26 @@ class ShaBloom:
          global _total_searches, _total_steps
          _total_searches += 1
          if not self.map: return None
-        found, steps = bloom_contains(self.map, 16, str(sha), self.bits)
+        found, steps = bloom_contains(self.map, str(sha), self.bits, self.k)
          _total_steps += steps
          return found
  
      @classmethod
-    def create(cls, name, readwrite=False, expected=100000):
+    def create(cls, name, f=None, readwrite=False, expected=100000, k=None):
          """Create and return a bloom filter for `expected` entries."""
          bits = int(math.floor(math.log(expected*MAX_BITS_EACH/8,2)))
-        if bits > MAX_BLOOM_BITS:
+        k = k or ((bits <= MAX_BLOOM_BITS[5]) and 5 or 4)
+        if bits > MAX_BLOOM_BITS[k]:
              log('bloom: warning, max bits exceeded, non-optimal\n')
-            bits = MAX_BLOOM_BITS
-        debug1('bloom: using 2^%d bytes for bloom filter\n' % bits)
-        f = open(name, 'wb')
+            bits = MAX_BLOOM_BITS[k]
+        debug1('bloom: using 2^%d bytes and %d hash functions\n' % (bits, k))
+        f = f or open(name, 'w+b')
          f.write('BLOM')
-        f.write(struct.pack('!III', BLOOM_VERSION, bits, 0))
+        f.write(struct.pack('!IHHI', BLOOM_VERSION, bits, k, 0))
          assert(f.tell() == 16)
          f.write('\0'*2**bits)
-        f.close()
-        return cls(name, readwrite=readwrite)
+        f.seek(0)
+        return cls(name, f=f, readwrite=readwrite)
  
      def __len__(self):
          return self.entries
diff --git a/lib/bup/t/tgit.py b/lib/bup/t/tgit.py

index 9e5fb6000e4cfdee098305460a3f0d259bca5bc3..054ca08557ba8b8969d2b1646400a7553938f829 100644 (file)
--- a/lib/bup/t/tgit.py
+++ b/lib/bup/t/tgit.py
@@ -118,23 +118,33 @@ def test_long_index():
  @wvtest
  def test_bloom():
      hashes = [os.urandom(20) for i in range(100)]
-    b = git.ShaBloom.create('pybuptest.bloom', readwrite=True, expected=100)
      class Idx:
          pass
      ix = Idx()
      ix.name='dummy.idx'
      ix.shatable = ''.join(hashes)
-    b.add_idx(ix)
-    WVPASSLT(b.pfalse_positive(), .1)
-    b.close()
-    b = git.ShaBloom('pybuptest.bloom')
-    all_present = True
-    for h in hashes:
-        all_present &= b.exists(h)
-    WVPASS(all_present)
-    false_positives = 0
-    for h in [os.urandom(20) for i in range(1000)]:
-        if b.exists(h):
-            false_positives += 1
-    WVPASSLT(false_positives, 5)
-    os.unlink('pybuptest.bloom')
+    for k in (4, 5):
+        b = git.ShaBloom.create('pybuptest.bloom', readwrite=True, expected=100, k=k)
+        b.add_idx(ix)
+        WVPASSLT(b.pfalse_positive(), .1)
+        b.close()
+        b = git.ShaBloom('pybuptest.bloom')
+        all_present = True
+        for h in hashes:
+            all_present &= b.exists(h)
+        WVPASS(all_present)
+        false_positives = 0
+        for h in [os.urandom(20) for i in range(1000)]:
+            if b.exists(h):
+                false_positives += 1
+        WVPASSLT(false_positives, 5)
+        os.unlink('pybuptest.bloom')
+
+    tf = tempfile.TemporaryFile()
+    b = git.ShaBloom.create('bup.bloom', f=tf, readwrite=True, expected=100)
+    WVPASSEQ(b.rwfile, tf)
+    WVPASSEQ(b.k, 5)
+# FIXME: commented out because it writes a gigabyte of zeros to disk.
+#    tf = tempfile.TemporaryFile()
+#    b = git.ShaBloom.create('bup.bloom', f=tf, readwrite=True, expected=2**28)
+#    WVPASSEQ(b.k, 4)
author	Brandon Low <lostlogic@lostlogicx.com>
	Mon, 7 Feb 2011 06:06:07 +0000 (22:06 -0800)
committer	Avery Pennarun <apenwarr@gmail.com>
	Mon, 7 Feb 2011 09:31:49 +0000 (01:31 -0800)
Documentation/bup-bloom.md		patch \| blob \| history
cmd/bloom-cmd.py		patch \| blob \| history
lib/bup/_helpers.c		patch \| blob \| history
lib/bup/git.py		patch \| blob \| history
lib/bup/t/tgit.py		patch \| blob \| history