]> arthur.barton.de Git - bup.git/commitdiff
cmd-margin: a command to find out the max bits of overlap between hashes.
authorAvery Pennarun <apenwarr@gmail.com>
Tue, 2 Feb 2010 01:40:30 +0000 (20:40 -0500)
committerAvery Pennarun <apenwarr@gmail.com>
Tue, 2 Feb 2010 01:44:11 +0000 (20:44 -0500)
Run 'bup margin' to go through the list of all the objects in your bup
directory and count the number of overlapping prefix bits between each two
consecutive objects.  That is, fine the longest hash length (in bits) that
*would* have caused an overlap, if sha1 hashes had been that length.

On my system with 111 gigs of packs, I get 44 bits.  Out of a total of 160.
That means I'm still safe from collisions for about 2^116 times over.  Or is
it only the square root of that?  Anyway, it's such a large number that my
brain explodes just thinking about it.

Mark my words: 2^160 ought to be enough for anyone.

Makefile
_hashsplit.c
cmd-margin.py [new file with mode: 0755]
cmd-midx.py
git.py
helpers.py

index ccebfbea0583de735a3fd46229b5b8cde47bcb79..8b3be5520f3ac4d6628cc2697a1b9b5e3ab7cc19 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -20,7 +20,7 @@ endif
 default: all
 
 all: bup-split bup-join bup-save bup-init bup-server bup-index bup-tick \
-       bup-midx bup-fuse bup-ls bup-damage bup-fsck \
+       bup-midx bup-fuse bup-ls bup-damage bup-fsck bup-margin \
        bup memtest randomgen$(EXT) _hashsplit$(SOEXT)
 
 randomgen$(EXT): randomgen.o
index a213d00f9691c41a6f8a8262645dcf49feece7b6..933abe638fb9c358ef94b4750ef976e31a847d2e 100644 (file)
@@ -48,9 +48,37 @@ static PyObject *splitbuf(PyObject *self, PyObject *args)
 }
 
 
+static PyObject *bitmatch(PyObject *self, PyObject *args)
+{
+    unsigned char *buf1 = NULL, *buf2 = NULL;
+    int len1 = 0, len2 = 0;
+    int byte, bit;
+
+    if (!PyArg_ParseTuple(args, "t#t#", &buf1, &len1, &buf2, &len2))
+       return NULL;
+    
+    bit = 0;
+    for (byte = 0; byte < len1 && byte < len2; byte++)
+    {
+       int b1 = buf1[byte], b2 = buf2[byte];
+       if (b1 != b2)
+       {
+           for (bit = 0; bit < 8; bit++)
+               if ( (b1 & (0x80 >> bit)) != (b2 & (0x80 >> bit)) )
+                   break;
+           break;
+       }
+    }
+    
+    return Py_BuildValue("i", byte*8 + bit);
+}
+
+
 static PyMethodDef hashsplit_methods[] = {
     { "splitbuf", splitbuf, METH_VARARGS,
        "Split a list of strings based on a rolling checksum." },
+    { "bitmatch", bitmatch, METH_VARARGS,
+       "Count the number of matching prefix bits between two strings." },
     { NULL, NULL, 0, NULL },  // sentinel
 };
 
diff --git a/cmd-margin.py b/cmd-margin.py
new file mode 100755 (executable)
index 0000000..010a840
--- /dev/null
@@ -0,0 +1,29 @@
+#!/usr/bin/env python
+import sys
+import options, git, _hashsplit
+from helpers import *
+
+
+optspec = """
+bup margin
+"""
+o = options.Options('bup margin', optspec)
+(opt, flags, extra) = o.parse(sys.argv[1:])
+
+if extra:
+    log("bup margin: no arguments expected\n")
+    o.usage()
+
+git.check_repo_or_die()
+#git.ignore_midx = 1
+
+mi = git.MultiPackIndex(git.repo('objects/pack'))
+last = '\0'*20
+longmatch = 0
+for i in mi:
+    if i == last:
+        continue
+    pm = _hashsplit.bitmatch(last, i)
+    longmatch = max(longmatch, pm)
+    last = i
+print longmatch
index 68ff7dba3868f63f4246c1bf8a48f72b006c367a..97d01de887be84b4f089549237088c751d2174d9 100755 (executable)
@@ -7,38 +7,13 @@ PAGE_SIZE=4096
 SHA_PER_PAGE=PAGE_SIZE/200.
 
 
-def next(it):
-    try:
-        return it.next()
-    except StopIteration:
-        return None
-    
-    
-def merge(idxlist, total, bits, table):
-    iters = [iter(i) for i in idxlist]
-    iters = [[next(it), it] for it in iters]
+def merge(idxlist, bits, table):
     count = 0
-    iters.sort()
-    while iters:
-        if (count % 10000) == 0:
-            log('\rMerging: %.2f%% (%d/%d)'
-                % (count*100.0/total, count, total))
-        e = iters[0][0]
-        yield e
+    for e in git.idxmerge(idxlist):
         count += 1
         prefix = git.extract_bits(e, bits)
         table[prefix] = count
-        e = iters[0][0] = next(iters[0][1])
-        if not e:
-            iters = iters[1:]
-        else:
-            i = 1
-            while i < len(iters):
-                if iters[i][0] > e:
-                    break
-                i += 1
-            iters = iters[1:i] + [iters[0]] + iters[i:]
-    log('\rMerging: done.                                    \n')
+        yield e
 
 
 def do_midx(outdir, outfilename, infilenames):
@@ -76,7 +51,7 @@ def do_midx(outdir, outfilename, infilenames):
     assert(f.tell() == 12)
     f.write('\0'*8*entries)
     
-    for e in merge(inp, total, bits, table):
+    for e in merge(inp, bits, table):
         f.write(e)
         
     f.write('\0'.join([os.path.basename(p) for p in infilenames]))
diff --git a/git.py b/git.py
index 0f05248ce54d7c915ced76a2e65022c3187699ca..d7d74662bc6cc2486a6e61f420c9f031f5f4c47a 100644 (file)
--- a/git.py
+++ b/git.py
@@ -207,6 +207,9 @@ class MultiPackIndex:
         _mpi_count -= 1
         assert(_mpi_count == 0)
 
+    def __iter__(self):
+        return iter(idxmerge(self.packs))
+
     def exists(self, hash):
         if hash in self.also:
             return True
@@ -267,6 +270,32 @@ def _shalist_sort_key(ent):
         return name
 
 
+def idxmerge(idxlist):
+    total = sum([len(i) for i in idxlist])
+    iters = [iter(i) for i in idxlist]
+    iters = [[next(it), it] for it in iters]
+    count = 0
+    iters.sort()
+    while iters:
+        if (count % 10000) == 0:
+            log('Merging: %.2f%% (%d/%d)\r'
+                % (count*100.0/total, count, total))
+        e = iters[0][0]
+        yield e
+        count += 1
+        e = iters[0][0] = next(iters[0][1])
+        if not e:
+            iters = iters[1:]
+        else:
+            i = 1
+            while i < len(iters):
+                if iters[i][0] > e:
+                    break
+                i += 1
+            iters = iters[1:i] + [iters[0]] + iters[i:]
+    log('Merging: %.2f%% (%d/%d), done.\n' % (100, total, total))
+
+    
 class PackWriter:
     def __init__(self, objcache_maker=None):
         self.count = 0
index e6f0772f57749bfdca5553eb66597fd85e8a48d9..c7dfd5f1892f8326722159156b8fac413d98bb77 100644 (file)
@@ -15,6 +15,13 @@ def mkdirp(d):
             raise
 
 
+def next(it):
+    try:
+        return it.next()
+    except StopIteration:
+        return None
+    
+    
 def readpipe(argv):
     p = subprocess.Popen(argv, stdout=subprocess.PIPE)
     r = p.stdout.read()