hashsplit: totally change the way the fanout stuff works.

author Avery Pennarun <apenwarr@gmail.com>

Fri, 12 Feb 2010 04:50:39 +0000 (23:50 -0500)

committer Avery Pennarun <apenwarr@gmail.com>

Fri, 12 Feb 2010 05:08:58 +0000 (00:08 -0500)
author Avery Pennarun <apenwarr@gmail.com>
Fri, 12 Feb 2010 04:50:39 +0000 (23:50 -0500)
committer Avery Pennarun <apenwarr@gmail.com>
Fri, 12 Feb 2010 05:08:58 +0000 (00:08 -0500)
diff --git a/_hashsplit.c b/_hashsplit.c

index e93eff20cd20985e8dc7334447eeac4052316160..a731454d9665eb7f3d30c1fa25dbd6244e6213a8 100644 (file)
--- a/_hashsplit.c
+++ b/_hashsplit.c
@@ -16,7 +16,7 @@ static uint32_t stupidsum_add(uint32_t old, uint8_t drop, uint8_t add)
  }
  
  
-static int find_ofs(const unsigned char *buf, int len)
+static int find_ofs(const unsigned char *buf, int len, int *bits)
  {
      unsigned char window[WINDOWSIZE];
      uint32_t sum = 0;
@@ -29,22 +29,37 @@ static int find_ofs(const unsigned char *buf, int len)
         window[i] = buf[count];
         i = (i + 1) % WINDOWSIZE;
         if ((sum & (BLOBSIZE-1)) == ((~0) & (BLOBSIZE-1)))
+       {
+           if (bits)
+           {
+               *bits = BLOBBITS;
+               for (*bits = BLOBBITS; (sum >> *bits) & 1; (*bits)++)
+                   ;
+           }
             return count+1;
+       }
      }
      return 0;
  }
  
  
+static PyObject *blobbits(PyObject *self, PyObject *args)
+{
+    if (!PyArg_ParseTuple(args, ""))
+       return NULL;
+    return Py_BuildValue("i", BLOBBITS);
+}
+
+
  static PyObject *splitbuf(PyObject *self, PyObject *args)
  {
      unsigned char *buf = NULL;
-    int len = 0, out = 0;
+    int len = 0, out = 0, bits = -1;
  
      if (!PyArg_ParseTuple(args, "t#", &buf, &len))
         return NULL;
-    out = find_ofs(buf, len);
-    //return Py_BuildValue("i", len);//len>BLOBSIZE ? BLOBSIZE : len);
-    return Py_BuildValue("i", out);
+    out = find_ofs(buf, len, &bits);
+    return Py_BuildValue("ii", out, bits);
  }
  
  
@@ -112,6 +127,8 @@ static PyObject *write_random(PyObject *self, PyObject *args)
  
  
  static PyMethodDef hashsplit_methods[] = {
+    { "blobbits", blobbits, METH_VARARGS,
+       "Return the number of bits in the rolling checksum." },
      { "splitbuf", splitbuf, METH_VARARGS,
         "Split a list of strings based on a rolling checksum." },
      { "bitmatch", bitmatch, METH_VARARGS,
diff --git a/cmd-split.py b/cmd-split.py

index 21c0ed3b700af5bcd6b9585b8f969598c1c40253..8b763910c6f9372794e8d4d0ed91d6ac8284439a 100755 (executable)
--- a/cmd-split.py
+++ b/cmd-split.py
@@ -16,6 +16,7 @@ n,name=    name of backup set to update (if any)
  N,noop     don't actually save the data anywhere
  q,quiet    don't print progress messages
  v,verbose  increase log output (can be used more than once)
+copy       just copy input to output, hashsplitting along the way
  bench      print benchmark timings to stderr
  max-pack-size=  maximum bytes in a single pack
  max-pack-objects=  maximum number of objects in a single pack
@@ -25,11 +26,13 @@ o = options.Options('bup split', optspec)
  (opt, flags, extra) = o.parse(sys.argv[1:])
  
  git.check_repo_or_die()
-if not (opt.blobs or opt.tree or opt.commit or opt.name or opt.noop):
-    log("bup split: use one or more of -b, -t, -c, -n\n")
+if not (opt.blobs or opt.tree or opt.commit or opt.name or
+        opt.noop or opt.copy):
+    log("bup split: use one or more of -b, -t, -c, -n, -N, --copy\n")
      o.usage()
-if opt.noop and (opt.blobs or opt.tree or opt.commit or opt.name):
-    log('bup split: -N is incompabile with -b, -t, -c, -n\n')
+if (opt.noop or opt.copy) and (opt.blobs or opt.tree or 
+                               opt.commit or opt.name):
+    log('bup split: -N is incompatible with -b, -t, -c, -n\n')
      o.usage()
  
  if opt.verbose >= 2:
@@ -47,7 +50,7 @@ if opt.blobs:
  start_time = time.time()
  
  refname = opt.name and 'refs/heads/%s' % opt.name or None
-if opt.noop:
+if opt.noop or opt.copy:
      cli = w = oldref = None
  elif opt.remote:
      cli = client.Client(opt.remote)
@@ -64,8 +67,10 @@ if w:
      tree = w.new_tree(shalist)
  else:
      last = 0
-    for blob in hashsplit.hashsplit_iter(files):
+    for (blob, bits) in hashsplit.hashsplit_iter(files):
          hashsplit.total_split += len(blob)
+        if opt.copy:
+            sys.stdout.write(str(blob))
          megs = hashsplit.total_split/1024/1024
          if not opt.quiet and last != megs:
              progress('%d Mbytes read\r' % megs)
diff --git a/hashsplit.py b/hashsplit.py

index 8fe5771a68001c7d0e79182a8063132486753e11..ca5682f60dc95ecdbb409a3ba49218b1527268fa 100644 (file)
--- a/hashsplit.py
+++ b/hashsplit.py
@@ -1,14 +1,15 @@
-import sys
+import sys, math
  import git, _hashsplit
  from helpers import *
  
  BLOB_LWM = 8192*2
  BLOB_MAX = BLOB_LWM*2
  BLOB_HWM = 1024*1024
+MAX_PER_TREE = 256
  progress_callback = None
  max_pack_size = 1000*1000*1000  # larger packs will slow down pruning
  max_pack_objects = 200*1000  # cache memory usage is about 83 bytes per object
-fanout = 4096
+fanout = 16
  
  class Buf:
      def __init__(self):
@@ -16,7 +17,6 @@ class Buf:
          self.start = 0
  
      def put(self, s):
-        #log('oldsize=%d+%d adding=%d\n' % (len(self.data), self.start, len(s)))
          if s:
              self.data = buffer(self.data, self.start) + s
              self.start = 0
@@ -38,11 +38,11 @@ class Buf:
  
  def splitbuf(buf):
      b = buf.peek(buf.used())
-    ofs = _hashsplit.splitbuf(b)
+    (ofs, bits) = _hashsplit.splitbuf(b)
      if ofs:
          buf.eat(ofs)
-        return buffer(b, 0, ofs)
-    return None
+        return (buffer(b, 0, ofs), bits)
+    return (None, 0)
  
  
  def blobiter(files):
@@ -59,9 +59,9 @@ def hashsplit_iter(files):
      buf = Buf()
      fi = blobiter(files)
      while 1:
-        blob = splitbuf(buf)
+        (blob, bits) = splitbuf(buf)
          if blob:
-            yield blob
+            yield (blob, bits)
          else:
              if buf.used() >= BLOB_MAX:
                  # limit max blob size
@@ -71,40 +71,72 @@ def hashsplit_iter(files):
                  if not bnew:
                      # eof
                      if buf.used():
-                        yield buf.get(buf.used())
+                        yield (buf.get(buf.used()), 0)
                      return
                  buf.put(bnew)
  
  
  total_split = 0
-def _split_to_shalist(w, files):
+def _split_to_blobs(w, files):
      global total_split
-    ofs = 0
-    for blob in hashsplit_iter(files):
+    for (blob, bits) in hashsplit_iter(files):
          sha = w.new_blob(blob)
          total_split += len(blob)
          if w.outbytes >= max_pack_size or w.count >= max_pack_objects:
              w.breakpoint()
          if progress_callback:
              progress_callback(len(blob))
-        yield ('100644', '%016x' % ofs, sha)
-        ofs += len(blob)
+        yield (sha, len(blob), bits)
+
+
+def _make_shalist(l):
+    ofs = 0
+    shalist = []
+    for (mode, sha, size) in l:
+        shalist.append((mode, '%016x' % ofs, sha))
+        ofs += size
+    total = ofs
+    return (shalist, total)
+
+
+def _squish(w, stacks, n):
+    i = 0
+    while i<n or len(stacks[i]) > MAX_PER_TREE:
+        while len(stacks) <= i+1:
+            stacks.append([])
+        if len(stacks[i]) == 1:
+            stacks[i+1] += stacks[i]
+        elif stacks[i]:
+            (shalist, size) = _make_shalist(stacks[i])
+            tree = w.new_tree(shalist)
+            stacks[i+1].append(('40000', tree, size))
+        stacks[i] = []
+        i += 1
  
  
  def split_to_shalist(w, files):
-    sl = _split_to_shalist(w, files)
+    sl = _split_to_blobs(w, files)
      if not fanout:
-        shalist = list(sl)
+        shal = []
+        for (sha,size,bits) in sl:
+            shal.append(('100644', sha, size))
+        return _make_shalist(shal)[0]
      else:
-        shalist = []
-        tmplist = []
-        for e in sl:
-            tmplist.append(e)
-            if len(tmplist) >= fanout and len(tmplist) >= 3:
-                shalist.append(('40000', tmplist[0][1], w.new_tree(tmplist)))
-                tmplist = []
-        shalist += tmplist
-    return shalist
+        base_bits = _hashsplit.blobbits()
+        fanout_bits = int(math.log(fanout, 2))
+        def bits_to_idx(n):
+            assert(n >= base_bits)
+            return (n - base_bits)/fanout_bits
+        stacks = [[]]
+        for (sha,size,bits) in sl:
+            assert(bits <= 32)
+            stacks[0].append(('100644', sha, size))
+            if bits > base_bits:
+                _squish(w, stacks, bits_to_idx(bits))
+        #log('stacks: %r\n' % [len(i) for i in stacks])
+        _squish(w, stacks, len(stacks)-1)
+        #log('stacks: %r\n' % [len(i) for i in stacks])
+        return _make_shalist(stacks[-1])[0]
  
  
  def split_to_blob_or_tree(w, files):
diff --git a/t/test.sh b/t/test.sh

index 40230a550b0b56a692dda9ac5c3698ec63684b1e..f1ba8ca41f0240d7536ebbf9011ec6c083648d09 100755 (executable)
--- a/t/test.sh
+++ b/t/test.sh
@@ -106,7 +106,6 @@ WVPASS bup margin
  WVPASS bup midx -f
  WVPASS bup margin
  WVPASS bup split -t t/testfile2 >tags2t.tmp
-WVPASS bup split -t t/testfile2 --fanout 3 >tags2tf.tmp
  WVPASS bup split -r "$BUP_DIR" -c t/testfile2 >tags2c.tmp
  WVPASS ls -lR \
     | WVPASS bup split -r "$BUP_DIR" -c --fanout 3 --max-pack-objects 3 -n lslr
@@ -116,8 +115,6 @@ WVPASS bup ls /lslr
  WVPASS bup ls /lslr/1971-01-01   # all dates always exist
  WVFAIL diff -u tags1.tmp tags2.tmp
  
-# fanout must be different from non-fanout
-WVFAIL diff -q tags2t.tmp tags2tf.tmp
  wc -c t/testfile1 t/testfile2
  wc -l tags1.tmp tags2.tmp
author	Avery Pennarun <apenwarr@gmail.com>
	Fri, 12 Feb 2010 04:50:39 +0000 (23:50 -0500)
committer	Avery Pennarun <apenwarr@gmail.com>
	Fri, 12 Feb 2010 05:08:58 +0000 (00:08 -0500)
_hashsplit.c		patch \| blob \| history
cmd-split.py		patch \| blob \| history
hashsplit.py		patch \| blob \| history
t/test.sh		patch \| blob \| history