Useless code churn or genius innovation? You decide.
The previous system for naming chunks of a split file was kind of lame. We
tried to name the files something that was "almost" their offset, so that
filenames wouldn't shuffle around too much if a few bytes were added/deleted
here and there. But that totally failed to work if a *lot* of bytes were
added, and it also lost the useful feature that you could seek to a specific
point in a file (like a VM image) without restoring the whole thing.
"Approximate" offsets aren't much good for seeking to.
The new system is even more crazy than the original hashsplit: we now use
the "extra bits" of the rolling checksum to define progressively larger
chunks. For example, we might define a normal chunk if the checksum ends in
0xFFF (12 bits). Now we can group multiple chunks together when the
checksum ends in 0xFFFF (16 bits). Because of the way the checksum works,
this happens about every 2^4 = 16 chunks. Similarly, 0xFFFFF (20 bits) will
happen 16 times less often than that, and so on. We can use this effect to
define a tree.
Then, in each branch of the tree, we name files based on their (exact, not
approximate) offset *from the start of that tree*.
Essentially, inserting/deleting/changing bytes will affect more "levels" of
the rolling checksum, mangling bigger and bigger branches of the overall
tree and causing those branches to change. However, only the content of
that sub-branch (and the *names*, ie offsets, of the following branches at
that and further-up levels) end up getting changed, so the effect can be
mostly localized. The subtrees of those renamed trees are *not* affected,
because all their offsets are relative to the start of their own tree. This
means *most* of the sha1sums in the resulting hierarchy don't need to
change, no matter how much data you add/insert/delete.
Anyway, the net result is that "git diff -M" now actually does something
halfway sensible when comparing the trees corresponding to huge split files.
Only halfway (because the chunk boundaries can move around a bit, and such
large files are usually binary anyway) but it opens the way for much cooler
algorithms in the future.
Also, it'll now be possible to make 'bup fuse' open files without restoring
the entire thing to a temp file first. That means restoring (or even
*using*) snapshotted VMs ought to become possible.
}
-static int find_ofs(const unsigned char *buf, int len)
+static int find_ofs(const unsigned char *buf, int len, int *bits)
{
unsigned char window[WINDOWSIZE];
uint32_t sum = 0;
window[i] = buf[count];
i = (i + 1) % WINDOWSIZE;
if ((sum & (BLOBSIZE-1)) == ((~0) & (BLOBSIZE-1)))
+ {
+ if (bits)
+ {
+ *bits = BLOBBITS;
+ for (*bits = BLOBBITS; (sum >> *bits) & 1; (*bits)++)
+ ;
+ }
return count+1;
+ }
}
return 0;
}
+static PyObject *blobbits(PyObject *self, PyObject *args)
+{
+ if (!PyArg_ParseTuple(args, ""))
+ return NULL;
+ return Py_BuildValue("i", BLOBBITS);
+}
+
+
static PyObject *splitbuf(PyObject *self, PyObject *args)
{
unsigned char *buf = NULL;
- int len = 0, out = 0;
+ int len = 0, out = 0, bits = -1;
if (!PyArg_ParseTuple(args, "t#", &buf, &len))
return NULL;
- out = find_ofs(buf, len);
- //return Py_BuildValue("i", len);//len>BLOBSIZE ? BLOBSIZE : len);
- return Py_BuildValue("i", out);
+ out = find_ofs(buf, len, &bits);
+ return Py_BuildValue("ii", out, bits);
}
static PyMethodDef hashsplit_methods[] = {
+ { "blobbits", blobbits, METH_VARARGS,
+ "Return the number of bits in the rolling checksum." },
{ "splitbuf", splitbuf, METH_VARARGS,
"Split a list of strings based on a rolling checksum." },
{ "bitmatch", bitmatch, METH_VARARGS,
N,noop don't actually save the data anywhere
q,quiet don't print progress messages
v,verbose increase log output (can be used more than once)
+copy just copy input to output, hashsplitting along the way
bench print benchmark timings to stderr
max-pack-size= maximum bytes in a single pack
max-pack-objects= maximum number of objects in a single pack
(opt, flags, extra) = o.parse(sys.argv[1:])
git.check_repo_or_die()
-if not (opt.blobs or opt.tree or opt.commit or opt.name or opt.noop):
- log("bup split: use one or more of -b, -t, -c, -n\n")
+if not (opt.blobs or opt.tree or opt.commit or opt.name or
+ opt.noop or opt.copy):
+ log("bup split: use one or more of -b, -t, -c, -n, -N, --copy\n")
o.usage()
-if opt.noop and (opt.blobs or opt.tree or opt.commit or opt.name):
- log('bup split: -N is incompabile with -b, -t, -c, -n\n')
+if (opt.noop or opt.copy) and (opt.blobs or opt.tree or
+ opt.commit or opt.name):
+ log('bup split: -N is incompatible with -b, -t, -c, -n\n')
o.usage()
if opt.verbose >= 2:
start_time = time.time()
refname = opt.name and 'refs/heads/%s' % opt.name or None
-if opt.noop:
+if opt.noop or opt.copy:
cli = w = oldref = None
elif opt.remote:
cli = client.Client(opt.remote)
tree = w.new_tree(shalist)
else:
last = 0
- for blob in hashsplit.hashsplit_iter(files):
+ for (blob, bits) in hashsplit.hashsplit_iter(files):
hashsplit.total_split += len(blob)
+ if opt.copy:
+ sys.stdout.write(str(blob))
megs = hashsplit.total_split/1024/1024
if not opt.quiet and last != megs:
progress('%d Mbytes read\r' % megs)
-import sys
+import sys, math
import git, _hashsplit
from helpers import *
BLOB_LWM = 8192*2
BLOB_MAX = BLOB_LWM*2
BLOB_HWM = 1024*1024
+MAX_PER_TREE = 256
progress_callback = None
max_pack_size = 1000*1000*1000 # larger packs will slow down pruning
max_pack_objects = 200*1000 # cache memory usage is about 83 bytes per object
-fanout = 4096
+fanout = 16
class Buf:
def __init__(self):
self.start = 0
def put(self, s):
- #log('oldsize=%d+%d adding=%d\n' % (len(self.data), self.start, len(s)))
if s:
self.data = buffer(self.data, self.start) + s
self.start = 0
def splitbuf(buf):
b = buf.peek(buf.used())
- ofs = _hashsplit.splitbuf(b)
+ (ofs, bits) = _hashsplit.splitbuf(b)
if ofs:
buf.eat(ofs)
- return buffer(b, 0, ofs)
- return None
+ return (buffer(b, 0, ofs), bits)
+ return (None, 0)
def blobiter(files):
buf = Buf()
fi = blobiter(files)
while 1:
- blob = splitbuf(buf)
+ (blob, bits) = splitbuf(buf)
if blob:
- yield blob
+ yield (blob, bits)
else:
if buf.used() >= BLOB_MAX:
# limit max blob size
if not bnew:
# eof
if buf.used():
- yield buf.get(buf.used())
+ yield (buf.get(buf.used()), 0)
return
buf.put(bnew)
total_split = 0
-def _split_to_shalist(w, files):
+def _split_to_blobs(w, files):
global total_split
- ofs = 0
- for blob in hashsplit_iter(files):
+ for (blob, bits) in hashsplit_iter(files):
sha = w.new_blob(blob)
total_split += len(blob)
if w.outbytes >= max_pack_size or w.count >= max_pack_objects:
w.breakpoint()
if progress_callback:
progress_callback(len(blob))
- yield ('100644', '%016x' % ofs, sha)
- ofs += len(blob)
+ yield (sha, len(blob), bits)
+
+
+def _make_shalist(l):
+ ofs = 0
+ shalist = []
+ for (mode, sha, size) in l:
+ shalist.append((mode, '%016x' % ofs, sha))
+ ofs += size
+ total = ofs
+ return (shalist, total)
+
+
+def _squish(w, stacks, n):
+ i = 0
+ while i<n or len(stacks[i]) > MAX_PER_TREE:
+ while len(stacks) <= i+1:
+ stacks.append([])
+ if len(stacks[i]) == 1:
+ stacks[i+1] += stacks[i]
+ elif stacks[i]:
+ (shalist, size) = _make_shalist(stacks[i])
+ tree = w.new_tree(shalist)
+ stacks[i+1].append(('40000', tree, size))
+ stacks[i] = []
+ i += 1
def split_to_shalist(w, files):
- sl = _split_to_shalist(w, files)
+ sl = _split_to_blobs(w, files)
if not fanout:
- shalist = list(sl)
+ shal = []
+ for (sha,size,bits) in sl:
+ shal.append(('100644', sha, size))
+ return _make_shalist(shal)[0]
else:
- shalist = []
- tmplist = []
- for e in sl:
- tmplist.append(e)
- if len(tmplist) >= fanout and len(tmplist) >= 3:
- shalist.append(('40000', tmplist[0][1], w.new_tree(tmplist)))
- tmplist = []
- shalist += tmplist
- return shalist
+ base_bits = _hashsplit.blobbits()
+ fanout_bits = int(math.log(fanout, 2))
+ def bits_to_idx(n):
+ assert(n >= base_bits)
+ return (n - base_bits)/fanout_bits
+ stacks = [[]]
+ for (sha,size,bits) in sl:
+ assert(bits <= 32)
+ stacks[0].append(('100644', sha, size))
+ if bits > base_bits:
+ _squish(w, stacks, bits_to_idx(bits))
+ #log('stacks: %r\n' % [len(i) for i in stacks])
+ _squish(w, stacks, len(stacks)-1)
+ #log('stacks: %r\n' % [len(i) for i in stacks])
+ return _make_shalist(stacks[-1])[0]
def split_to_blob_or_tree(w, files):
WVPASS bup midx -f
WVPASS bup margin
WVPASS bup split -t t/testfile2 >tags2t.tmp
-WVPASS bup split -t t/testfile2 --fanout 3 >tags2tf.tmp
WVPASS bup split -r "$BUP_DIR" -c t/testfile2 >tags2c.tmp
WVPASS ls -lR \
| WVPASS bup split -r "$BUP_DIR" -c --fanout 3 --max-pack-objects 3 -n lslr
WVPASS bup ls /lslr/1971-01-01 # all dates always exist
WVFAIL diff -u tags1.tmp tags2.tmp
-# fanout must be different from non-fanout
-WVFAIL diff -q tags2t.tmp tags2tf.tmp
wc -c t/testfile1 t/testfile2
wc -l tags1.tmp tags2.tmp