From: Avery Pennarun <apenwarr@gmail.com>
Date: Wed, 22 Sep 2010 13:02:32 +0000 (-0700)
Subject: cmd/split: add a --git-ids option.
X-Git-Tag: bup-0.20~12
X-Git-Url: https://arthur.barton.de/cgi-bin/gitweb.cgi?p=bup.git;a=commitdiff_plain;h=5570b95f0519dfa7c2ce78413131d4433738cc8f

cmd/split: add a --git-ids option.

This lets you provide a list of git object ids on stdin instead of the raw
content.  bup-split then uses a CatPipe to retrieve the objects from git and
hashsplit them.  You could use this as a helper for converting a git repo
that contains a bunch of large files into one that uses bup-style hashsplit
files.

Signed-off-by: Avery Pennarun <apenwarr@gmail.com>
---

diff --git a/Documentation/bup-split.md b/Documentation/bup-split.md
index 0ab5d09..43fccc5 100644
--- a/Documentation/bup-split.md
+++ b/Documentation/bup-split.md
@@ -11,7 +11,7 @@ bup-split - save individual files to bup backup sets
 bup split [-r *host*:*path*] <-b|-t|-c|-n *name*> [-v] [-q]
   [--bench] [--max-pack-size=*bytes*]
   [--max-pack-objects=*n*] [--fanout=*count]
-  [--keep-boundaries] [filenames...]
+  [--git-ids] [--keep-boundaries] [filenames...]
 
 # DESCRIPTION
 
@@ -73,6 +73,15 @@ To get the data back, use `bup-join`(1).
 -v, --verbose
 :   increase verbosity (can be used more than once).
 
+--git-ids
+:   stdin is a list of git object ids instead of raw data.
+    `bup split` will read the contents of each named git
+    object (if it exists in the bup repository) and split
+    it.  This might be useful for converting a git
+    repository with large binary files to use bup-style
+    hashsplitting instead.  This option is probably most
+    useful when combined with `--keep-boundaries`.
+
 --keep-boundaries
 :   if multiple filenames are given on the command line,
     they are normally concatenated together as if the
diff --git a/cmd/split-cmd.py b/cmd/split-cmd.py
index 44c3cdc..2a72bd6 100755
--- a/cmd/split-cmd.py
+++ b/cmd/split-cmd.py
@@ -15,6 +15,7 @@ n,name=    name of backup set to update (if any)
 d,date=    date for the commit (seconds since the epoch)
 q,quiet    don't print progress messages
 v,verbose  increase log output (can be used more than once)
+git-ids    read a list of git object ids from stdin and split their contents
 keep-boundaries  don't let one chunk span two input files
 noop       don't actually save the data anywhere
 copy       just copy input to output, hashsplitting along the way
@@ -35,6 +36,8 @@ if not (opt.blobs or opt.tree or opt.commit or opt.name or
 if (opt.noop or opt.copy) and (opt.blobs or opt.tree or 
                                opt.commit or opt.name):
     o.fatal('-N and --copy are incompatible with -b, -t, -c, -n')
+if extra and opt.git_ids:
+    o.fatal("don't provide filenames when using --git-ids")
 
 if opt.verbose >= 2:
     git.verbose = opt.verbose - 1
@@ -77,7 +80,41 @@ else:
     oldref = refname and git.read_ref(refname) or None
     pack_writer = git.PackWriter()
 
-files = extra and (open(fn) for fn in extra) or [sys.stdin]
+if opt.git_ids:
+    # the input is actually a series of git object ids that we should retrieve
+    # and split.
+    #
+    # This is a bit messy, but basically it converts from a series of
+    # CatPipe.get() iterators into a series of file-type objects.
+    # It would be less ugly if either CatPipe.get() returned a file-like object
+    # (not very efficient), or split_to_shalist() expected an iterator instead
+    # of a file.
+    cp = git.CatPipe()
+    class IterToFile:
+        def __init__(self, it):
+            self.it = iter(it)
+        def read(self, size):
+            v = next(self.it)
+            return v or ''
+    def read_ids():
+        while 1:
+            line = sys.stdin.readline()
+            if not line:
+                break
+            if line:
+                line = line.strip()
+            try:
+                it = cp.get(line.strip())
+                next(it)  # skip the file type
+            except KeyError, e:
+                add_error('error: %s' % e)
+                continue
+            yield IterToFile(it)
+    files = read_ids()
+else:
+    # the input either comes from a series of files or from stdin.
+    files = extra and (open(fn) for fn in extra) or [sys.stdin]
+
 if pack_writer:
     shalist = hashsplit.split_to_shalist(pack_writer, files,
                                          keep_boundaries=opt.keep_boundaries)
@@ -126,3 +163,7 @@ size = hashsplit.total_split
 if opt.bench:
     log('\nbup: %.2fkbytes in %.2f secs = %.2f kbytes/sec\n'
         % (size/1024., secs, size/1024./secs))
+
+if saved_errors:
+    log('WARNING: %d errors encountered while saving.\n' % len(saved_errors))
+    sys.exit(1)
diff --git a/lib/bup/git.py b/lib/bup/git.py
index 66370ca..75d6443 100644
--- a/lib/bup/git.py
+++ b/lib/bup/git.py
@@ -870,11 +870,12 @@ class CatPipe:
         assert(not self.inprogress)
         assert(id.find('\n') < 0)
         assert(id.find('\r') < 0)
-        assert(id[0] != '-')
+        assert(not id.startswith('-'))
         self.inprogress = id
         self.p.stdin.write('%s\n' % id)
         hdr = self.p.stdout.readline()
         if hdr.endswith(' missing\n'):
+            self.inprogress = None
             raise KeyError('blob %r is missing' % id)
         spl = hdr.split(' ')
         if len(spl) != 3 or len(spl[0]) != 40:
diff --git a/lib/bup/hashsplit.py b/lib/bup/hashsplit.py
index b9896a0..d73e9f6 100644
--- a/lib/bup/hashsplit.py
+++ b/lib/bup/hashsplit.py
@@ -189,5 +189,5 @@ def open_noatime(name):
 
 def fadvise_done(f, ofs):
     assert(ofs >= 0)
-    if ofs > 0:
+    if ofs > 0 and hasattr(f, 'fileno'):
         _helpers.fadvise_done(f.fileno(), ofs)
diff --git a/t/test.sh b/t/test.sh
index 1f2f916..02590aa 100755
--- a/t/test.sh
+++ b/t/test.sh
@@ -124,8 +124,19 @@ WVFAIL bup save -r :$BUP_DIR -n r-test $D/fake/path
 WVSTART "split"
 echo a >a.tmp
 echo b >b.tmp
-WVPASSEQ $(bup split -b a.tmp b.tmp | wc -l) 1
-WVPASSEQ $(bup split -b --keep-boundaries a.tmp b.tmp | wc -l) 2
+WVPASS bup split -b a.tmp >taga.tmp
+WVPASS bup split -b b.tmp >tagb.tmp
+cat a.tmp b.tmp | WVPASS bup split -b >tagab.tmp
+WVPASSEQ "$(cat taga.tmp | wc -l)" 1
+WVPASSEQ "$(cat tagb.tmp | wc -l)" 1
+WVPASSEQ "$(cat tagab.tmp | wc -l)" 1
+WVPASSEQ "$(cat tag[ab].tmp | wc -l)" 2
+WVPASSEQ "$(bup split -b a.tmp b.tmp)" "$(cat tagab.tmp)"
+WVPASSEQ "$(bup split -b --keep-boundaries a.tmp b.tmp)" "$(cat tag[ab].tmp)"
+WVPASSEQ "$(cat tag[ab].tmp | bup split -b --keep-boundaries --git-ids)" \
+         "$(cat tag[ab].tmp)"
+WVPASSEQ "$(cat tag[ab].tmp | bup split -b --git-ids)" \
+         "$(cat tagab.tmp)"
 WVPASS bup split --bench -b <t/testfile1 >tags1.tmp
 WVPASS bup split -vvvv -b t/testfile2 >tags2.tmp
 WVPASS bup margin