cmd/split: add a --git-ids option.

author Avery Pennarun <apenwarr@gmail.com>

Wed, 22 Sep 2010 13:02:32 +0000 (06:02 -0700)

committer Avery Pennarun <apenwarr@gmail.com>

Wed, 22 Sep 2010 13:02:32 +0000 (06:02 -0700)
author Avery Pennarun <apenwarr@gmail.com>
Wed, 22 Sep 2010 13:02:32 +0000 (06:02 -0700)
committer Avery Pennarun <apenwarr@gmail.com>
Wed, 22 Sep 2010 13:02:32 +0000 (06:02 -0700)
diff --git a/Documentation/bup-split.md b/Documentation/bup-split.md

index 0ab5d091ce7d76bd049ead974b7da10f3cbc48ba..43fccc5ee67de064bc1355a24ff90672dbf90bcf 100644 (file)
--- a/Documentation/bup-split.md
+++ b/Documentation/bup-split.md
@@ -11,7 +11,7 @@ bup-split - save individual files to bup backup sets
  bup split [-r *host*:*path*] <-b|-t|-c|-n *name*> [-v] [-q]
    [--bench] [--max-pack-size=*bytes*]
    [--max-pack-objects=*n*] [--fanout=*count]
-  [--keep-boundaries] [filenames...]
+  [--git-ids] [--keep-boundaries] [filenames...]
  
  # DESCRIPTION
  
@@ -73,6 +73,15 @@ To get the data back, use `bup-join`(1).
  -v, --verbose
  :   increase verbosity (can be used more than once).
  
+--git-ids
+:   stdin is a list of git object ids instead of raw data.
+    `bup split` will read the contents of each named git
+    object (if it exists in the bup repository) and split
+    it.  This might be useful for converting a git
+    repository with large binary files to use bup-style
+    hashsplitting instead.  This option is probably most
+    useful when combined with `--keep-boundaries`.
+
  --keep-boundaries
  :   if multiple filenames are given on the command line,
      they are normally concatenated together as if the
diff --git a/cmd/split-cmd.py b/cmd/split-cmd.py

index 44c3cdc0b960b8e7b6589c05f1ddb0d1bc72480c..2a72bd673fb912d47b89f8713578342c0d5849b2 100755 (executable)
--- a/cmd/split-cmd.py
+++ b/cmd/split-cmd.py
@@ -15,6 +15,7 @@ n,name=    name of backup set to update (if any)
  d,date=    date for the commit (seconds since the epoch)
  q,quiet    don't print progress messages
  v,verbose  increase log output (can be used more than once)
+git-ids    read a list of git object ids from stdin and split their contents
  keep-boundaries  don't let one chunk span two input files
  noop       don't actually save the data anywhere
  copy       just copy input to output, hashsplitting along the way
@@ -35,6 +36,8 @@ if not (opt.blobs or opt.tree or opt.commit or opt.name or
  if (opt.noop or opt.copy) and (opt.blobs or opt.tree or 
                                 opt.commit or opt.name):
      o.fatal('-N and --copy are incompatible with -b, -t, -c, -n')
+if extra and opt.git_ids:
+    o.fatal("don't provide filenames when using --git-ids")
  
  if opt.verbose >= 2:
      git.verbose = opt.verbose - 1
@@ -77,7 +80,41 @@ else:
      oldref = refname and git.read_ref(refname) or None
      pack_writer = git.PackWriter()
  
-files = extra and (open(fn) for fn in extra) or [sys.stdin]
+if opt.git_ids:
+    # the input is actually a series of git object ids that we should retrieve
+    # and split.
+    #
+    # This is a bit messy, but basically it converts from a series of
+    # CatPipe.get() iterators into a series of file-type objects.
+    # It would be less ugly if either CatPipe.get() returned a file-like object
+    # (not very efficient), or split_to_shalist() expected an iterator instead
+    # of a file.
+    cp = git.CatPipe()
+    class IterToFile:
+        def __init__(self, it):
+            self.it = iter(it)
+        def read(self, size):
+            v = next(self.it)
+            return v or ''
+    def read_ids():
+        while 1:
+            line = sys.stdin.readline()
+            if not line:
+                break
+            if line:
+                line = line.strip()
+            try:
+                it = cp.get(line.strip())
+                next(it)  # skip the file type
+            except KeyError, e:
+                add_error('error: %s' % e)
+                continue
+            yield IterToFile(it)
+    files = read_ids()
+else:
+    # the input either comes from a series of files or from stdin.
+    files = extra and (open(fn) for fn in extra) or [sys.stdin]
+
  if pack_writer:
      shalist = hashsplit.split_to_shalist(pack_writer, files,
                                           keep_boundaries=opt.keep_boundaries)
@@ -126,3 +163,7 @@ size = hashsplit.total_split
  if opt.bench:
      log('\nbup: %.2fkbytes in %.2f secs = %.2f kbytes/sec\n'
          % (size/1024., secs, size/1024./secs))
+
+if saved_errors:
+    log('WARNING: %d errors encountered while saving.\n' % len(saved_errors))
+    sys.exit(1)
diff --git a/lib/bup/git.py b/lib/bup/git.py

index 66370cacc64d6e6a967fa8220752397175ac7e82..75d6443d3af25cc3116b6a2900d01cafc1e37629 100644 (file)
--- a/lib/bup/git.py
+++ b/lib/bup/git.py
@@ -870,11 +870,12 @@ class CatPipe:
          assert(not self.inprogress)
          assert(id.find('\n') < 0)
          assert(id.find('\r') < 0)
-        assert(id[0] != '-')
+        assert(not id.startswith('-'))
          self.inprogress = id
          self.p.stdin.write('%s\n' % id)
          hdr = self.p.stdout.readline()
          if hdr.endswith(' missing\n'):
+            self.inprogress = None
              raise KeyError('blob %r is missing' % id)
          spl = hdr.split(' ')
          if len(spl) != 3 or len(spl[0]) != 40:
diff --git a/lib/bup/hashsplit.py b/lib/bup/hashsplit.py

index b9896a096ff84e8813842011d8cd170913cb20a3..d73e9f689acb27662241ff6af0e8d2b98b509566 100644 (file)
--- a/lib/bup/hashsplit.py
+++ b/lib/bup/hashsplit.py
@@ -189,5 +189,5 @@ def open_noatime(name):
  
  def fadvise_done(f, ofs):
      assert(ofs >= 0)
-    if ofs > 0:
+    if ofs > 0 and hasattr(f, 'fileno'):
          _helpers.fadvise_done(f.fileno(), ofs)
diff --git a/t/test.sh b/t/test.sh

index 1f2f916b8ca6bcb6b4769b5083d52910aae7ab8a..02590aaf7be0f385101cb307205da9b19e21c7bd 100755 (executable)
--- a/t/test.sh
+++ b/t/test.sh
@@ -124,8 +124,19 @@ WVFAIL bup save -r :$BUP_DIR -n r-test $D/fake/path
  WVSTART "split"
  echo a >a.tmp
  echo b >b.tmp
-WVPASSEQ $(bup split -b a.tmp b.tmp | wc -l) 1
-WVPASSEQ $(bup split -b --keep-boundaries a.tmp b.tmp | wc -l) 2
+WVPASS bup split -b a.tmp >taga.tmp
+WVPASS bup split -b b.tmp >tagb.tmp
+cat a.tmp b.tmp | WVPASS bup split -b >tagab.tmp
+WVPASSEQ "$(cat taga.tmp | wc -l)" 1
+WVPASSEQ "$(cat tagb.tmp | wc -l)" 1
+WVPASSEQ "$(cat tagab.tmp | wc -l)" 1
+WVPASSEQ "$(cat tag[ab].tmp | wc -l)" 2
+WVPASSEQ "$(bup split -b a.tmp b.tmp)" "$(cat tagab.tmp)"
+WVPASSEQ "$(bup split -b --keep-boundaries a.tmp b.tmp)" "$(cat tag[ab].tmp)"
+WVPASSEQ "$(cat tag[ab].tmp | bup split -b --keep-boundaries --git-ids)" \
+         "$(cat tag[ab].tmp)"
+WVPASSEQ "$(cat tag[ab].tmp | bup split -b --git-ids)" \
+         "$(cat tagab.tmp)"
  WVPASS bup split --bench -b <t/testfile1 >tags1.tmp
  WVPASS bup split -vvvv -b t/testfile2 >tags2.tmp
  WVPASS bup margin
author	Avery Pennarun <apenwarr@gmail.com>
	Wed, 22 Sep 2010 13:02:32 +0000 (06:02 -0700)
committer	Avery Pennarun <apenwarr@gmail.com>
	Wed, 22 Sep 2010 13:02:32 +0000 (06:02 -0700)
Documentation/bup-split.md		patch \| blob \| history
cmd/split-cmd.py		patch \| blob \| history
lib/bup/git.py		patch \| blob \| history
lib/bup/hashsplit.py		patch \| blob \| history
t/test.sh		patch \| blob \| history