From: Avery Pennarun Date: Wed, 22 Sep 2010 13:02:32 +0000 (-0700) Subject: cmd/split: add a --git-ids option. X-Git-Tag: bup-0.20~12 X-Git-Url: https://arthur.barton.de/cgi-bin/gitweb.cgi?p=bup.git;a=commitdiff_plain;h=5570b95f0519dfa7c2ce78413131d4433738cc8f cmd/split: add a --git-ids option. This lets you provide a list of git object ids on stdin instead of the raw content. bup-split then uses a CatPipe to retrieve the objects from git and hashsplit them. You could use this as a helper for converting a git repo that contains a bunch of large files into one that uses bup-style hashsplit files. Signed-off-by: Avery Pennarun --- diff --git a/Documentation/bup-split.md b/Documentation/bup-split.md index 0ab5d09..43fccc5 100644 --- a/Documentation/bup-split.md +++ b/Documentation/bup-split.md @@ -11,7 +11,7 @@ bup-split - save individual files to bup backup sets bup split [-r *host*:*path*] <-b|-t|-c|-n *name*> [-v] [-q] [--bench] [--max-pack-size=*bytes*] [--max-pack-objects=*n*] [--fanout=*count] - [--keep-boundaries] [filenames...] + [--git-ids] [--keep-boundaries] [filenames...] # DESCRIPTION @@ -73,6 +73,15 @@ To get the data back, use `bup-join`(1). -v, --verbose : increase verbosity (can be used more than once). +--git-ids +: stdin is a list of git object ids instead of raw data. + `bup split` will read the contents of each named git + object (if it exists in the bup repository) and split + it. This might be useful for converting a git + repository with large binary files to use bup-style + hashsplitting instead. This option is probably most + useful when combined with `--keep-boundaries`. + --keep-boundaries : if multiple filenames are given on the command line, they are normally concatenated together as if the diff --git a/cmd/split-cmd.py b/cmd/split-cmd.py index 44c3cdc..2a72bd6 100755 --- a/cmd/split-cmd.py +++ b/cmd/split-cmd.py @@ -15,6 +15,7 @@ n,name= name of backup set to update (if any) d,date= date for the commit (seconds since the epoch) q,quiet don't print progress messages v,verbose increase log output (can be used more than once) +git-ids read a list of git object ids from stdin and split their contents keep-boundaries don't let one chunk span two input files noop don't actually save the data anywhere copy just copy input to output, hashsplitting along the way @@ -35,6 +36,8 @@ if not (opt.blobs or opt.tree or opt.commit or opt.name or if (opt.noop or opt.copy) and (opt.blobs or opt.tree or opt.commit or opt.name): o.fatal('-N and --copy are incompatible with -b, -t, -c, -n') +if extra and opt.git_ids: + o.fatal("don't provide filenames when using --git-ids") if opt.verbose >= 2: git.verbose = opt.verbose - 1 @@ -77,7 +80,41 @@ else: oldref = refname and git.read_ref(refname) or None pack_writer = git.PackWriter() -files = extra and (open(fn) for fn in extra) or [sys.stdin] +if opt.git_ids: + # the input is actually a series of git object ids that we should retrieve + # and split. + # + # This is a bit messy, but basically it converts from a series of + # CatPipe.get() iterators into a series of file-type objects. + # It would be less ugly if either CatPipe.get() returned a file-like object + # (not very efficient), or split_to_shalist() expected an iterator instead + # of a file. + cp = git.CatPipe() + class IterToFile: + def __init__(self, it): + self.it = iter(it) + def read(self, size): + v = next(self.it) + return v or '' + def read_ids(): + while 1: + line = sys.stdin.readline() + if not line: + break + if line: + line = line.strip() + try: + it = cp.get(line.strip()) + next(it) # skip the file type + except KeyError, e: + add_error('error: %s' % e) + continue + yield IterToFile(it) + files = read_ids() +else: + # the input either comes from a series of files or from stdin. + files = extra and (open(fn) for fn in extra) or [sys.stdin] + if pack_writer: shalist = hashsplit.split_to_shalist(pack_writer, files, keep_boundaries=opt.keep_boundaries) @@ -126,3 +163,7 @@ size = hashsplit.total_split if opt.bench: log('\nbup: %.2fkbytes in %.2f secs = %.2f kbytes/sec\n' % (size/1024., secs, size/1024./secs)) + +if saved_errors: + log('WARNING: %d errors encountered while saving.\n' % len(saved_errors)) + sys.exit(1) diff --git a/lib/bup/git.py b/lib/bup/git.py index 66370ca..75d6443 100644 --- a/lib/bup/git.py +++ b/lib/bup/git.py @@ -870,11 +870,12 @@ class CatPipe: assert(not self.inprogress) assert(id.find('\n') < 0) assert(id.find('\r') < 0) - assert(id[0] != '-') + assert(not id.startswith('-')) self.inprogress = id self.p.stdin.write('%s\n' % id) hdr = self.p.stdout.readline() if hdr.endswith(' missing\n'): + self.inprogress = None raise KeyError('blob %r is missing' % id) spl = hdr.split(' ') if len(spl) != 3 or len(spl[0]) != 40: diff --git a/lib/bup/hashsplit.py b/lib/bup/hashsplit.py index b9896a0..d73e9f6 100644 --- a/lib/bup/hashsplit.py +++ b/lib/bup/hashsplit.py @@ -189,5 +189,5 @@ def open_noatime(name): def fadvise_done(f, ofs): assert(ofs >= 0) - if ofs > 0: + if ofs > 0 and hasattr(f, 'fileno'): _helpers.fadvise_done(f.fileno(), ofs) diff --git a/t/test.sh b/t/test.sh index 1f2f916..02590aa 100755 --- a/t/test.sh +++ b/t/test.sh @@ -124,8 +124,19 @@ WVFAIL bup save -r :$BUP_DIR -n r-test $D/fake/path WVSTART "split" echo a >a.tmp echo b >b.tmp -WVPASSEQ $(bup split -b a.tmp b.tmp | wc -l) 1 -WVPASSEQ $(bup split -b --keep-boundaries a.tmp b.tmp | wc -l) 2 +WVPASS bup split -b a.tmp >taga.tmp +WVPASS bup split -b b.tmp >tagb.tmp +cat a.tmp b.tmp | WVPASS bup split -b >tagab.tmp +WVPASSEQ "$(cat taga.tmp | wc -l)" 1 +WVPASSEQ "$(cat tagb.tmp | wc -l)" 1 +WVPASSEQ "$(cat tagab.tmp | wc -l)" 1 +WVPASSEQ "$(cat tag[ab].tmp | wc -l)" 2 +WVPASSEQ "$(bup split -b a.tmp b.tmp)" "$(cat tagab.tmp)" +WVPASSEQ "$(bup split -b --keep-boundaries a.tmp b.tmp)" "$(cat tag[ab].tmp)" +WVPASSEQ "$(cat tag[ab].tmp | bup split -b --keep-boundaries --git-ids)" \ + "$(cat tag[ab].tmp)" +WVPASSEQ "$(cat tag[ab].tmp | bup split -b --git-ids)" \ + "$(cat tagab.tmp)" WVPASS bup split --bench -b tags1.tmp WVPASS bup split -vvvv -b t/testfile2 >tags2.tmp WVPASS bup margin