From b26f361cd6210d746e9764140630c87dc23f3da5 Mon Sep 17 00:00:00 2001 From: Avery Pennarun Date: Wed, 22 Sep 2010 05:07:33 -0700 Subject: [PATCH] cmd/split: add a new --keep-boundaries option. If you provide multiple input files on the command line, sometimes you want to merge them togther into a single file before re-chunking them (the default). But sometimes you want all the files to be treated separately for chunking purposes, ie. when you know that some of the files will never change so there's never any point in merging it with previous/subsequent files. Signed-off-by: Avery Pennarun --- Documentation/bup-split.md | 15 ++++++++++++++- cmd/save-cmd.py | 2 +- cmd/split-cmd.py | 7 +++++-- lib/bup/hashsplit.py | 27 ++++++++++++++++++++------- t/test.sh | 4 ++++ 5 files changed, 44 insertions(+), 11 deletions(-) diff --git a/Documentation/bup-split.md b/Documentation/bup-split.md index bf219bc..0ab5d09 100644 --- a/Documentation/bup-split.md +++ b/Documentation/bup-split.md @@ -10,7 +10,8 @@ bup-split - save individual files to bup backup sets bup split [-r *host*:*path*] <-b|-t|-c|-n *name*> [-v] [-q] [--bench] [--max-pack-size=*bytes*] - [--max-pack-objects=*n*] [--fanout=*count] [filenames...] + [--max-pack-objects=*n*] [--fanout=*count] + [--keep-boundaries] [filenames...] # DESCRIPTION @@ -72,6 +73,18 @@ To get the data back, use `bup-join`(1). -v, --verbose : increase verbosity (can be used more than once). +--keep-boundaries +: if multiple filenames are given on the command line, + they are normally concatenated together as if the + content all came from a single file. That is, the + set of blobs/trees produced is identical to what it + would have been if there had been a single input file. + However, if you use `--keep-boundaries`, each file is + split separately. You still only get a single tree or + commit or series of blobs, but each blob comes from + only one of the files; the end of one of the input + files always ends a blob. + --noop : read the data and split it into blocks based on the "bupsplit" rolling checksum algorithm, but don't do anything with diff --git a/cmd/save-cmd.py b/cmd/save-cmd.py index 5b48afd..e9b6e9e 100755 --- a/cmd/save-cmd.py +++ b/cmd/save-cmd.py @@ -239,7 +239,7 @@ for (transname,ent) in r.filter(extra, wantrecurse=wantrecurse_during): add_error(e) lastskip_name = ent.name else: - (mode, id) = hashsplit.split_to_blob_or_tree(w, [f]) + (mode, id) = hashsplit.split_to_blob_or_tree(w, [f], False) else: if stat.S_ISDIR(ent.mode): assert(0) # handled above diff --git a/cmd/split-cmd.py b/cmd/split-cmd.py index 94fba53..44c3cdc 100755 --- a/cmd/split-cmd.py +++ b/cmd/split-cmd.py @@ -15,6 +15,7 @@ n,name= name of backup set to update (if any) d,date= date for the commit (seconds since the epoch) q,quiet don't print progress messages v,verbose increase log output (can be used more than once) +keep-boundaries don't let one chunk span two input files noop don't actually save the data anywhere copy just copy input to output, hashsplitting along the way bench print benchmark timings to stderr @@ -78,11 +79,13 @@ else: files = extra and (open(fn) for fn in extra) or [sys.stdin] if pack_writer: - shalist = hashsplit.split_to_shalist(pack_writer, files) + shalist = hashsplit.split_to_shalist(pack_writer, files, + keep_boundaries=opt.keep_boundaries) tree = pack_writer.new_tree(shalist) else: last = 0 - for (blob, bits) in hashsplit.hashsplit_iter(files): + for (blob, bits) in hashsplit.hashsplit_iter(files, + keep_boundaries=opt.keep_boundaries): hashsplit.total_split += len(blob) if opt.copy: sys.stdout.write(str(blob)) diff --git a/lib/bup/hashsplit.py b/lib/bup/hashsplit.py index 61840ad..b9896a0 100644 --- a/lib/bup/hashsplit.py +++ b/lib/bup/hashsplit.py @@ -72,7 +72,7 @@ def drainbuf(buf, finalize): yield (buf.get(buf.used()), 0) -def hashsplit_iter(files): +def _hashsplit_iter(files): assert(BLOB_HWM > BLOB_MAX) buf = Buf() fi = blobiter(files) @@ -89,10 +89,23 @@ def hashsplit_iter(files): buf.put(bnew) +def _hashsplit_iter_keep_boundaries(files): + for f in files: + for i in _hashsplit_iter([f]): + yield i + + +def hashsplit_iter(files, keep_boundaries): + if keep_boundaries: + return _hashsplit_iter_keep_boundaries(files) + else: + return _hashsplit_iter(files) + + total_split = 0 -def _split_to_blobs(w, files): +def _split_to_blobs(w, files, keep_boundaries): global total_split - for (blob, bits) in hashsplit_iter(files): + for (blob, bits) in hashsplit_iter(files, keep_boundaries): sha = w.new_blob(blob) total_split += len(blob) if w.outbytes >= max_pack_size or w.count >= max_pack_objects: @@ -127,8 +140,8 @@ def _squish(w, stacks, n): i += 1 -def split_to_shalist(w, files): - sl = _split_to_blobs(w, files) +def split_to_shalist(w, files, keep_boundaries): + sl = _split_to_blobs(w, files, keep_boundaries) if not fanout: shal = [] for (sha,size,bits) in sl: @@ -152,8 +165,8 @@ def split_to_shalist(w, files): return _make_shalist(stacks[-1])[0] -def split_to_blob_or_tree(w, files): - shalist = list(split_to_shalist(w, files)) +def split_to_blob_or_tree(w, files, keep_boundaries): + shalist = list(split_to_shalist(w, files, keep_boundaries)) if len(shalist) == 1: return (shalist[0][0], shalist[0][2]) elif len(shalist) == 0: diff --git a/t/test.sh b/t/test.sh index acfdda5..1f2f916 100755 --- a/t/test.sh +++ b/t/test.sh @@ -122,6 +122,10 @@ WVFAIL bup save -r :$BUP_DIR/fake/path -n r-test $D WVFAIL bup save -r :$BUP_DIR -n r-test $D/fake/path WVSTART "split" +echo a >a.tmp +echo b >b.tmp +WVPASSEQ $(bup split -b a.tmp b.tmp | wc -l) 1 +WVPASSEQ $(bup split -b --keep-boundaries a.tmp b.tmp | wc -l) 2 WVPASS bup split --bench -b tags1.tmp WVPASS bup split -vvvv -b t/testfile2 >tags2.tmp WVPASS bup margin -- 2.39.2