]> arthur.barton.de Git - bup.git/commitdiff
cmd/split: add a new --keep-boundaries option.
authorAvery Pennarun <apenwarr@gmail.com>
Wed, 22 Sep 2010 12:07:33 +0000 (05:07 -0700)
committerAvery Pennarun <apenwarr@gmail.com>
Wed, 22 Sep 2010 12:43:59 +0000 (05:43 -0700)
If you provide multiple input files on the command line, sometimes you want
to merge them togther into a single file before re-chunking them (the
default).  But sometimes you want all the files to be treated separately for
chunking purposes, ie. when you know that some of the files will never
change so there's never any point in merging it with previous/subsequent
files.

Signed-off-by: Avery Pennarun <apenwarr@gmail.com>
Documentation/bup-split.md
cmd/save-cmd.py
cmd/split-cmd.py
lib/bup/hashsplit.py
t/test.sh

index bf219bc4757af4877d8d0ff388d6477246448544..0ab5d091ce7d76bd049ead974b7da10f3cbc48ba 100644 (file)
@@ -10,7 +10,8 @@ bup-split - save individual files to bup backup sets
 
 bup split [-r *host*:*path*] <-b|-t|-c|-n *name*> [-v] [-q]
   [--bench] [--max-pack-size=*bytes*]
-  [--max-pack-objects=*n*] [--fanout=*count] [filenames...]
+  [--max-pack-objects=*n*] [--fanout=*count]
+  [--keep-boundaries] [filenames...]
 
 # DESCRIPTION
 
@@ -72,6 +73,18 @@ To get the data back, use `bup-join`(1).
 -v, --verbose
 :   increase verbosity (can be used more than once).
 
+--keep-boundaries
+:   if multiple filenames are given on the command line,
+    they are normally concatenated together as if the
+    content all came from a single file.  That is, the
+    set of blobs/trees produced is identical to what it
+    would have been if there had been a single input file. 
+    However, if you use `--keep-boundaries`, each file is
+    split separately.  You still only get a single tree or
+    commit or series of blobs, but each blob comes from
+    only one of the files; the end of one of the input
+    files always ends a blob.
+
 --noop
 :   read the data and split it into blocks based on the "bupsplit"
     rolling checksum algorithm, but don't do anything with
index 5b48afdc9079bfa0119f341cd0950c8a7cf506a0..e9b6e9e9ec8e7738e21712918e2067fd6eebe000 100755 (executable)
@@ -239,7 +239,7 @@ for (transname,ent) in r.filter(extra, wantrecurse=wantrecurse_during):
                 add_error(e)
                 lastskip_name = ent.name
             else:
-                (mode, id) = hashsplit.split_to_blob_or_tree(w, [f])
+                (mode, id) = hashsplit.split_to_blob_or_tree(w, [f], False)
         else:
             if stat.S_ISDIR(ent.mode):
                 assert(0)  # handled above
index 94fba53bda7793f76ef1cdec467c5fb96059a762..44c3cdc0b960b8e7b6589c05f1ddb0d1bc72480c 100755 (executable)
@@ -15,6 +15,7 @@ n,name=    name of backup set to update (if any)
 d,date=    date for the commit (seconds since the epoch)
 q,quiet    don't print progress messages
 v,verbose  increase log output (can be used more than once)
+keep-boundaries  don't let one chunk span two input files
 noop       don't actually save the data anywhere
 copy       just copy input to output, hashsplitting along the way
 bench      print benchmark timings to stderr
@@ -78,11 +79,13 @@ else:
 
 files = extra and (open(fn) for fn in extra) or [sys.stdin]
 if pack_writer:
-    shalist = hashsplit.split_to_shalist(pack_writer, files)
+    shalist = hashsplit.split_to_shalist(pack_writer, files,
+                                         keep_boundaries=opt.keep_boundaries)
     tree = pack_writer.new_tree(shalist)
 else:
     last = 0
-    for (blob, bits) in hashsplit.hashsplit_iter(files):
+    for (blob, bits) in hashsplit.hashsplit_iter(files,
+                                    keep_boundaries=opt.keep_boundaries):
         hashsplit.total_split += len(blob)
         if opt.copy:
             sys.stdout.write(str(blob))
index 61840ada73de6256fd14e709bd1764e2cb43af06..b9896a096ff84e8813842011d8cd170913cb20a3 100644 (file)
@@ -72,7 +72,7 @@ def drainbuf(buf, finalize):
         yield (buf.get(buf.used()), 0)
 
 
-def hashsplit_iter(files):
+def _hashsplit_iter(files):
     assert(BLOB_HWM > BLOB_MAX)
     buf = Buf()
     fi = blobiter(files)
@@ -89,10 +89,23 @@ def hashsplit_iter(files):
             buf.put(bnew)
 
 
+def _hashsplit_iter_keep_boundaries(files):
+    for f in files:
+        for i in _hashsplit_iter([f]):
+            yield i
+
+
+def hashsplit_iter(files, keep_boundaries):
+    if keep_boundaries:
+        return _hashsplit_iter_keep_boundaries(files)
+    else:
+        return _hashsplit_iter(files)
+
+
 total_split = 0
-def _split_to_blobs(w, files):
+def _split_to_blobs(w, files, keep_boundaries):
     global total_split
-    for (blob, bits) in hashsplit_iter(files):
+    for (blob, bits) in hashsplit_iter(files, keep_boundaries):
         sha = w.new_blob(blob)
         total_split += len(blob)
         if w.outbytes >= max_pack_size or w.count >= max_pack_objects:
@@ -127,8 +140,8 @@ def _squish(w, stacks, n):
         i += 1
 
 
-def split_to_shalist(w, files):
-    sl = _split_to_blobs(w, files)
+def split_to_shalist(w, files, keep_boundaries):
+    sl = _split_to_blobs(w, files, keep_boundaries)
     if not fanout:
         shal = []
         for (sha,size,bits) in sl:
@@ -152,8 +165,8 @@ def split_to_shalist(w, files):
         return _make_shalist(stacks[-1])[0]
 
 
-def split_to_blob_or_tree(w, files):
-    shalist = list(split_to_shalist(w, files))
+def split_to_blob_or_tree(w, files, keep_boundaries):
+    shalist = list(split_to_shalist(w, files, keep_boundaries))
     if len(shalist) == 1:
         return (shalist[0][0], shalist[0][2])
     elif len(shalist) == 0:
index acfdda5093a3ab3a961a57bd8476381637f692d8..1f2f916b8ca6bcb6b4769b5083d52910aae7ab8a 100755 (executable)
--- a/t/test.sh
+++ b/t/test.sh
@@ -122,6 +122,10 @@ WVFAIL bup save -r :$BUP_DIR/fake/path -n r-test $D
 WVFAIL bup save -r :$BUP_DIR -n r-test $D/fake/path
 
 WVSTART "split"
+echo a >a.tmp
+echo b >b.tmp
+WVPASSEQ $(bup split -b a.tmp b.tmp | wc -l) 1
+WVPASSEQ $(bup split -b --keep-boundaries a.tmp b.tmp | wc -l) 2
 WVPASS bup split --bench -b <t/testfile1 >tags1.tmp
 WVPASS bup split -vvvv -b t/testfile2 >tags2.tmp
 WVPASS bup margin