From b26f361cd6210d746e9764140630c87dc23f3da5 Mon Sep 17 00:00:00 2001
From: Avery Pennarun <apenwarr@gmail.com>
Date: Wed, 22 Sep 2010 05:07:33 -0700
Subject: [PATCH] cmd/split: add a new --keep-boundaries option.

If you provide multiple input files on the command line, sometimes you want
to merge them togther into a single file before re-chunking them (the
default).  But sometimes you want all the files to be treated separately for
chunking purposes, ie. when you know that some of the files will never
change so there's never any point in merging it with previous/subsequent
files.

Signed-off-by: Avery Pennarun <apenwarr@gmail.com>
---
 Documentation/bup-split.md | 15 ++++++++++++++-
 cmd/save-cmd.py            |  2 +-
 cmd/split-cmd.py           |  7 +++++--
 lib/bup/hashsplit.py       | 27 ++++++++++++++++++++-------
 t/test.sh                  |  4 ++++
 5 files changed, 44 insertions(+), 11 deletions(-)

diff --git a/Documentation/bup-split.md b/Documentation/bup-split.md
index bf219bc..0ab5d09 100644
--- a/Documentation/bup-split.md
+++ b/Documentation/bup-split.md
@@ -10,7 +10,8 @@ bup-split - save individual files to bup backup sets
 
 bup split [-r *host*:*path*] <-b|-t|-c|-n *name*> [-v] [-q]
   [--bench] [--max-pack-size=*bytes*]
-  [--max-pack-objects=*n*] [--fanout=*count] [filenames...]
+  [--max-pack-objects=*n*] [--fanout=*count]
+  [--keep-boundaries] [filenames...]
 
 # DESCRIPTION
 
@@ -72,6 +73,18 @@ To get the data back, use `bup-join`(1).
 -v, --verbose
 :   increase verbosity (can be used more than once).
 
+--keep-boundaries
+:   if multiple filenames are given on the command line,
+    they are normally concatenated together as if the
+    content all came from a single file.  That is, the
+    set of blobs/trees produced is identical to what it
+    would have been if there had been a single input file. 
+    However, if you use `--keep-boundaries`, each file is
+    split separately.  You still only get a single tree or
+    commit or series of blobs, but each blob comes from
+    only one of the files; the end of one of the input
+    files always ends a blob.
+
 --noop
 :   read the data and split it into blocks based on the "bupsplit"
     rolling checksum algorithm, but don't do anything with
diff --git a/cmd/save-cmd.py b/cmd/save-cmd.py
index 5b48afd..e9b6e9e 100755
--- a/cmd/save-cmd.py
+++ b/cmd/save-cmd.py
@@ -239,7 +239,7 @@ for (transname,ent) in r.filter(extra, wantrecurse=wantrecurse_during):
                 add_error(e)
                 lastskip_name = ent.name
             else:
-                (mode, id) = hashsplit.split_to_blob_or_tree(w, [f])
+                (mode, id) = hashsplit.split_to_blob_or_tree(w, [f], False)
         else:
             if stat.S_ISDIR(ent.mode):
                 assert(0)  # handled above
diff --git a/cmd/split-cmd.py b/cmd/split-cmd.py
index 94fba53..44c3cdc 100755
--- a/cmd/split-cmd.py
+++ b/cmd/split-cmd.py
@@ -15,6 +15,7 @@ n,name=    name of backup set to update (if any)
 d,date=    date for the commit (seconds since the epoch)
 q,quiet    don't print progress messages
 v,verbose  increase log output (can be used more than once)
+keep-boundaries  don't let one chunk span two input files
 noop       don't actually save the data anywhere
 copy       just copy input to output, hashsplitting along the way
 bench      print benchmark timings to stderr
@@ -78,11 +79,13 @@ else:
 
 files = extra and (open(fn) for fn in extra) or [sys.stdin]
 if pack_writer:
-    shalist = hashsplit.split_to_shalist(pack_writer, files)
+    shalist = hashsplit.split_to_shalist(pack_writer, files,
+                                         keep_boundaries=opt.keep_boundaries)
     tree = pack_writer.new_tree(shalist)
 else:
     last = 0
-    for (blob, bits) in hashsplit.hashsplit_iter(files):
+    for (blob, bits) in hashsplit.hashsplit_iter(files,
+                                    keep_boundaries=opt.keep_boundaries):
         hashsplit.total_split += len(blob)
         if opt.copy:
             sys.stdout.write(str(blob))
diff --git a/lib/bup/hashsplit.py b/lib/bup/hashsplit.py
index 61840ad..b9896a0 100644
--- a/lib/bup/hashsplit.py
+++ b/lib/bup/hashsplit.py
@@ -72,7 +72,7 @@ def drainbuf(buf, finalize):
         yield (buf.get(buf.used()), 0)
 
 
-def hashsplit_iter(files):
+def _hashsplit_iter(files):
     assert(BLOB_HWM > BLOB_MAX)
     buf = Buf()
     fi = blobiter(files)
@@ -89,10 +89,23 @@ def hashsplit_iter(files):
             buf.put(bnew)
 
 
+def _hashsplit_iter_keep_boundaries(files):
+    for f in files:
+        for i in _hashsplit_iter([f]):
+            yield i
+
+
+def hashsplit_iter(files, keep_boundaries):
+    if keep_boundaries:
+        return _hashsplit_iter_keep_boundaries(files)
+    else:
+        return _hashsplit_iter(files)
+
+
 total_split = 0
-def _split_to_blobs(w, files):
+def _split_to_blobs(w, files, keep_boundaries):
     global total_split
-    for (blob, bits) in hashsplit_iter(files):
+    for (blob, bits) in hashsplit_iter(files, keep_boundaries):
         sha = w.new_blob(blob)
         total_split += len(blob)
         if w.outbytes >= max_pack_size or w.count >= max_pack_objects:
@@ -127,8 +140,8 @@ def _squish(w, stacks, n):
         i += 1
 
 
-def split_to_shalist(w, files):
-    sl = _split_to_blobs(w, files)
+def split_to_shalist(w, files, keep_boundaries):
+    sl = _split_to_blobs(w, files, keep_boundaries)
     if not fanout:
         shal = []
         for (sha,size,bits) in sl:
@@ -152,8 +165,8 @@ def split_to_shalist(w, files):
         return _make_shalist(stacks[-1])[0]
 
 
-def split_to_blob_or_tree(w, files):
-    shalist = list(split_to_shalist(w, files))
+def split_to_blob_or_tree(w, files, keep_boundaries):
+    shalist = list(split_to_shalist(w, files, keep_boundaries))
     if len(shalist) == 1:
         return (shalist[0][0], shalist[0][2])
     elif len(shalist) == 0:
diff --git a/t/test.sh b/t/test.sh
index acfdda5..1f2f916 100755
--- a/t/test.sh
+++ b/t/test.sh
@@ -122,6 +122,10 @@ WVFAIL bup save -r :$BUP_DIR/fake/path -n r-test $D
 WVFAIL bup save -r :$BUP_DIR -n r-test $D/fake/path
 
 WVSTART "split"
+echo a >a.tmp
+echo b >b.tmp
+WVPASSEQ $(bup split -b a.tmp b.tmp | wc -l) 1
+WVPASSEQ $(bup split -b --keep-boundaries a.tmp b.tmp | wc -l) 2
 WVPASS bup split --bench -b <t/testfile1 >tags1.tmp
 WVPASS bup split -vvvv -b t/testfile2 >tags2.tmp
 WVPASS bup margin
-- 
2.39.2