cmd/split: add a new --keep-boundaries option.

author Avery Pennarun <apenwarr@gmail.com>

Wed, 22 Sep 2010 12:07:33 +0000 (05:07 -0700)

committer Avery Pennarun <apenwarr@gmail.com>

Wed, 22 Sep 2010 12:43:59 +0000 (05:43 -0700)
author Avery Pennarun <apenwarr@gmail.com>
Wed, 22 Sep 2010 12:07:33 +0000 (05:07 -0700)
committer Avery Pennarun <apenwarr@gmail.com>
Wed, 22 Sep 2010 12:43:59 +0000 (05:43 -0700)
diff --git a/Documentation/bup-split.md b/Documentation/bup-split.md

index bf219bc4757af4877d8d0ff388d6477246448544..0ab5d091ce7d76bd049ead974b7da10f3cbc48ba 100644 (file)
--- a/Documentation/bup-split.md
+++ b/Documentation/bup-split.md
@@ -10,7 +10,8 @@ bup-split - save individual files to bup backup sets
  
  bup split [-r *host*:*path*] <-b|-t|-c|-n *name*> [-v] [-q]
    [--bench] [--max-pack-size=*bytes*]
-  [--max-pack-objects=*n*] [--fanout=*count] [filenames...]
+  [--max-pack-objects=*n*] [--fanout=*count]
+  [--keep-boundaries] [filenames...]
  
  # DESCRIPTION
  
@@ -72,6 +73,18 @@ To get the data back, use `bup-join`(1).
  -v, --verbose
  :   increase verbosity (can be used more than once).
  
+--keep-boundaries
+:   if multiple filenames are given on the command line,
+    they are normally concatenated together as if the
+    content all came from a single file.  That is, the
+    set of blobs/trees produced is identical to what it
+    would have been if there had been a single input file. 
+    However, if you use `--keep-boundaries`, each file is
+    split separately.  You still only get a single tree or
+    commit or series of blobs, but each blob comes from
+    only one of the files; the end of one of the input
+    files always ends a blob.
+
  --noop
  :   read the data and split it into blocks based on the "bupsplit"
      rolling checksum algorithm, but don't do anything with
diff --git a/cmd/save-cmd.py b/cmd/save-cmd.py

index 5b48afdc9079bfa0119f341cd0950c8a7cf506a0..e9b6e9e9ec8e7738e21712918e2067fd6eebe000 100755 (executable)
--- a/cmd/save-cmd.py
+++ b/cmd/save-cmd.py
@@ -239,7 +239,7 @@ for (transname,ent) in r.filter(extra, wantrecurse=wantrecurse_during):
                  add_error(e)
                  lastskip_name = ent.name
              else:
-                (mode, id) = hashsplit.split_to_blob_or_tree(w, [f])
+                (mode, id) = hashsplit.split_to_blob_or_tree(w, [f], False)
          else:
              if stat.S_ISDIR(ent.mode):
                  assert(0)  # handled above
diff --git a/cmd/split-cmd.py b/cmd/split-cmd.py

index 94fba53bda7793f76ef1cdec467c5fb96059a762..44c3cdc0b960b8e7b6589c05f1ddb0d1bc72480c 100755 (executable)
--- a/cmd/split-cmd.py
+++ b/cmd/split-cmd.py
@@ -15,6 +15,7 @@ n,name=    name of backup set to update (if any)
  d,date=    date for the commit (seconds since the epoch)
  q,quiet    don't print progress messages
  v,verbose  increase log output (can be used more than once)
+keep-boundaries  don't let one chunk span two input files
  noop       don't actually save the data anywhere
  copy       just copy input to output, hashsplitting along the way
  bench      print benchmark timings to stderr
@@ -78,11 +79,13 @@ else:
  
  files = extra and (open(fn) for fn in extra) or [sys.stdin]
  if pack_writer:
-    shalist = hashsplit.split_to_shalist(pack_writer, files)
+    shalist = hashsplit.split_to_shalist(pack_writer, files,
+                                         keep_boundaries=opt.keep_boundaries)
      tree = pack_writer.new_tree(shalist)
  else:
      last = 0
-    for (blob, bits) in hashsplit.hashsplit_iter(files):
+    for (blob, bits) in hashsplit.hashsplit_iter(files,
+                                    keep_boundaries=opt.keep_boundaries):
          hashsplit.total_split += len(blob)
          if opt.copy:
              sys.stdout.write(str(blob))
diff --git a/lib/bup/hashsplit.py b/lib/bup/hashsplit.py

index 61840ada73de6256fd14e709bd1764e2cb43af06..b9896a096ff84e8813842011d8cd170913cb20a3 100644 (file)
--- a/lib/bup/hashsplit.py
+++ b/lib/bup/hashsplit.py
@@ -72,7 +72,7 @@ def drainbuf(buf, finalize):
          yield (buf.get(buf.used()), 0)
  
  
-def hashsplit_iter(files):
+def _hashsplit_iter(files):
      assert(BLOB_HWM > BLOB_MAX)
      buf = Buf()
      fi = blobiter(files)
@@ -89,10 +89,23 @@ def hashsplit_iter(files):
              buf.put(bnew)
  
  
+def _hashsplit_iter_keep_boundaries(files):
+    for f in files:
+        for i in _hashsplit_iter([f]):
+            yield i
+
+
+def hashsplit_iter(files, keep_boundaries):
+    if keep_boundaries:
+        return _hashsplit_iter_keep_boundaries(files)
+    else:
+        return _hashsplit_iter(files)
+
+
  total_split = 0
-def _split_to_blobs(w, files):
+def _split_to_blobs(w, files, keep_boundaries):
      global total_split
-    for (blob, bits) in hashsplit_iter(files):
+    for (blob, bits) in hashsplit_iter(files, keep_boundaries):
          sha = w.new_blob(blob)
          total_split += len(blob)
          if w.outbytes >= max_pack_size or w.count >= max_pack_objects:
@@ -127,8 +140,8 @@ def _squish(w, stacks, n):
          i += 1
  
  
-def split_to_shalist(w, files):
-    sl = _split_to_blobs(w, files)
+def split_to_shalist(w, files, keep_boundaries):
+    sl = _split_to_blobs(w, files, keep_boundaries)
      if not fanout:
          shal = []
          for (sha,size,bits) in sl:
@@ -152,8 +165,8 @@ def split_to_shalist(w, files):
          return _make_shalist(stacks[-1])[0]
  
  
-def split_to_blob_or_tree(w, files):
-    shalist = list(split_to_shalist(w, files))
+def split_to_blob_or_tree(w, files, keep_boundaries):
+    shalist = list(split_to_shalist(w, files, keep_boundaries))
      if len(shalist) == 1:
          return (shalist[0][0], shalist[0][2])
      elif len(shalist) == 0:
diff --git a/t/test.sh b/t/test.sh

index acfdda5093a3ab3a961a57bd8476381637f692d8..1f2f916b8ca6bcb6b4769b5083d52910aae7ab8a 100755 (executable)
--- a/t/test.sh
+++ b/t/test.sh
@@ -122,6 +122,10 @@ WVFAIL bup save -r :$BUP_DIR/fake/path -n r-test $D
  WVFAIL bup save -r :$BUP_DIR -n r-test $D/fake/path
  
  WVSTART "split"
+echo a >a.tmp
+echo b >b.tmp
+WVPASSEQ $(bup split -b a.tmp b.tmp | wc -l) 1
+WVPASSEQ $(bup split -b --keep-boundaries a.tmp b.tmp | wc -l) 2
  WVPASS bup split --bench -b <t/testfile1 >tags1.tmp
  WVPASS bup split -vvvv -b t/testfile2 >tags2.tmp
  WVPASS bup margin
author	Avery Pennarun <apenwarr@gmail.com>
	Wed, 22 Sep 2010 12:07:33 +0000 (05:07 -0700)
committer	Avery Pennarun <apenwarr@gmail.com>
	Wed, 22 Sep 2010 12:43:59 +0000 (05:43 -0700)
Documentation/bup-split.md		patch \| blob \| history
cmd/save-cmd.py		patch \| blob \| history
cmd/split-cmd.py		patch \| blob \| history
lib/bup/hashsplit.py		patch \| blob \| history
t/test.sh		patch \| blob \| history