ls: move opt processing to opts_from_commandline

[bup.git] / cmd / split-cmd.py
diff --git a/cmd/split-cmd.py b/cmd/split-cmd.py

index 26340bfe9a06c0e03d69169d65a6ffd9c7d623bd..44d63230bf21a05601c755aae71d4509181cb8f6 100755 (executable)
--- a/cmd/split-cmd.py
+++ b/cmd/split-cmd.py
@@ -1,97 +1,196 @@
-#!/usr/bin/env python
-import sys, time
+#!/bin/sh
+"""": # -*-python-*-
+bup_python="$(dirname "$0")/bup-python" || exit $?
+exec "$bup_python" "$0" ${1+"$@"}
+"""
+# end of bup preamble
+
+import os, sys, time
+
  from bup import hashsplit, git, options, client
-from bup.helpers import *
+from bup.helpers import (add_error, handle_ctrl_c, hostname, log, parse_num,
+                         qprogress, reprogress, saved_errors,
+                         userfullname, username, valid_save_name)
  
  
  optspec = """
-bup split [-tcb] [-n name] [--bench] [filenames...]
+bup split [-t] [-c] [-n name] OPTIONS [--git-ids | filenames...]
+bup split -b OPTIONS [--git-ids | filenames...]
+bup split <--noop [--copy]|--copy>  OPTIONS [--git-ids | filenames...]
  --
-r,remote=  remote repository path
-b,blobs    output a series of blob ids
+ Modes:
+b,blobs    output a series of blob ids.  Implies --fanout=0.
  t,tree     output a tree id
  c,commit   output a commit id
-n,name=    name of backup set to update (if any)
-N,noop     don't actually save the data anywhere
+n,name=    save the result under the given name
+noop       split the input, but throw away the result
+copy       split the input, copy it to stdout, don't save to repo
+ Options:
+r,remote=  remote repository path
+d,date=    date for the commit (seconds since the epoch)
  q,quiet    don't print progress messages
  v,verbose  increase log output (can be used more than once)
-copy       just copy input to output, hashsplitting along the way
+git-ids    read a list of git object ids from stdin and split their contents
+keep-boundaries  don't let one chunk span two input files
  bench      print benchmark timings to stderr
  max-pack-size=  maximum bytes in a single pack
  max-pack-objects=  maximum number of objects in a single pack
-fanout=    maximum number of blobs in a single tree
+fanout=    average number of blobs in a single tree
  bwlimit=   maximum bytes/sec to transmit to server
+#,compress=  set compression level to # (0-9, 9 is highest) [1]
  """
-o = options.Options('bup split', optspec)
+o = options.Options(optspec)
  (opt, flags, extra) = o.parse(sys.argv[1:])
  
  handle_ctrl_c()
  git.check_repo_or_die()
  if not (opt.blobs or opt.tree or opt.commit or opt.name or
          opt.noop or opt.copy):
-    o.fatal("use one or more of -b, -t, -c, -n, -N, --copy")
-if (opt.noop or opt.copy) and (opt.blobs or opt.tree or 
+    o.fatal("use one or more of -b, -t, -c, -n, --noop, --copy")
+if (opt.noop or opt.copy) and (opt.blobs or opt.tree or
                                 opt.commit or opt.name):
-    o.fatal('-N and --copy are incompatible with -b, -t, -c, -n')
+    o.fatal('--noop and --copy are incompatible with -b, -t, -c, -n')
+if opt.blobs and (opt.tree or opt.commit or opt.name):
+    o.fatal('-b is incompatible with -t, -c, -n')
+if extra and opt.git_ids:
+    o.fatal("don't provide filenames when using --git-ids")
  
  if opt.verbose >= 2:
      git.verbose = opt.verbose - 1
      opt.bench = 1
+
+max_pack_size = None
  if opt.max_pack_size:
-    hashsplit.max_pack_size = parse_num(opt.max_pack_size)
+    max_pack_size = parse_num(opt.max_pack_size)
+max_pack_objects = None
  if opt.max_pack_objects:
-    hashsplit.max_pack_objects = parse_num(opt.max_pack_objects)
+    max_pack_objects = parse_num(opt.max_pack_objects)
+
  if opt.fanout:
      hashsplit.fanout = parse_num(opt.fanout)
  if opt.blobs:
      hashsplit.fanout = 0
  if opt.bwlimit:
      client.bwlimit = parse_num(opt.bwlimit)
+if opt.date:
+    date = parse_date_or_fatal(opt.date, o.fatal)
+else:
+    date = time.time()
+
+total_bytes = 0
+def prog(filenum, nbytes):
+    global total_bytes
+    total_bytes += nbytes
+    if filenum > 0:
+        qprogress('Splitting: file #%d, %d kbytes\r'
+                  % (filenum+1, total_bytes/1024))
+    else:
+        qprogress('Splitting: %d kbytes\r' % (total_bytes/1024))
+
  
  is_reverse = os.environ.get('BUP_SERVER_REVERSE')
  if is_reverse and opt.remote:
      o.fatal("don't use -r in reverse mode; it's automatic")
  start_time = time.time()
  
+if opt.name and not valid_save_name(opt.name):
+    o.fatal("'%s' is not a valid branch name." % opt.name)
  refname = opt.name and 'refs/heads/%s' % opt.name or None
  if opt.noop or opt.copy:
      cli = pack_writer = oldref = None
  elif opt.remote or is_reverse:
      cli = client.Client(opt.remote)
      oldref = refname and cli.read_ref(refname) or None
-    pack_writer = cli.new_packwriter()
+    pack_writer = cli.new_packwriter(compression_level=opt.compress,
+                                     max_pack_size=max_pack_size,
+                                     max_pack_objects=max_pack_objects)
  else:
      cli = None
      oldref = refname and git.read_ref(refname) or None
-    pack_writer = git.PackWriter()
+    pack_writer = git.PackWriter(compression_level=opt.compress,
+                                 max_pack_size=max_pack_size,
+                                 max_pack_objects=max_pack_objects)
  
-files = extra and (open(fn) for fn in extra) or [sys.stdin]
-if pack_writer:
-    shalist = hashsplit.split_to_shalist(pack_writer, files)
+if opt.git_ids:
+    # the input is actually a series of git object ids that we should retrieve
+    # and split.
+    #
+    # This is a bit messy, but basically it converts from a series of
+    # CatPipe.get() iterators into a series of file-type objects.
+    # It would be less ugly if either CatPipe.get() returned a file-like object
+    # (not very efficient), or split_to_shalist() expected an iterator instead
+    # of a file.
+    cp = git.CatPipe()
+    class IterToFile:
+        def __init__(self, it):
+            self.it = iter(it)
+        def read(self, size):
+            v = next(self.it, None)
+            return v or ''
+    def read_ids():
+        while 1:
+            line = sys.stdin.readline()
+            if not line:
+                break
+            if line:
+                line = line.strip()
+            try:
+                it = cp.get(line.strip())
+                next(it, None)  # skip the file info
+            except KeyError as e:
+                add_error('error: %s' % e)
+                continue
+            yield IterToFile(it)
+    files = read_ids()
+else:
+    # the input either comes from a series of files or from stdin.
+    files = extra and (open(fn) for fn in extra) or [sys.stdin]
+
+if pack_writer and opt.blobs:
+    shalist = hashsplit.split_to_blobs(pack_writer.new_blob, files,
+                                       keep_boundaries=opt.keep_boundaries,
+                                       progress=prog)
+    for (sha, size, level) in shalist:
+        print sha.encode('hex')
+        reprogress()
+elif pack_writer:  # tree or commit or name
+    if opt.name: # insert dummy_name which may be used as a restore target
+        mode, sha = \
+            hashsplit.split_to_blob_or_tree(pack_writer.new_blob,
+                                            pack_writer.new_tree,
+                                            files,
+                                            keep_boundaries=opt.keep_boundaries,
+                                            progress=prog)
+        splitfile_name = git.mangle_name('data', hashsplit.GIT_MODE_FILE, mode)
+        shalist = [(mode, splitfile_name, sha)]
+    else:
+        shalist = hashsplit.split_to_shalist(
+                      pack_writer.new_blob, pack_writer.new_tree, files,
+                      keep_boundaries=opt.keep_boundaries, progress=prog)
      tree = pack_writer.new_tree(shalist)
  else:
      last = 0
-    for (blob, bits) in hashsplit.hashsplit_iter(files):
+    it = hashsplit.hashsplit_iter(files,
+                                  keep_boundaries=opt.keep_boundaries,
+                                  progress=prog)
+    for (blob, level) in it:
          hashsplit.total_split += len(blob)
          if opt.copy:
              sys.stdout.write(str(blob))
          megs = hashsplit.total_split/1024/1024
          if not opt.quiet and last != megs:
-            progress('%d Mbytes read\r' % megs)
              last = megs
-    progress('%d Mbytes read, done.\n' % megs)
  
  if opt.verbose:
      log('\n')
-if opt.blobs:
-    for (mode,name,bin) in shalist:
-        print bin.encode('hex')
  if opt.tree:
      print tree.encode('hex')
  if opt.commit or opt.name:
-    msg = 'bup split\n\nGenerated by command:\n%r' % sys.argv
+    msg = 'bup split\n\nGenerated by command:\n%r\n' % sys.argv
      ref = opt.name and ('refs/heads/%s' % opt.name) or None
-    commit = pack_writer.new_commit(oldref, tree, msg)
+    userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
+    commit = pack_writer.new_commit(tree, oldref, userline, date, None,
+                                    userline, date, None, msg)
      if opt.commit:
          print commit.encode('hex')
  
@@ -110,5 +209,9 @@ if cli:
  secs = time.time() - start_time
  size = hashsplit.total_split
  if opt.bench:
-    log('\nbup: %.2fkbytes in %.2f secs = %.2f kbytes/sec\n'
+    log('bup: %.2fkbytes in %.2f secs = %.2f kbytes/sec\n'
          % (size/1024., secs, size/1024./secs))
+
+if saved_errors:
+    log('WARNING: %d errors encountered while saving.\n' % len(saved_errors))
+    sys.exit(1)