cmd-split and hashsplit: cleaning up in preparation for refactoring.

author Avery Pennarun <apenwarr@gmail.com>

Fri, 12 Feb 2010 04:50:18 +0000 (23:50 -0500)

committer Avery Pennarun <apenwarr@gmail.com>

Fri, 12 Feb 2010 04:50:19 +0000 (23:50 -0500)
author Avery Pennarun <apenwarr@gmail.com>
Fri, 12 Feb 2010 04:50:18 +0000 (23:50 -0500)
committer Avery Pennarun <apenwarr@gmail.com>
Fri, 12 Feb 2010 04:50:19 +0000 (23:50 -0500)
diff --git a/_hashsplit.c b/_hashsplit.c

index 732149f52507fd4b7847edcc09a1fdb60e99b09b..e93eff20cd20985e8dc7334447eeac4052316160 100644 (file)
--- a/_hashsplit.c
+++ b/_hashsplit.c
@@ -2,8 +2,8 @@
  #include <assert.h>
  #include <stdint.h>
  
-#define BLOBBITS (14)
-#define BLOBSIZE (1<<(BLOBBITS-1))
+#define BLOBBITS (13)
+#define BLOBSIZE (1<<BLOBBITS)
  #define WINDOWBITS (7)
  #define WINDOWSIZE (1<<(WINDOWBITS-1))
  
diff --git a/cmd-split.py b/cmd-split.py

index 8bc267e8a38cb8f2fc7fe407981c8974de283ec9..21c0ed3b700af5bcd6b9585b8f969598c1c40253 100755 (executable)
--- a/cmd-split.py
+++ b/cmd-split.py
@@ -13,6 +13,8 @@ b,blobs    output a series of blob ids
  t,tree     output a tree id
  c,commit   output a commit id
  n,name=    name of backup set to update (if any)
+N,noop     don't actually save the data anywhere
+q,quiet    don't print progress messages
  v,verbose  increase log output (can be used more than once)
  bench      print benchmark timings to stderr
  max-pack-size=  maximum bytes in a single pack
@@ -23,11 +25,13 @@ o = options.Options('bup split', optspec)
  (opt, flags, extra) = o.parse(sys.argv[1:])
  
  git.check_repo_or_die()
-if not (opt.blobs or opt.tree or opt.commit or opt.name):
+if not (opt.blobs or opt.tree or opt.commit or opt.name or opt.noop):
      log("bup split: use one or more of -b, -t, -c, -n\n")
      o.usage()
+if opt.noop and (opt.blobs or opt.tree or opt.commit or opt.name):
+    log('bup split: -N is incompabile with -b, -t, -c, -n\n')
+    o.usage()
  
-hashsplit.split_verbosely = opt.verbose
  if opt.verbose >= 2:
      git.verbose = opt.verbose - 1
      opt.bench = 1
@@ -43,7 +47,9 @@ if opt.blobs:
  start_time = time.time()
  
  refname = opt.name and 'refs/heads/%s' % opt.name or None
-if opt.remote:
+if opt.noop:
+    cli = w = oldref = None
+elif opt.remote:
      cli = client.Client(opt.remote)
      oldref = refname and cli.read_ref(refname) or None
      w = cli.new_packwriter()
@@ -51,9 +57,20 @@ else:
      cli = None
      oldref = refname and git.read_ref(refname) or None
      w = git.PackWriter()
-    
-shalist = hashsplit.split_to_shalist(w, hashsplit.autofiles(extra))
-tree = w.new_tree(shalist)
+
+files = extra and (open(fn) for fn in extra) or [sys.stdin]
+if w:
+    shalist = hashsplit.split_to_shalist(w, files)
+    tree = w.new_tree(shalist)
+else:
+    last = 0
+    for blob in hashsplit.hashsplit_iter(files):
+        hashsplit.total_split += len(blob)
+        megs = hashsplit.total_split/1024/1024
+        if not opt.quiet and last != megs:
+            progress('%d Mbytes read\r' % megs)
+            last = megs
+    progress('%d Mbytes read, done.\n' % megs)
  
  if opt.verbose:
      log('\n')
@@ -69,7 +86,8 @@ if opt.commit or opt.name:
      if opt.commit:
          print commit.encode('hex')
  
-w.close()  # must close before we can update the ref
+if w:
+    w.close()  # must close before we can update the ref
          
  if opt.name:
      if cli:
diff --git a/hashsplit.py b/hashsplit.py

index c82896f0fe39ab812fdb4a6a8af1c860a594458c..8fe5771a68001c7d0e79182a8063132486753e11 100644 (file)
--- a/hashsplit.py
+++ b/hashsplit.py
@@ -5,7 +5,6 @@ from helpers import *
  BLOB_LWM = 8192*2
  BLOB_MAX = BLOB_LWM*2
  BLOB_HWM = 1024*1024
-split_verbosely = 0
  progress_callback = None
  max_pack_size = 1000*1000*1000  # larger packs will slow down pruning
  max_pack_objects = 200*1000  # cache memory usage is about 83 bytes per object
@@ -41,8 +40,6 @@ def splitbuf(buf):
      b = buf.peek(buf.used())
      ofs = _hashsplit.splitbuf(b)
      if ofs:
-        if split_verbosely >= 2:
-            log('.')
          buf.eat(ofs)
          return buffer(b, 0, ofs)
      return None
@@ -50,89 +47,52 @@ def splitbuf(buf):
  
  def blobiter(files):
      for f in files:
-        b = 1
-        while b:
+        while 1:
              b = f.read(BLOB_HWM)
-            if b:
-                yield b
-    yield '' # EOF indicator
+            if not b:
+                break
+            yield b
  
  
-def autofiles(filenames):
-    if not filenames:
-        yield sys.stdin
-    else:
-        for n in filenames:
-            yield open(n)
-            
-    
-def hashsplit_iter(w, files):
-    ofs = 0
+def hashsplit_iter(files):
+    assert(BLOB_HWM > BLOB_MAX)
      buf = Buf()
      fi = blobiter(files)
-    blob = 1
-
-    eof = 0
-    lv = 0
-    while blob or not eof:
-        if not eof and (buf.used() < BLOB_LWM or not blob):
-            bnew = fi.next()
-            if not bnew: eof = 1
-            #log('got %d, total %d\n' % (len(bnew), buf.used()))
-            buf.put(bnew)
-
+    while 1:
          blob = splitbuf(buf)
-        if eof and not blob:
-            blob = buf.get(buf.used())
-        if not blob and buf.used() >= BLOB_MAX:
-            blob = buf.get(buf.used())  # limit max blob size
-        if not blob and not eof:
-            continue
-
          if blob:
-            if w.outbytes >= max_pack_size or w.count >= max_pack_objects:
-                w.breakpoint()
-            yield (ofs, len(blob), w.new_blob(blob))
-            ofs += len(blob)
-          
-        nv = (ofs + buf.used())/1000000
-        if nv != lv:
-            if split_verbosely >= 1:
-                log('%d\t' % nv)
-            lv = nv
+            yield blob
+        else:
+            if buf.used() >= BLOB_MAX:
+                # limit max blob size
+                yield (buf.get(buf.used()), 0)
+            while buf.used() < BLOB_HWM:
+                bnew = next(fi)
+                if not bnew:
+                    # eof
+                    if buf.used():
+                        yield buf.get(buf.used())
+                    return
+                buf.put(bnew)
  
  
  total_split = 0
  def _split_to_shalist(w, files):
      global total_split
      ofs = 0
-    last_ofs = 0
-    for (ofs, size, sha) in hashsplit_iter(w, files):
-        #log('SPLIT @ %-8d size=%-8d\n' % (ofs, size))
-        # this silliness keeps chunk filenames "similar" when a file changes
-        # slightly.
-        bm = BLOB_MAX
-        while 1:
-            cn = ofs / bm * bm
-            #log('%x,%x,%x,%x\n' % (last_ofs,ofs,cn,bm))
-            if cn > last_ofs or ofs == last_ofs: break
-            bm /= 2
-        last_ofs = cn
-        total_split += size
+    for blob in hashsplit_iter(files):
+        sha = w.new_blob(blob)
+        total_split += len(blob)
+        if w.outbytes >= max_pack_size or w.count >= max_pack_objects:
+            w.breakpoint()
          if progress_callback:
-            progress_callback(size)
-        yield ('100644', 'bup.chunk.%016x' % cn, sha)
-
-
-def _next(i):
-    try:
-        return i.next()
-    except StopIteration:
-        return None
+            progress_callback(len(blob))
+        yield ('100644', '%016x' % ofs, sha)
+        ofs += len(blob)
  
  
  def split_to_shalist(w, files):
-    sl = iter(_split_to_shalist(w, files))
+    sl = _split_to_shalist(w, files)
      if not fanout:
          shalist = list(sl)
      else:
author	Avery Pennarun <apenwarr@gmail.com>
	Fri, 12 Feb 2010 04:50:18 +0000 (23:50 -0500)
committer	Avery Pennarun <apenwarr@gmail.com>
	Fri, 12 Feb 2010 04:50:19 +0000 (23:50 -0500)
_hashsplit.c		patch \| blob \| history
cmd-split.py		patch \| blob \| history
hashsplit.py		patch \| blob \| history