]> arthur.barton.de Git - bup.git/blobdiff - hashsplit.py
cmd-split and hashsplit: cleaning up in preparation for refactoring.
[bup.git] / hashsplit.py
index c82896f0fe39ab812fdb4a6a8af1c860a594458c..8fe5771a68001c7d0e79182a8063132486753e11 100644 (file)
@@ -5,7 +5,6 @@ from helpers import *
 BLOB_LWM = 8192*2
 BLOB_MAX = BLOB_LWM*2
 BLOB_HWM = 1024*1024
-split_verbosely = 0
 progress_callback = None
 max_pack_size = 1000*1000*1000  # larger packs will slow down pruning
 max_pack_objects = 200*1000  # cache memory usage is about 83 bytes per object
@@ -41,8 +40,6 @@ def splitbuf(buf):
     b = buf.peek(buf.used())
     ofs = _hashsplit.splitbuf(b)
     if ofs:
-        if split_verbosely >= 2:
-            log('.')
         buf.eat(ofs)
         return buffer(b, 0, ofs)
     return None
@@ -50,89 +47,52 @@ def splitbuf(buf):
 
 def blobiter(files):
     for f in files:
-        b = 1
-        while b:
+        while 1:
             b = f.read(BLOB_HWM)
-            if b:
-                yield b
-    yield '' # EOF indicator
+            if not b:
+                break
+            yield b
 
 
-def autofiles(filenames):
-    if not filenames:
-        yield sys.stdin
-    else:
-        for n in filenames:
-            yield open(n)
-            
-    
-def hashsplit_iter(w, files):
-    ofs = 0
+def hashsplit_iter(files):
+    assert(BLOB_HWM > BLOB_MAX)
     buf = Buf()
     fi = blobiter(files)
-    blob = 1
-
-    eof = 0
-    lv = 0
-    while blob or not eof:
-        if not eof and (buf.used() < BLOB_LWM or not blob):
-            bnew = fi.next()
-            if not bnew: eof = 1
-            #log('got %d, total %d\n' % (len(bnew), buf.used()))
-            buf.put(bnew)
-
+    while 1:
         blob = splitbuf(buf)
-        if eof and not blob:
-            blob = buf.get(buf.used())
-        if not blob and buf.used() >= BLOB_MAX:
-            blob = buf.get(buf.used())  # limit max blob size
-        if not blob and not eof:
-            continue
-
         if blob:
-            if w.outbytes >= max_pack_size or w.count >= max_pack_objects:
-                w.breakpoint()
-            yield (ofs, len(blob), w.new_blob(blob))
-            ofs += len(blob)
-          
-        nv = (ofs + buf.used())/1000000
-        if nv != lv:
-            if split_verbosely >= 1:
-                log('%d\t' % nv)
-            lv = nv
+            yield blob
+        else:
+            if buf.used() >= BLOB_MAX:
+                # limit max blob size
+                yield (buf.get(buf.used()), 0)
+            while buf.used() < BLOB_HWM:
+                bnew = next(fi)
+                if not bnew:
+                    # eof
+                    if buf.used():
+                        yield buf.get(buf.used())
+                    return
+                buf.put(bnew)
 
 
 total_split = 0
 def _split_to_shalist(w, files):
     global total_split
     ofs = 0
-    last_ofs = 0
-    for (ofs, size, sha) in hashsplit_iter(w, files):
-        #log('SPLIT @ %-8d size=%-8d\n' % (ofs, size))
-        # this silliness keeps chunk filenames "similar" when a file changes
-        # slightly.
-        bm = BLOB_MAX
-        while 1:
-            cn = ofs / bm * bm
-            #log('%x,%x,%x,%x\n' % (last_ofs,ofs,cn,bm))
-            if cn > last_ofs or ofs == last_ofs: break
-            bm /= 2
-        last_ofs = cn
-        total_split += size
+    for blob in hashsplit_iter(files):
+        sha = w.new_blob(blob)
+        total_split += len(blob)
+        if w.outbytes >= max_pack_size or w.count >= max_pack_objects:
+            w.breakpoint()
         if progress_callback:
-            progress_callback(size)
-        yield ('100644', 'bup.chunk.%016x' % cn, sha)
-
-
-def _next(i):
-    try:
-        return i.next()
-    except StopIteration:
-        return None
+            progress_callback(len(blob))
+        yield ('100644', '%016x' % ofs, sha)
+        ofs += len(blob)
 
 
 def split_to_shalist(w, files):
-    sl = iter(_split_to_shalist(w, files))
+    sl = _split_to_shalist(w, files)
     if not fanout:
         shalist = list(sl)
     else: