Greatly improved progress reporting during index/save.

author Avery Pennarun <apenwarr@gmail.com>

Thu, 4 Feb 2010 06:12:06 +0000 (01:12 -0500)

committer Avery Pennarun <apenwarr@gmail.com>

Thu, 4 Feb 2010 06:12:06 +0000 (01:12 -0500)
author Avery Pennarun <apenwarr@gmail.com>
Thu, 4 Feb 2010 06:12:06 +0000 (01:12 -0500)
committer Avery Pennarun <apenwarr@gmail.com>
Thu, 4 Feb 2010 06:12:06 +0000 (01:12 -0500)
diff --git a/cmd-fsck.py b/cmd-fsck.py

index 1818a9f0eec1343a276f7cb968ca7a14c8964633..30f9b07c770dfa790a0b2f27574cc8edaad14de5 100755 (executable)
--- a/cmd-fsck.py
+++ b/cmd-fsck.py
@@ -3,7 +3,6 @@ import sys, os, glob, subprocess, time, sha
  import options, git
  from helpers import *
  
-istty = os.isatty(1)
  par2_ok = 0
  nullf = open('/dev/null')
  
@@ -163,8 +162,8 @@ for name in extra:
      sys.stdout.flush()
      debug('fsck: checking %s (%s)\n' 
            % (last, par2_ok and par2_exists and 'par2' or 'git'))
-    if not opt.verbose and istty:
-        log('fsck (%d/%d)\r' % (count, len(extra)))
+    if not opt.verbose:
+        progress('fsck (%d/%d)\r' % (count, len(extra)))
      
      if not opt.jobs:
          nc = do_pack(base, last)
@@ -195,8 +194,8 @@ while len(outstanding):
          del outstanding[pid]
          code = code or nc
          count += 1
-    if not opt.verbose and istty:
-        log('fsck (%d/%d)\r' % (count, len(extra)))
+    if not opt.verbose:
+        progress('fsck (%d/%d)\r' % (count, len(extra)))
  
  if not opt.verbose and istty:
      log('fsck done.           \n')
diff --git a/cmd-index.py b/cmd-index.py

index d2eeaa7e3aa08a038b1c687ddb1869078cb81211..da0bda760ec7452ce99382f0e7923cc4f5aff554 100755 (executable)
--- a/cmd-index.py
+++ b/cmd-index.py
@@ -5,11 +5,15 @@ from helpers import *
  
  
  def _simplify_iter(iters):
+    total = sum([len(it) for it in iters])
      l = list([iter(it) for it in iters])
-    l = list([(next(it),it) for it in l])
      del iters
+    l = list([(next(it),it) for it in l])
      l = filter(lambda x: x[0], l)
+    count = 0
      while l:
+        if not (count % 1024):
+            progress('bup: merging indexes (%d/%d)\r' % (count, total))
          l.sort()
          (e,it) = l.pop()
          if not e:
@@ -20,10 +24,11 @@ def _simplify_iter(iters):
          n = next(it)
          if n:
              l.append((n,it))
+        count += 1
+    log('bup: merging indexes (%d/%d), done.\n' % (count, total))
  
  
  def merge_indexes(out, r1, r2):
-    log('bup: merging indexes.\n')
      for e in _simplify_iter([r1, r2]):
          #if e.flags & index.IX_EXISTS:
              out.add_ixentry(e)
@@ -81,8 +86,12 @@ def update_index(top):
  
      #log('doing: %r\n' % paths)
  
+    total = 0
      for (path,pst) in drecurse.recursive_dirlist([top], xdev=opt.xdev):
          #log('got: %r\n' % path)
+        if not (total % 128):
+            progress('Indexing: %d\r' % total)
+        total += 1
          if opt.verbose>=2 or (opt.verbose==1 and stat.S_ISDIR(pst.st_mode)):
              sys.stdout.write('%s\n' % path)
              sys.stdout.flush()
@@ -102,6 +111,7 @@ def update_index(top):
          else:  # new paths
              #log('adding: %r\n' % path)
              wi.add(path, pst, hashgen = hashgen)
+    progress('Indexing: %d, done.\n' % total)
      
      if ri.exists():
          ri.save()
diff --git a/cmd-save.py b/cmd-save.py

index 429d6460746a11f312203d54ddcf0d4bf36ce41f..a88684be22ae470136d837df878cf1b773c9805e 100755 (executable)
--- a/cmd-save.py
+++ b/cmd-save.py
@@ -18,6 +18,7 @@ t,tree     output a tree id
  c,commit   output a commit id
  n,name=    name of backup set to update (if any)
  v,verbose  increase log output (can be used more than once)
+q,quiet    don't show progress meter
  smaller=   only back up files smaller than n bytes
  """
  o = options.Options('bup save', optspec)
@@ -31,9 +32,7 @@ if not extra:
      log("bup save: no filenames given.\n")
      o.usage()
  
-if opt.verbose >= 2:
-    git.verbose = opt.verbose - 1
-    hashsplit.split_verbosely = opt.verbose - 1
+opt.progress = (istty and not opt.quiet)
  
  refname = opt.name and 'refs/heads/%s' % opt.name or None
  if opt.remote:
@@ -68,8 +67,31 @@ def _pop():
      tree = w.new_tree(shalist)
      shalists[-1].append(('40000', part, tree))
  
-
-for (transname,ent) in index.Reader(git.repo('bupindex')).filter(extra):
+def progress_report(n):
+    global count
+    count += n
+    pct = count*100.0/total
+    progress('Saving: %.2f%% (%d/%dk, %d/%d files)\r'
+             % (pct, count/1024, total/1024, fcount, ftotal))
+
+
+r = index.Reader(git.repo('bupindex'))
+
+total = ftotal = 0
+if opt.progress:
+    for (transname,ent) in r.filter(extra):
+        if not (ftotal % 10024):
+            progress('Reading index: %d\r' % ftotal)
+        exists = (ent.flags & index.IX_EXISTS)
+        hashvalid = (ent.flags & index.IX_HASHVALID) and w.exists(ent.sha)
+        if exists and not hashvalid:
+            total += ent.size
+        ftotal += 1
+    progress('Reading index: %d, done.\n' % ftotal)
+    hashsplit.progress_callback = progress_report
+
+count = fcount = 0
+for (transname,ent) in r.filter(extra):
      (dir, file) = os.path.split(ent.name)
      exists = (ent.flags & index.IX_EXISTS)
      hashvalid = (ent.flags & index.IX_HASHVALID) and w.exists(ent.sha)
@@ -83,10 +105,13 @@ for (transname,ent) in index.Reader(git.repo('bupindex')).filter(extra):
                  status = 'M'
          else:
              status = ' '
-        if opt.verbose >= 2 or (status in ['A','M'] 
-                                and not stat.S_ISDIR(ent.mode)):
-            log('\n%s %s ' % (status, ent.name))
+        if opt.verbose >= 2 or stat.S_ISDIR(ent.mode):
+            log('%s %-70s\n' % (status, ent.name))
  
+    if opt.progress:
+        progress_report(0)
+    fcount += 1
+    
      if not exists:
          continue
  
@@ -102,6 +127,7 @@ for (transname,ent) in index.Reader(git.repo('bupindex')).filter(extra):
          # directory already handled.
          # FIXME: not using the indexed tree sha1's for anything, which is
          # a waste.  That's a potential optimization...
+        count += ent.size
          continue  
  
      id = None
@@ -116,12 +142,14 @@ for (transname,ent) in index.Reader(git.repo('bupindex')).filter(extra):
              if stat.S_ISREG(ent.mode):
                  f = open(ent.name)
                  (mode, id) = hashsplit.split_to_blob_or_tree(w, [f])
-            elif stat.S_ISDIR(ent.mode):
-                assert(0)  # handled above
-            elif stat.S_ISLNK(ent.mode):
-                (mode, id) = ('120000', w.new_blob(os.readlink(ent.name)))
              else:
-                add_error(Exception('skipping special file "%s"' % ent.name))
+                if stat.S_ISDIR(ent.mode):
+                    assert(0)  # handled above
+                elif stat.S_ISLNK(ent.mode):
+                    (mode, id) = ('120000', w.new_blob(os.readlink(ent.name)))
+                else:
+                    add_error(Exception('skipping special file "%s"' % ent.name))
+                count += ent.size
          except IOError, e:
              add_error(e)
          except OSError, e:
@@ -130,6 +158,12 @@ for (transname,ent) in index.Reader(git.repo('bupindex')).filter(extra):
              ent.validate(id)
              ent.repack()
              shalists[-1].append((mode, file, id))
+
+if opt.progress:
+    pct = total and count*100.0/total or 100
+    progress('Saving: %.2f%% (%d/%dk, %d/%d files), done.\n'
+             % (pct, count/1024, total/1024, fcount, ftotal))
+
  #log('parts out: %r\n' % parts)
  #log('stk out: %r\n' % shalists)
  while len(parts) > 1:
@@ -138,8 +172,6 @@ while len(parts) > 1:
  #log('stk out: %r\n' % shalists)
  assert(len(shalists) == 1)
  tree = w.new_tree(shalists[-1])
-if opt.verbose:
-    log('\n')
  if opt.tree:
      print tree.encode('hex')
  if opt.commit or opt.name:
@@ -147,8 +179,6 @@ if opt.commit or opt.name:
      ref = opt.name and ('refs/heads/%s' % opt.name) or None
      commit = w.new_commit(oldref, tree, msg)
      if opt.commit:
-        if opt.verbose:
-            log('\n')
          print commit.encode('hex')
  
  w.close()  # must close before we can update the ref
diff --git a/git.py b/git.py

index 43191f57767b77f6dafa5638d4d8f144b4df04d6..2ac3cf6d4b7e9f86176f92ab457a715c11a41c9b 100644 (file)
--- a/git.py
+++ b/git.py
@@ -286,9 +286,9 @@ def idxmerge(idxlist):
      heapq.heapify(heap)
      count = 0
      while heap:
-        if (count % 10000) == 0:
-            log('Merging: %.2f%% (%d/%d)\r'
-                % (count*100.0/total, count, total))
+        if (count % 10024) == 0:
+            progress('Creating midx: %.2f%% (%d/%d)\r'
+                     % (count*100.0/total, count, total))
          (e, it) = heap[0]
          yield e
          count += 1
@@ -297,7 +297,7 @@ def idxmerge(idxlist):
              heapq.heapreplace(heap, (e, it))
          else:
              heapq.heappop(heap)
-    log('Merging: %.2f%% (%d/%d), done.\n' % (100, total, total))
+    log('Creating midx: %.2f%% (%d/%d), done.\n' % (100, total, total))
  
      
  class PackWriter:
diff --git a/hashsplit.py b/hashsplit.py

index 15efd08c6a533a669270e0011870149ca98df3a2..c82896f0fe39ab812fdb4a6a8af1c860a594458c 100644 (file)
--- a/hashsplit.py
+++ b/hashsplit.py
@@ -6,6 +6,7 @@ BLOB_LWM = 8192*2
  BLOB_MAX = BLOB_LWM*2
  BLOB_HWM = 1024*1024
  split_verbosely = 0
+progress_callback = None
  max_pack_size = 1000*1000*1000  # larger packs will slow down pruning
  max_pack_objects = 200*1000  # cache memory usage is about 83 bytes per object
  fanout = 4096
@@ -37,7 +38,6 @@ class Buf:
  
  
  def splitbuf(buf):
-    global split_verbosely
      b = buf.peek(buf.used())
      ofs = _hashsplit.splitbuf(b)
      if ofs:
@@ -67,7 +67,6 @@ def autofiles(filenames):
              
      
  def hashsplit_iter(w, files):
-    global split_verbosely
      ofs = 0
      buf = Buf()
      fi = blobiter(files)
@@ -120,6 +119,8 @@ def _split_to_shalist(w, files):
              bm /= 2
          last_ofs = cn
          total_split += size
+        if progress_callback:
+            progress_callback(size)
          yield ('100644', 'bup.chunk.%016x' % cn, sha)
  
  
diff --git a/helpers.py b/helpers.py

index a7a6cc6d249465c09f017556ea60c850c1891119..ac04a957ded08a90a52b34d886479a4b3e993ef8 100644 (file)
--- a/helpers.py
+++ b/helpers.py
@@ -199,7 +199,18 @@ def mmap_readwrite(f, len = 0):
      return _mmap_do(f, len, mmap.MAP_SHARED, mmap.PROT_READ|mmap.PROT_WRITE)
  
  
+# count the number of elements in an iterator (consumes the iterator)
+def count(l):
+    return reduce(lambda x,y: x+1, l)
+
+
  saved_errors = []
  def add_error(e):
      saved_errors.append(e)
-    log('\n%s\n' % e)
+    log('%-70s\n' % e)
+
+
+istty = os.isatty(2)
+def progress(s):
+    if istty:
+        log(s)
diff --git a/index.py b/index.py

index 5ba57cf3c670d2cff7fefb59eea16341e288b228..c07c7fece9495f9809f775515eaf32f34160689d 100644 (file)
--- a/index.py
+++ b/index.py
@@ -6,6 +6,8 @@ FAKE_SHA = '\x01'*20
  INDEX_HDR = 'BUPI\0\0\0\2'
  INDEX_SIG = '!IIIIIQII20sHII'
  ENTLEN = struct.calcsize(INDEX_SIG)
+FOOTER_SIG = '!Q'
+FOOTLEN = struct.calcsize(FOOTER_SIG)
  
  IX_EXISTS = 0x8000
  IX_HASHVALID = 0x4000
@@ -151,7 +153,8 @@ class ExistingEntry(Entry):
              dname += '/'
          ofs = self.children_ofs
          assert(ofs <= len(self._m))
-        for i in range(self.children_n):
+        assert(self.children_n < 1000000)
+        for i in xrange(self.children_n):
              eon = self._m.find('\0', ofs)
              assert(eon >= 0)
              assert(eon >= ofs)
@@ -177,6 +180,7 @@ class Reader:
          self.filename = filename
          self.m = ''
          self.writable = False
+        self.count = 0
          f = None
          try:
              f = open(filename, 'r+')
@@ -195,13 +199,18 @@ class Reader:
                  if st.st_size:
                      self.m = mmap_readwrite(f)
                      self.writable = True
+                    self.count = struct.unpack(FOOTER_SIG,
+                          str(buffer(self.m, st.st_size-FOOTLEN, FOOTLEN)))[0]
  
      def __del__(self):
          self.close()
  
+    def __len__(self):
+        return self.count
+
      def forward_iter(self):
          ofs = len(INDEX_HDR)
-        while ofs+ENTLEN <= len(self.m):
+        while ofs+ENTLEN <= len(self.m)-FOOTLEN:
              eon = self.m.find('\0', ofs)
              assert(eon >= 0)
              assert(eon >= ofs)
@@ -215,7 +224,7 @@ class Reader:
              dname = name
              if dname and not dname.endswith('/'):
                  dname += '/'
-            root = ExistingEntry('/', '/', self.m, len(self.m)-ENTLEN)
+            root = ExistingEntry('/', '/', self.m, len(self.m)-FOOTLEN-ENTLEN)
              for sub in root.iter(name=name):
                  yield sub
              if not dname or dname == root.name:
@@ -271,9 +280,12 @@ class Writer:
      def flush(self):
          if self.level:
              self.level = _golevel(self.level, self.f, [], None)
+            self.count = self.rootlevel.count
+            if self.count:
+                self.count += 1
+            self.f.write(struct.pack(FOOTER_SIG, self.count))
+            self.f.flush()
          assert(self.level == None)
-        self.count = self.rootlevel.count
-        self.f.flush()
  
      def close(self):
          self.flush()
author	Avery Pennarun <apenwarr@gmail.com>
	Thu, 4 Feb 2010 06:12:06 +0000 (01:12 -0500)
committer	Avery Pennarun <apenwarr@gmail.com>
	Thu, 4 Feb 2010 06:12:06 +0000 (01:12 -0500)
cmd-fsck.py		patch \| blob \| history
cmd-index.py		patch \| blob \| history
cmd-save.py		patch \| blob \| history
git.py		patch \| blob \| history
hashsplit.py		patch \| blob \| history
helpers.py		patch \| blob \| history
index.py		patch \| blob \| history