Migrate all xrange calls to range in bup.compat

[bup.git] / lib / bup / git.py
diff --git a/lib/bup/git.py b/lib/bup/git.py

index 2fc155f6a49ed8a81ef5b5580d7c19fa231334f4..8b37e616ccb5fa19841c626c78373f13bb2d1f11 100644 (file)
--- a/lib/bup/git.py
+++ b/lib/bup/git.py
@@ -2,18 +2,27 @@
  bup repositories are in Git format. This library allows us to
  interact with the Git data structures.
  """
  bup repositories are in Git format. This library allows us to
  interact with the Git data structures.
  """
-import os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
-from collections import namedtuple
-
-from bup.helpers import *
-from bup import _helpers, path, midx, bloom, xstat
  
  
-max_pack_size = 1000*1000*1000  # larger packs will slow down pruning
-max_pack_objects = 200*1000  # cache memory usage is about 83 bytes per object
+from __future__ import absolute_import
+import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
+from collections import namedtuple
+from itertools import islice
+from numbers import Integral
+
+from bup import _helpers, compat, hashsplit, path, midx, bloom, xstat
+from bup.compat import range
+from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
+                         fdatasync,
+                         hostname, localtime, log, merge_iter,
+                         mmap_read, mmap_readwrite,
+                         parse_num,
+                         progress, qprogress, shstr, stat_if_exists,
+                         unlink, username, userfullname,
+                         utc_offset_str)
  
  verbose = 0
  ignore_midx = 0
  
  verbose = 0
  ignore_midx = 0
-repodir = None
+repodir = None  # The default repository, once initialized
  
  _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
  _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
  
  _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
  _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
@@ -26,6 +35,30 @@ class GitError(Exception):
      pass
  
  
      pass
  
  
+def _git_wait(cmd, p):
+    rv = p.wait()
+    if rv != 0:
+        raise GitError('%s returned %d' % (shstr(cmd), rv))
+
+def _git_capture(argv):
+    p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv())
+    r = p.stdout.read()
+    _git_wait(repr(argv), p)
+    return r
+
+def git_config_get(option, repo_dir=None):
+    cmd = ('git', 'config', '--get', option)
+    p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
+                         preexec_fn=_gitenv(repo_dir=repo_dir))
+    r = p.stdout.read()
+    rc = p.wait()
+    if rc == 0:
+        return r
+    if rc != 1:
+        raise GitError('%s returned %d' % (cmd, rc))
+    return None
+
+
  def parse_tz_offset(s):
      """UTC offset in seconds."""
      tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
  def parse_tz_offset(s):
      """UTC offset in seconds."""
      tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
@@ -81,14 +114,27 @@ def parse_commit(content):
  
  def get_commit_items(id, cp):
      commit_it = cp.get(id)
  
  def get_commit_items(id, cp):
      commit_it = cp.get(id)
-    assert(commit_it.next() == 'commit')
+    _, typ, _ = next(commit_it)
+    assert(typ == 'commit')
      commit_content = ''.join(commit_it)
      return parse_commit(commit_content)
  
  
      commit_content = ''.join(commit_it)
      return parse_commit(commit_content)
  
  
+def _local_git_date_str(epoch_sec):
+    return '%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
+
+
+def _git_date_str(epoch_sec, tz_offset_sec):
+    offs =  tz_offset_sec // 60
+    return '%d %s%02d%02d' \
+        % (epoch_sec,
+           '+' if offs >= 0 else '-',
+           abs(offs) // 60,
+           abs(offs) % 60)
+
+
  def repo(sub = '', repo_dir=None):
      """Get the path to the git repository or one of its subdirectories."""
  def repo(sub = '', repo_dir=None):
      """Get the path to the git repository or one of its subdirectories."""
-    global repodir
      repo_dir = repo_dir or repodir
      if not repo_dir:
          raise GitError('You should call check_repo_or_die()')
      repo_dir = repo_dir or repodir
      if not repo_dir:
          raise GitError('You should call check_repo_or_die()')
@@ -96,7 +142,7 @@ def repo(sub = '', repo_dir=None):
      # If there's a .git subdirectory, then the actual repo is in there.
      gd = os.path.join(repo_dir, '.git')
      if os.path.exists(gd):
      # If there's a .git subdirectory, then the actual repo is in there.
      gd = os.path.join(repo_dir, '.git')
      if os.path.exists(gd):
-        repodir = gd
+        repo_dir = gd
  
      return os.path.join(repo_dir, sub)
  
  
      return os.path.join(repo_dir, sub)
  
@@ -128,7 +174,7 @@ def auto_midx(objdir):
      args = [path.exe(), 'midx', '--auto', '--dir', objdir]
      try:
          rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
      args = [path.exe(), 'midx', '--auto', '--dir', objdir]
      try:
          rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
-    except OSError, e:
+    except OSError as e:
          # make sure 'args' gets printed to help with debugging
          add_error('%r: exception: %s' % (args, e))
          raise
          # make sure 'args' gets printed to help with debugging
          add_error('%r: exception: %s' % (args, e))
          raise
@@ -138,7 +184,7 @@ def auto_midx(objdir):
      args = [path.exe(), 'bloom', '--dir', objdir]
      try:
          rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
      args = [path.exe(), 'bloom', '--dir', objdir]
      try:
          rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
-    except OSError, e:
+    except OSError as e:
          # make sure 'args' gets printed to help with debugging
          add_error('%r: exception: %s' % (args, e))
          raise
          # make sure 'args' gets printed to help with debugging
          add_error('%r: exception: %s' % (args, e))
          raise
@@ -162,7 +208,7 @@ def mangle_name(name, mode, gitmode):
  
  
  (BUP_NORMAL, BUP_CHUNKED) = (0,1)
  
  
  (BUP_NORMAL, BUP_CHUNKED) = (0,1)
-def demangle_name(name):
+def demangle_name(name, mode):
      """Remove name mangling from a file name, if necessary.
  
      The return value is a tuple (demangled_filename,mode), where mode is one of
      """Remove name mangling from a file name, if necessary.
  
      The return value is a tuple (demangled_filename,mode), where mode is one of
@@ -177,6 +223,9 @@ def demangle_name(name):
          return (name[:-5], BUP_NORMAL)
      elif name.endswith('.bup'):
          return (name[:-4], BUP_CHUNKED)
          return (name[:-5], BUP_NORMAL)
      elif name.endswith('.bup'):
          return (name[:-4], BUP_CHUNKED)
+    elif name.endswith('.bupm'):
+        return (name[:-5],
+                BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
      else:
          return (name, BUP_NORMAL)
  
      else:
          return (name, BUP_NORMAL)
  
@@ -228,6 +277,8 @@ def tree_decode(buf):
  
  
  def _encode_packobj(type, content, compression_level=1):
  
  
  def _encode_packobj(type, content, compression_level=1):
+    if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
+        raise ValueError('invalid compression level %s' % compression_level)
      szout = ''
      sz = len(content)
      szbits = (sz & 0x0f) | (_typemap[type]<<4)
      szout = ''
      sz = len(content)
      szbits = (sz & 0x0f) | (_typemap[type]<<4)
@@ -239,10 +290,6 @@ def _encode_packobj(type, content, compression_level=1):
              break
          szbits = sz & 0x7f
          sz >>= 7
              break
          szbits = sz & 0x7f
          sz >>= 7
-    if compression_level > 9:
-        compression_level = 9
-    elif compression_level < 0:
-        compression_level = 0
      z = zlib.compressobj(compression_level)
      yield szout
      yield z.compress(content)
      z = zlib.compressobj(compression_level)
      yield szout
      yield z.compress(content)
@@ -349,7 +396,7 @@ class PackIdxV1(PackIdx):
          return str(self.shatable[idx*24+4 : idx*24+24])
  
      def __iter__(self):
          return str(self.shatable[idx*24+4 : idx*24+24])
  
      def __iter__(self):
-        for i in xrange(self.fanout[255]):
+        for i in range(self.fanout[255]):
              yield buffer(self.map, 256*4 + 24*i + 4, 20)
  
  
              yield buffer(self.map, 256*4 + 24*i + 4, 20)
  
  
@@ -384,7 +431,7 @@ class PackIdxV2(PackIdx):
          return str(self.shatable[idx*20:(idx+1)*20])
  
      def __iter__(self):
          return str(self.shatable[idx*20:(idx+1)*20])
  
      def __iter__(self):
-        for i in xrange(self.fanout[255]):
+        for i in range(self.fanout[255]):
              yield buffer(self.map, 8 + 256*4 + 20*i, 20)
  
  
              yield buffer(self.map, 8 + 256*4 + 20*i, 20)
  
  
@@ -498,7 +545,7 @@ class PackIdxList:
                  if not d.get(full):
                      try:
                          ix = open_idx(full)
                  if not d.get(full):
                      try:
                          ix = open_idx(full)
-                    except GitError, e:
+                    except GitError as e:
                          add_error(e)
                          continue
                      d[full] = ix
                          add_error(e)
                          continue
                      d[full] = ix
@@ -506,7 +553,7 @@ class PackIdxList:
              if self.bloom is None and os.path.exists(bfull):
                  self.bloom = bloom.ShaBloom(bfull)
              self.packs = list(set(d.values()))
              if self.bloom is None and os.path.exists(bfull):
                  self.bloom = bloom.ShaBloom(bfull)
              self.packs = list(set(d.values()))
-            self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
+            self.packs.sort(reverse=True, key=lambda x: len(x))
              if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
                  self.do_bloom = True
              else:
              if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
                  self.do_bloom = True
              else:
@@ -555,25 +602,64 @@ def idxmerge(idxlist, final_progress=True):
  def _make_objcache():
      return PackIdxList(repo('objects/pack'))
  
  def _make_objcache():
      return PackIdxList(repo('objects/pack'))
  
+# bup-gc assumes that it can disable all PackWriter activities
+# (bloom/midx/cache) via the constructor and close() arguments.
+
  class PackWriter:
      """Writes Git objects inside a pack file."""
  class PackWriter:
      """Writes Git objects inside a pack file."""
-    def __init__(self, objcache_maker=_make_objcache, compression_level=1):
+    def __init__(self, objcache_maker=_make_objcache, compression_level=1,
+                 run_midx=True, on_pack_finish=None,
+                 max_pack_size=None, max_pack_objects=None, repo_dir=None):
+        self.repo_dir = repo_dir or repo()
+        self.file = None
+        self.parentfd = None
          self.count = 0
          self.outbytes = 0
          self.filename = None
          self.count = 0
          self.outbytes = 0
          self.filename = None
-        self.file = None
          self.idx = None
          self.objcache_maker = objcache_maker
          self.objcache = None
          self.compression_level = compression_level
          self.idx = None
          self.objcache_maker = objcache_maker
          self.objcache = None
          self.compression_level = compression_level
+        self.run_midx=run_midx
+        self.on_pack_finish = on_pack_finish
+        if not max_pack_size:
+            max_pack_size = git_config_get('pack.packSizeLimit',
+                                           repo_dir=self.repo_dir)
+            if max_pack_size is not None:
+                max_pack_size = parse_num(max_pack_size)
+            if not max_pack_size:
+                # larger packs slow down pruning
+                max_pack_size = 1000 * 1000 * 1000
+        self.max_pack_size = max_pack_size
+        # cache memory usage is about 83 bytes per object
+        self.max_pack_objects = max_pack_objects if max_pack_objects \
+                                else max(1, self.max_pack_size // 5000)
  
      def __del__(self):
          self.close()
  
  
      def __del__(self):
          self.close()
  
+    def __enter__(self):
+        return self
+
+    def __exit__(self, type, value, traceback):
+        self.close()
+
      def _open(self):
          if not self.file:
      def _open(self):
          if not self.file:
-            (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
-            self.file = os.fdopen(fd, 'w+b')
+            objdir = dir = os.path.join(self.repo_dir, 'objects')
+            fd, name = tempfile.mkstemp(suffix='.pack', dir=objdir)
+            try:
+                self.file = os.fdopen(fd, 'w+b')
+            except:
+                os.close(fd)
+                raise
+            try:
+                self.parentfd = os.open(objdir, os.O_RDONLY)
+            except:
+                f = self.file
+                self.file = None
+                f.close()
+                raise
              assert(name.endswith('.pack'))
              self.filename = name[:-5]
              self.file.write('PACK\0\0\0\2\0\0\0\0')
              assert(name.endswith('.pack'))
              self.filename = name[:-5]
              self.file.write('PACK\0\0\0\2\0\0\0\0')
@@ -590,7 +676,7 @@ class PackWriter:
          oneblob = ''.join(datalist)
          try:
              f.write(oneblob)
          oneblob = ''.join(datalist)
          try:
              f.write(oneblob)
-        except IOError, e:
+        except IOError as e:
              raise GitError, e, sys.exc_info()[2]
          nw = len(oneblob)
          crc = zlib.crc32(oneblob) & 0xffffffff
              raise GitError, e, sys.exc_info()[2]
          nw = len(oneblob)
          crc = zlib.crc32(oneblob) & 0xffffffff
@@ -612,13 +698,14 @@ class PackWriter:
          size, crc = self._raw_write(_encode_packobj(type, content,
                                                      self.compression_level),
                                      sha=sha)
          size, crc = self._raw_write(_encode_packobj(type, content,
                                                      self.compression_level),
                                      sha=sha)
-        if self.outbytes >= max_pack_size or self.count >= max_pack_objects:
+        if self.outbytes >= self.max_pack_size \
+           or self.count >= self.max_pack_objects:
              self.breakpoint()
          return sha
  
      def breakpoint(self):
          """Clear byte and object counts and return the last processed id."""
              self.breakpoint()
          return sha
  
      def breakpoint(self):
          """Clear byte and object counts and return the last processed id."""
-        id = self._end()
+        id = self._end(self.run_midx)
          self.outbytes = self.count = 0
          return id
  
          self.outbytes = self.count = 0
          return id
  
@@ -634,11 +721,16 @@ class PackWriter:
          self._require_objcache()
          return self.objcache.exists(id, want_source=want_source)
  
          self._require_objcache()
          return self.objcache.exists(id, want_source=want_source)
  
+    def just_write(self, sha, type, content):
+        """Write an object to the pack file, bypassing the objcache.  Fails if
+        sha exists()."""
+        self._write(sha, type, content)
+
      def maybe_write(self, type, content):
          """Write an object to the pack file if not present and return its id."""
          sha = calc_hash(type, content)
          if not self.exists(sha):
      def maybe_write(self, type, content):
          """Write an object to the pack file if not present and return its id."""
          sha = calc_hash(type, content)
          if not self.exists(sha):
-            self._write(sha, type, content)
+            self.just_write(sha, type, content)
              self._require_objcache()
              self.objcache.add(sha)
          return sha
              self._require_objcache()
              self.objcache.add(sha)
          return sha
@@ -652,66 +744,90 @@ class PackWriter:
          content = tree_encode(shalist)
          return self.maybe_write('tree', content)
  
          content = tree_encode(shalist)
          return self.maybe_write('tree', content)
  
-    def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
+    def new_commit(self, tree, parent,
+                   author, adate_sec, adate_tz,
+                   committer, cdate_sec, cdate_tz,
+                   msg):
+        """Create a commit object in the pack.  The date_sec values must be
+        epoch-seconds, and if a tz is None, the local timezone is assumed."""
+        if adate_tz:
+            adate_str = _git_date_str(adate_sec, adate_tz)
+        else:
+            adate_str = _local_git_date_str(adate_sec)
+        if cdate_tz:
+            cdate_str = _git_date_str(cdate_sec, cdate_tz)
+        else:
+            cdate_str = _local_git_date_str(cdate_sec)
          l = []
          if tree: l.append('tree %s' % tree.encode('hex'))
          if parent: l.append('parent %s' % parent.encode('hex'))
          l = []
          if tree: l.append('tree %s' % tree.encode('hex'))
          if parent: l.append('parent %s' % parent.encode('hex'))
-        if author: l.append('author %s %s' % (author, _git_date(adate)))
-        if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
+        if author: l.append('author %s %s' % (author, adate_str))
+        if committer: l.append('committer %s %s' % (committer, cdate_str))
          l.append('')
          l.append(msg)
          return self.maybe_write('commit', '\n'.join(l))
  
          l.append('')
          l.append(msg)
          return self.maybe_write('commit', '\n'.join(l))
  
-    def new_commit(self, parent, tree, date, msg):
-        """Create a commit object in the pack."""
-        userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
-        commit = self._new_commit(tree, parent,
-                                  userline, date, userline, date,
-                                  msg)
-        return commit
-
      def abort(self):
          """Remove the pack file from disk."""
          f = self.file
          if f:
      def abort(self):
          """Remove the pack file from disk."""
          f = self.file
          if f:
-            self.idx = None
+            pfd = self.parentfd
              self.file = None
              self.file = None
-            f.close()
-            os.unlink(self.filename + '.pack')
+            self.parentfd = None
+            self.idx = None
+            try:
+                try:
+                    os.unlink(self.filename + '.pack')
+                finally:
+                    f.close()
+            finally:
+                if pfd is not None:
+                    os.close(pfd)
  
      def _end(self, run_midx=True):
          f = self.file
          if not f: return None
          self.file = None
  
      def _end(self, run_midx=True):
          f = self.file
          if not f: return None
          self.file = None
-        self.objcache = None
-        idx = self.idx
-        self.idx = None
+        try:
+            self.objcache = None
+            idx = self.idx
+            self.idx = None
  
  
-        # update object count
-        f.seek(8)
-        cp = struct.pack('!i', self.count)
-        assert(len(cp) == 4)
-        f.write(cp)
-
-        # calculate the pack sha1sum
-        f.seek(0)
-        sum = Sha1()
-        for b in chunkyreader(f):
-            sum.update(b)
-        packbin = sum.digest()
-        f.write(packbin)
-        f.close()
+            # update object count
+            f.seek(8)
+            cp = struct.pack('!i', self.count)
+            assert(len(cp) == 4)
+            f.write(cp)
+
+            # calculate the pack sha1sum
+            f.seek(0)
+            sum = Sha1()
+            for b in chunkyreader(f):
+                sum.update(b)
+            packbin = sum.digest()
+            f.write(packbin)
+            fdatasync(f.fileno())
+        finally:
+            f.close()
  
          obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
  
          obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
-
-        nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
+        nameprefix = os.path.join(self.repo_dir,
+                                  'objects/pack/pack-' +  obj_list_sha)
          if os.path.exists(self.filename + '.map'):
              os.unlink(self.filename + '.map')
          os.rename(self.filename + '.pack', nameprefix + '.pack')
          os.rename(self.filename + '.idx', nameprefix + '.idx')
          if os.path.exists(self.filename + '.map'):
              os.unlink(self.filename + '.map')
          os.rename(self.filename + '.pack', nameprefix + '.pack')
          os.rename(self.filename + '.idx', nameprefix + '.idx')
+        try:
+            os.fsync(self.parentfd)
+        finally:
+            os.close(self.parentfd)
  
          if run_midx:
  
          if run_midx:
-            auto_midx(repo('objects/pack'))
+            auto_midx(os.path.join(self.repo_dir, 'objects/pack'))
+
+        if self.on_pack_finish:
+            self.on_pack_finish(nameprefix)
+
          return nameprefix
  
      def close(self, run_midx=True):
          return nameprefix
  
      def close(self, run_midx=True):
@@ -731,11 +847,15 @@ class PackWriter:
          idx_f = open(filename, 'w+b')
          try:
              idx_f.truncate(index_len)
          idx_f = open(filename, 'w+b')
          try:
              idx_f.truncate(index_len)
+            fdatasync(idx_f.fileno())
              idx_map = mmap_readwrite(idx_f, close=False)
              idx_map = mmap_readwrite(idx_f, close=False)
-            count = _helpers.write_idx(filename, idx_map, idx, self.count)
-            assert(count == self.count)
+            try:
+                count = _helpers.write_idx(filename, idx_map, idx, self.count)
+                assert(count == self.count)
+                idx_map.flush()
+            finally:
+                idx_map.close()
          finally:
          finally:
-            if idx_map: idx_map.close()
              idx_f.close()
  
          idx_f = open(filename, 'a+b')
              idx_f.close()
  
          idx_f = open(filename, 'a+b')
@@ -755,15 +875,12 @@ class PackWriter:
              for b in chunkyreader(idx_f):
                  idx_sum.update(b)
              idx_f.write(idx_sum.digest())
              for b in chunkyreader(idx_f):
                  idx_sum.update(b)
              idx_f.write(idx_sum.digest())
+            fdatasync(idx_f.fileno())
              return namebase
          finally:
              idx_f.close()
  
  
              return namebase
          finally:
              idx_f.close()
  
  
-def _git_date(date):
-    return '%d %s' % (date, utc_offset_str(date))
-
-
  def _gitenv(repo_dir = None):
      if not repo_dir:
          repo_dir = repo()
  def _gitenv(repo_dir = None):
      if not repo_dir:
          repo_dir = repo()
@@ -772,13 +889,23 @@ def _gitenv(repo_dir = None):
      return env
  
  
      return env
  
  
-def list_refs(refname = None, repo_dir = None):
-    """Generate a list of tuples in the form (refname,hash).
-    If a ref name is specified, list only this particular ref.
+def list_refs(patterns=None, repo_dir=None,
+              limit_to_heads=False, limit_to_tags=False):
+    """Yield (refname, hash) tuples for all repository refs unless
+    patterns are specified.  In that case, only include tuples for
+    refs matching those patterns (cf. git-show-ref(1)).  The limits
+    restrict the result items to refs/heads or refs/tags.  If both
+    limits are specified, items from both sources will be included.
+
      """
      """
-    argv = ['git', 'show-ref', '--']
-    if refname:
-        argv += [refname]
+    argv = ['git', 'show-ref']
+    if limit_to_heads:
+        argv.append('--heads')
+    if limit_to_tags:
+        argv.append('--tags')
+    argv.append('--')
+    if patterns:
+        argv.extend(patterns)
      p = subprocess.Popen(argv,
                           preexec_fn = _gitenv(repo_dir),
                           stdout = subprocess.PIPE)
      p = subprocess.Popen(argv,
                           preexec_fn = _gitenv(repo_dir),
                           stdout = subprocess.PIPE)
@@ -794,7 +921,8 @@ def list_refs(refname = None, repo_dir = None):
  
  def read_ref(refname, repo_dir = None):
      """Get the commit id of the most recent commit made on a given ref."""
  
  def read_ref(refname, repo_dir = None):
      """Get the commit id of the most recent commit made on a given ref."""
-    l = list(list_refs(refname, repo_dir))
+    refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
+    l = tuple(islice(refs, 2))
      if l:
          assert(len(l) == 1)
          return l[0][1]
      if l:
          assert(len(l) == 1)
          return l[0][1]
@@ -802,32 +930,52 @@ def read_ref(refname, repo_dir = None):
          return None
  
  
          return None
  
  
-def rev_list(ref, count=None, repo_dir=None):
-    """Generate a list of reachable commits in reverse chronological order.
+def rev_list_invocation(ref_or_refs, count=None, format=None):
+    if isinstance(ref_or_refs, compat.str_type):
+        refs = (ref_or_refs,)
+    else:
+        refs = ref_or_refs
+    argv = ['git', 'rev-list']
+    if isinstance(count, Integral):
+        argv.extend(['-n', str(count)])
+    elif count:
+        raise ValueError('unexpected count argument %r' % count)
+
+    if format:
+        argv.append('--pretty=format:' + format)
+    for ref in refs:
+        assert not ref.startswith('-')
+        argv.append(ref)
+    argv.append('--')
+    return argv
+
  
  
-    This generator walks through commits, from child to parent, that are
-    reachable via the specified ref and yields a series of tuples of the form
-    (date,hash).
+def rev_list(ref_or_refs, count=None, parse=None, format=None, repo_dir=None):
+    """Yield information about commits as per "git rev-list".  If a format
+    is not provided, yield one hex hash at a time.  If a format is
+    provided, pass it to rev-list and call parse(git_stdout) for each
+    commit with the stream positioned just after the rev-list "commit
+    HASH" header line.  When a format is provided yield (oidx,
+    parse(git_stdout)) for each commit.
  
  
-    If count is a non-zero integer, limit the number of commits to "count"
-    objects.
      """
      """
-    assert(not ref.startswith('-'))
-    opts = []
-    if count:
-        opts += ['-n', str(atoi(count))]
-    argv = ['git', 'rev-list', '--pretty=format:%at'] + opts + [ref, '--']
-    p = subprocess.Popen(argv,
+    assert bool(parse) == bool(format)
+    p = subprocess.Popen(rev_list_invocation(ref_or_refs, count=count,
+                                             format=format),
                           preexec_fn = _gitenv(repo_dir),
                           stdout = subprocess.PIPE)
                           preexec_fn = _gitenv(repo_dir),
                           stdout = subprocess.PIPE)
-    commit = None
-    for row in p.stdout:
-        s = row.strip()
-        if s.startswith('commit '):
-            commit = s[7:].decode('hex')
-        else:
-            date = int(s)
-            yield (date, commit)
+    if not format:
+        for line in p.stdout:
+            yield line.strip()
+    else:
+        line = p.stdout.readline()
+        while line:
+            s = line.strip()
+            if not s.startswith('commit '):
+                raise Exception('unexpected line ' + s)
+            yield s[7:], parse(p.stdout)
+            line = p.stdout.readline()
+
      rv = p.wait()  # not fatal
      if rv:
          raise GitError, 'git rev-list returned error %d' % rv
      rv = p.wait()  # not fatal
      if rv:
          raise GitError, 'git rev-list returned error %d' % rv
@@ -883,10 +1031,11 @@ def update_ref(refname, newval, oldval, repo_dir=None):
      _git_wait('git update-ref', p)
  
  
      _git_wait('git update-ref', p)
  
  
-def delete_ref(refname):
-    """Delete a repository reference."""
+def delete_ref(refname, oldvalue=None):
+    """Delete a repository reference (see git update-ref(1))."""
      assert(refname.startswith('refs/'))
      assert(refname.startswith('refs/'))
-    p = subprocess.Popen(['git', 'update-ref', '-d', refname],
+    oldvalue = [] if not oldvalue else [oldvalue]
+    p = subprocess.Popen(['git', 'update-ref', '-d', refname] + oldvalue,
                           preexec_fn = _gitenv())
      _git_wait('git update-ref', p)
  
                           preexec_fn = _gitenv())
      _git_wait('git update-ref', p)
  
@@ -931,21 +1080,20 @@ def init_repo(path=None):
  
  
  def check_repo_or_die(path=None):
  
  
  def check_repo_or_die(path=None):
-    """Make sure a bup repository exists, and abort if not.
-    If the path to a particular repository was not specified, this function
-    initializes the default repository automatically.
-    """
+    """Check to see if a bup repository probably exists, and abort if not."""
      guess_repo(path)
      guess_repo(path)
-    try:
-        os.stat(repo('objects/pack/.'))
-    except OSError, e:
-        if e.errno == errno.ENOENT:
-            log('error: %r is not a bup repository; run "bup init"\n'
-                % repo())
+    top = repo()
+    pst = stat_if_exists(top + '/objects/pack')
+    if pst and stat.S_ISDIR(pst.st_mode):
+        return
+    if not pst:
+        top_st = stat_if_exists(top)
+        if not top_st:
+            log('error: repository %r does not exist (see "bup help init")\n'
+                % top)
              sys.exit(15)
              sys.exit(15)
-        else:
-            log('error: %s\n' % e)
-            sys.exit(14)
+    log('error: %r is not a repository\n' % top)
+    sys.exit(14)
  
  
  _ver = None
  
  
  _ver = None
@@ -975,19 +1123,6 @@ def ver():
      return _ver
  
  
      return _ver
  
  
-def _git_wait(cmd, p):
-    rv = p.wait()
-    if rv != 0:
-        raise GitError('%s returned %d' % (cmd, rv))
-
-
-def _git_capture(argv):
-    p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv())
-    r = p.stdout.read()
-    _git_wait(repr(argv), p)
-    return r
-
-
  class _AbortableIter:
      def __init__(self, it, onabort = None):
          self.it = it
  class _AbortableIter:
      def __init__(self, it, onabort = None):
          self.it = it
@@ -999,8 +1134,8 @@ class _AbortableIter:
  
      def next(self):
          try:
  
      def next(self):
          try:
-            return self.it.next()
-        except StopIteration, e:
+            return next(self.it)
+        except StopIteration as e:
              self.done = True
              raise
          except:
              self.done = True
              raise
          except:
@@ -1026,14 +1161,9 @@ class CatPipe:
          self.repo_dir = repo_dir
          wanted = ('1','5','6')
          if ver() < wanted:
          self.repo_dir = repo_dir
          wanted = ('1','5','6')
          if ver() < wanted:
-            if not _ver_warned:
-                log('warning: git version < %s; bup will be slow.\n'
-                    % '.'.join(wanted))
-                _ver_warned = 1
-            self.get = self._slow_get
-        else:
-            self.p = self.inprogress = None
-            self.get = self._fast_get
+            log('error: git version must be at least 1.5.6\n')
+            sys.exit(1)
+        self.p = self.inprogress = None
  
      def _abort(self):
          if self.p:
  
      def _abort(self):
          if self.p:
@@ -1042,7 +1172,7 @@ class CatPipe:
          self.p = None
          self.inprogress = None
  
          self.p = None
          self.inprogress = None
  
-    def _restart(self):
+    def restart(self):
          self._abort()
          self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
                                    stdin=subprocess.PIPE,
          self._abort()
          self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
                                    stdin=subprocess.PIPE,
@@ -1051,76 +1181,66 @@ class CatPipe:
                                    bufsize = 4096,
                                    preexec_fn = _gitenv(self.repo_dir))
  
                                    bufsize = 4096,
                                    preexec_fn = _gitenv(self.repo_dir))
  
-    def _fast_get(self, id):
+    def get(self, ref):
+        """Yield (oidx, type, size), followed by the data referred to by ref.
+        If ref does not exist, only yield (None, None, None).
+
+        """
          if not self.p or self.p.poll() != None:
          if not self.p or self.p.poll() != None:
-            self._restart()
+            self.restart()
          assert(self.p)
          poll_result = self.p.poll()
          assert(poll_result == None)
          if self.inprogress:
          assert(self.p)
          poll_result = self.p.poll()
          assert(poll_result == None)
          if self.inprogress:
-            log('_fast_get: opening %r while %r is open\n'
-                % (id, self.inprogress))
+            log('get: opening %r while %r is open\n' % (ref, self.inprogress))
          assert(not self.inprogress)
          assert(not self.inprogress)
-        assert(id.find('\n') < 0)
-        assert(id.find('\r') < 0)
-        assert(not id.startswith('-'))
-        self.inprogress = id
-        self.p.stdin.write('%s\n' % id)
+        assert(ref.find('\n') < 0)
+        assert(ref.find('\r') < 0)
+        assert(not ref.startswith('-'))
+        self.inprogress = ref
+        self.p.stdin.write('%s\n' % ref)
          self.p.stdin.flush()
          hdr = self.p.stdout.readline()
          if hdr.endswith(' missing\n'):
              self.inprogress = None
          self.p.stdin.flush()
          hdr = self.p.stdout.readline()
          if hdr.endswith(' missing\n'):
              self.inprogress = None
-            raise KeyError('blob %r is missing' % id)
-        spl = hdr.split(' ')
-        if len(spl) != 3 or len(spl[0]) != 40:
-            raise GitError('expected blob, got %r' % spl)
-        (hex, type, size) = spl
-
-        it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
-                           onabort = self._abort)
+            yield None, None, None
+            return
+        info = hdr.split(' ')
+        if len(info) != 3 or len(info[0]) != 40:
+            raise GitError('expected object (id, type, size), got %r' % info)
+        oidx, typ, size = info
+        size = int(size)
+        it = _AbortableIter(chunkyreader(self.p.stdout, size),
+                            onabort=self._abort)
          try:
          try:
-            yield type
+            yield oidx, typ, size
              for blob in it:
                  yield blob
              readline_result = self.p.stdout.readline()
              assert(readline_result == '\n')
              self.inprogress = None
              for blob in it:
                  yield blob
              readline_result = self.p.stdout.readline()
              assert(readline_result == '\n')
              self.inprogress = None
-        except Exception, e:
+        except Exception as e:
              it.abort()
              raise
  
              it.abort()
              raise
  
-    def _slow_get(self, id):
-        assert(id.find('\n') < 0)
-        assert(id.find('\r') < 0)
-        assert(id[0] != '-')
-        type = _git_capture(['git', 'cat-file', '-t', id]).strip()
-        yield type
-
-        p = subprocess.Popen(['git', 'cat-file', type, id],
-                             stdout=subprocess.PIPE,
-                             preexec_fn = _gitenv(self.repo_dir))
-        for blob in chunkyreader(p.stdout):
-            yield blob
-        _git_wait('git cat-file', p)
-
      def _join(self, it):
      def _join(self, it):
-        type = it.next()
-        if type == 'blob':
+        _, typ, _ = next(it)
+        if typ == 'blob':
              for blob in it:
                  yield blob
              for blob in it:
                  yield blob
-        elif type == 'tree':
+        elif typ == 'tree':
              treefile = ''.join(it)
              for (mode, name, sha) in tree_decode(treefile):
                  for blob in self.join(sha.encode('hex')):
                      yield blob
              treefile = ''.join(it)
              for (mode, name, sha) in tree_decode(treefile):
                  for blob in self.join(sha.encode('hex')):
                      yield blob
-        elif type == 'commit':
+        elif typ == 'commit':
              treeline = ''.join(it).split('\n')[0]
              assert(treeline.startswith('tree '))
              for blob in self.join(treeline[5:]):
                  yield blob
          else:
              raise GitError('invalid object type %r: expected blob/tree/commit'
              treeline = ''.join(it).split('\n')[0]
              assert(treeline.startswith('tree '))
              for blob in self.join(treeline[5:]):
                  yield blob
          else:
              raise GitError('invalid object type %r: expected blob/tree/commit'
-                           % type)
+                           % typ)
  
      def join(self, id):
          """Generate a list of the content of all blobs that can be reached
  
      def join(self, id):
          """Generate a list of the content of all blobs that can be reached
@@ -1139,9 +1259,9 @@ _cp = {}
  
  def cp(repo_dir=None):
      """Create a CatPipe object or reuse the already existing one."""
  
  def cp(repo_dir=None):
      """Create a CatPipe object or reuse the already existing one."""
-    global _cp
+    global _cp, repodir
      if not repo_dir:
      if not repo_dir:
-        repo_dir = repo()
+        repo_dir = repodir or repo()
      repo_dir = os.path.abspath(repo_dir)
      cp = _cp.get(repo_dir)
      if not cp:
      repo_dir = os.path.abspath(repo_dir)
      cp = _cp.get(repo_dir)
      if not cp:
@@ -1153,11 +1273,100 @@ def cp(repo_dir=None):
  def tags(repo_dir = None):
      """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
      tags = {}
  def tags(repo_dir = None):
      """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
      tags = {}
-    for (n,c) in list_refs(repo_dir = repo_dir):
-        if n.startswith('refs/tags/'):
-            name = n[10:]
-            if not c in tags:
-                tags[c] = []
-
-            tags[c].append(name)  # more than one tag can point at 'c'
+    for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
+        assert(n.startswith('refs/tags/'))
+        name = n[10:]
+        if not c in tags:
+            tags[c] = []
+        tags[c].append(name)  # more than one tag can point at 'c'
      return tags
      return tags
+
+
+class MissingObject(KeyError):
+    def __init__(self, oid):
+        self.oid = oid
+        KeyError.__init__(self, 'object %r is missing' % oid.encode('hex'))
+
+
+WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
+                                   'path', 'chunk_path', 'data'])
+# The path is the mangled path, and if an item represents a fragment
+# of a chunked file, the chunk_path will be the chunked subtree path
+# for the chunk, i.e. ['', '2d3115e', ...].  The top-level path for a
+# chunked file will have a chunk_path of [''].  So some chunk subtree
+# of the file '/foo/bar/baz' might look like this:
+#
+#   item.path = ['foo', 'bar', 'baz.bup']
+#   item.chunk_path = ['', '2d3115e', '016b097']
+#   item.type = 'tree'
+#   ...
+
+
+def walk_object(cat_pipe, oidx,
+                stop_at=None,
+                include_data=None):
+    """Yield everything reachable from oidx via cat_pipe as a WalkItem,
+    stopping whenever stop_at(oidx) returns true.  Throw MissingObject
+    if a hash encountered is missing from the repository, and don't
+    read or return blob content in the data field unless include_data
+    is set.
+    """
+    # Maintain the pending stack on the heap to avoid stack overflow
+    pending = [(oidx, [], [], None)]
+    while len(pending):
+        oidx, parent_path, chunk_path, mode = pending.pop()
+        oid = oidx.decode('hex')
+        if stop_at and stop_at(oidx):
+            continue
+
+        if (not include_data) and mode and stat.S_ISREG(mode):
+            # If the object is a "regular file", then it's a leaf in
+            # the graph, so we can skip reading the data if the caller
+            # hasn't requested it.
+            yield WalkItem(oid=oid, type='blob',
+                           chunk_path=chunk_path, path=parent_path,
+                           mode=mode,
+                           data=None)
+            continue
+
+        item_it = cat_pipe.get(oidx)
+        get_oidx, typ, _ = next(item_it)
+        if not get_oidx:
+            raise MissingObject(oidx.decode('hex'))
+        if typ not in ('blob', 'commit', 'tree'):
+            raise Exception('unexpected repository object type %r' % typ)
+
+        # FIXME: set the mode based on the type when the mode is None
+        if typ == 'blob' and not include_data:
+            # Dump data until we can ask cat_pipe not to fetch it
+            for ignored in item_it:
+                pass
+            data = None
+        else:
+            data = ''.join(item_it)
+
+        yield WalkItem(oid=oid, type=typ,
+                       chunk_path=chunk_path, path=parent_path,
+                       mode=mode,
+                       data=(data if include_data else None))
+
+        if typ == 'commit':
+            commit_items = parse_commit(data)
+            for pid in commit_items.parents:
+                pending.append((pid, parent_path, chunk_path, mode))
+            pending.append((commit_items.tree, parent_path, chunk_path,
+                            hashsplit.GIT_MODE_TREE))
+        elif typ == 'tree':
+            for mode, name, ent_id in tree_decode(data):
+                demangled, bup_type = demangle_name(name, mode)
+                if chunk_path:
+                    sub_path = parent_path
+                    sub_chunk_path = chunk_path + [name]
+                else:
+                    sub_path = parent_path + [name]
+                    if bup_type == BUP_CHUNKED:
+                        sub_chunk_path = ['']
+                    else:
+                        sub_chunk_path = chunk_path
+                pending.append((ent_id.encode('hex'), sub_path, sub_chunk_path,
+                                mode))