git.cp(): don't repeatedly recompute default repo

[bup.git] / lib / bup / git.py
diff --git a/lib/bup/git.py b/lib/bup/git.py

index fc48e51ee8eee4122aa3ae175bc433827de188e5..31ae34216e6dde4b6a6c375d4f02db883ce6d7b1 100644 (file)
--- a/lib/bup/git.py
+++ b/lib/bup/git.py
@@ -2,16 +2,27 @@
  bup repositories are in Git format. This library allows us to
  interact with the Git data structures.
  """
  bup repositories are in Git format. This library allows us to
  interact with the Git data structures.
  """
-import os, sys, zlib, time, subprocess, struct, stat, re, tempfile, heapq
-from bup.helpers import *
-from bup import _helpers, path
  
  
-MIDX_VERSION = 2
+import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
+from collections import namedtuple
+from itertools import islice
+
+from bup import _helpers, hashsplit, path, midx, bloom, xstat
+from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
+                         fdatasync,
+                         hostname, localtime, log, merge_iter,
+                         mmap_read, mmap_readwrite,
+                         progress, qprogress, stat_if_exists,
+                         unlink, username, userfullname,
+                         utc_offset_str)
+
+
+max_pack_size = 1000*1000*1000  # larger packs will slow down pruning
+max_pack_objects = 200*1000  # cache memory usage is about 83 bytes per object
  
  verbose = 0
  ignore_midx = 0
  
  verbose = 0
  ignore_midx = 0
-home_repodir = os.path.expanduser('~/.bup')
-repodir = None
+repodir = None  # The default repository, once initialized
  
  _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
  _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
  
  _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
  _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
@@ -24,25 +35,132 @@ class GitError(Exception):
      pass
  
  
      pass
  
  
-def repo(sub = ''):
+def parse_tz_offset(s):
+    """UTC offset in seconds."""
+    tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
+    if s[0] == '-':
+        return - tz_off
+    return tz_off
+
+
+# FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
+# Make sure that's authoritative.
+_start_end_char = r'[^ .,:;<>"\'\0\n]'
+_content_char = r'[^\0\n<>]'
+_safe_str_rx = '(?:%s{1,2}|(?:%s%s*%s))' \
+    % (_start_end_char,
+       _start_end_char, _content_char, _start_end_char)
+_tz_rx = r'[-+]\d\d[0-5]\d'
+_parent_rx = r'(?:parent [abcdefABCDEF0123456789]{40}\n)'
+_commit_rx = re.compile(r'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
+(?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
+committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)
+
+(?P<message>(?:.|\n)*)''' % (_parent_rx,
+                             _safe_str_rx, _safe_str_rx, _tz_rx,
+                             _safe_str_rx, _safe_str_rx, _tz_rx))
+_parent_hash_rx = re.compile(r'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
+
+
+# Note that the author_sec and committer_sec values are (UTC) epoch seconds.
+CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
+                                       'author_name', 'author_mail',
+                                       'author_sec', 'author_offset',
+                                       'committer_name', 'committer_mail',
+                                       'committer_sec', 'committer_offset',
+                                       'message'])
+
+def parse_commit(content):
+    commit_match = re.match(_commit_rx, content)
+    if not commit_match:
+        raise Exception('cannot parse commit %r' % content)
+    matches = commit_match.groupdict()
+    return CommitInfo(tree=matches['tree'],
+                      parents=re.findall(_parent_hash_rx, matches['parents']),
+                      author_name=matches['author_name'],
+                      author_mail=matches['author_mail'],
+                      author_sec=int(matches['asec']),
+                      author_offset=parse_tz_offset(matches['atz']),
+                      committer_name=matches['committer_name'],
+                      committer_mail=matches['committer_mail'],
+                      committer_sec=int(matches['csec']),
+                      committer_offset=parse_tz_offset(matches['ctz']),
+                      message=matches['message'])
+
+
+def get_commit_items(id, cp):
+    commit_it = cp.get(id)
+    assert(commit_it.next() == 'commit')
+    commit_content = ''.join(commit_it)
+    return parse_commit(commit_content)
+
+
+def _local_git_date_str(epoch_sec):
+    return '%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
+
+
+def _git_date_str(epoch_sec, tz_offset_sec):
+    offs =  tz_offset_sec // 60
+    return '%d %s%02d%02d' \
+        % (epoch_sec,
+           '+' if offs >= 0 else '-',
+           abs(offs) // 60,
+           abs(offs) % 60)
+
+
+def repo(sub = '', repo_dir=None):
      """Get the path to the git repository or one of its subdirectories."""
      global repodir
      """Get the path to the git repository or one of its subdirectories."""
      global repodir
-    if not repodir:
+    repo_dir = repo_dir or repodir
+    if not repo_dir:
          raise GitError('You should call check_repo_or_die()')
  
      # If there's a .git subdirectory, then the actual repo is in there.
          raise GitError('You should call check_repo_or_die()')
  
      # If there's a .git subdirectory, then the actual repo is in there.
-    gd = os.path.join(repodir, '.git')
+    gd = os.path.join(repo_dir, '.git')
      if os.path.exists(gd):
          repodir = gd
  
      if os.path.exists(gd):
          repodir = gd
  
-    return os.path.join(repodir, sub)
+    return os.path.join(repo_dir, sub)
+
+
+def shorten_hash(s):
+    return re.sub(r'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)',
+                  r'\1\2*\3', s)
+
+
+def repo_rel(path):
+    full = os.path.abspath(path)
+    fullrepo = os.path.abspath(repo(''))
+    if not fullrepo.endswith('/'):
+        fullrepo += '/'
+    if full.startswith(fullrepo):
+        path = full[len(fullrepo):]
+    if path.startswith('index-cache/'):
+        path = path[len('index-cache/'):]
+    return shorten_hash(path)
+
+
+def all_packdirs():
+    paths = [repo('objects/pack')]
+    paths += glob.glob(repo('index-cache/*/.'))
+    return paths
  
  
  def auto_midx(objdir):
      args = [path.exe(), 'midx', '--auto', '--dir', objdir]
      try:
          rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
  
  
  def auto_midx(objdir):
      args = [path.exe(), 'midx', '--auto', '--dir', objdir]
      try:
          rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
-    except OSError, e:
+    except OSError as e:
+        # make sure 'args' gets printed to help with debugging
+        add_error('%r: exception: %s' % (args, e))
+        raise
+    if rv:
+        add_error('%r: returned %d' % (args, rv))
+
+    args = [path.exe(), 'bloom', '--dir', objdir]
+    try:
+        rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
+    except OSError as e:
          # make sure 'args' gets printed to help with debugging
          add_error('%r: exception: %s' % (args, e))
          raise
          # make sure 'args' gets printed to help with debugging
          add_error('%r: exception: %s' % (args, e))
          raise
@@ -54,9 +172,10 @@ def mangle_name(name, mode, gitmode):
      """Mangle a file name to present an abstract name for segmented files.
      Mangled file names will have the ".bup" extension added to them. If a
      file's name already ends with ".bup", a ".bupl" extension is added to
      """Mangle a file name to present an abstract name for segmented files.
      Mangled file names will have the ".bup" extension added to them. If a
      file's name already ends with ".bup", a ".bupl" extension is added to
-    disambiguate normal files from semgmented ones.
+    disambiguate normal files from segmented ones.
      """
      if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
      """
      if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
+        assert(stat.S_ISDIR(gitmode))
          return name + '.bup'
      elif name.endswith('.bup') or name[:-1].endswith('.bup'):
          return name + '.bupl'
          return name + '.bup'
      elif name.endswith('.bup') or name[:-1].endswith('.bup'):
          return name + '.bupl'
@@ -65,26 +184,77 @@ def mangle_name(name, mode, gitmode):
  
  
  (BUP_NORMAL, BUP_CHUNKED) = (0,1)
  
  
  (BUP_NORMAL, BUP_CHUNKED) = (0,1)
-def demangle_name(name):
+def demangle_name(name, mode):
      """Remove name mangling from a file name, if necessary.
  
      The return value is a tuple (demangled_filename,mode), where mode is one of
      the following:
  
      * BUP_NORMAL  : files that should be read as-is from the repository
      """Remove name mangling from a file name, if necessary.
  
      The return value is a tuple (demangled_filename,mode), where mode is one of
      the following:
  
      * BUP_NORMAL  : files that should be read as-is from the repository
-    * BUP_CHUNKED : files that were chunked and need to be assembled
+    * BUP_CHUNKED : files that were chunked and need to be reassembled
  
  
-    For more information on the name mangling algorythm, see mangle_name()
+    For more information on the name mangling algorithm, see mangle_name()
      """
      if name.endswith('.bupl'):
          return (name[:-5], BUP_NORMAL)
      elif name.endswith('.bup'):
          return (name[:-4], BUP_CHUNKED)
      """
      if name.endswith('.bupl'):
          return (name[:-5], BUP_NORMAL)
      elif name.endswith('.bup'):
          return (name[:-4], BUP_CHUNKED)
+    elif name.endswith('.bupm'):
+        return (name[:-5],
+                BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
      else:
          return (name, BUP_NORMAL)
  
  
      else:
          return (name, BUP_NORMAL)
  
  
-def _encode_packobj(type, content):
+def calc_hash(type, content):
+    """Calculate some content's hash in the Git fashion."""
+    header = '%s %d\0' % (type, len(content))
+    sum = Sha1(header)
+    sum.update(content)
+    return sum.digest()
+
+
+def shalist_item_sort_key(ent):
+    (mode, name, id) = ent
+    assert(mode+0 == mode)
+    if stat.S_ISDIR(mode):
+        return name + '/'
+    else:
+        return name
+
+
+def tree_encode(shalist):
+    """Generate a git tree object from (mode,name,hash) tuples."""
+    shalist = sorted(shalist, key = shalist_item_sort_key)
+    l = []
+    for (mode,name,bin) in shalist:
+        assert(mode)
+        assert(mode+0 == mode)
+        assert(name)
+        assert(len(bin) == 20)
+        s = '%o %s\0%s' % (mode,name,bin)
+        assert(s[0] != '0')  # 0-padded octal is not acceptable in a git tree
+        l.append(s)
+    return ''.join(l)
+
+
+def tree_decode(buf):
+    """Generate a list of (mode,name,hash) from the git tree object in buf."""
+    ofs = 0
+    while ofs < len(buf):
+        z = buf.find('\0', ofs)
+        assert(z > ofs)
+        spl = buf[ofs:z].split(' ', 1)
+        assert(len(spl) == 2)
+        mode,name = spl
+        sha = buf[z+1:z+1+20]
+        ofs = z+1+20
+        yield (int(mode, 8), name, sha)
+
+
+def _encode_packobj(type, content, compression_level=1):
+    if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
+        raise ValueError('invalid compression level %s' % compression_level)
      szout = ''
      sz = len(content)
      szbits = (sz & 0x0f) | (_typemap[type]<<4)
      szout = ''
      sz = len(content)
      szbits = (sz & 0x0f) | (_typemap[type]<<4)
@@ -96,14 +266,14 @@ def _encode_packobj(type, content):
              break
          szbits = sz & 0x7f
          sz >>= 7
              break
          szbits = sz & 0x7f
          sz >>= 7
-    z = zlib.compressobj(1)
+    z = zlib.compressobj(compression_level)
      yield szout
      yield z.compress(content)
      yield z.flush()
  
  
      yield szout
      yield z.compress(content)
      yield z.flush()
  
  
-def _encode_looseobj(type, content):
-    z = zlib.compressobj(1)
+def _encode_looseobj(type, content, compression_level=1):
+    z = zlib.compressobj(compression_level)
      yield z.compress('%s %d\0' % (type, len(content)))
      yield z.compress(content)
      yield z.flush()
      yield z.compress('%s %d\0' % (type, len(content)))
      yield z.compress(content)
      yield z.flush()
@@ -151,9 +321,11 @@ class PackIdx:
              return self._ofs_from_idx(idx)
          return None
  
              return self._ofs_from_idx(idx)
          return None
  
-    def exists(self, hash):
+    def exists(self, hash, want_source=False):
          """Return nonempty if the object exists in this index."""
          """Return nonempty if the object exists in this index."""
-        return hash and (self._idx_from_hash(hash) != None) and True or None
+        if hash and (self._idx_from_hash(hash) != None):
+            return want_source and os.path.basename(self.name) or True
+        return None
  
      def __len__(self):
          return int(self.fanout[255])
  
      def __len__(self):
          return int(self.fanout[255])
@@ -190,7 +362,8 @@ class PackIdxV1(PackIdx):
                                           str(buffer(self.map, 0, 256*4))))
          self.fanout.append(0)  # entry "-1"
          nsha = self.fanout[255]
                                           str(buffer(self.map, 0, 256*4))))
          self.fanout.append(0)  # entry "-1"
          nsha = self.fanout[255]
-        self.shatable = buffer(self.map, 256*4, nsha*24)
+        self.sha_ofs = 256*4
+        self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
  
      def _ofs_from_idx(self, idx):
          return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
  
      def _ofs_from_idx(self, idx):
          return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
@@ -214,9 +387,10 @@ class PackIdxV2(PackIdx):
                                           str(buffer(self.map, 8, 256*4))))
          self.fanout.append(0)  # entry "-1"
          nsha = self.fanout[255]
                                           str(buffer(self.map, 8, 256*4))))
          self.fanout.append(0)  # entry "-1"
          nsha = self.fanout[255]
-        self.shatable = buffer(self.map, 8 + 256*4, nsha*20)
+        self.sha_ofs = 8 + 256*4
+        self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
          self.ofstable = buffer(self.map,
          self.ofstable = buffer(self.map,
-                               8 + 256*4 + nsha*20 + nsha*4,
+                               self.sha_ofs + nsha*20 + nsha*4,
                                 nsha*4)
          self.ofs64table = buffer(self.map,
                                   8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
                                 nsha*4)
          self.ofs64table = buffer(self.map,
                                   8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
@@ -237,101 +411,6 @@ class PackIdxV2(PackIdx):
              yield buffer(self.map, 8 + 256*4 + 20*i, 20)
  
  
              yield buffer(self.map, 8 + 256*4 + 20*i, 20)
  
  
-extract_bits = _helpers.extract_bits
-
-
-class PackMidx:
-    """Wrapper which contains data from multiple index files.
-    Multiple index (.midx) files constitute a wrapper around index (.idx) files
-    and make it possible for bup to expand Git's indexing capabilities to vast
-    amounts of files.
-    """
-    def __init__(self, filename):
-        self.name = filename
-        self.force_keep = False
-        assert(filename.endswith('.midx'))
-        self.map = mmap_read(open(filename))
-        if str(self.map[0:4]) != 'MIDX':
-            log('Warning: skipping: invalid MIDX header in %r\n' % filename)
-            self.force_keep = True
-            return self._init_failed()
-        ver = struct.unpack('!I', self.map[4:8])[0]
-        if ver < MIDX_VERSION:
-            log('Warning: ignoring old-style (v%d) midx %r\n' 
-                % (ver, filename))
-            self.force_keep = False  # old stuff is boring  
-            return self._init_failed()
-        if ver > MIDX_VERSION:
-            log('Warning: ignoring too-new (v%d) midx %r\n'
-                % (ver, filename))
-            self.force_keep = True  # new stuff is exciting
-            return self._init_failed()
-
-        self.bits = _helpers.firstword(self.map[8:12])
-        self.entries = 2**self.bits
-        self.fanout = buffer(self.map, 12, self.entries*4)
-        shaofs = 12 + self.entries*4
-        nsha = self._fanget(self.entries-1)
-        self.shalist = buffer(self.map, shaofs, nsha*20)
-        self.idxnames = str(self.map[shaofs + 20*nsha:]).split('\0')
-
-    def _init_failed(self):
-        self.bits = 0
-        self.entries = 1
-        self.fanout = buffer('\0\0\0\0')
-        self.shalist = buffer('\0'*20)
-        self.idxnames = []
-
-    def _fanget(self, i):
-        start = i*4
-        s = self.fanout[start:start+4]
-        return _helpers.firstword(s)
-
-    def _get(self, i):
-        return str(self.shalist[i*20:(i+1)*20])
-
-    def exists(self, hash):
-        """Return nonempty if the object exists in the index files."""
-        global _total_searches, _total_steps
-        _total_searches += 1
-        want = str(hash)
-        el = extract_bits(want, self.bits)
-        if el:
-            start = self._fanget(el-1)
-            startv = el << (32-self.bits)
-        else:
-            start = 0
-            startv = 0
-        end = self._fanget(el)
-        endv = (el+1) << (32-self.bits)
-        _total_steps += 1   # lookup table is a step
-        hashv = _helpers.firstword(hash)
-        #print '(%08x) %08x %08x %08x' % (extract_bits(want, 32), startv, hashv, endv)
-        while start < end:
-            _total_steps += 1
-            #print '! %08x %08x %08x   %d - %d' % (startv, hashv, endv, start, end)
-            mid = start + (hashv-startv)*(end-start-1)/(endv-startv)
-            #print '  %08x %08x %08x   %d %d %d' % (startv, hashv, endv, start, mid, end)
-            v = self._get(mid)
-            #print '    %08x' % self._num(v)
-            if v < want:
-                start = mid+1
-                startv = _helpers.firstword(v)
-            elif v > want:
-                end = mid
-                endv = _helpers.firstword(v)
-            else: # got it!
-                return True
-        return None
-
-    def __iter__(self):
-        for i in xrange(self._fanget(self.entries-1)):
-            yield buffer(self.shalist, i*20, 20)
-
-    def __len__(self):
-        return int(self._fanget(self.entries-1))
-
-
  _mpi_count = 0
  class PackIdxList:
      def __init__(self, dir):
  _mpi_count = 0
  class PackIdxList:
      def __init__(self, dir):
@@ -339,8 +418,10 @@ class PackIdxList:
          assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
          _mpi_count += 1
          self.dir = dir
          assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
          _mpi_count += 1
          self.dir = dir
-        self.also = {}
+        self.also = set()
          self.packs = []
          self.packs = []
+        self.do_bloom = False
+        self.bloom = None
          self.refresh()
  
      def __del__(self):
          self.refresh()
  
      def __del__(self):
@@ -354,19 +435,27 @@ class PackIdxList:
      def __len__(self):
          return sum(len(pack) for pack in self.packs)
  
      def __len__(self):
          return sum(len(pack) for pack in self.packs)
  
-    def exists(self, hash):
+    def exists(self, hash, want_source=False):
          """Return nonempty if the object exists in the index files."""
          global _total_searches
          _total_searches += 1
          if hash in self.also:
              return True
          """Return nonempty if the object exists in the index files."""
          global _total_searches
          _total_searches += 1
          if hash in self.also:
              return True
-        for i in range(len(self.packs)):
+        if self.do_bloom and self.bloom:
+            if self.bloom.exists(hash):
+                self.do_bloom = False
+            else:
+                _total_searches -= 1  # was counted by bloom
+                return None
+        for i in xrange(len(self.packs)):
              p = self.packs[i]
              _total_searches -= 1  # will be incremented by sub-pack
              p = self.packs[i]
              _total_searches -= 1  # will be incremented by sub-pack
-            if p.exists(hash):
+            ix = p.exists(hash, want_source=want_source)
+            if ix:
                  # reorder so most recently used packs are searched first
                  self.packs = [p] + self.packs[:i] + self.packs[i+1:]
                  # reorder so most recently used packs are searched first
                  self.packs = [p] + self.packs[:i] + self.packs[i+1:]
-                return p.name
+                return ix
+        self.do_bloom = True
          return None
  
      def refresh(self, skip_midx = False):
          return None
  
      def refresh(self, skip_midx = False):
@@ -381,100 +470,76 @@ class PackIdxList:
          The module-global variable 'ignore_midx' can force this function to
          always act as if skip_midx was True.
          """
          The module-global variable 'ignore_midx' can force this function to
          always act as if skip_midx was True.
          """
+        self.bloom = None # Always reopen the bloom as it may have been relaced
+        self.do_bloom = False
          skip_midx = skip_midx or ignore_midx
          d = dict((p.name, p) for p in self.packs
          skip_midx = skip_midx or ignore_midx
          d = dict((p.name, p) for p in self.packs
-                 if not skip_midx or not isinstance(p, PackMidx))
+                 if not skip_midx or not isinstance(p, midx.PackMidx))
          if os.path.exists(self.dir):
              if not skip_midx:
                  midxl = []
                  for ix in self.packs:
          if os.path.exists(self.dir):
              if not skip_midx:
                  midxl = []
                  for ix in self.packs:
-                    if isinstance(ix, PackMidx):
+                    if isinstance(ix, midx.PackMidx):
                          for name in ix.idxnames:
                              d[os.path.join(self.dir, name)] = ix
                          for name in ix.idxnames:
                              d[os.path.join(self.dir, name)] = ix
-                for f in os.listdir(self.dir):
-                    full = os.path.join(self.dir, f)
-                    if f.endswith('.midx') and not d.get(full):
-                        mx = PackMidx(full)
+                for full in glob.glob(os.path.join(self.dir,'*.midx')):
+                    if not d.get(full):
+                        mx = midx.PackMidx(full)
                          (mxd, mxf) = os.path.split(mx.name)
                          (mxd, mxf) = os.path.split(mx.name)
-                        broken = 0
+                        broken = False
                          for n in mx.idxnames:
                              if not os.path.exists(os.path.join(mxd, n)):
                                  log(('warning: index %s missing\n' +
                                      '  used by %s\n') % (n, mxf))
                          for n in mx.idxnames:
                              if not os.path.exists(os.path.join(mxd, n)):
                                  log(('warning: index %s missing\n' +
                                      '  used by %s\n') % (n, mxf))
-                                broken += 1
+                                broken = True
                          if broken:
                          if broken:
+                            mx.close()
                              del mx
                              unlink(full)
                          else:
                              midxl.append(mx)
                              del mx
                              unlink(full)
                          else:
                              midxl.append(mx)
-                midxl.sort(lambda x,y: -cmp(len(x),len(y)))
+                midxl.sort(key=lambda ix:
+                           (-len(ix), -xstat.stat(ix.name).st_mtime))
                  for ix in midxl:
                  for ix in midxl:
-                    any = 0
+                    any_needed = False
                      for sub in ix.idxnames:
                          found = d.get(os.path.join(self.dir, sub))
                          if not found or isinstance(found, PackIdx):
                              # doesn't exist, or exists but not in a midx
                      for sub in ix.idxnames:
                          found = d.get(os.path.join(self.dir, sub))
                          if not found or isinstance(found, PackIdx):
                              # doesn't exist, or exists but not in a midx
-                            d[ix.name] = ix
-                            for name in ix.idxnames:
-                                d[os.path.join(self.dir, name)] = ix
-                            any += 1
+                            any_needed = True
                              break
                              break
-                    if not any and not ix.force_keep:
+                    if any_needed:
+                        d[ix.name] = ix
+                        for name in ix.idxnames:
+                            d[os.path.join(self.dir, name)] = ix
+                    elif not ix.force_keep:
                          debug1('midx: removing redundant: %s\n'
                                 % os.path.basename(ix.name))
                          debug1('midx: removing redundant: %s\n'
                                 % os.path.basename(ix.name))
+                        ix.close()
                          unlink(ix.name)
                          unlink(ix.name)
-            for f in os.listdir(self.dir):
-                full = os.path.join(self.dir, f)
-                if f.endswith('.idx') and not d.get(full):
+            for full in glob.glob(os.path.join(self.dir,'*.idx')):
+                if not d.get(full):
                      try:
                          ix = open_idx(full)
                      try:
                          ix = open_idx(full)
-                    except GitError, e:
+                    except GitError as e:
                          add_error(e)
                          continue
                      d[full] = ix
                          add_error(e)
                          continue
                      d[full] = ix
+            bfull = os.path.join(self.dir, 'bup.bloom')
+            if self.bloom is None and os.path.exists(bfull):
+                self.bloom = bloom.ShaBloom(bfull)
              self.packs = list(set(d.values()))
              self.packs = list(set(d.values()))
+            self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
+            if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
+                self.do_bloom = True
+            else:
+                self.bloom = None
          debug1('PackIdxList: using %d index%s.\n'
              % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
  
          debug1('PackIdxList: using %d index%s.\n'
              % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
  
-    def packname_containing(self, hash):
-        # figure out which pack contains a given hash.
-        # FIXME: if the midx file format would just *store* this information,
-        # we could calculate it a lot more efficiently.  But it's not needed
-        # often, so let's do it like this.
-        for f in os.listdir(self.dir):
-            if f.endswith('.idx'):
-                full = os.path.join(self.dir, f)
-                try:
-                    ix = open_idx(full)
-                except GitError, e:
-                    add_error(e)
-                    continue
-                if ix.exists(hash):
-                    return full
-
      def add(self, hash):
          """Insert an additional object in the list."""
      def add(self, hash):
          """Insert an additional object in the list."""
-        self.also[hash] = 1
-
-    def zap_also(self):
-        """Remove all additional objects from the list."""
-        self.also = {}
-
-
-def calc_hash(type, content):
-    """Calculate some content's hash in the Git fashion."""
-    header = '%s %d\0' % (type, len(content))
-    sum = Sha1(header)
-    sum.update(content)
-    return sum.digest()
-
-
-def _shalist_sort_key(ent):
-    (mode, name, id) = ent
-    if stat.S_ISDIR(int(mode, 8)):
-        return name + '/'
-    else:
-        return name
+        self.also.add(hash)
  
  
  def open_idx(filename):
  
  
  def open_idx(filename):
@@ -493,65 +558,69 @@ def open_idx(filename):
          else:
              raise GitError('%s: unrecognized idx file header' % filename)
      elif filename.endswith('.midx'):
          else:
              raise GitError('%s: unrecognized idx file header' % filename)
      elif filename.endswith('.midx'):
-        return PackMidx(filename)
+        return midx.PackMidx(filename)
      else:
          raise GitError('idx filenames must end with .idx or .midx')
  
  
  def idxmerge(idxlist, final_progress=True):
      """Generate a list of all the objects reachable in a PackIdxList."""
      else:
          raise GitError('idx filenames must end with .idx or .midx')
  
  
  def idxmerge(idxlist, final_progress=True):
      """Generate a list of all the objects reachable in a PackIdxList."""
-    total = sum(len(i) for i in idxlist)
-    iters = (iter(i) for i in idxlist)
-    heap = [(next(it), it) for it in iters]
-    heapq.heapify(heap)
-    count = 0
-    last = None
-    while heap:
-        if (count % 10024) == 0:
-            progress('Reading indexes: %.2f%% (%d/%d)\r'
-                     % (count*100.0/total, count, total))
-        (e, it) = heap[0]
-        if e != last:
-            yield e
-            last = e
-        count += 1
-        e = next(it)
-        if e:
-            heapq.heapreplace(heap, (e, it))
-        else:
-            heapq.heappop(heap)
-    if final_progress:
-        log('Reading indexes: %.2f%% (%d/%d), done.\n' % (100, total, total))
+    def pfunc(count, total):
+        qprogress('Reading indexes: %.2f%% (%d/%d)\r'
+                  % (count*100.0/total, count, total))
+    def pfinal(count, total):
+        if final_progress:
+            progress('Reading indexes: %.2f%% (%d/%d), done.\n'
+                     % (100, total, total))
+    return merge_iter(idxlist, 10024, pfunc, pfinal)
  
  
  def _make_objcache():
      return PackIdxList(repo('objects/pack'))
  
  
  
  def _make_objcache():
      return PackIdxList(repo('objects/pack'))
  
+# bup-gc assumes that it can disable all PackWriter activities
+# (bloom/midx/cache) via the constructor and close() arguments.
+
  class PackWriter:
  class PackWriter:
-    """Writes Git objects insid a pack file."""
-    def __init__(self, objcache_maker=_make_objcache):
+    """Writes Git objects inside a pack file."""
+    def __init__(self, objcache_maker=_make_objcache, compression_level=1,
+                 run_midx=True, on_pack_finish=None):
+        self.file = None
+        self.parentfd = None
          self.count = 0
          self.outbytes = 0
          self.filename = None
          self.count = 0
          self.outbytes = 0
          self.filename = None
-        self.file = None
          self.idx = None
          self.objcache_maker = objcache_maker
          self.objcache = None
          self.idx = None
          self.objcache_maker = objcache_maker
          self.objcache = None
+        self.compression_level = compression_level
+        self.run_midx=run_midx
+        self.on_pack_finish = on_pack_finish
  
      def __del__(self):
          self.close()
  
      def _open(self):
          if not self.file:
  
      def __del__(self):
          self.close()
  
      def _open(self):
          if not self.file:
-            (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
-            self.file = os.fdopen(fd, 'w+b')
+            objdir = dir=repo('objects')
+            fd, name = tempfile.mkstemp(suffix='.pack', dir=objdir)
+            try:
+                self.file = os.fdopen(fd, 'w+b')
+            except:
+                os.close(fd)
+                raise
+            try:
+                self.parentfd = os.open(objdir, os.O_RDONLY)
+            except:
+                f = self.file
+                self.file = None
+                f.close()
+                raise
              assert(name.endswith('.pack'))
              self.filename = name[:-5]
              self.file.write('PACK\0\0\0\2\0\0\0\0')
              self.idx = list(list() for i in xrange(256))
  
              assert(name.endswith('.pack'))
              self.filename = name[:-5]
              self.file.write('PACK\0\0\0\2\0\0\0\0')
              self.idx = list(list() for i in xrange(256))
  
-    # the 'sha' parameter is used in client.py's _raw_write(), but not needed
-    # in this basic version.
      def _raw_write(self, datalist, sha):
          self._open()
          f = self.file
      def _raw_write(self, datalist, sha):
          self._open()
          f = self.file
@@ -563,7 +632,7 @@ class PackWriter:
          oneblob = ''.join(datalist)
          try:
              f.write(oneblob)
          oneblob = ''.join(datalist)
          try:
              f.write(oneblob)
-        except IOError, e:
+        except IOError as e:
              raise GitError, e, sys.exc_info()[2]
          nw = len(oneblob)
          crc = zlib.crc32(oneblob) & 0xffffffff
              raise GitError, e, sys.exc_info()[2]
          nw = len(oneblob)
          crc = zlib.crc32(oneblob) & 0xffffffff
@@ -582,19 +651,19 @@ class PackWriter:
              log('>')
          if not sha:
              sha = calc_hash(type, content)
              log('>')
          if not sha:
              sha = calc_hash(type, content)
-        size, crc = self._raw_write(_encode_packobj(type, content), sha=sha)
+        size, crc = self._raw_write(_encode_packobj(type, content,
+                                                    self.compression_level),
+                                    sha=sha)
+        if self.outbytes >= max_pack_size or self.count >= max_pack_objects:
+            self.breakpoint()
          return sha
  
      def breakpoint(self):
          """Clear byte and object counts and return the last processed id."""
          return sha
  
      def breakpoint(self):
          """Clear byte and object counts and return the last processed id."""
-        id = self._end()
+        id = self._end(self.run_midx)
          self.outbytes = self.count = 0
          return id
  
          self.outbytes = self.count = 0
          return id
  
-    def write(self, type, content):
-        """Write an object in this pack file."""
-        return self._write(calc_hash(type, content), type, content)
-
      def _require_objcache(self):
          if self.objcache is None and self.objcache_maker:
              self.objcache = self.objcache_maker()
      def _require_objcache(self):
          if self.objcache is None and self.objcache_maker:
              self.objcache = self.objcache_maker()
@@ -602,17 +671,22 @@ class PackWriter:
              raise GitError(
                      "PackWriter not opened or can't check exists w/o objcache")
  
              raise GitError(
                      "PackWriter not opened or can't check exists w/o objcache")
  
-    def exists(self, id):
+    def exists(self, id, want_source=False):
          """Return non-empty if an object is found in the object cache."""
          self._require_objcache()
          """Return non-empty if an object is found in the object cache."""
          self._require_objcache()
-        return self.objcache.exists(id)
+        return self.objcache.exists(id, want_source=want_source)
+
+    def just_write(self, sha, type, content):
+        """Write an object to the pack file, bypassing the objcache.  Fails if
+        sha exists()."""
+        self._write(sha, type, content)
  
      def maybe_write(self, type, content):
          """Write an object to the pack file if not present and return its id."""
  
      def maybe_write(self, type, content):
          """Write an object to the pack file if not present and return its id."""
-        self._require_objcache()
          sha = calc_hash(type, content)
          if not self.exists(sha):
          sha = calc_hash(type, content)
          if not self.exists(sha):
-            self._write(sha, type, content)
+            self.just_write(sha, type, content)
+            self._require_objcache()
              self.objcache.add(sha)
          return sha
  
              self.objcache.add(sha)
          return sha
  
@@ -622,140 +696,174 @@ class PackWriter:
  
      def new_tree(self, shalist):
          """Create a tree object in the pack."""
  
      def new_tree(self, shalist):
          """Create a tree object in the pack."""
-        shalist = sorted(shalist, key = _shalist_sort_key)
-        l = []
-        for (mode,name,bin) in shalist:
-            assert(mode)
-            assert(mode != '0')
-            assert(mode[0] != '0')
-            assert(name)
-            assert(len(bin) == 20)
-            l.append('%s %s\0%s' % (mode,name,bin))
-        return self.maybe_write('tree', ''.join(l))
-
-    def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
+        content = tree_encode(shalist)
+        return self.maybe_write('tree', content)
+
+    def new_commit(self, tree, parent,
+                   author, adate_sec, adate_tz,
+                   committer, cdate_sec, cdate_tz,
+                   msg):
+        """Create a commit object in the pack.  The date_sec values must be
+        epoch-seconds, and if a tz is None, the local timezone is assumed."""
+        if adate_tz:
+            adate_str = _git_date_str(adate_sec, adate_tz)
+        else:
+            adate_str = _local_git_date_str(adate_sec)
+        if cdate_tz:
+            cdate_str = _git_date_str(cdate_sec, cdate_tz)
+        else:
+            cdate_str = _local_git_date_str(cdate_sec)
          l = []
          if tree: l.append('tree %s' % tree.encode('hex'))
          if parent: l.append('parent %s' % parent.encode('hex'))
          l = []
          if tree: l.append('tree %s' % tree.encode('hex'))
          if parent: l.append('parent %s' % parent.encode('hex'))
-        if author: l.append('author %s %s' % (author, _git_date(adate)))
-        if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
+        if author: l.append('author %s %s' % (author, adate_str))
+        if committer: l.append('committer %s %s' % (committer, cdate_str))
          l.append('')
          l.append(msg)
          return self.maybe_write('commit', '\n'.join(l))
  
          l.append('')
          l.append(msg)
          return self.maybe_write('commit', '\n'.join(l))
  
-    def new_commit(self, parent, tree, date, msg):
-        """Create a commit object in the pack."""
-        userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
-        commit = self._new_commit(tree, parent,
-                                  userline, date, userline, date,
-                                  msg)
-        return commit
-
      def abort(self):
          """Remove the pack file from disk."""
          f = self.file
          if f:
      def abort(self):
          """Remove the pack file from disk."""
          f = self.file
          if f:
-            self.idx = None
+            pfd = self.parentfd
              self.file = None
              self.file = None
-            f.close()
-            os.unlink(self.filename + '.pack')
+            self.parentfd = None
+            self.idx = None
+            try:
+                try:
+                    os.unlink(self.filename + '.pack')
+                finally:
+                    f.close()
+            finally:
+                if pfd is not None:
+                    os.close(pfd)
  
      def _end(self, run_midx=True):
          f = self.file
          if not f: return None
          self.file = None
  
      def _end(self, run_midx=True):
          f = self.file
          if not f: return None
          self.file = None
-        self.objcache = None
-        idx = self.idx
-        self.idx = None
+        try:
+            self.objcache = None
+            idx = self.idx
+            self.idx = None
+
+            # update object count
+            f.seek(8)
+            cp = struct.pack('!i', self.count)
+            assert(len(cp) == 4)
+            f.write(cp)
+
+            # calculate the pack sha1sum
+            f.seek(0)
+            sum = Sha1()
+            for b in chunkyreader(f):
+                sum.update(b)
+            packbin = sum.digest()
+            f.write(packbin)
+            fdatasync(f.fileno())
+        finally:
+            f.close()
  
  
-        # update object count
-        f.seek(8)
-        cp = struct.pack('!i', self.count)
-        assert(len(cp) == 4)
-        f.write(cp)
-
-        # calculate the pack sha1sum
-        f.seek(0)
-        sum = Sha1()
-        for b in chunkyreader(f):
-            sum.update(b)
-        packbin = sum.digest()
-        f.write(packbin)
-        f.close()
-
-        idx_f = open(self.filename + '.idx', 'wb')
-        obj_list_sha = self._write_pack_idx_v2(idx_f, idx, packbin)
-        idx_f.close()
+        obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
  
          nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
          if os.path.exists(self.filename + '.map'):
              os.unlink(self.filename + '.map')
          os.rename(self.filename + '.pack', nameprefix + '.pack')
          os.rename(self.filename + '.idx', nameprefix + '.idx')
  
          nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
          if os.path.exists(self.filename + '.map'):
              os.unlink(self.filename + '.map')
          os.rename(self.filename + '.pack', nameprefix + '.pack')
          os.rename(self.filename + '.idx', nameprefix + '.idx')
+        try:
+            os.fsync(self.parentfd)
+        finally:
+            os.close(self.parentfd)
  
          if run_midx:
              auto_midx(repo('objects/pack'))
  
          if run_midx:
              auto_midx(repo('objects/pack'))
+
+        if self.on_pack_finish:
+            self.on_pack_finish(nameprefix)
+
          return nameprefix
  
      def close(self, run_midx=True):
          """Close the pack file and move it to its definitive path."""
          return self._end(run_midx=run_midx)
  
          return nameprefix
  
      def close(self, run_midx=True):
          """Close the pack file and move it to its definitive path."""
          return self._end(run_midx=run_midx)
  
-    def _write_pack_idx_v2(self, file, idx, packbin):
-        sum = Sha1()
-
-        def write(data):
-            file.write(data)
-            sum.update(data)
-
-        write('\377tOc\0\0\0\2')
-
-        n = 0
-        for part in idx:
-            n += len(part)
-            write(struct.pack('!i', n))
-            part.sort(key=lambda x: x[0])
-
-        obj_list_sum = Sha1()
-        for part in idx:
-            for entry in part:
-                write(entry[0])
-                obj_list_sum.update(entry[0])
-        for part in idx:
-            for entry in part:
-                write(struct.pack('!I', entry[1]))
-        ofs64_list = []
-        for part in idx:
-            for entry in part:
-                if entry[2] & 0x80000000:
-                    write(struct.pack('!I', 0x80000000 | len(ofs64_list)))
-                    ofs64_list.append(struct.pack('!Q', entry[2]))
-                else:
-                    write(struct.pack('!i', entry[2]))
-        for ofs64 in ofs64_list:
-            write(ofs64)
-
-        write(packbin)
-        file.write(sum.digest())
-        return obj_list_sum.hexdigest()
-
-
-def _git_date(date):
-    return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
-
-
-def _gitenv():
-    os.environ['GIT_DIR'] = os.path.abspath(repo())
-
+    def _write_pack_idx_v2(self, filename, idx, packbin):
+        ofs64_count = 0
+        for section in idx:
+            for entry in section:
+                if entry[2] >= 2**31:
+                    ofs64_count += 1
+
+        # Length: header + fan-out + shas-and-crcs + overflow-offsets
+        index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
+        idx_map = None
+        idx_f = open(filename, 'w+b')
+        try:
+            idx_f.truncate(index_len)
+            fdatasync(idx_f.fileno())
+            idx_map = mmap_readwrite(idx_f, close=False)
+            try:
+                count = _helpers.write_idx(filename, idx_map, idx, self.count)
+                assert(count == self.count)
+                idx_map.flush()
+            finally:
+                idx_map.close()
+        finally:
+            idx_f.close()
+
+        idx_f = open(filename, 'a+b')
+        try:
+            idx_f.write(packbin)
+            idx_f.seek(0)
+            idx_sum = Sha1()
+            b = idx_f.read(8 + 4*256)
+            idx_sum.update(b)
+
+            obj_list_sum = Sha1()
+            for b in chunkyreader(idx_f, 20*self.count):
+                idx_sum.update(b)
+                obj_list_sum.update(b)
+            namebase = obj_list_sum.hexdigest()
+
+            for b in chunkyreader(idx_f):
+                idx_sum.update(b)
+            idx_f.write(idx_sum.digest())
+            fdatasync(idx_f.fileno())
+            return namebase
+        finally:
+            idx_f.close()
+
+
+def _gitenv(repo_dir = None):
+    if not repo_dir:
+        repo_dir = repo()
+    def env():
+        os.environ['GIT_DIR'] = os.path.abspath(repo_dir)
+    return env
+
+
+def list_refs(refnames=None, repo_dir=None,
+              limit_to_heads=False, limit_to_tags=False):
+    """Yield (refname, hash) tuples for all repository refs unless
+    refnames are specified.  In that case, only include tuples for
+    those refs.  The limits restrict the result items to refs/heads or
+    refs/tags.  If both limits are specified, items from both sources
+    will be included.
  
  
-def list_refs(refname = None):
-    """Generate a list of tuples in the form (refname,hash).
-    If a ref name is specified, list only this particular ref.
      """
      """
-    argv = ['git', 'show-ref', '--']
-    if refname:
-        argv += [refname]
-    p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
+    argv = ['git', 'show-ref']
+    if limit_to_heads:
+        argv.append('--heads')
+    if limit_to_tags:
+        argv.append('--tags')
+    argv.append('--')
+    if refnames:
+        argv += refnames
+    p = subprocess.Popen(argv,
+                         preexec_fn = _gitenv(repo_dir),
+                         stdout = subprocess.PIPE)
      out = p.stdout.read().strip()
      rv = p.wait()  # not fatal
      if rv:
      out = p.stdout.read().strip()
      rv = p.wait()  # not fatal
      if rv:
@@ -766,9 +874,10 @@ def list_refs(refname = None):
              yield (name, sha.decode('hex'))
  
  
              yield (name, sha.decode('hex'))
  
  
-def read_ref(refname):
+def read_ref(refname, repo_dir = None):
      """Get the commit id of the most recent commit made on a given ref."""
      """Get the commit id of the most recent commit made on a given ref."""
-    l = list(list_refs(refname))
+    refs = list_refs(refnames=[refname], repo_dir=repo_dir, limit_to_heads=True)
+    l = tuple(islice(refs, 2))
      if l:
          assert(len(l) == 1)
          return l[0][1]
      if l:
          assert(len(l) == 1)
          return l[0][1]
@@ -776,7 +885,7 @@ def read_ref(refname):
          return None
  
  
          return None
  
  
-def rev_list(ref, count=None):
+def rev_list(ref, count=None, repo_dir=None):
      """Generate a list of reachable commits in reverse chronological order.
  
      This generator walks through commits, from child to parent, that are
      """Generate a list of reachable commits in reverse chronological order.
  
      This generator walks through commits, from child to parent, that are
@@ -790,8 +899,10 @@ def rev_list(ref, count=None):
      opts = []
      if count:
          opts += ['-n', str(atoi(count))]
      opts = []
      if count:
          opts += ['-n', str(atoi(count))]
-    argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
-    p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
+    argv = ['git', 'rev-list', '--pretty=format:%at'] + opts + [ref, '--']
+    p = subprocess.Popen(argv,
+                         preexec_fn = _gitenv(repo_dir),
+                         stdout = subprocess.PIPE)
      commit = None
      for row in p.stdout:
          s = row.strip()
      commit = None
      for row in p.stdout:
          s = row.strip()
@@ -805,14 +916,18 @@ def rev_list(ref, count=None):
          raise GitError, 'git rev-list returned error %d' % rv
  
  
          raise GitError, 'git rev-list returned error %d' % rv
  
  
-def rev_get_date(ref):
-    """Get the date of the latest commit on the specified ref."""
-    for (date, commit) in rev_list(ref, count=1):
-        return date
-    raise GitError, 'no such commit %r' % ref
+def get_commit_dates(refs, repo_dir=None):
+    """Get the dates for the specified commit refs.  For now, every unique
+       string in refs must resolve to a different commit or this
+       function will fail."""
+    result = []
+    for ref in refs:
+        commit = get_commit_items(ref, cp(repo_dir))
+        result.append(commit.author_sec)
+    return result
  
  
  
  
-def rev_parse(committish):
+def rev_parse(committish, repo_dir=None):
      """Resolve the full hash for 'committish', if it exists.
  
      Should be roughly equivalent to 'git rev-parse'.
      """Resolve the full hash for 'committish', if it exists.
  
      Should be roughly equivalent to 'git rev-parse'.
@@ -820,12 +935,12 @@ def rev_parse(committish):
      Returns the hex value of the hash if it is found, None if 'committish' does
      not correspond to anything.
      """
      Returns the hex value of the hash if it is found, None if 'committish' does
      not correspond to anything.
      """
-    head = read_ref(committish)
+    head = read_ref(committish, repo_dir=repo_dir)
      if head:
          debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
          return head
  
      if head:
          debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
          return head
  
-    pL = PackIdxList(repo('objects/pack'))
+    pL = PackIdxList(repo('objects/pack', repo_dir=repo_dir))
  
      if len(committish) == 40:
          try:
  
      if len(committish) == 40:
          try:
@@ -839,14 +954,24 @@ def rev_parse(committish):
      return None
  
  
      return None
  
  
-def update_ref(refname, newval, oldval):
-    """Change the commit pointed to by a branch."""
+def update_ref(refname, newval, oldval, repo_dir=None):
+    """Update a repository reference."""
      if not oldval:
          oldval = ''
      if not oldval:
          oldval = ''
-    assert(refname.startswith('refs/heads/'))
+    assert(refname.startswith('refs/heads/') \
+           or refname.startswith('refs/tags/'))
      p = subprocess.Popen(['git', 'update-ref', refname,
                            newval.encode('hex'), oldval.encode('hex')],
      p = subprocess.Popen(['git', 'update-ref', refname,
                            newval.encode('hex'), oldval.encode('hex')],
-                         preexec_fn = _gitenv)
+                         preexec_fn = _gitenv(repo_dir))
+    _git_wait('git update-ref', p)
+
+
+def delete_ref(refname, oldvalue=None):
+    """Delete a repository reference (see git update-ref(1))."""
+    assert(refname.startswith('refs/'))
+    oldvalue = [] if not oldvalue else [oldvalue]
+    p = subprocess.Popen(['git', 'update-ref', '-d', refname] + oldvalue,
+                         preexec_fn = _gitenv())
      _git_wait('git update-ref', p)
  
  
      _git_wait('git update-ref', p)
  
  
@@ -874,42 +999,36 @@ def init_repo(path=None):
      if parent and not os.path.exists(parent):
          raise GitError('parent directory "%s" does not exist\n' % parent)
      if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
      if parent and not os.path.exists(parent):
          raise GitError('parent directory "%s" does not exist\n' % parent)
      if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
-        raise GitError('"%d" exists but is not a directory\n' % d)
+        raise GitError('"%s" exists but is not a directory\n' % d)
      p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
      p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
-                         preexec_fn = _gitenv)
+                         preexec_fn = _gitenv())
      _git_wait('git init', p)
      # Force the index version configuration in order to ensure bup works
      # regardless of the version of the installed Git binary.
      p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
      _git_wait('git init', p)
      # Force the index version configuration in order to ensure bup works
      # regardless of the version of the installed Git binary.
      p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
-                         stdout=sys.stderr, preexec_fn = _gitenv)
+                         stdout=sys.stderr, preexec_fn = _gitenv())
+    _git_wait('git config', p)
+    # Enable the reflog
+    p = subprocess.Popen(['git', 'config', 'core.logAllRefUpdates', 'true'],
+                         stdout=sys.stderr, preexec_fn = _gitenv())
      _git_wait('git config', p)
  
  
  def check_repo_or_die(path=None):
      _git_wait('git config', p)
  
  
  def check_repo_or_die(path=None):
-    """Make sure a bup repository exists, and abort if not.
-    If the path to a particular repository was not specified, this function
-    initializes the default repository automatically.
-    """
+    """Check to see if a bup repository probably exists, and abort if not."""
      guess_repo(path)
      guess_repo(path)
-    if not os.path.isdir(repo('objects/pack/.')):
-        if repodir == home_repodir:
-            init_repo()
-        else:
-            log('error: %r is not a bup/git repository\n' % repo())
+    top = repo()
+    pst = stat_if_exists(top + '/objects/pack')
+    if pst and stat.S_ISDIR(pst.st_mode):
+        return
+    if not pst:
+        top_st = stat_if_exists(top)
+        if not top_st:
+            log('error: repository %r does not exist (see "bup help init")\n'
+                % top)
              sys.exit(15)
              sys.exit(15)
-
-
-def treeparse(buf):
-    """Generate a list of (mode, name, hash) tuples of objects from 'buf'."""
-    ofs = 0
-    while ofs < len(buf):
-        z = buf[ofs:].find('\0')
-        assert(z > 0)
-        spl = buf[ofs:ofs+z].split(' ', 1)
-        assert(len(spl) == 2)
-        sha = buf[ofs+z+1:ofs+z+1+20]
-        ofs += z+1+20
-        yield (spl[0], spl[1], sha)
+    log('error: %r is not a repository\n' % top)
+    sys.exit(14)
  
  
  _ver = None
  
  
  _ver = None
@@ -946,7 +1065,7 @@ def _git_wait(cmd, p):
  
  
  def _git_capture(argv):
  
  
  def _git_capture(argv):
-    p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
+    p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv())
      r = p.stdout.read()
      _git_wait(repr(argv), p)
      return r
      r = p.stdout.read()
      _git_wait(repr(argv), p)
      return r
@@ -964,7 +1083,7 @@ class _AbortableIter:
      def next(self):
          try:
              return self.it.next()
      def next(self):
          try:
              return self.it.next()
-        except StopIteration, e:
+        except StopIteration as e:
              self.done = True
              raise
          except:
              self.done = True
              raise
          except:
@@ -982,11 +1101,18 @@ class _AbortableIter:
          self.abort()
  
  
          self.abort()
  
  
+class MissingObject(KeyError):
+    def __init__(self, id):
+        self.id = id
+        KeyError.__init__(self, 'object %r is missing' % id.encode('hex'))
+
+
  _ver_warned = 0
  class CatPipe:
      """Link to 'git cat-file' that is used to retrieve blob data."""
  _ver_warned = 0
  class CatPipe:
      """Link to 'git cat-file' that is used to retrieve blob data."""
-    def __init__(self):
+    def __init__(self, repo_dir = None):
          global _ver_warned
          global _ver_warned
+        self.repo_dir = repo_dir
          wanted = ('1','5','6')
          if ver() < wanted:
              if not _ver_warned:
          wanted = ('1','5','6')
          if ver() < wanted:
              if not _ver_warned:
@@ -1005,22 +1131,23 @@ class CatPipe:
          self.p = None
          self.inprogress = None
  
          self.p = None
          self.inprogress = None
  
-    def _restart(self):
+    def restart(self):
          self._abort()
          self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
                                    stdin=subprocess.PIPE,
                                    stdout=subprocess.PIPE,
                                    close_fds = True,
                                    bufsize = 4096,
          self._abort()
          self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
                                    stdin=subprocess.PIPE,
                                    stdout=subprocess.PIPE,
                                    close_fds = True,
                                    bufsize = 4096,
-                                  preexec_fn = _gitenv)
+                                  preexec_fn = _gitenv(self.repo_dir))
  
      def _fast_get(self, id):
          if not self.p or self.p.poll() != None:
  
      def _fast_get(self, id):
          if not self.p or self.p.poll() != None:
-            self._restart()
+            self.restart()
          assert(self.p)
          assert(self.p)
-        assert(self.p.poll() == None)
+        poll_result = self.p.poll()
+        assert(poll_result == None)
          if self.inprogress:
          if self.inprogress:
-            log('_fast_get: opening %r while %r is open'
+            log('_fast_get: opening %r while %r is open\n'
                  % (id, self.inprogress))
          assert(not self.inprogress)
          assert(id.find('\n') < 0)
                  % (id, self.inprogress))
          assert(not self.inprogress)
          assert(id.find('\n') < 0)
@@ -1032,7 +1159,7 @@ class CatPipe:
          hdr = self.p.stdout.readline()
          if hdr.endswith(' missing\n'):
              self.inprogress = None
          hdr = self.p.stdout.readline()
          if hdr.endswith(' missing\n'):
              self.inprogress = None
-            raise KeyError('blob %r is missing' % id)
+            raise MissingObject(id.decode('hex'))
          spl = hdr.split(' ')
          if len(spl) != 3 or len(spl[0]) != 40:
              raise GitError('expected blob, got %r' % spl)
          spl = hdr.split(' ')
          if len(spl) != 3 or len(spl[0]) != 40:
              raise GitError('expected blob, got %r' % spl)
@@ -1044,9 +1171,10 @@ class CatPipe:
              yield type
              for blob in it:
                  yield blob
              yield type
              for blob in it:
                  yield blob
-            assert(self.p.stdout.readline() == '\n')
+            readline_result = self.p.stdout.readline()
+            assert(readline_result == '\n')
              self.inprogress = None
              self.inprogress = None
-        except Exception, e:
+        except Exception as e:
              it.abort()
              raise
  
              it.abort()
              raise
  
@@ -1059,7 +1187,7 @@ class CatPipe:
  
          p = subprocess.Popen(['git', 'cat-file', type, id],
                               stdout=subprocess.PIPE,
  
          p = subprocess.Popen(['git', 'cat-file', type, id],
                               stdout=subprocess.PIPE,
-                             preexec_fn = _gitenv)
+                             preexec_fn = _gitenv(self.repo_dir))
          for blob in chunkyreader(p.stdout):
              yield blob
          _git_wait('git cat-file', p)
          for blob in chunkyreader(p.stdout):
              yield blob
          _git_wait('git cat-file', p)
@@ -1071,7 +1199,7 @@ class CatPipe:
                  yield blob
          elif type == 'tree':
              treefile = ''.join(it)
                  yield blob
          elif type == 'tree':
              treefile = ''.join(it)
-            for (mode, name, sha) in treeparse(treefile):
+            for (mode, name, sha) in tree_decode(treefile):
                  for blob in self.join(sha.encode('hex')):
                      yield blob
          elif type == 'commit':
                  for blob in self.join(sha.encode('hex')):
                      yield blob
          elif type == 'commit':
@@ -1095,15 +1223,110 @@ class CatPipe:
          except StopIteration:
              log('booger!\n')
  
          except StopIteration:
              log('booger!\n')
  
-def tags():
+
+_cp = {}
+
+def cp(repo_dir=None):
+    """Create a CatPipe object or reuse the already existing one."""
+    global _cp, repodir
+    if not repo_dir:
+        repo_dir = repodir or repo()
+    repo_dir = os.path.abspath(repo_dir)
+    cp = _cp.get(repo_dir)
+    if not cp:
+        cp = CatPipe(repo_dir)
+        _cp[repo_dir] = cp
+    return cp
+
+
+def tags(repo_dir = None):
      """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
      tags = {}
      """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
      tags = {}
-    for (n,c) in list_refs():
-        if n.startswith('refs/tags/'):
-            name = n[10:]
-            if not c in tags:
-                tags[c] = []
+    for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
+        assert(n.startswith('refs/tags/'))
+        name = n[10:]
+        if not c in tags:
+            tags[c] = []
+        tags[c].append(name)  # more than one tag can point at 'c'
+    return tags
  
  
-            tags[c].append(name)  # more than one tag can point at 'c'
  
  
-    return tags
+WalkItem = namedtuple('WalkItem', ['id', 'type', 'mode',
+                                   'path', 'chunk_path', 'data'])
+# The path is the mangled path, and if an item represents a fragment
+# of a chunked file, the chunk_path will be the chunked subtree path
+# for the chunk, i.e. ['', '2d3115e', ...].  The top-level path for a
+# chunked file will have a chunk_path of [''].  So some chunk subtree
+# of the file '/foo/bar/baz' might look like this:
+#
+#   item.path = ['foo', 'bar', 'baz.bup']
+#   item.chunk_path = ['', '2d3115e', '016b097']
+#   item.type = 'tree'
+#   ...
+
+
+def walk_object(cat_pipe, id,
+                stop_at=None,
+                include_data=None):
+    """Yield everything reachable from id via cat_pipe as a WalkItem,
+    stopping whenever stop_at(id) returns true.  Throw MissingObject
+    if a hash encountered is missing from the repository, and don't
+    read or return blob content in the data field unless include_data
+    is set.
+    """
+    # Maintain the pending stack on the heap to avoid stack overflow
+    pending = [(id, [], [], None)]
+    while len(pending):
+        id, parent_path, chunk_path, mode = pending.pop()
+        if stop_at and stop_at(id):
+            continue
+
+        if (not include_data) and mode and stat.S_ISREG(mode):
+            # If the object is a "regular file", then it's a leaf in
+            # the graph, so we can skip reading the data if the caller
+            # hasn't requested it.
+            yield WalkItem(id=id, type='blob',
+                           chunk_path=chunk_path, path=parent_path,
+                           mode=mode,
+                           data=None)
+            continue
+
+        item_it = cat_pipe.get(id)
+        type = item_it.next()
+        if type not in ('blob', 'commit', 'tree'):
+            raise Exception('unexpected repository object type %r' % type)
+
+        # FIXME: set the mode based on the type when the mode is None
+        if type == 'blob' and not include_data:
+            # Dump data until we can ask cat_pipe not to fetch it
+            for ignored in item_it:
+                pass
+            data = None
+        else:
+            data = ''.join(item_it)
+
+        yield WalkItem(id=id, type=type,
+                       chunk_path=chunk_path, path=parent_path,
+                       mode=mode,
+                       data=(data if include_data else None))
+
+        if type == 'commit':
+            commit_items = parse_commit(data)
+            for pid in commit_items.parents:
+                pending.append((pid, parent_path, chunk_path, mode))
+            pending.append((commit_items.tree, parent_path, chunk_path,
+                            hashsplit.GIT_MODE_TREE))
+        elif type == 'tree':
+            for mode, name, ent_id in tree_decode(data):
+                demangled, bup_type = demangle_name(name, mode)
+                if chunk_path:
+                    sub_path = parent_path
+                    sub_chunk_path = chunk_path + [name]
+                else:
+                    sub_path = parent_path + [name]
+                    if bup_type == BUP_CHUNKED:
+                        sub_chunk_path = ['']
+                    else:
+                        sub_chunk_path = chunk_path
+                pending.append((ent_id.encode('hex'), sub_path, sub_chunk_path,
+                                mode))