Allow per-instance specification of the repo_dir for CatPipe

[bup.git] / lib / bup / git.py
diff --git a/lib/bup/git.py b/lib/bup/git.py

index 5ec94ebfa3f3c1f85705fc72e47ac24e723e3446..a8f3729f05f439ef545e3ecb3be5cfa316f493b2 100644 (file)
--- a/lib/bup/git.py
+++ b/lib/bup/git.py
@@ -2,14 +2,17 @@
  bup repositories are in Git format. This library allows us to
  interact with the Git data structures.
  """
  bup repositories are in Git format. This library allows us to
  interact with the Git data structures.
  """
-import os, zlib, time, subprocess, struct, stat, re, tempfile
-import heapq
+import os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
+from collections import namedtuple
+
  from bup.helpers import *
  from bup.helpers import *
-from bup import _helpers
+from bup import _helpers, path, midx, bloom, xstat
+
+max_pack_size = 1000*1000*1000  # larger packs will slow down pruning
+max_pack_objects = 200*1000  # cache memory usage is about 83 bytes per object
  
  verbose = 0
  ignore_midx = 0
  
  verbose = 0
  ignore_midx = 0
-home_repodir = os.path.expanduser('~/.bup')
  repodir = None
  
  _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
  repodir = None
  
  _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
@@ -23,6 +26,66 @@ class GitError(Exception):
      pass
  
  
      pass
  
  
+def parse_tz_offset(s):
+    """UTC offset in seconds."""
+    tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
+    if s[0] == '-':
+        return - tz_off
+    return tz_off
+
+
+# FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
+# Make sure that's authoritative.
+_start_end_char = r'[^ .,:;<>"\'\0\n]'
+_content_char = r'[^\0\n<>]'
+_safe_str_rx = '(?:%s{1,2}|(?:%s%s*%s))' \
+    % (_start_end_char,
+       _start_end_char, _content_char, _start_end_char)
+_tz_rx = r'[-+]\d\d[0-5]\d'
+_parent_rx = r'(?:parent [abcdefABCDEF0123456789]{40}\n)'
+_commit_rx = re.compile(r'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
+(?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
+committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)
+
+(?P<message>(?:.|\n)*)''' % (_parent_rx,
+                             _safe_str_rx, _safe_str_rx, _tz_rx,
+                             _safe_str_rx, _safe_str_rx, _tz_rx))
+_parent_hash_rx = re.compile(r'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
+
+
+# Note that the author_sec and committer_sec values are (UTC) epoch seconds.
+CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
+                                       'author_name', 'author_mail',
+                                       'author_sec', 'author_offset',
+                                       'committer_name', 'committer_mail',
+                                       'committer_sec', 'committer_offset',
+                                       'message'])
+
+def parse_commit(content):
+    commit_match = re.match(_commit_rx, content)
+    if not commit_match:
+        raise Exception('cannot parse commit %r' % content)
+    matches = commit_match.groupdict()
+    return CommitInfo(tree=matches['tree'],
+                      parents=re.findall(_parent_hash_rx, matches['parents']),
+                      author_name=matches['author_name'],
+                      author_mail=matches['author_mail'],
+                      author_sec=int(matches['asec']),
+                      author_offset=parse_tz_offset(matches['atz']),
+                      committer_name=matches['committer_name'],
+                      committer_mail=matches['committer_mail'],
+                      committer_sec=int(matches['csec']),
+                      committer_offset=parse_tz_offset(matches['ctz']),
+                      message=matches['message'])
+
+
+def get_commit_items(id, cp):
+    commit_it = cp.get(id)
+    assert(commit_it.next() == 'commit')
+    commit_content = ''.join(commit_it)
+    return parse_commit(commit_content)
+
+
  def repo(sub = ''):
      """Get the path to the git repository or one of its subdirectories."""
      global repodir
  def repo(sub = ''):
      """Get the path to the git repository or one of its subdirectories."""
      global repodir
@@ -37,13 +100,59 @@ def repo(sub = ''):
      return os.path.join(repodir, sub)
  
  
      return os.path.join(repodir, sub)
  
  
+def shorten_hash(s):
+    return re.sub(r'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)',
+                  r'\1\2*\3', s)
+
+
+def repo_rel(path):
+    full = os.path.abspath(path)
+    fullrepo = os.path.abspath(repo(''))
+    if not fullrepo.endswith('/'):
+        fullrepo += '/'
+    if full.startswith(fullrepo):
+        path = full[len(fullrepo):]
+    if path.startswith('index-cache/'):
+        path = path[len('index-cache/'):]
+    return shorten_hash(path)
+
+
+def all_packdirs():
+    paths = [repo('objects/pack')]
+    paths += glob.glob(repo('index-cache/*/.'))
+    return paths
+
+
+def auto_midx(objdir):
+    args = [path.exe(), 'midx', '--auto', '--dir', objdir]
+    try:
+        rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
+    except OSError, e:
+        # make sure 'args' gets printed to help with debugging
+        add_error('%r: exception: %s' % (args, e))
+        raise
+    if rv:
+        add_error('%r: returned %d' % (args, rv))
+
+    args = [path.exe(), 'bloom', '--dir', objdir]
+    try:
+        rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
+    except OSError, e:
+        # make sure 'args' gets printed to help with debugging
+        add_error('%r: exception: %s' % (args, e))
+        raise
+    if rv:
+        add_error('%r: returned %d' % (args, rv))
+
+
  def mangle_name(name, mode, gitmode):
      """Mangle a file name to present an abstract name for segmented files.
      Mangled file names will have the ".bup" extension added to them. If a
      file's name already ends with ".bup", a ".bupl" extension is added to
  def mangle_name(name, mode, gitmode):
      """Mangle a file name to present an abstract name for segmented files.
      Mangled file names will have the ".bup" extension added to them. If a
      file's name already ends with ".bup", a ".bupl" extension is added to
-    disambiguate normal files from semgmented ones.
+    disambiguate normal files from segmented ones.
      """
      if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
      """
      if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
+        assert(stat.S_ISDIR(gitmode))
          return name + '.bup'
      elif name.endswith('.bup') or name[:-1].endswith('.bup'):
          return name + '.bupl'
          return name + '.bup'
      elif name.endswith('.bup') or name[:-1].endswith('.bup'):
          return name + '.bupl'
@@ -59,9 +168,9 @@ def demangle_name(name):
      the following:
  
      * BUP_NORMAL  : files that should be read as-is from the repository
      the following:
  
      * BUP_NORMAL  : files that should be read as-is from the repository
-    * BUP_CHUNKED : files that were chunked and need to be assembled
+    * BUP_CHUNKED : files that were chunked and need to be reassembled
  
  
-    For more information on the name mangling algorythm, see mangle_name()
+    For more information on the name mangling algorithm, see mangle_name()
      """
      if name.endswith('.bupl'):
          return (name[:-5], BUP_NORMAL)
      """
      if name.endswith('.bupl'):
          return (name[:-5], BUP_NORMAL)
@@ -71,7 +180,53 @@ def demangle_name(name):
          return (name, BUP_NORMAL)
  
  
          return (name, BUP_NORMAL)
  
  
-def _encode_packobj(type, content):
+def calc_hash(type, content):
+    """Calculate some content's hash in the Git fashion."""
+    header = '%s %d\0' % (type, len(content))
+    sum = Sha1(header)
+    sum.update(content)
+    return sum.digest()
+
+
+def shalist_item_sort_key(ent):
+    (mode, name, id) = ent
+    assert(mode+0 == mode)
+    if stat.S_ISDIR(mode):
+        return name + '/'
+    else:
+        return name
+
+
+def tree_encode(shalist):
+    """Generate a git tree object from (mode,name,hash) tuples."""
+    shalist = sorted(shalist, key = shalist_item_sort_key)
+    l = []
+    for (mode,name,bin) in shalist:
+        assert(mode)
+        assert(mode+0 == mode)
+        assert(name)
+        assert(len(bin) == 20)
+        s = '%o %s\0%s' % (mode,name,bin)
+        assert(s[0] != '0')  # 0-padded octal is not acceptable in a git tree
+        l.append(s)
+    return ''.join(l)
+
+
+def tree_decode(buf):
+    """Generate a list of (mode,name,hash) from the git tree object in buf."""
+    ofs = 0
+    while ofs < len(buf):
+        z = buf.find('\0', ofs)
+        assert(z > ofs)
+        spl = buf[ofs:z].split(' ', 1)
+        assert(len(spl) == 2)
+        mode,name = spl
+        sha = buf[z+1:z+1+20]
+        ofs = z+1+20
+        yield (int(mode, 8), name, sha)
+
+
+def _encode_packobj(type, content, compression_level=1):
      szout = ''
      sz = len(content)
      szbits = (sz & 0x0f) | (_typemap[type]<<4)
      szout = ''
      sz = len(content)
      szbits = (sz & 0x0f) | (_typemap[type]<<4)
@@ -83,14 +238,18 @@ def _encode_packobj(type, content):
              break
          szbits = sz & 0x7f
          sz >>= 7
              break
          szbits = sz & 0x7f
          sz >>= 7
-    z = zlib.compressobj(1)
+    if compression_level > 9:
+        compression_level = 9
+    elif compression_level < 0:
+        compression_level = 0
+    z = zlib.compressobj(compression_level)
      yield szout
      yield z.compress(content)
      yield z.flush()
  
  
      yield szout
      yield z.compress(content)
      yield z.flush()
  
  
-def _encode_looseobj(type, content):
-    z = zlib.compressobj(1)
+def _encode_looseobj(type, content, compression_level=1):
+    z = zlib.compressobj(compression_level)
      yield z.compress('%s %d\0' % (type, len(content)))
      yield z.compress(content)
      yield z.flush()
      yield z.compress('%s %d\0' % (type, len(content)))
      yield z.compress(content)
      yield z.flush()
@@ -128,28 +287,24 @@ def _decode_packobj(buf):
  
  
  class PackIdx:
  
  
  class PackIdx:
-    """Object representation of a Git pack index file."""
-    def __init__(self, filename):
-        self.name = filename
-        self.map = mmap_read(open(filename))
-        assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
-        self.fanout = list(struct.unpack('!256I',
-                                         str(buffer(self.map, 8, 256*4))))
-        self.fanout.append(0)  # entry "-1"
-        nsha = self.fanout[255]
-        self.ofstable = buffer(self.map,
-                               8 + 256*4 + nsha*20 + nsha*4,
-                               nsha*4)
-        self.ofs64table = buffer(self.map,
-                                 8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
+    def __init__(self):
+        assert(0)
  
  
-    def _ofs_from_idx(self, idx):
-        ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
-        if ofs & 0x80000000:
-            idx64 = ofs & 0x7fffffff
-            ofs = struct.unpack('!I',
-                                str(buffer(self.ofs64table, idx64*8, 8)))[0]
-        return ofs
+    def find_offset(self, hash):
+        """Get the offset of an object inside the index file."""
+        idx = self._idx_from_hash(hash)
+        if idx != None:
+            return self._ofs_from_idx(idx)
+        return None
+
+    def exists(self, hash, want_source=False):
+        """Return nonempty if the object exists in this index."""
+        if hash and (self._idx_from_hash(hash) != None):
+            return want_source and os.path.basename(self.name) or True
+        return None
+
+    def __len__(self):
+        return int(self.fanout[255])
  
      def _idx_from_hash(self, hash):
          global _total_searches, _total_steps
  
      def _idx_from_hash(self, hash):
          global _total_searches, _total_steps
@@ -158,13 +313,12 @@ class PackIdx:
          b1 = ord(hash[0])
          start = self.fanout[b1-1] # range -1..254
          end = self.fanout[b1] # range 0..255
          b1 = ord(hash[0])
          start = self.fanout[b1-1] # range -1..254
          end = self.fanout[b1] # range 0..255
-        buf = buffer(self.map, 8 + 256*4, end*20)
          want = str(hash)
          _total_steps += 1  # lookup table is a step
          while start < end:
              _total_steps += 1
              mid = start + (end-start)/2
          want = str(hash)
          _total_steps += 1  # lookup table is a step
          while start < end:
              _total_steps += 1
              mid = start + (end-start)/2
-            v = str(buf[mid*20:(mid+1)*20])
+            v = self._idx_to_hash(mid)
              if v < want:
                  start = mid+1
              elif v > want:
              if v < want:
                  start = mid+1
              elif v > want:
@@ -173,108 +327,64 @@ class PackIdx:
                  return mid
          return None
  
                  return mid
          return None
  
-    def find_offset(self, hash):
-        """Get the offset of an object inside the index file."""
-        idx = self._idx_from_hash(hash)
-        if idx != None:
-            return self._ofs_from_idx(idx)
-        return None
  
  
-    def exists(self, hash):
-        """Return nonempty if the object exists in this index."""
-        return hash and (self._idx_from_hash(hash) != None) and True or None
+class PackIdxV1(PackIdx):
+    """Object representation of a Git pack index (version 1) file."""
+    def __init__(self, filename, f):
+        self.name = filename
+        self.idxnames = [self.name]
+        self.map = mmap_read(f)
+        self.fanout = list(struct.unpack('!256I',
+                                         str(buffer(self.map, 0, 256*4))))
+        self.fanout.append(0)  # entry "-1"
+        nsha = self.fanout[255]
+        self.sha_ofs = 256*4
+        self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
+
+    def _ofs_from_idx(self, idx):
+        return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
+
+    def _idx_to_hash(self, idx):
+        return str(self.shatable[idx*24+4 : idx*24+24])
  
      def __iter__(self):
          for i in xrange(self.fanout[255]):
  
      def __iter__(self):
          for i in xrange(self.fanout[255]):
-            yield buffer(self.map, 8 + 256*4 + 20*i, 20)
-
-    def __len__(self):
-        return int(self.fanout[255])
+            yield buffer(self.map, 256*4 + 24*i + 4, 20)
  
  
  
  
-def extract_bits(buf, nbits):
-    """Take the first 'nbits' bits from 'buf' and return them as an integer."""
-    mask = (1<<nbits) - 1
-    v = _helpers.firstword(buf)
-    v = (v >> (32-nbits)) & mask
-    return v
+class PackIdxV2(PackIdx):
+    """Object representation of a Git pack index (version 2) file."""
+    def __init__(self, filename, f):
+        self.name = filename
+        self.idxnames = [self.name]
+        self.map = mmap_read(f)
+        assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
+        self.fanout = list(struct.unpack('!256I',
+                                         str(buffer(self.map, 8, 256*4))))
+        self.fanout.append(0)  # entry "-1"
+        nsha = self.fanout[255]
+        self.sha_ofs = 8 + 256*4
+        self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
+        self.ofstable = buffer(self.map,
+                               self.sha_ofs + nsha*20 + nsha*4,
+                               nsha*4)
+        self.ofs64table = buffer(self.map,
+                                 8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
  
  
+    def _ofs_from_idx(self, idx):
+        ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
+        if ofs & 0x80000000:
+            idx64 = ofs & 0x7fffffff
+            ofs = struct.unpack('!Q',
+                                str(buffer(self.ofs64table, idx64*8, 8)))[0]
+        return ofs
  
  
-class PackMidx:
-    """Wrapper which contains data from multiple index files.
-    Multiple index (.midx) files constitute a wrapper around index (.idx) files
-    and make it possible for bup to expand Git's indexing capabilities to vast
-    amounts of files.
-    """
-    def __init__(self, filename):
-        self.name = filename
-        assert(filename.endswith('.midx'))
-        self.map = mmap_read(open(filename))
-        if str(self.map[0:8]) == 'MIDX\0\0\0\1':
-            log('Warning: ignoring old-style midx %r\n' % filename)
-            self.bits = 0
-            self.entries = 1
-            self.fanout = buffer('\0\0\0\0')
-            self.shalist = buffer('\0'*20)
-            self.idxnames = []
-        else:
-            assert(str(self.map[0:8]) == 'MIDX\0\0\0\2')
-            self.bits = _helpers.firstword(self.map[8:12])
-            self.entries = 2**self.bits
-            self.fanout = buffer(self.map, 12, self.entries*4)
-            shaofs = 12 + self.entries*4
-            nsha = self._fanget(self.entries-1)
-            self.shalist = buffer(self.map, shaofs, nsha*20)
-            self.idxnames = str(self.map[shaofs + 20*nsha:]).split('\0')
-
-    def _fanget(self, i):
-        start = i*4
-        s = self.fanout[start:start+4]
-        return _helpers.firstword(s)
-
-    def _get(self, i):
-        return str(self.shalist[i*20:(i+1)*20])
-
-    def exists(self, hash):
-        """Return nonempty if the object exists in the index files."""
-        global _total_searches, _total_steps
-        _total_searches += 1
-        want = str(hash)
-        el = extract_bits(want, self.bits)
-        if el:
-            start = self._fanget(el-1)
-            startv = el << (32-self.bits)
-        else:
-            start = 0
-            startv = 0
-        end = self._fanget(el)
-        endv = (el+1) << (32-self.bits)
-        _total_steps += 1   # lookup table is a step
-        hashv = _helpers.firstword(hash)
-        #print '(%08x) %08x %08x %08x' % (extract_bits(want, 32), startv, hashv, endv)
-        while start < end:
-            _total_steps += 1
-            #print '! %08x %08x %08x   %d - %d' % (startv, hashv, endv, start, end)
-            mid = start + (hashv-startv)*(end-start-1)/(endv-startv)
-            #print '  %08x %08x %08x   %d %d %d' % (startv, hashv, endv, start, mid, end)
-            v = self._get(mid)
-            #print '    %08x' % self._num(v)
-            if v < want:
-                start = mid+1
-                startv = _helpers.firstword(v)
-            elif v > want:
-                end = mid
-                endv = _helpers.firstword(v)
-            else: # got it!
-                return True
-        return None
+    def _idx_to_hash(self, idx):
+        return str(self.shatable[idx*20:(idx+1)*20])
  
      def __iter__(self):
  
      def __iter__(self):
-        for i in xrange(self._fanget(self.entries-1)):
-            yield buffer(self.shalist, i*20, 20)
-
-    def __len__(self):
-        return int(self._fanget(self.entries-1))
+        for i in xrange(self.fanout[255]):
+            yield buffer(self.map, 8 + 256*4 + 20*i, 20)
  
  
  _mpi_count = 0
  
  
  _mpi_count = 0
@@ -284,8 +394,10 @@ class PackIdxList:
          assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
          _mpi_count += 1
          self.dir = dir
          assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
          _mpi_count += 1
          self.dir = dir
-        self.also = {}
+        self.also = set()
          self.packs = []
          self.packs = []
+        self.do_bloom = False
+        self.bloom = None
          self.refresh()
  
      def __del__(self):
          self.refresh()
  
      def __del__(self):
@@ -299,19 +411,27 @@ class PackIdxList:
      def __len__(self):
          return sum(len(pack) for pack in self.packs)
  
      def __len__(self):
          return sum(len(pack) for pack in self.packs)
  
-    def exists(self, hash):
+    def exists(self, hash, want_source=False):
          """Return nonempty if the object exists in the index files."""
          global _total_searches
          _total_searches += 1
          if hash in self.also:
              return True
          """Return nonempty if the object exists in the index files."""
          global _total_searches
          _total_searches += 1
          if hash in self.also:
              return True
-        for i in range(len(self.packs)):
+        if self.do_bloom and self.bloom:
+            if self.bloom.exists(hash):
+                self.do_bloom = False
+            else:
+                _total_searches -= 1  # was counted by bloom
+                return None
+        for i in xrange(len(self.packs)):
              p = self.packs[i]
              _total_searches -= 1  # will be incremented by sub-pack
              p = self.packs[i]
              _total_searches -= 1  # will be incremented by sub-pack
-            if p.exists(hash):
+            ix = p.exists(hash, want_source=want_source)
+            if ix:
                  # reorder so most recently used packs are searched first
                  self.packs = [p] + self.packs[:i] + self.packs[i+1:]
                  # reorder so most recently used packs are searched first
                  self.packs = [p] + self.packs[:i] + self.packs[i+1:]
-                return p.name
+                return ix
+        self.do_bloom = True
          return None
  
      def refresh(self, skip_midx = False):
          return None
  
      def refresh(self, skip_midx = False):
@@ -326,134 +446,139 @@ class PackIdxList:
          The module-global variable 'ignore_midx' can force this function to
          always act as if skip_midx was True.
          """
          The module-global variable 'ignore_midx' can force this function to
          always act as if skip_midx was True.
          """
+        self.bloom = None # Always reopen the bloom as it may have been relaced
+        self.do_bloom = False
          skip_midx = skip_midx or ignore_midx
          d = dict((p.name, p) for p in self.packs
          skip_midx = skip_midx or ignore_midx
          d = dict((p.name, p) for p in self.packs
-                 if not skip_midx or not isinstance(p, PackMidx))
+                 if not skip_midx or not isinstance(p, midx.PackMidx))
          if os.path.exists(self.dir):
              if not skip_midx:
                  midxl = []
                  for ix in self.packs:
          if os.path.exists(self.dir):
              if not skip_midx:
                  midxl = []
                  for ix in self.packs:
-                    if isinstance(ix, PackMidx):
+                    if isinstance(ix, midx.PackMidx):
                          for name in ix.idxnames:
                              d[os.path.join(self.dir, name)] = ix
                          for name in ix.idxnames:
                              d[os.path.join(self.dir, name)] = ix
-                for f in os.listdir(self.dir):
-                    full = os.path.join(self.dir, f)
-                    if f.endswith('.midx') and not d.get(full):
-                        mx = PackMidx(full)
+                for full in glob.glob(os.path.join(self.dir,'*.midx')):
+                    if not d.get(full):
+                        mx = midx.PackMidx(full)
                          (mxd, mxf) = os.path.split(mx.name)
                          (mxd, mxf) = os.path.split(mx.name)
-                        broken = 0
+                        broken = False
                          for n in mx.idxnames:
                              if not os.path.exists(os.path.join(mxd, n)):
                                  log(('warning: index %s missing\n' +
                                      '  used by %s\n') % (n, mxf))
                          for n in mx.idxnames:
                              if not os.path.exists(os.path.join(mxd, n)):
                                  log(('warning: index %s missing\n' +
                                      '  used by %s\n') % (n, mxf))
-                                broken += 1
-                        if not broken:
+                                broken = True
+                        if broken:
+                            mx.close()
+                            del mx
+                            unlink(full)
+                        else:
                              midxl.append(mx)
                              midxl.append(mx)
-                midxl.sort(lambda x,y: -cmp(len(x),len(y)))
+                midxl.sort(key=lambda ix:
+                           (-len(ix), -xstat.stat(ix.name).st_mtime))
                  for ix in midxl:
                  for ix in midxl:
-                    any = 0
+                    any_needed = False
                      for sub in ix.idxnames:
                          found = d.get(os.path.join(self.dir, sub))
                          if not found or isinstance(found, PackIdx):
                              # doesn't exist, or exists but not in a midx
                      for sub in ix.idxnames:
                          found = d.get(os.path.join(self.dir, sub))
                          if not found or isinstance(found, PackIdx):
                              # doesn't exist, or exists but not in a midx
-                            d[ix.name] = ix
-                            for name in ix.idxnames:
-                                d[os.path.join(self.dir, name)] = ix
-                            any += 1
+                            any_needed = True
                              break
                              break
-                    if not any:
-                        log('midx: removing redundant: %s\n'
-                            % os.path.basename(ix.name))
+                    if any_needed:
+                        d[ix.name] = ix
+                        for name in ix.idxnames:
+                            d[os.path.join(self.dir, name)] = ix
+                    elif not ix.force_keep:
+                        debug1('midx: removing redundant: %s\n'
+                               % os.path.basename(ix.name))
+                        ix.close()
                          unlink(ix.name)
                          unlink(ix.name)
-            for f in os.listdir(self.dir):
-                full = os.path.join(self.dir, f)
-                if f.endswith('.idx') and not d.get(full):
-                    ix = PackIdx(full)
+            for full in glob.glob(os.path.join(self.dir,'*.idx')):
+                if not d.get(full):
+                    try:
+                        ix = open_idx(full)
+                    except GitError, e:
+                        add_error(e)
+                        continue
                      d[full] = ix
                      d[full] = ix
+            bfull = os.path.join(self.dir, 'bup.bloom')
+            if self.bloom is None and os.path.exists(bfull):
+                self.bloom = bloom.ShaBloom(bfull)
              self.packs = list(set(d.values()))
              self.packs = list(set(d.values()))
-        log('PackIdxList: using %d index%s.\n'
+            self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
+            if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
+                self.do_bloom = True
+            else:
+                self.bloom = None
+        debug1('PackIdxList: using %d index%s.\n'
              % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
  
      def add(self, hash):
          """Insert an additional object in the list."""
              % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
  
      def add(self, hash):
          """Insert an additional object in the list."""
-        self.also[hash] = 1
-
-    def zap_also(self):
-        """Remove all additional objects from the list."""
-        self.also = {}
+        self.also.add(hash)
  
  
  
  
-def calc_hash(type, content):
-    """Calculate some content's hash in the Git fashion."""
-    header = '%s %d\0' % (type, len(content))
-    sum = Sha1(header)
-    sum.update(content)
-    return sum.digest()
-
-
-def _shalist_sort_key(ent):
-    (mode, name, id) = ent
-    if stat.S_ISDIR(int(mode, 8)):
-        return name + '/'
+def open_idx(filename):
+    if filename.endswith('.idx'):
+        f = open(filename, 'rb')
+        header = f.read(8)
+        if header[0:4] == '\377tOc':
+            version = struct.unpack('!I', header[4:8])[0]
+            if version == 2:
+                return PackIdxV2(filename, f)
+            else:
+                raise GitError('%s: expected idx file version 2, got %d'
+                               % (filename, version))
+        elif len(header) == 8 and header[0:4] < '\377tOc':
+            return PackIdxV1(filename, f)
+        else:
+            raise GitError('%s: unrecognized idx file header' % filename)
+    elif filename.endswith('.midx'):
+        return midx.PackMidx(filename)
      else:
      else:
-        return name
+        raise GitError('idx filenames must end with .idx or .midx')
  
  
  
  
-def idxmerge(idxlist):
+def idxmerge(idxlist, final_progress=True):
      """Generate a list of all the objects reachable in a PackIdxList."""
      """Generate a list of all the objects reachable in a PackIdxList."""
-    total = sum(len(i) for i in idxlist)
-    iters = (iter(i) for i in idxlist)
-    heap = [(next(it), it) for it in iters]
-    heapq.heapify(heap)
-    count = 0
-    last = None
-    while heap:
-        if (count % 10024) == 0:
-            progress('Reading indexes: %.2f%% (%d/%d)\r'
-                     % (count*100.0/total, count, total))
-        (e, it) = heap[0]
-        if e != last:
-            yield e
-            last = e
-        count += 1
-        e = next(it)
-        if e:
-            heapq.heapreplace(heap, (e, it))
-        else:
-            heapq.heappop(heap)
-    log('Reading indexes: %.2f%% (%d/%d), done.\n' % (100, total, total))
+    def pfunc(count, total):
+        qprogress('Reading indexes: %.2f%% (%d/%d)\r'
+                  % (count*100.0/total, count, total))
+    def pfinal(count, total):
+        if final_progress:
+            progress('Reading indexes: %.2f%% (%d/%d), done.\n'
+                     % (100, total, total))
+    return merge_iter(idxlist, 10024, pfunc, pfinal)
+
  
  
+def _make_objcache():
+    return PackIdxList(repo('objects/pack'))
  
  class PackWriter:
  
  class PackWriter:
-    """Writes Git objects insid a pack file."""
-    def __init__(self, objcache_maker=None):
+    """Writes Git objects inside a pack file."""
+    def __init__(self, objcache_maker=_make_objcache, compression_level=1):
          self.count = 0
          self.outbytes = 0
          self.filename = None
          self.file = None
          self.count = 0
          self.outbytes = 0
          self.filename = None
          self.file = None
+        self.idx = None
          self.objcache_maker = objcache_maker
          self.objcache = None
          self.objcache_maker = objcache_maker
          self.objcache = None
+        self.compression_level = compression_level
  
      def __del__(self):
          self.close()
  
  
      def __del__(self):
          self.close()
  
-    def _make_objcache(self):
-        if self.objcache == None:
-            if self.objcache_maker:
-                self.objcache = self.objcache_maker()
-            else:
-                self.objcache = PackIdxList(repo('objects/pack'))
-
      def _open(self):
          if not self.file:
      def _open(self):
          if not self.file:
-            self._make_objcache()
              (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
              self.file = os.fdopen(fd, 'w+b')
              assert(name.endswith('.pack'))
              self.filename = name[:-5]
              self.file.write('PACK\0\0\0\2\0\0\0\0')
              (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
              self.file = os.fdopen(fd, 'w+b')
              assert(name.endswith('.pack'))
              self.filename = name[:-5]
              self.file.write('PACK\0\0\0\2\0\0\0\0')
+            self.idx = list(list() for i in xrange(256))
  
  
-    def _raw_write(self, datalist):
+    def _raw_write(self, datalist, sha):
          self._open()
          f = self.file
          # in case we get interrupted (eg. KeyboardInterrupt), it's best if
          self._open()
          f = self.file
          # in case we get interrupted (eg. KeyboardInterrupt), it's best if
@@ -462,15 +587,33 @@ class PackWriter:
          # to our hashsplit algorithm.)  f.write() does its own buffering,
          # but that's okay because we'll flush it in _end().
          oneblob = ''.join(datalist)
          # to our hashsplit algorithm.)  f.write() does its own buffering,
          # but that's okay because we'll flush it in _end().
          oneblob = ''.join(datalist)
-        f.write(oneblob)
-        self.outbytes += len(oneblob)
+        try:
+            f.write(oneblob)
+        except IOError, e:
+            raise GitError, e, sys.exc_info()[2]
+        nw = len(oneblob)
+        crc = zlib.crc32(oneblob) & 0xffffffff
+        self._update_idx(sha, crc, nw)
+        self.outbytes += nw
          self.count += 1
          self.count += 1
+        return nw, crc
  
  
-    def _write(self, bin, type, content):
+    def _update_idx(self, sha, crc, size):
+        assert(sha)
+        if self.idx:
+            self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
+
+    def _write(self, sha, type, content):
          if verbose:
              log('>')
          if verbose:
              log('>')
-        self._raw_write(_encode_packobj(type, content))
-        return bin
+        if not sha:
+            sha = calc_hash(type, content)
+        size, crc = self._raw_write(_encode_packobj(type, content,
+                                                    self.compression_level),
+                                    sha=sha)
+        if self.outbytes >= max_pack_size or self.count >= max_pack_objects:
+            self.breakpoint()
+        return sha
  
      def breakpoint(self):
          """Clear byte and object counts and return the last processed id."""
  
      def breakpoint(self):
          """Clear byte and object counts and return the last processed id."""
@@ -478,23 +621,26 @@ class PackWriter:
          self.outbytes = self.count = 0
          return id
  
          self.outbytes = self.count = 0
          return id
  
-    def write(self, type, content):
-        """Write an object in this pack file."""
-        return self._write(calc_hash(type, content), type, content)
+    def _require_objcache(self):
+        if self.objcache is None and self.objcache_maker:
+            self.objcache = self.objcache_maker()
+        if self.objcache is None:
+            raise GitError(
+                    "PackWriter not opened or can't check exists w/o objcache")
  
  
-    def exists(self, id):
+    def exists(self, id, want_source=False):
          """Return non-empty if an object is found in the object cache."""
          """Return non-empty if an object is found in the object cache."""
-        if not self.objcache:
-            self._make_objcache()
-        return self.objcache.exists(id)
+        self._require_objcache()
+        return self.objcache.exists(id, want_source=want_source)
  
      def maybe_write(self, type, content):
          """Write an object to the pack file if not present and return its id."""
  
      def maybe_write(self, type, content):
          """Write an object to the pack file if not present and return its id."""
-        bin = calc_hash(type, content)
-        if not self.exists(bin):
-            self._write(bin, type, content)
-            self.objcache.add(bin)
-        return bin
+        sha = calc_hash(type, content)
+        if not self.exists(sha):
+            self._write(sha, type, content)
+            self._require_objcache()
+            self.objcache.add(sha)
+        return sha
  
      def new_blob(self, blob):
          """Create a blob object in the pack with the supplied content."""
  
      def new_blob(self, blob):
          """Create a blob object in the pack with the supplied content."""
@@ -502,16 +648,8 @@ class PackWriter:
  
      def new_tree(self, shalist):
          """Create a tree object in the pack."""
  
      def new_tree(self, shalist):
          """Create a tree object in the pack."""
-        shalist = sorted(shalist, key = _shalist_sort_key)
-        l = []
-        for (mode,name,bin) in shalist:
-            assert(mode)
-            assert(mode != '0')
-            assert(mode[0] != '0')
-            assert(name)
-            assert(len(bin) == 20)
-            l.append('%s %s\0%s' % (mode,name,bin))
-        return self.maybe_write('tree', ''.join(l))
+        content = tree_encode(shalist)
+        return self.maybe_write('tree', content)
  
      def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
          l = []
  
      def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
          l = []
@@ -523,12 +661,11 @@ class PackWriter:
          l.append(msg)
          return self.maybe_write('commit', '\n'.join(l))
  
          l.append(msg)
          return self.maybe_write('commit', '\n'.join(l))
  
-    def new_commit(self, parent, tree, msg):
+    def new_commit(self, parent, tree, date, msg):
          """Create a commit object in the pack."""
          """Create a commit object in the pack."""
-        now = time.time()
          userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
          commit = self._new_commit(tree, parent,
          userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
          commit = self._new_commit(tree, parent,
-                                  userline, now, userline, now,
+                                  userline, date, userline, date,
                                    msg)
          return commit
  
                                    msg)
          return commit
  
@@ -536,15 +673,18 @@ class PackWriter:
          """Remove the pack file from disk."""
          f = self.file
          if f:
          """Remove the pack file from disk."""
          f = self.file
          if f:
+            self.idx = None
              self.file = None
              f.close()
              os.unlink(self.filename + '.pack')
  
              self.file = None
              f.close()
              os.unlink(self.filename + '.pack')
  
-    def _end(self):
+    def _end(self, run_midx=True):
          f = self.file
          if not f: return None
          self.file = None
          self.objcache = None
          f = self.file
          if not f: return None
          self.file = None
          self.objcache = None
+        idx = self.idx
+        self.idx = None
  
          # update object count
          f.seek(8)
  
          # update object count
          f.seek(8)
@@ -555,41 +695,80 @@ class PackWriter:
          # calculate the pack sha1sum
          f.seek(0)
          sum = Sha1()
          # calculate the pack sha1sum
          f.seek(0)
          sum = Sha1()
-        while 1:
-            b = f.read(65536)
+        for b in chunkyreader(f):
              sum.update(b)
              sum.update(b)
-            if not b: break
-        f.write(sum.digest())
-
+        packbin = sum.digest()
+        f.write(packbin)
          f.close()
  
          f.close()
  
-        p = subprocess.Popen(['git', 'index-pack', '-v',
-                              '--index-version=2',
-                              self.filename + '.pack'],
-                             preexec_fn = _gitenv,
-                             stdout = subprocess.PIPE)
-        out = p.stdout.read().strip()
-        _git_wait('git index-pack', p)
-        if not out:
-            raise GitError('git index-pack produced no output')
-        nameprefix = repo('objects/pack/%s' % out)
+        obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
+
+        nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
          if os.path.exists(self.filename + '.map'):
              os.unlink(self.filename + '.map')
          os.rename(self.filename + '.pack', nameprefix + '.pack')
          os.rename(self.filename + '.idx', nameprefix + '.idx')
          if os.path.exists(self.filename + '.map'):
              os.unlink(self.filename + '.map')
          os.rename(self.filename + '.pack', nameprefix + '.pack')
          os.rename(self.filename + '.idx', nameprefix + '.idx')
+
+        if run_midx:
+            auto_midx(repo('objects/pack'))
          return nameprefix
  
          return nameprefix
  
-    def close(self):
+    def close(self, run_midx=True):
          """Close the pack file and move it to its definitive path."""
          """Close the pack file and move it to its definitive path."""
-        return self._end()
+        return self._end(run_midx=run_midx)
+
+    def _write_pack_idx_v2(self, filename, idx, packbin):
+        ofs64_count = 0
+        for section in idx:
+            for entry in section:
+                if entry[2] >= 2**31:
+                    ofs64_count += 1
+
+        # Length: header + fan-out + shas-and-crcs + overflow-offsets
+        index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
+        idx_map = None
+        idx_f = open(filename, 'w+b')
+        try:
+            idx_f.truncate(index_len)
+            idx_map = mmap_readwrite(idx_f, close=False)
+            count = _helpers.write_idx(filename, idx_map, idx, self.count)
+            assert(count == self.count)
+        finally:
+            if idx_map: idx_map.close()
+            idx_f.close()
+
+        idx_f = open(filename, 'a+b')
+        try:
+            idx_f.write(packbin)
+            idx_f.seek(0)
+            idx_sum = Sha1()
+            b = idx_f.read(8 + 4*256)
+            idx_sum.update(b)
+
+            obj_list_sum = Sha1()
+            for b in chunkyreader(idx_f, 20*self.count):
+                idx_sum.update(b)
+                obj_list_sum.update(b)
+            namebase = obj_list_sum.hexdigest()
+
+            for b in chunkyreader(idx_f):
+                idx_sum.update(b)
+            idx_f.write(idx_sum.digest())
+            return namebase
+        finally:
+            idx_f.close()
  
  
  def _git_date(date):
      return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
  
  
  
  
  def _git_date(date):
      return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
  
  
-def _gitenv():
-    os.environ['GIT_DIR'] = os.path.abspath(repo())
+def _gitenv(repo_dir = None):
+    if not repo_dir:
+        repo_dir = repo()
+    def env():
+        os.environ['GIT_DIR'] = os.path.abspath(repo_dir)
+    return env
  
  
  def list_refs(refname = None):
  
  
  def list_refs(refname = None):
@@ -599,7 +778,7 @@ def list_refs(refname = None):
      argv = ['git', 'show-ref', '--']
      if refname:
          argv += [refname]
      argv = ['git', 'show-ref', '--']
      if refname:
          argv += [refname]
-    p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
+    p = subprocess.Popen(argv, preexec_fn = _gitenv(), stdout = subprocess.PIPE)
      out = p.stdout.read().strip()
      rv = p.wait()  # not fatal
      if rv:
      out = p.stdout.read().strip()
      rv = p.wait()  # not fatal
      if rv:
@@ -634,8 +813,8 @@ def rev_list(ref, count=None):
      opts = []
      if count:
          opts += ['-n', str(atoi(count))]
      opts = []
      if count:
          opts += ['-n', str(atoi(count))]
-    argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
-    p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
+    argv = ['git', 'rev-list', '--pretty=format:%at'] + opts + [ref, '--']
+    p = subprocess.Popen(argv, preexec_fn = _gitenv(), stdout = subprocess.PIPE)
      commit = None
      for row in p.stdout:
          s = row.strip()
      commit = None
      for row in p.stdout:
          s = row.strip()
@@ -649,11 +828,42 @@ def rev_list(ref, count=None):
          raise GitError, 'git rev-list returned error %d' % rv
  
  
          raise GitError, 'git rev-list returned error %d' % rv
  
  
-def rev_get_date(ref):
-    """Get the date of the latest commit on the specified ref."""
-    for (date, commit) in rev_list(ref, count=1):
-        return date
-    raise GitError, 'no such commit %r' % ref
+def get_commit_dates(refs):
+    """Get the dates for the specified commit refs.  For now, every unique
+       string in refs must resolve to a different commit or this
+       function will fail."""
+    result = []
+    for ref in refs:
+        commit = get_commit_items(ref, cp())
+        result.append(commit.author_sec)
+    return result
+
+
+def rev_parse(committish):
+    """Resolve the full hash for 'committish', if it exists.
+
+    Should be roughly equivalent to 'git rev-parse'.
+
+    Returns the hex value of the hash if it is found, None if 'committish' does
+    not correspond to anything.
+    """
+    head = read_ref(committish)
+    if head:
+        debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
+        return head
+
+    pL = PackIdxList(repo('objects/pack'))
+
+    if len(committish) == 40:
+        try:
+            hash = committish.decode('hex')
+        except TypeError:
+            return None
+
+        if pL.exists(hash):
+            return hash
+
+    return None
  
  
  def update_ref(refname, newval, oldval):
  
  
  def update_ref(refname, newval, oldval):
@@ -663,7 +873,7 @@ def update_ref(refname, newval, oldval):
      assert(refname.startswith('refs/heads/'))
      p = subprocess.Popen(['git', 'update-ref', refname,
                            newval.encode('hex'), oldval.encode('hex')],
      assert(refname.startswith('refs/heads/'))
      p = subprocess.Popen(['git', 'update-ref', refname,
                            newval.encode('hex'), oldval.encode('hex')],
-                         preexec_fn = _gitenv)
+                         preexec_fn = _gitenv())
      _git_wait('git update-ref', p)
  
  
      _git_wait('git update-ref', p)
  
  
@@ -686,16 +896,23 @@ def guess_repo(path=None):
  def init_repo(path=None):
      """Create the Git bare repository for bup in a given path."""
      guess_repo(path)
  def init_repo(path=None):
      """Create the Git bare repository for bup in a given path."""
      guess_repo(path)
-    d = repo()
+    d = repo()  # appends a / to the path
+    parent = os.path.dirname(os.path.dirname(d))
+    if parent and not os.path.exists(parent):
+        raise GitError('parent directory "%s" does not exist\n' % parent)
      if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
      if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
-        raise GitError('"%d" exists but is not a directory\n' % d)
+        raise GitError('"%s" exists but is not a directory\n' % d)
      p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
      p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
-                         preexec_fn = _gitenv)
+                         preexec_fn = _gitenv())
      _git_wait('git init', p)
      # Force the index version configuration in order to ensure bup works
      # regardless of the version of the installed Git binary.
      p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
      _git_wait('git init', p)
      # Force the index version configuration in order to ensure bup works
      # regardless of the version of the installed Git binary.
      p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
-                         stdout=sys.stderr, preexec_fn = _gitenv)
+                         stdout=sys.stderr, preexec_fn = _gitenv())
+    _git_wait('git config', p)
+    # Enable the reflog
+    p = subprocess.Popen(['git', 'config', 'core.logAllRefUpdates', 'true'],
+                         stdout=sys.stderr, preexec_fn = _gitenv())
      _git_wait('git config', p)
  
  
      _git_wait('git config', p)
  
  
@@ -705,25 +922,16 @@ def check_repo_or_die(path=None):
      initializes the default repository automatically.
      """
      guess_repo(path)
      initializes the default repository automatically.
      """
      guess_repo(path)
-    if not os.path.isdir(repo('objects/pack/.')):
-        if repodir == home_repodir:
-            init_repo()
-        else:
-            log('error: %r is not a bup/git repository\n' % repo())
+    try:
+        os.stat(repo('objects/pack/.'))
+    except OSError, e:
+        if e.errno == errno.ENOENT:
+            log('error: %r is not a bup repository; run "bup init"\n'
+                % repo())
              sys.exit(15)
              sys.exit(15)
-
-
-def treeparse(buf):
-    """Generate a list of (mode, name, hash) tuples of objects from 'buf'."""
-    ofs = 0
-    while ofs < len(buf):
-        z = buf[ofs:].find('\0')
-        assert(z > 0)
-        spl = buf[ofs:ofs+z].split(' ', 1)
-        assert(len(spl) == 2)
-        sha = buf[ofs+z+1:ofs+z+1+20]
-        ofs += z+1+20
-        yield (spl[0], spl[1], sha)
+        else:
+            log('error: %s\n' % e)
+            sys.exit(14)
  
  
  _ver = None
  
  
  _ver = None
@@ -760,7 +968,7 @@ def _git_wait(cmd, p):
  
  
  def _git_capture(argv):
  
  
  def _git_capture(argv):
-    p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
+    p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv())
      r = p.stdout.read()
      _git_wait(repr(argv), p)
      return r
      r = p.stdout.read()
      _git_wait(repr(argv), p)
      return r
@@ -799,8 +1007,9 @@ class _AbortableIter:
  _ver_warned = 0
  class CatPipe:
      """Link to 'git cat-file' that is used to retrieve blob data."""
  _ver_warned = 0
  class CatPipe:
      """Link to 'git cat-file' that is used to retrieve blob data."""
-    def __init__(self):
+    def __init__(self, repo_dir = None):
          global _ver_warned
          global _ver_warned
+        self.repo_dir = repo_dir
          wanted = ('1','5','6')
          if ver() < wanted:
              if not _ver_warned:
          wanted = ('1','5','6')
          if ver() < wanted:
              if not _ver_warned:
@@ -825,24 +1034,28 @@ class CatPipe:
                                    stdin=subprocess.PIPE,
                                    stdout=subprocess.PIPE,
                                    close_fds = True,
                                    stdin=subprocess.PIPE,
                                    stdout=subprocess.PIPE,
                                    close_fds = True,
-                                  preexec_fn = _gitenv)
+                                  bufsize = 4096,
+                                  preexec_fn = _gitenv(self.repo_dir))
  
      def _fast_get(self, id):
          if not self.p or self.p.poll() != None:
              self._restart()
          assert(self.p)
  
      def _fast_get(self, id):
          if not self.p or self.p.poll() != None:
              self._restart()
          assert(self.p)
-        assert(self.p.poll() == None)
+        poll_result = self.p.poll()
+        assert(poll_result == None)
          if self.inprogress:
          if self.inprogress:
-            log('_fast_get: opening %r while %r is open'
+            log('_fast_get: opening %r while %r is open\n'
                  % (id, self.inprogress))
          assert(not self.inprogress)
          assert(id.find('\n') < 0)
          assert(id.find('\r') < 0)
                  % (id, self.inprogress))
          assert(not self.inprogress)
          assert(id.find('\n') < 0)
          assert(id.find('\r') < 0)
-        assert(id[0] != '-')
+        assert(not id.startswith('-'))
          self.inprogress = id
          self.p.stdin.write('%s\n' % id)
          self.inprogress = id
          self.p.stdin.write('%s\n' % id)
+        self.p.stdin.flush()
          hdr = self.p.stdout.readline()
          if hdr.endswith(' missing\n'):
          hdr = self.p.stdout.readline()
          if hdr.endswith(' missing\n'):
+            self.inprogress = None
              raise KeyError('blob %r is missing' % id)
          spl = hdr.split(' ')
          if len(spl) != 3 or len(spl[0]) != 40:
              raise KeyError('blob %r is missing' % id)
          spl = hdr.split(' ')
          if len(spl) != 3 or len(spl[0]) != 40:
@@ -855,7 +1068,8 @@ class CatPipe:
              yield type
              for blob in it:
                  yield blob
              yield type
              for blob in it:
                  yield blob
-            assert(self.p.stdout.readline() == '\n')
+            readline_result = self.p.stdout.readline()
+            assert(readline_result == '\n')
              self.inprogress = None
          except Exception, e:
              it.abort()
              self.inprogress = None
          except Exception, e:
              it.abort()
@@ -870,7 +1084,7 @@ class CatPipe:
  
          p = subprocess.Popen(['git', 'cat-file', type, id],
                               stdout=subprocess.PIPE,
  
          p = subprocess.Popen(['git', 'cat-file', type, id],
                               stdout=subprocess.PIPE,
-                             preexec_fn = _gitenv)
+                             preexec_fn = _gitenv(self.repo_dir))
          for blob in chunkyreader(p.stdout):
              yield blob
          _git_wait('git cat-file', p)
          for blob in chunkyreader(p.stdout):
              yield blob
          _git_wait('git cat-file', p)
@@ -882,7 +1096,7 @@ class CatPipe:
                  yield blob
          elif type == 'tree':
              treefile = ''.join(it)
                  yield blob
          elif type == 'tree':
              treefile = ''.join(it)
-            for (mode, name, sha) in treeparse(treefile):
+            for (mode, name, sha) in tree_decode(treefile):
                  for blob in self.join(sha.encode('hex')):
                      yield blob
          elif type == 'commit':
                  for blob in self.join(sha.encode('hex')):
                      yield blob
          elif type == 'commit':
@@ -905,3 +1119,30 @@ class CatPipe:
                  yield d
          except StopIteration:
              log('booger!\n')
                  yield d
          except StopIteration:
              log('booger!\n')
+
+
+_cp = (None, None)
+
+def cp():
+    """Create a CatPipe object or reuse an already existing one."""
+    global _cp
+    cp_dir, cp = _cp
+    cur_dir = os.path.realpath(repo())
+    if cur_dir != cp_dir:
+        cp = CatPipe()
+        _cp = (cur_dir, cp)
+    return cp
+
+
+def tags():
+    """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
+    tags = {}
+    for (n,c) in list_refs():
+        if n.startswith('refs/tags/'):
+            name = n[10:]
+            if not c in tags:
+                tags[c] = []
+
+            tags[c].append(name)  # more than one tag can point at 'c'
+
+    return tags