lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5
   6 from __future__ import absolute_import, print_function
   7 import os, sys, zlib, subprocess, struct, stat, re, tempfile, glob
   8 from array import array
   9 from binascii import hexlify, unhexlify
  10 from collections import namedtuple
  11 from itertools import islice
  12
  13 from bup import _helpers, hashsplit, path, midx, bloom, xstat
  14 from bup.compat import (buffer,
  15                         byte_int, bytes_from_byte, bytes_from_uint,
  16                         environ,
  17                         items,
  18                         pending_raise,
  19                         range,
  20                         reraise)
  21 from bup.io import path_msg
  22 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
  23                          exo,
  24                          fdatasync,
  25                          finalized,
  26                          log,
  27                          merge_dict,
  28                          merge_iter,
  29                          mmap_read, mmap_readwrite,
  30                          progress, qprogress, stat_if_exists,
  31                          unlink,
  32                          utc_offset_str)
  33
  34
  35 verbose = 0
  36 repodir = None  # The default repository, once initialized
  37
  38 _typemap =  {b'blob': 3, b'tree': 2, b'commit': 1, b'tag': 4}
  39 _typermap = {v: k for k, v in items(_typemap)}
  40
  41
  42 _total_searches = 0
  43 _total_steps = 0
  44
  45
  46 class GitError(Exception):
  47     pass
  48
  49
  50 def _gitenv(repo_dir=None):
  51     if not repo_dir:
  52         repo_dir = repo()
  53     return merge_dict(environ, {b'GIT_DIR': os.path.abspath(repo_dir)})
  54
  55 def _git_wait(cmd, p):
  56     rv = p.wait()
  57     if rv != 0:
  58         raise GitError('%r returned %d' % (cmd, rv))
  59
  60 def _git_exo(cmd, **kwargs):
  61     kwargs['check'] = False
  62     result = exo(cmd, **kwargs)
  63     _, _, proc = result
  64     if proc.returncode != 0:
  65         raise GitError('%r returned %d' % (cmd, proc.returncode))
  66     return result
  67
  68 def git_config_get(option, repo_dir=None, opttype=None, cfg_file=None):
  69     assert not (repo_dir and cfg_file), "repo_dir and cfg_file cannot both be used"
  70     cmd = [b'git', b'config', b'--null']
  71     if cfg_file:
  72         cmd.extend([b'--file', cfg_file])
  73     if opttype == 'int':
  74         cmd.extend([b'--int'])
  75     elif opttype == 'bool':
  76         cmd.extend([b'--bool'])
  77     else:
  78         assert opttype is None
  79     cmd.extend([b'--get', option])
  80     env=None
  81     if repo_dir:
  82         env = _gitenv(repo_dir=repo_dir)
  83     p = subprocess.Popen(cmd, stdout=subprocess.PIPE, env=env,
  84                          close_fds=True)
  85     # with --null, git writes out a trailing \0 after the value
  86     r = p.stdout.read()[:-1]
  87     rc = p.wait()
  88     if rc == 0:
  89         if opttype == 'int':
  90             return int(r)
  91         elif opttype == 'bool':
  92             # git converts to 'true' or 'false'
  93             return r == b'true'
  94         return r
  95     if rc != 1:
  96         raise GitError('%r returned %d' % (cmd, rc))
  97     return None
  98
  99
 100 def parse_tz_offset(s):
 101     """UTC offset in seconds."""
 102     tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
 103     if bytes_from_byte(s[0]) == b'-':
 104         return - tz_off
 105     return tz_off
 106
 107 def parse_commit_gpgsig(sig):
 108     """Return the original signature bytes.
 109
 110     i.e. with the "gpgsig " header and the leading space character on
 111     each continuation line removed.
 112
 113     """
 114     if not sig:
 115         return None
 116     assert sig.startswith(b'gpgsig ')
 117     sig = sig[7:]
 118     return sig.replace(b'\n ', b'\n')
 119
 120 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
 121 # Make sure that's authoritative.
 122
 123 # See also
 124 # https://github.com/git/git/blob/master/Documentation/technical/signature-format.txt
 125 # The continuation lines have only one leading space.
 126
 127 _start_end_char = br'[^ .,:;<>"\'\0\n]'
 128 _content_char = br'[^\0\n<>]'
 129 _safe_str_rx = br'(?:%s{1,2}|(?:%s%s*%s))' \
 130     % (_start_end_char,
 131        _start_end_char, _content_char, _start_end_char)
 132 _tz_rx = br'[-+]\d\d[0-5]\d'
 133 _parent_rx = br'(?:parent [abcdefABCDEF0123456789]{40}\n)'
 134 # Assumes every following line starting with a space is part of the
 135 # mergetag.  Is there a formal commit blob spec?
 136 _mergetag_rx = br'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)'
 137 _commit_rx = re.compile(br'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
 138 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
 139 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?)
 140 (?P<gpgsig>gpgsig .*\n(?: .*\n)*)?
 141 (?P<message>(?:.|\n)*)''' % (_parent_rx,
 142                              _safe_str_rx, _safe_str_rx, _tz_rx,
 143                              _safe_str_rx, _safe_str_rx, _tz_rx,
 144                              _mergetag_rx))
 145 _parent_hash_rx = re.compile(br'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
 146
 147 # Note that the author_sec and committer_sec values are (UTC) epoch
 148 # seconds, and for now the mergetag is not included.
 149 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
 150                                        'author_name', 'author_mail',
 151                                        'author_sec', 'author_offset',
 152                                        'committer_name', 'committer_mail',
 153                                        'committer_sec', 'committer_offset',
 154                                        'gpgsig',
 155                                        'message'])
 156
 157 def parse_commit(content):
 158     commit_match = re.match(_commit_rx, content)
 159     if not commit_match:
 160         raise Exception('cannot parse commit %r' % content)
 161     matches = commit_match.groupdict()
 162     return CommitInfo(tree=matches['tree'],
 163                       parents=re.findall(_parent_hash_rx, matches['parents']),
 164                       author_name=matches['author_name'],
 165                       author_mail=matches['author_mail'],
 166                       author_sec=int(matches['asec']),
 167                       author_offset=parse_tz_offset(matches['atz']),
 168                       committer_name=matches['committer_name'],
 169                       committer_mail=matches['committer_mail'],
 170                       committer_sec=int(matches['csec']),
 171                       committer_offset=parse_tz_offset(matches['ctz']),
 172                       gpgsig=parse_commit_gpgsig(matches['gpgsig']),
 173                       message=matches['message'])
 174
 175
 176 def get_cat_data(cat_iterator, expected_type):
 177     _, kind, _ = next(cat_iterator)
 178     if kind != expected_type:
 179         raise Exception('expected %r, saw %r' % (expected_type, kind))
 180     return b''.join(cat_iterator)
 181
 182 def get_commit_items(id, cp):
 183     return parse_commit(get_cat_data(cp.get(id), b'commit'))
 184
 185 def _local_git_date_str(epoch_sec):
 186     return b'%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
 187
 188
 189 def _git_date_str(epoch_sec, tz_offset_sec):
 190     offs =  tz_offset_sec // 60
 191     return b'%d %s%02d%02d' \
 192         % (epoch_sec,
 193            b'+' if offs >= 0 else b'-',
 194            abs(offs) // 60,
 195            abs(offs) % 60)
 196
 197
 198 def repo(sub = b'', repo_dir=None):
 199     """Get the path to the git repository or one of its subdirectories."""
 200     repo_dir = repo_dir or repodir
 201     if not repo_dir:
 202         raise GitError('You should call check_repo_or_die()')
 203
 204     # If there's a .git subdirectory, then the actual repo is in there.
 205     gd = os.path.join(repo_dir, b'.git')
 206     if os.path.exists(gd):
 207         repo_dir = gd
 208
 209     return os.path.join(repo_dir, sub)
 210
 211
 212 _shorten_hash_rx = \
 213     re.compile(br'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)')
 214
 215 def shorten_hash(s):
 216     return _shorten_hash_rx.sub(br'\1\2*\3', s)
 217
 218
 219 def repo_rel(path):
 220     full = os.path.abspath(path)
 221     fullrepo = os.path.abspath(repo(b''))
 222     if not fullrepo.endswith(b'/'):
 223         fullrepo += b'/'
 224     if full.startswith(fullrepo):
 225         path = full[len(fullrepo):]
 226     if path.startswith(b'index-cache/'):
 227         path = path[len(b'index-cache/'):]
 228     return shorten_hash(path)
 229
 230
 231 def auto_midx(objdir):
 232     args = [path.exe(), b'midx', b'--auto', b'--dir', objdir]
 233     try:
 234         rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
 235     except OSError as e:
 236         # make sure 'args' gets printed to help with debugging
 237         add_error('%r: exception: %s' % (args, e))
 238         raise
 239     if rv:
 240         add_error('%r: returned %d' % (args, rv))
 241
 242     args = [path.exe(), b'bloom', b'--dir', objdir]
 243     try:
 244         rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
 245     except OSError as e:
 246         # make sure 'args' gets printed to help with debugging
 247         add_error('%r: exception: %s' % (args, e))
 248         raise
 249     if rv:
 250         add_error('%r: returned %d' % (args, rv))
 251
 252
 253 def mangle_name(name, mode, gitmode):
 254     """Mangle a file name to present an abstract name for segmented files.
 255     Mangled file names will have the ".bup" extension added to them. If a
 256     file's name already ends with ".bup", a ".bupl" extension is added to
 257     disambiguate normal files from segmented ones.
 258     """
 259     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 260         assert(stat.S_ISDIR(gitmode))
 261         return name + b'.bup'
 262     elif name.endswith(b'.bup') or name[:-1].endswith(b'.bup'):
 263         return name + b'.bupl'
 264     else:
 265         return name
 266
 267
 268 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 269 def demangle_name(name, mode):
 270     """Remove name mangling from a file name, if necessary.
 271
 272     The return value is a tuple (demangled_filename,mode), where mode is one of
 273     the following:
 274
 275     * BUP_NORMAL  : files that should be read as-is from the repository
 276     * BUP_CHUNKED : files that were chunked and need to be reassembled
 277
 278     For more information on the name mangling algorithm, see mangle_name()
 279     """
 280     if name.endswith(b'.bupl'):
 281         return (name[:-5], BUP_NORMAL)
 282     elif name.endswith(b'.bup'):
 283         return (name[:-4], BUP_CHUNKED)
 284     elif name.endswith(b'.bupm'):
 285         return (name[:-5],
 286                 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
 287     return (name, BUP_NORMAL)
 288
 289
 290 def calc_hash(type, content):
 291     """Calculate some content's hash in the Git fashion."""
 292     header = b'%s %d\0' % (type, len(content))
 293     sum = Sha1(header)
 294     sum.update(content)
 295     return sum.digest()
 296
 297
 298 def shalist_item_sort_key(ent):
 299     (mode, name, id) = ent
 300     assert(mode+0 == mode)
 301     if stat.S_ISDIR(mode):
 302         return name + b'/'
 303     else:
 304         return name
 305
 306
 307 def tree_encode(shalist):
 308     """Generate a git tree object from (mode,name,hash) tuples."""
 309     shalist = sorted(shalist, key = shalist_item_sort_key)
 310     l = []
 311     for (mode,name,bin) in shalist:
 312         assert(mode)
 313         assert(mode+0 == mode)
 314         assert(name)
 315         assert(len(bin) == 20)
 316         s = b'%o %s\0%s' % (mode,name,bin)
 317         assert s[0] != b'0'  # 0-padded octal is not acceptable in a git tree
 318         l.append(s)
 319     return b''.join(l)
 320
 321
 322 def tree_decode(buf):
 323     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 324     ofs = 0
 325     while ofs < len(buf):
 326         z = buf.find(b'\0', ofs)
 327         assert(z > ofs)
 328         spl = buf[ofs:z].split(b' ', 1)
 329         assert(len(spl) == 2)
 330         mode,name = spl
 331         sha = buf[z+1:z+1+20]
 332         ofs = z+1+20
 333         yield (int(mode, 8), name, sha)
 334
 335
 336 def _encode_packobj(type, content, compression_level=1):
 337     if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
 338         raise ValueError('invalid compression level %s' % compression_level)
 339     szout = b''
 340     sz = len(content)
 341     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 342     sz >>= 4
 343     while 1:
 344         if sz: szbits |= 0x80
 345         szout += bytes_from_uint(szbits)
 346         if not sz:
 347             break
 348         szbits = sz & 0x7f
 349         sz >>= 7
 350     z = zlib.compressobj(compression_level)
 351     yield szout
 352     yield z.compress(content)
 353     yield z.flush()
 354
 355
 356 def _decode_packobj(buf):
 357     assert(buf)
 358     c = byte_int(buf[0])
 359     type = _typermap[(c & 0x70) >> 4]
 360     sz = c & 0x0f
 361     shift = 4
 362     i = 0
 363     while c & 0x80:
 364         i += 1
 365         c = byte_int(buf[i])
 366         sz |= (c & 0x7f) << shift
 367         shift += 7
 368         if not (c & 0x80):
 369             break
 370     return (type, zlib.decompress(buf[i+1:]))
 371
 372
 373 class PackIdx:
 374     def __init__(self):
 375         assert(0)
 376
 377     def find_offset(self, hash):
 378         """Get the offset of an object inside the index file."""
 379         idx = self._idx_from_hash(hash)
 380         if idx != None:
 381             return self._ofs_from_idx(idx)
 382         return None
 383
 384     def exists(self, hash, want_source=False):
 385         """Return nonempty if the object exists in this index."""
 386         if hash and (self._idx_from_hash(hash) != None):
 387             return want_source and os.path.basename(self.name) or True
 388         return None
 389
 390     def _idx_from_hash(self, hash):
 391         global _total_searches, _total_steps
 392         _total_searches += 1
 393         assert(len(hash) == 20)
 394         b1 = byte_int(hash[0])
 395         start = self.fanout[b1-1] # range -1..254
 396         end = self.fanout[b1] # range 0..255
 397         want = hash
 398         _total_steps += 1  # lookup table is a step
 399         while start < end:
 400             _total_steps += 1
 401             mid = start + (end - start) // 2
 402             v = self._idx_to_hash(mid)
 403             if v < want:
 404                 start = mid+1
 405             elif v > want:
 406                 end = mid
 407             else: # got it!
 408                 return mid
 409         return None
 410
 411
 412 class PackIdxV1(PackIdx):
 413     """Object representation of a Git pack index (version 1) file."""
 414     def __init__(self, filename, f):
 415         self.name = filename
 416         self.idxnames = [self.name]
 417         self.map = mmap_read(f)
 418         # Min size for 'L' is 4, which is sufficient for struct's '!I'
 419         self.fanout = array('L', struct.unpack('!256I', self.map))
 420         self.fanout.append(0)  # entry "-1"
 421         self.nsha = self.fanout[255]
 422         self.sha_ofs = 256 * 4
 423         # Avoid slicing shatable for individual hashes (very high overhead)
 424         self.shatable = buffer(self.map, self.sha_ofs, self.nsha * 24)
 425
 426     def __enter__(self):
 427         return self
 428
 429     def __exit__(self, type, value, traceback):
 430         with pending_raise(value, rethrow=False):
 431             self.close()
 432
 433     def __len__(self):
 434         return int(self.nsha)  # int() from long for python 2
 435
 436     def _ofs_from_idx(self, idx):
 437         if idx >= self.nsha or idx < 0:
 438             raise IndexError('invalid pack index index %d' % idx)
 439         ofs = self.sha_ofs + idx * 24
 440         return struct.unpack_from('!I', self.map, offset=ofs)[0]
 441
 442     def _idx_to_hash(self, idx):
 443         if idx >= self.nsha or idx < 0:
 444             raise IndexError('invalid pack index index %d' % idx)
 445         ofs = self.sha_ofs + idx * 24 + 4
 446         return self.map[ofs : ofs + 20]
 447
 448     def __iter__(self):
 449         start = self.sha_ofs + 4
 450         for ofs in range(start, start + 24 * self.nsha, 24):
 451             yield self.map[ofs : ofs + 20]
 452
 453     def close(self):
 454         if self.map is not None:
 455             self.shatable = None
 456             self.map.close()
 457             self.map = None
 458
 459
 460 class PackIdxV2(PackIdx):
 461     """Object representation of a Git pack index (version 2) file."""
 462     def __init__(self, filename, f):
 463         self.name = filename
 464         self.idxnames = [self.name]
 465         self.map = mmap_read(f)
 466         assert self.map[0:8] == b'\377tOc\0\0\0\2'
 467         # Min size for 'L' is 4, which is sufficient for struct's '!I'
 468         self.fanout = array('L', struct.unpack_from('!256I', self.map, offset=8))
 469         self.fanout.append(0)
 470         self.nsha = self.fanout[255]
 471         self.sha_ofs = 8 + 256*4
 472         self.ofstable_ofs = self.sha_ofs + self.nsha * 20 + self.nsha * 4
 473         self.ofs64table_ofs = self.ofstable_ofs + self.nsha * 4
 474         # Avoid slicing this for individual hashes (very high overhead)
 475         self.shatable = buffer(self.map, self.sha_ofs, self.nsha*20)
 476
 477     def __enter__(self):
 478         return self
 479
 480     def __exit__(self, type, value, traceback):
 481         with pending_raise(value, rethrow=False):
 482             self.close()
 483
 484     def __len__(self):
 485         return int(self.nsha)  # int() from long for python 2
 486
 487     def _ofs_from_idx(self, idx):
 488         if idx >= self.nsha or idx < 0:
 489             raise IndexError('invalid pack index index %d' % idx)
 490         ofs_ofs = self.ofstable_ofs + idx * 4
 491         ofs = struct.unpack_from('!I', self.map, offset=ofs_ofs)[0]
 492         if ofs & 0x80000000:
 493             idx64 = ofs & 0x7fffffff
 494             ofs64_ofs = self.ofs64table_ofs + idx64 * 8
 495             ofs = struct.unpack_from('!Q', self.map, offset=ofs64_ofs)[0]
 496         return ofs
 497
 498     def _idx_to_hash(self, idx):
 499         if idx >= self.nsha or idx < 0:
 500             raise IndexError('invalid pack index index %d' % idx)
 501         ofs = self.sha_ofs + idx * 20
 502         return self.map[ofs : ofs + 20]
 503
 504     def __iter__(self):
 505         start = self.sha_ofs
 506         for ofs in range(start, start + 20 * self.nsha, 20):
 507             yield self.map[ofs : ofs + 20]
 508
 509     def close(self):
 510         if self.map is not None:
 511             self.shatable = None
 512             self.map.close()
 513             self.map = None
 514
 515
 516 _mpi_count = 0
 517 class PackIdxList:
 518     def __init__(self, dir, ignore_midx=False):
 519         global _mpi_count
 520         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 521         _mpi_count += 1
 522         self.dir = dir
 523         self.also = set()
 524         self.packs = []
 525         self.do_bloom = False
 526         self.bloom = None
 527         self.ignore_midx = ignore_midx
 528         self.refresh()
 529
 530     def __del__(self):
 531         global _mpi_count
 532         _mpi_count -= 1
 533         assert(_mpi_count == 0)
 534
 535     def __iter__(self):
 536         return iter(idxmerge(self.packs))
 537
 538     def __len__(self):
 539         return sum(len(pack) for pack in self.packs)
 540
 541     def exists(self, hash, want_source=False):
 542         """Return nonempty if the object exists in the index files."""
 543         global _total_searches
 544         _total_searches += 1
 545         if hash in self.also:
 546             return True
 547         if self.do_bloom and self.bloom:
 548             if self.bloom.exists(hash):
 549                 self.do_bloom = False
 550             else:
 551                 _total_searches -= 1  # was counted by bloom
 552                 return None
 553         for i in range(len(self.packs)):
 554             p = self.packs[i]
 555             _total_searches -= 1  # will be incremented by sub-pack
 556             ix = p.exists(hash, want_source=want_source)
 557             if ix:
 558                 # reorder so most recently used packs are searched first
 559                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 560                 return ix
 561         self.do_bloom = True
 562         return None
 563
 564     def refresh(self, skip_midx = False):
 565         """Refresh the index list.
 566         This method verifies if .midx files were superseded (e.g. all of its
 567         contents are in another, bigger .midx file) and removes the superseded
 568         files.
 569
 570         If skip_midx is True, all work on .midx files will be skipped and .midx
 571         files will be removed from the list.
 572
 573         The instance variable 'ignore_midx' can force this function to
 574         always act as if skip_midx was True.
 575         """
 576         if self.bloom is not None:
 577             self.bloom.close()
 578         self.bloom = None # Always reopen the bloom as it may have been relaced
 579         self.do_bloom = False
 580         skip_midx = skip_midx or self.ignore_midx
 581         d = dict((p.name, p) for p in self.packs
 582                  if not skip_midx or not isinstance(p, midx.PackMidx))
 583         if os.path.exists(self.dir):
 584             if not skip_midx:
 585                 midxl = []
 586                 midxes = set(glob.glob(os.path.join(self.dir, b'*.midx')))
 587                 # remove any *.midx files from our list that no longer exist
 588                 for ix in list(d.values()):
 589                     if not isinstance(ix, midx.PackMidx):
 590                         continue
 591                     if ix.name in midxes:
 592                         continue
 593                     # remove the midx
 594                     del d[ix.name]
 595                     ix.close()
 596                     self.packs.remove(ix)
 597                 for ix in self.packs:
 598                     if isinstance(ix, midx.PackMidx):
 599                         for name in ix.idxnames:
 600                             d[os.path.join(self.dir, name)] = ix
 601                 for full in midxes:
 602                     if not d.get(full):
 603                         mx = midx.PackMidx(full)
 604                         (mxd, mxf) = os.path.split(mx.name)
 605                         broken = False
 606                         for n in mx.idxnames:
 607                             if not os.path.exists(os.path.join(mxd, n)):
 608                                 log(('warning: index %s missing\n'
 609                                      '  used by %s\n')
 610                                     % (path_msg(n), path_msg(mxf)))
 611                                 broken = True
 612                         if broken:
 613                             mx.close()
 614                             unlink(full)
 615                         else:
 616                             midxl.append(mx)
 617                 midxl.sort(key=lambda ix:
 618                            (-len(ix), -xstat.stat(ix.name).st_mtime))
 619                 for ix in midxl:
 620                     any_needed = False
 621                     for sub in ix.idxnames:
 622                         found = d.get(os.path.join(self.dir, sub))
 623                         if not found or isinstance(found, PackIdx):
 624                             # doesn't exist, or exists but not in a midx
 625                             any_needed = True
 626                             break
 627                     if any_needed:
 628                         d[ix.name] = ix
 629                         for name in ix.idxnames:
 630                             d[os.path.join(self.dir, name)] = ix
 631                     elif not ix.force_keep:
 632                         debug1('midx: removing redundant: %s\n'
 633                                % path_msg(os.path.basename(ix.name)))
 634                         ix.close()
 635                         unlink(ix.name)
 636             for full in glob.glob(os.path.join(self.dir, b'*.idx')):
 637                 if not d.get(full):
 638                     try:
 639                         ix = open_idx(full)
 640                     except GitError as e:
 641                         add_error(e)
 642                         continue
 643                     d[full] = ix
 644             bfull = os.path.join(self.dir, b'bup.bloom')
 645             self.packs = list(set(d.values()))
 646             self.packs.sort(reverse=True, key=lambda x: len(x))
 647             if self.bloom is None and os.path.exists(bfull):
 648                 self.bloom = bloom.ShaBloom(bfull)
 649             try:
 650                 if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 651                     self.do_bloom = True
 652                 else:
 653                     if self.bloom:
 654                         self.bloom, bloom_tmp = None, self.bloom
 655                         bloom_tmp.close()
 656             except BaseException as ex:
 657                 with pending_raise(ex):
 658                     if self.bloom:
 659                         self.bloom.close()
 660
 661         debug1('PackIdxList: using %d index%s.\n'
 662             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 663
 664     def add(self, hash):
 665         """Insert an additional object in the list."""
 666         self.also.add(hash)
 667
 668
 669 def open_idx(filename):
 670     if filename.endswith(b'.idx'):
 671         f = open(filename, 'rb')
 672         header = f.read(8)
 673         if header[0:4] == b'\377tOc':
 674             version = struct.unpack('!I', header[4:8])[0]
 675             if version == 2:
 676                 return PackIdxV2(filename, f)
 677             else:
 678                 raise GitError('%s: expected idx file version 2, got %d'
 679                                % (path_msg(filename), version))
 680         elif len(header) == 8 and header[0:4] < b'\377tOc':
 681             return PackIdxV1(filename, f)
 682         else:
 683             raise GitError('%s: unrecognized idx file header'
 684                            % path_msg(filename))
 685     elif filename.endswith(b'.midx'):
 686         return midx.PackMidx(filename)
 687     else:
 688         raise GitError('idx filenames must end with .idx or .midx')
 689
 690
 691 def idxmerge(idxlist, final_progress=True):
 692     """Generate a list of all the objects reachable in a PackIdxList."""
 693     def pfunc(count, total):
 694         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 695                   % (count*100.0/total, count, total))
 696     def pfinal(count, total):
 697         if final_progress:
 698             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 699                      % (100, total, total))
 700     return merge_iter(idxlist, 10024, pfunc, pfinal)
 701
 702
 703 def create_commit_blob(tree, parent,
 704                        author, adate_sec, adate_tz,
 705                        committer, cdate_sec, cdate_tz,
 706                        msg):
 707     if adate_tz is not None:
 708         adate_str = _git_date_str(adate_sec, adate_tz)
 709     else:
 710         adate_str = _local_git_date_str(adate_sec)
 711     if cdate_tz is not None:
 712         cdate_str = _git_date_str(cdate_sec, cdate_tz)
 713     else:
 714         cdate_str = _local_git_date_str(cdate_sec)
 715     l = []
 716     if tree: l.append(b'tree %s' % hexlify(tree))
 717     if parent: l.append(b'parent %s' % hexlify(parent))
 718     if author: l.append(b'author %s %s' % (author, adate_str))
 719     if committer: l.append(b'committer %s %s' % (committer, cdate_str))
 720     l.append(b'')
 721     l.append(msg)
 722     return b'\n'.join(l)
 723
 724
 725 def _make_objcache():
 726     return PackIdxList(repo(b'objects/pack'))
 727
 728 # bup-gc assumes that it can disable all PackWriter activities
 729 # (bloom/midx/cache) via the constructor and close() arguments.
 730
 731 class PackWriter:
 732     """Writes Git objects inside a pack file."""
 733     def __init__(self, objcache_maker=_make_objcache, compression_level=1,
 734                  run_midx=True, on_pack_finish=None,
 735                  max_pack_size=None, max_pack_objects=None, repo_dir=None):
 736         self.repo_dir = repo_dir or repo()
 737         self.file = None
 738         self.parentfd = None
 739         self.count = 0
 740         self.outbytes = 0
 741         self.filename = None
 742         self.idx = None
 743         self.objcache_maker = objcache_maker
 744         self.objcache = None
 745         self.compression_level = compression_level
 746         self.run_midx=run_midx
 747         self.on_pack_finish = on_pack_finish
 748         if not max_pack_size:
 749             max_pack_size = git_config_get(b'pack.packSizeLimit',
 750                                            repo_dir=self.repo_dir,
 751                                            opttype='int')
 752             if not max_pack_size:
 753                 # larger packs slow down pruning
 754                 max_pack_size = 1000 * 1000 * 1000
 755         self.max_pack_size = max_pack_size
 756         # cache memory usage is about 83 bytes per object
 757         self.max_pack_objects = max_pack_objects if max_pack_objects \
 758                                 else max(1, self.max_pack_size // 5000)
 759
 760     def __enter__(self):
 761         return self
 762
 763     def __exit__(self, type, value, traceback):
 764         with pending_raise(value, rethrow=False):
 765             self.close()
 766
 767     def _open(self):
 768         if not self.file:
 769             objdir = dir = os.path.join(self.repo_dir, b'objects')
 770             fd, name = tempfile.mkstemp(suffix=b'.pack', dir=objdir)
 771             try:
 772                 self.file = os.fdopen(fd, 'w+b')
 773             except:
 774                 os.close(fd)
 775                 raise
 776             try:
 777                 self.parentfd = os.open(objdir, os.O_RDONLY)
 778             except:
 779                 f = self.file
 780                 self.file = None
 781                 f.close()
 782                 raise
 783             assert name.endswith(b'.pack')
 784             self.filename = name[:-5]
 785             self.file.write(b'PACK\0\0\0\2\0\0\0\0')
 786             self.idx = PackIdxV2Writer()
 787
 788     def _raw_write(self, datalist, sha):
 789         self._open()
 790         f = self.file
 791         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 792         # the file never has a *partial* blob.  So let's make sure it's
 793         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 794         # to our hashsplit algorithm.)  f.write() does its own buffering,
 795         # but that's okay because we'll flush it in _end().
 796         oneblob = b''.join(datalist)
 797         try:
 798             f.write(oneblob)
 799         except IOError as e:
 800             reraise(GitError(e))
 801         nw = len(oneblob)
 802         crc = zlib.crc32(oneblob) & 0xffffffff
 803         self._update_idx(sha, crc, nw)
 804         self.outbytes += nw
 805         self.count += 1
 806         return nw, crc
 807
 808     def _update_idx(self, sha, crc, size):
 809         assert(sha)
 810         if self.idx:
 811             self.idx.add(sha, crc, self.file.tell() - size)
 812
 813     def _write(self, sha, type, content):
 814         if verbose:
 815             log('>')
 816         if not sha:
 817             sha = calc_hash(type, content)
 818         size, crc = self._raw_write(_encode_packobj(type, content,
 819                                                     self.compression_level),
 820                                     sha=sha)
 821         if self.outbytes >= self.max_pack_size \
 822            or self.count >= self.max_pack_objects:
 823             self.breakpoint()
 824         return sha
 825
 826     def _require_objcache(self):
 827         if self.objcache is None and self.objcache_maker:
 828             self.objcache = self.objcache_maker()
 829         if self.objcache is None:
 830             raise GitError(
 831                     "PackWriter not opened or can't check exists w/o objcache")
 832
 833     def exists(self, id, want_source=False):
 834         """Return non-empty if an object is found in the object cache."""
 835         self._require_objcache()
 836         return self.objcache.exists(id, want_source=want_source)
 837
 838     def just_write(self, sha, type, content):
 839         """Write an object to the pack file without checking for duplication."""
 840         self._write(sha, type, content)
 841         # If nothing else, gc doesn't have/want an objcache
 842         if self.objcache is not None:
 843             self.objcache.add(sha)
 844
 845     def maybe_write(self, type, content):
 846         """Write an object to the pack file if not present and return its id."""
 847         sha = calc_hash(type, content)
 848         if not self.exists(sha):
 849             self._require_objcache()
 850             self.just_write(sha, type, content)
 851         return sha
 852
 853     def new_blob(self, blob):
 854         """Create a blob object in the pack with the supplied content."""
 855         return self.maybe_write(b'blob', blob)
 856
 857     def new_tree(self, shalist):
 858         """Create a tree object in the pack."""
 859         content = tree_encode(shalist)
 860         return self.maybe_write(b'tree', content)
 861
 862     def new_commit(self, tree, parent,
 863                    author, adate_sec, adate_tz,
 864                    committer, cdate_sec, cdate_tz,
 865                    msg):
 866         """Create a commit object in the pack.  The date_sec values must be
 867         epoch-seconds, and if a tz is None, the local timezone is assumed."""
 868         content = create_commit_blob(tree, parent,
 869                                      author, adate_sec, adate_tz,
 870                                      committer, cdate_sec, cdate_tz,
 871                                      msg)
 872         return self.maybe_write(b'commit', content)
 873
 874     def _end(self, run_midx=True, abort=False):
 875         # Ignores run_midx during abort
 876         if not self.file:
 877             return None
 878         self.file, f = None, self.file
 879         self.idx, idx = None, self.idx
 880         self.parentfd, pfd, = None, self.parentfd
 881         self.objcache = None
 882
 883         with finalized(pfd, lambda x: x is not None and os.close(x)), \
 884              f:
 885
 886             if abort:
 887                 os.unlink(self.filename + b'.pack')
 888                 return None
 889
 890             # update object count
 891             f.seek(8)
 892             cp = struct.pack('!i', self.count)
 893             assert len(cp) == 4
 894             f.write(cp)
 895
 896             # calculate the pack sha1sum
 897             f.seek(0)
 898             sum = Sha1()
 899             for b in chunkyreader(f):
 900                 sum.update(b)
 901             packbin = sum.digest()
 902             f.write(packbin)
 903             f.flush()
 904             fdatasync(f.fileno())
 905             f.close()
 906
 907             idx.write(self.filename + b'.idx', packbin)
 908             nameprefix = os.path.join(self.repo_dir,
 909                                       b'objects/pack/pack-' +  hexlify(packbin))
 910             if os.path.exists(self.filename + b'.map'):
 911                 os.unlink(self.filename + b'.map')
 912             os.rename(self.filename + b'.pack', nameprefix + b'.pack')
 913             os.rename(self.filename + b'.idx', nameprefix + b'.idx')
 914             os.fsync(pfd)
 915             if run_midx:
 916                 auto_midx(os.path.join(self.repo_dir, b'objects/pack'))
 917             if self.on_pack_finish:
 918                 self.on_pack_finish(nameprefix)
 919             return nameprefix
 920
 921     def abort(self):
 922         """Remove the pack file from disk."""
 923         self._end(abort=True)
 924
 925     def breakpoint(self):
 926         """Clear byte and object counts and return the last processed id."""
 927         id = self._end(self.run_midx)
 928         self.outbytes = self.count = 0
 929         return id
 930
 931     def close(self, run_midx=True):
 932         """Close the pack file and move it to its definitive path."""
 933         return self._end(run_midx=run_midx)
 934
 935
 936 class PackIdxV2Writer:
 937     def __init__(self):
 938         self.idx = list(list() for i in range(256))
 939         self.count = 0
 940
 941     def add(self, sha, crc, offs):
 942         assert(sha)
 943         self.count += 1
 944         self.idx[byte_int(sha[0])].append((sha, crc, offs))
 945
 946     def write(self, filename, packbin):
 947         ofs64_count = 0
 948         for section in self.idx:
 949             for entry in section:
 950                 if entry[2] >= 2**31:
 951                     ofs64_count += 1
 952
 953         # Length: header + fan-out + shas-and-crcs + overflow-offsets
 954         index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
 955         idx_map = None
 956         idx_f = open(filename, 'w+b')
 957         try:
 958             idx_f.truncate(index_len)
 959             fdatasync(idx_f.fileno())
 960             idx_map = mmap_readwrite(idx_f, close=False)
 961             try:
 962                 count = _helpers.write_idx(filename, idx_map, self.idx,
 963                                            self.count)
 964                 assert(count == self.count)
 965                 idx_map.flush()
 966             finally:
 967                 idx_map.close()
 968         finally:
 969             idx_f.close()
 970
 971         idx_f = open(filename, 'a+b')
 972         try:
 973             idx_f.write(packbin)
 974             idx_f.seek(0)
 975             idx_sum = Sha1()
 976             b = idx_f.read(8 + 4*256)
 977             idx_sum.update(b)
 978
 979             for b in chunkyreader(idx_f, 20 * self.count):
 980                 idx_sum.update(b)
 981
 982             for b in chunkyreader(idx_f):
 983                 idx_sum.update(b)
 984             idx_f.write(idx_sum.digest())
 985             fdatasync(idx_f.fileno())
 986         finally:
 987             idx_f.close()
 988
 989
 990 def list_refs(patterns=None, repo_dir=None,
 991               limit_to_heads=False, limit_to_tags=False):
 992     """Yield (refname, hash) tuples for all repository refs unless
 993     patterns are specified.  In that case, only include tuples for
 994     refs matching those patterns (cf. git-show-ref(1)).  The limits
 995     restrict the result items to refs/heads or refs/tags.  If both
 996     limits are specified, items from both sources will be included.
 997
 998     """
 999     argv = [b'git', b'show-ref']
1000     if limit_to_heads:
1001         argv.append(b'--heads')
1002     if limit_to_tags:
1003         argv.append(b'--tags')
1004     argv.append(b'--')
1005     if patterns:
1006         argv.extend(patterns)
1007     p = subprocess.Popen(argv, env=_gitenv(repo_dir), stdout=subprocess.PIPE,
1008                          close_fds=True)
1009     out = p.stdout.read().strip()
1010     rv = p.wait()  # not fatal
1011     if rv:
1012         assert(not out)
1013     if out:
1014         for d in out.split(b'\n'):
1015             sha, name = d.split(b' ', 1)
1016             yield name, unhexlify(sha)
1017
1018
1019 def read_ref(refname, repo_dir = None):
1020     """Get the commit id of the most recent commit made on a given ref."""
1021     refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
1022     l = tuple(islice(refs, 2))
1023     if l:
1024         assert(len(l) == 1)
1025         return l[0][1]
1026     else:
1027         return None
1028
1029
1030 def rev_list_invocation(ref_or_refs, format=None):
1031     if isinstance(ref_or_refs, bytes):
1032         refs = (ref_or_refs,)
1033     else:
1034         refs = ref_or_refs
1035     argv = [b'git', b'rev-list']
1036
1037     if format:
1038         argv.append(b'--pretty=format:' + format)
1039     for ref in refs:
1040         assert not ref.startswith(b'-')
1041         argv.append(ref)
1042     argv.append(b'--')
1043     return argv
1044
1045
1046 def rev_list(ref_or_refs, parse=None, format=None, repo_dir=None):
1047     """Yield information about commits as per "git rev-list".  If a format
1048     is not provided, yield one hex hash at a time.  If a format is
1049     provided, pass it to rev-list and call parse(git_stdout) for each
1050     commit with the stream positioned just after the rev-list "commit
1051     HASH" header line.  When a format is provided yield (oidx,
1052     parse(git_stdout)) for each commit.
1053
1054     """
1055     assert bool(parse) == bool(format)
1056     p = subprocess.Popen(rev_list_invocation(ref_or_refs,
1057                                              format=format),
1058                          env=_gitenv(repo_dir),
1059                          stdout = subprocess.PIPE,
1060                          close_fds=True)
1061     if not format:
1062         for line in p.stdout:
1063             yield line.strip()
1064     else:
1065         line = p.stdout.readline()
1066         while line:
1067             s = line.strip()
1068             if not s.startswith(b'commit '):
1069                 raise Exception('unexpected line ' + repr(s))
1070             s = s[7:]
1071             assert len(s) == 40
1072             yield s, parse(p.stdout)
1073             line = p.stdout.readline()
1074
1075     rv = p.wait()  # not fatal
1076     if rv:
1077         raise GitError('git rev-list returned error %d' % rv)
1078
1079
1080 def rev_parse(committish, repo_dir=None):
1081     """Resolve the full hash for 'committish', if it exists.
1082
1083     Should be roughly equivalent to 'git rev-parse'.
1084
1085     Returns the hex value of the hash if it is found, None if 'committish' does
1086     not correspond to anything.
1087     """
1088     head = read_ref(committish, repo_dir=repo_dir)
1089     if head:
1090         debug2("resolved from ref: commit = %s\n" % hexlify(head))
1091         return head
1092
1093     pL = PackIdxList(repo(b'objects/pack', repo_dir=repo_dir))
1094
1095     if len(committish) == 40:
1096         try:
1097             hash = unhexlify(committish)
1098         except TypeError:
1099             return None
1100
1101         if pL.exists(hash):
1102             return hash
1103
1104     return None
1105
1106
1107 def update_ref(refname, newval, oldval, repo_dir=None):
1108     """Update a repository reference."""
1109     if not oldval:
1110         oldval = b''
1111     assert refname.startswith(b'refs/heads/') \
1112         or refname.startswith(b'refs/tags/')
1113     p = subprocess.Popen([b'git', b'update-ref', refname,
1114                           hexlify(newval), hexlify(oldval)],
1115                          env=_gitenv(repo_dir),
1116                          close_fds=True)
1117     _git_wait(b'git update-ref', p)
1118
1119
1120 def delete_ref(refname, oldvalue=None):
1121     """Delete a repository reference (see git update-ref(1))."""
1122     assert refname.startswith(b'refs/')
1123     oldvalue = [] if not oldvalue else [oldvalue]
1124     p = subprocess.Popen([b'git', b'update-ref', b'-d', refname] + oldvalue,
1125                          env=_gitenv(),
1126                          close_fds=True)
1127     _git_wait('git update-ref', p)
1128
1129
1130 def guess_repo(path=None):
1131     """Set the path value in the global variable "repodir".
1132     This makes bup look for an existing bup repository, but not fail if a
1133     repository doesn't exist. Usually, if you are interacting with a bup
1134     repository, you would not be calling this function but using
1135     check_repo_or_die().
1136     """
1137     global repodir
1138     if path:
1139         repodir = path
1140     if not repodir:
1141         repodir = environ.get(b'BUP_DIR')
1142         if not repodir:
1143             repodir = os.path.expanduser(b'~/.bup')
1144
1145
1146 def init_repo(path=None):
1147     """Create the Git bare repository for bup in a given path."""
1148     guess_repo(path)
1149     d = repo()  # appends a / to the path
1150     parent = os.path.dirname(os.path.dirname(d))
1151     if parent and not os.path.exists(parent):
1152         raise GitError('parent directory "%s" does not exist\n'
1153                        % path_msg(parent))
1154     if os.path.exists(d) and not os.path.isdir(os.path.join(d, b'.')):
1155         raise GitError('"%s" exists but is not a directory\n' % path_msg(d))
1156     p = subprocess.Popen([b'git', b'--bare', b'init'], stdout=sys.stderr,
1157                          env=_gitenv(),
1158                          close_fds=True)
1159     _git_wait('git init', p)
1160     # Force the index version configuration in order to ensure bup works
1161     # regardless of the version of the installed Git binary.
1162     p = subprocess.Popen([b'git', b'config', b'pack.indexVersion', '2'],
1163                          stdout=sys.stderr, env=_gitenv(), close_fds=True)
1164     _git_wait('git config', p)
1165     # Enable the reflog
1166     p = subprocess.Popen([b'git', b'config', b'core.logAllRefUpdates', b'true'],
1167                          stdout=sys.stderr, env=_gitenv(), close_fds=True)
1168     _git_wait('git config', p)
1169
1170
1171 def check_repo_or_die(path=None):
1172     """Check to see if a bup repository probably exists, and abort if not."""
1173     guess_repo(path)
1174     top = repo()
1175     pst = stat_if_exists(top + b'/objects/pack')
1176     if pst and stat.S_ISDIR(pst.st_mode):
1177         return
1178     if not pst:
1179         top_st = stat_if_exists(top)
1180         if not top_st:
1181             log('error: repository %r does not exist (see "bup help init")\n'
1182                 % top)
1183             sys.exit(15)
1184     log('error: %s is not a repository\n' % path_msg(top))
1185     sys.exit(14)
1186
1187
1188 def is_suitable_git(ver_str):
1189     if not ver_str.startswith(b'git version '):
1190         return 'unrecognized'
1191     ver_str = ver_str[len(b'git version '):]
1192     if ver_str.startswith(b'0.'):
1193         return 'insufficient'
1194     if ver_str.startswith(b'1.'):
1195         if re.match(br'1\.[012345]rc', ver_str):
1196             return 'insufficient'
1197         if re.match(br'1\.[01234]\.', ver_str):
1198             return 'insufficient'
1199         if re.match(br'1\.5\.[012345]($|\.)', ver_str):
1200             return 'insufficient'
1201         if re.match(br'1\.5\.6-rc', ver_str):
1202             return 'insufficient'
1203         return 'suitable'
1204     if re.match(br'[0-9]+(\.|$)?', ver_str):
1205         return 'suitable'
1206     sys.exit(13)
1207
1208 _git_great = None
1209
1210 def require_suitable_git(ver_str=None):
1211     """Raise GitError if the version of git isn't suitable.
1212
1213     Rely on ver_str when provided, rather than invoking the git in the
1214     path.
1215
1216     """
1217     global _git_great
1218     if _git_great is not None:
1219         return
1220     if environ.get(b'BUP_GIT_VERSION_IS_FINE', b'').lower() \
1221        in (b'yes', b'true', b'1'):
1222         _git_great = True
1223         return
1224     if not ver_str:
1225         ver_str, _, _ = _git_exo([b'git', b'--version'])
1226     status = is_suitable_git(ver_str)
1227     if status == 'unrecognized':
1228         raise GitError('Unexpected git --version output: %r' % ver_str)
1229     if status == 'insufficient':
1230         log('error: git version must be at least 1.5.6\n')
1231         sys.exit(1)
1232     if status == 'suitable':
1233         _git_great = True
1234         return
1235     assert False
1236
1237
1238 class CatPipe:
1239     """Link to 'git cat-file' that is used to retrieve blob data."""
1240     def __init__(self, repo_dir = None):
1241         require_suitable_git()
1242         self.repo_dir = repo_dir
1243         self.p = self.inprogress = None
1244
1245     def close(self, wait=False):
1246         self.p, p = None, self.p
1247         self.inprogress = None
1248         if p:
1249             try:
1250                 p.stdout.close()
1251             finally:
1252                 # This will handle pending exceptions correctly once
1253                 # we drop py2
1254                 p.stdin.close()
1255         if wait:
1256             p.wait()
1257             return p.returncode
1258         return None
1259
1260     def restart(self):
1261         self.close()
1262         self.p = subprocess.Popen([b'git', b'cat-file', b'--batch'],
1263                                   stdin=subprocess.PIPE,
1264                                   stdout=subprocess.PIPE,
1265                                   close_fds = True,
1266                                   bufsize = 4096,
1267                                   env=_gitenv(self.repo_dir))
1268
1269     def get(self, ref):
1270         """Yield (oidx, type, size), followed by the data referred to by ref.
1271         If ref does not exist, only yield (None, None, None).
1272
1273         """
1274         if not self.p or self.p.poll() != None:
1275             self.restart()
1276         assert(self.p)
1277         poll_result = self.p.poll()
1278         assert(poll_result == None)
1279         if self.inprogress:
1280             log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1281         assert(not self.inprogress)
1282         assert ref.find(b'\n') < 0
1283         assert ref.find(b'\r') < 0
1284         assert not ref.startswith(b'-')
1285         self.inprogress = ref
1286         self.p.stdin.write(ref + b'\n')
1287         self.p.stdin.flush()
1288         hdr = self.p.stdout.readline()
1289         if not hdr:
1290             raise GitError('unexpected cat-file EOF (last request: %r, exit: %s)'
1291                            % (ref, self.p.poll() or 'none'))
1292         if hdr.endswith(b' missing\n'):
1293             self.inprogress = None
1294             yield None, None, None
1295             return
1296         info = hdr.split(b' ')
1297         if len(info) != 3 or len(info[0]) != 40:
1298             raise GitError('expected object (id, type, size), got %r' % info)
1299         oidx, typ, size = info
1300         size = int(size)
1301         try:
1302             it = chunkyreader(self.p.stdout, size)
1303             yield oidx, typ, size
1304             for blob in chunkyreader(self.p.stdout, size):
1305                 yield blob
1306             readline_result = self.p.stdout.readline()
1307             assert readline_result == b'\n'
1308             self.inprogress = None
1309         except Exception as ex:
1310             with pending_raise(ex):
1311                 self.close()
1312
1313     def _join(self, it):
1314         _, typ, _ = next(it)
1315         if typ == b'blob':
1316             for blob in it:
1317                 yield blob
1318         elif typ == b'tree':
1319             treefile = b''.join(it)
1320             for (mode, name, sha) in tree_decode(treefile):
1321                 for blob in self.join(hexlify(sha)):
1322                     yield blob
1323         elif typ == b'commit':
1324             treeline = b''.join(it).split(b'\n')[0]
1325             assert treeline.startswith(b'tree ')
1326             for blob in self.join(treeline[5:]):
1327                 yield blob
1328         else:
1329             raise GitError('invalid object type %r: expected blob/tree/commit'
1330                            % typ)
1331
1332     def join(self, id):
1333         """Generate a list of the content of all blobs that can be reached
1334         from an object.  The hash given in 'id' must point to a blob, a tree
1335         or a commit. The content of all blobs that can be seen from trees or
1336         commits will be added to the list.
1337         """
1338         for d in self._join(self.get(id)):
1339             yield d
1340
1341
1342 _cp = {}
1343
1344 def cp(repo_dir=None):
1345     """Create a CatPipe object or reuse the already existing one."""
1346     global _cp, repodir
1347     if not repo_dir:
1348         repo_dir = repodir or repo()
1349     repo_dir = os.path.abspath(repo_dir)
1350     cp = _cp.get(repo_dir)
1351     if not cp:
1352         cp = CatPipe(repo_dir)
1353         _cp[repo_dir] = cp
1354     return cp
1355
1356
1357 def close_catpipes():
1358     # FIXME: chain exceptions
1359     while _cp:
1360         _, cp = _cp.popitem()
1361         cp.close(wait=True)
1362
1363
1364 def tags(repo_dir = None):
1365     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1366     tags = {}
1367     for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1368         assert n.startswith(b'refs/tags/')
1369         name = n[10:]
1370         if not c in tags:
1371             tags[c] = []
1372         tags[c].append(name)  # more than one tag can point at 'c'
1373     return tags
1374
1375
1376 class MissingObject(KeyError):
1377     def __init__(self, oid):
1378         self.oid = oid
1379         KeyError.__init__(self, 'object %r is missing' % hexlify(oid))
1380
1381
1382 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1383                                    'path', 'chunk_path', 'data'])
1384 # The path is the mangled path, and if an item represents a fragment
1385 # of a chunked file, the chunk_path will be the chunked subtree path
1386 # for the chunk, i.e. ['', '2d3115e', ...].  The top-level path for a
1387 # chunked file will have a chunk_path of [''].  So some chunk subtree
1388 # of the file '/foo/bar/baz' might look like this:
1389 #
1390 #   item.path = ['foo', 'bar', 'baz.bup']
1391 #   item.chunk_path = ['', '2d3115e', '016b097']
1392 #   item.type = 'tree'
1393 #   ...
1394
1395
1396 def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1397     """Yield everything reachable from oidx via get_ref (which must behave
1398     like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1399     returns true.  Throw MissingObject if a hash encountered is
1400     missing from the repository, and don't read or return blob content
1401     in the data field unless include_data is set.
1402
1403     """
1404     # Maintain the pending stack on the heap to avoid stack overflow
1405     pending = [(oidx, [], [], None)]
1406     while len(pending):
1407         oidx, parent_path, chunk_path, mode = pending.pop()
1408         oid = unhexlify(oidx)
1409         if stop_at and stop_at(oidx):
1410             continue
1411
1412         if (not include_data) and mode and stat.S_ISREG(mode):
1413             # If the object is a "regular file", then it's a leaf in
1414             # the graph, so we can skip reading the data if the caller
1415             # hasn't requested it.
1416             yield WalkItem(oid=oid, type=b'blob',
1417                            chunk_path=chunk_path, path=parent_path,
1418                            mode=mode,
1419                            data=None)
1420             continue
1421
1422         item_it = get_ref(oidx)
1423         get_oidx, typ, _ = next(item_it)
1424         if not get_oidx:
1425             raise MissingObject(unhexlify(oidx))
1426         if typ not in (b'blob', b'commit', b'tree'):
1427             raise Exception('unexpected repository object type %r' % typ)
1428
1429         # FIXME: set the mode based on the type when the mode is None
1430         if typ == b'blob' and not include_data:
1431             # Dump data until we can ask cat_pipe not to fetch it
1432             for ignored in item_it:
1433                 pass
1434             data = None
1435         else:
1436             data = b''.join(item_it)
1437
1438         yield WalkItem(oid=oid, type=typ,
1439                        chunk_path=chunk_path, path=parent_path,
1440                        mode=mode,
1441                        data=(data if include_data else None))
1442
1443         if typ == b'commit':
1444             commit_items = parse_commit(data)
1445             for pid in commit_items.parents:
1446                 pending.append((pid, parent_path, chunk_path, mode))
1447             pending.append((commit_items.tree, parent_path, chunk_path,
1448                             hashsplit.GIT_MODE_TREE))
1449         elif typ == b'tree':
1450             for mode, name, ent_id in tree_decode(data):
1451                 demangled, bup_type = demangle_name(name, mode)
1452                 if chunk_path:
1453                     sub_path = parent_path
1454                     sub_chunk_path = chunk_path + [name]
1455                 else:
1456                     sub_path = parent_path + [name]
1457                     if bup_type == BUP_CHUNKED:
1458                         sub_chunk_path = [b'']
1459                     else:
1460                         sub_chunk_path = chunk_path
1461                 pending.append((hexlify(ent_id), sub_path, sub_chunk_path,
1462                                 mode))