lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5
   6 from __future__ import absolute_import, print_function
   7 import os, sys, zlib, subprocess, struct, stat, re, tempfile, glob
   8 from array import array
   9 from binascii import hexlify, unhexlify
  10 from collections import namedtuple
  11 from itertools import islice
  12
  13 from bup import _helpers, hashsplit, path, midx, bloom, xstat
  14 from bup.compat import (buffer,
  15                         byte_int, bytes_from_byte, bytes_from_uint,
  16                         environ,
  17                         items,
  18                         pending_raise,
  19                         range,
  20                         reraise)
  21 from bup.io import path_msg
  22 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
  23                          exo,
  24                          fdatasync,
  25                          finalized,
  26                          log,
  27                          merge_dict,
  28                          merge_iter,
  29                          mmap_read, mmap_readwrite,
  30                          progress, qprogress, stat_if_exists,
  31                          unlink,
  32                          utc_offset_str)
  33
  34
  35 verbose = 0
  36 repodir = None  # The default repository, once initialized
  37
  38 _typemap =  {b'blob': 3, b'tree': 2, b'commit': 1, b'tag': 4}
  39 _typermap = {v: k for k, v in items(_typemap)}
  40
  41
  42 _total_searches = 0
  43 _total_steps = 0
  44
  45
  46 class GitError(Exception):
  47     pass
  48
  49
  50 def _gitenv(repo_dir=None):
  51     if not repo_dir:
  52         repo_dir = repo()
  53     return merge_dict(environ, {b'GIT_DIR': os.path.abspath(repo_dir)})
  54
  55 def _git_wait(cmd, p):
  56     rv = p.wait()
  57     if rv != 0:
  58         raise GitError('%r returned %d' % (cmd, rv))
  59
  60 def _git_exo(cmd, **kwargs):
  61     kwargs['check'] = False
  62     result = exo(cmd, **kwargs)
  63     _, _, proc = result
  64     if proc.returncode != 0:
  65         raise GitError('%r returned %d' % (cmd, proc.returncode))
  66     return result
  67
  68 def git_config_get(option, repo_dir=None, opttype=None, cfg_file=None):
  69     assert not (repo_dir and cfg_file), "repo_dir and cfg_file cannot both be used"
  70     cmd = [b'git', b'config', b'--null']
  71     if cfg_file:
  72         cmd.extend([b'--file', cfg_file])
  73     if opttype == 'int':
  74         cmd.extend([b'--int'])
  75     elif opttype == 'bool':
  76         cmd.extend([b'--bool'])
  77     else:
  78         assert opttype is None
  79     cmd.extend([b'--get', option])
  80     env=None
  81     if repo_dir:
  82         env = _gitenv(repo_dir=repo_dir)
  83     p = subprocess.Popen(cmd, stdout=subprocess.PIPE, env=env,
  84                          close_fds=True)
  85     # with --null, git writes out a trailing \0 after the value
  86     r = p.stdout.read()[:-1]
  87     rc = p.wait()
  88     if rc == 0:
  89         if opttype == 'int':
  90             return int(r)
  91         elif opttype == 'bool':
  92             # git converts to 'true' or 'false'
  93             return r == b'true'
  94         return r
  95     if rc != 1:
  96         raise GitError('%r returned %d' % (cmd, rc))
  97     return None
  98
  99
 100 def parse_tz_offset(s):
 101     """UTC offset in seconds."""
 102     tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
 103     if bytes_from_byte(s[0]) == b'-':
 104         return - tz_off
 105     return tz_off
 106
 107 def parse_commit_gpgsig(sig):
 108     """Return the original signature bytes.
 109
 110     i.e. with the "gpgsig " header and the leading space character on
 111     each continuation line removed.
 112
 113     """
 114     if not sig:
 115         return None
 116     assert sig.startswith(b'gpgsig ')
 117     sig = sig[7:]
 118     return sig.replace(b'\n ', b'\n')
 119
 120 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
 121 # Make sure that's authoritative.
 122
 123 # See also
 124 # https://github.com/git/git/blob/master/Documentation/technical/signature-format.txt
 125 # The continuation lines have only one leading space.
 126
 127 _start_end_char = br'[^ .,:;<>"\'\0\n]'
 128 _content_char = br'[^\0\n<>]'
 129 _safe_str_rx = br'(?:%s{1,2}|(?:%s%s*%s))' \
 130     % (_start_end_char,
 131        _start_end_char, _content_char, _start_end_char)
 132 _tz_rx = br'[-+]\d\d[0-5]\d'
 133 _parent_rx = br'(?:parent [abcdefABCDEF0123456789]{40}\n)'
 134 # Assumes every following line starting with a space is part of the
 135 # mergetag.  Is there a formal commit blob spec?
 136 _mergetag_rx = br'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)'
 137 _commit_rx = re.compile(br'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
 138 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
 139 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?)
 140 (?P<gpgsig>gpgsig .*\n(?: .*\n)*)?
 141 (?P<message>(?:.|\n)*)''' % (_parent_rx,
 142                              _safe_str_rx, _safe_str_rx, _tz_rx,
 143                              _safe_str_rx, _safe_str_rx, _tz_rx,
 144                              _mergetag_rx))
 145 _parent_hash_rx = re.compile(br'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
 146
 147 # Note that the author_sec and committer_sec values are (UTC) epoch
 148 # seconds, and for now the mergetag is not included.
 149 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
 150                                        'author_name', 'author_mail',
 151                                        'author_sec', 'author_offset',
 152                                        'committer_name', 'committer_mail',
 153                                        'committer_sec', 'committer_offset',
 154                                        'gpgsig',
 155                                        'message'])
 156
 157 def parse_commit(content):
 158     commit_match = re.match(_commit_rx, content)
 159     if not commit_match:
 160         raise Exception('cannot parse commit %r' % content)
 161     matches = commit_match.groupdict()
 162     return CommitInfo(tree=matches['tree'],
 163                       parents=re.findall(_parent_hash_rx, matches['parents']),
 164                       author_name=matches['author_name'],
 165                       author_mail=matches['author_mail'],
 166                       author_sec=int(matches['asec']),
 167                       author_offset=parse_tz_offset(matches['atz']),
 168                       committer_name=matches['committer_name'],
 169                       committer_mail=matches['committer_mail'],
 170                       committer_sec=int(matches['csec']),
 171                       committer_offset=parse_tz_offset(matches['ctz']),
 172                       gpgsig=parse_commit_gpgsig(matches['gpgsig']),
 173                       message=matches['message'])
 174
 175
 176 def get_cat_data(cat_iterator, expected_type):
 177     _, kind, _ = next(cat_iterator)
 178     if kind != expected_type:
 179         raise Exception('expected %r, saw %r' % (expected_type, kind))
 180     return b''.join(cat_iterator)
 181
 182 def get_commit_items(id, cp):
 183     return parse_commit(get_cat_data(cp.get(id), b'commit'))
 184
 185 def _local_git_date_str(epoch_sec):
 186     return b'%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
 187
 188
 189 def _git_date_str(epoch_sec, tz_offset_sec):
 190     offs =  tz_offset_sec // 60
 191     return b'%d %s%02d%02d' \
 192         % (epoch_sec,
 193            b'+' if offs >= 0 else b'-',
 194            abs(offs) // 60,
 195            abs(offs) % 60)
 196
 197
 198 def repo(sub = b'', repo_dir=None):
 199     """Get the path to the git repository or one of its subdirectories."""
 200     repo_dir = repo_dir or repodir
 201     if not repo_dir:
 202         raise GitError('You should call check_repo_or_die()')
 203
 204     # If there's a .git subdirectory, then the actual repo is in there.
 205     gd = os.path.join(repo_dir, b'.git')
 206     if os.path.exists(gd):
 207         repo_dir = gd
 208
 209     return os.path.join(repo_dir, sub)
 210
 211
 212 _shorten_hash_rx = \
 213     re.compile(br'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)')
 214
 215 def shorten_hash(s):
 216     return _shorten_hash_rx.sub(br'\1\2*\3', s)
 217
 218
 219 def repo_rel(path):
 220     full = os.path.abspath(path)
 221     fullrepo = os.path.abspath(repo(b''))
 222     if not fullrepo.endswith(b'/'):
 223         fullrepo += b'/'
 224     if full.startswith(fullrepo):
 225         path = full[len(fullrepo):]
 226     if path.startswith(b'index-cache/'):
 227         path = path[len(b'index-cache/'):]
 228     return shorten_hash(path)
 229
 230
 231 def auto_midx(objdir):
 232     args = [path.exe(), b'midx', b'--auto', b'--dir', objdir]
 233     try:
 234         rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
 235     except OSError as e:
 236         # make sure 'args' gets printed to help with debugging
 237         add_error('%r: exception: %s' % (args, e))
 238         raise
 239     if rv:
 240         add_error('%r: returned %d' % (args, rv))
 241
 242     args = [path.exe(), b'bloom', b'--dir', objdir]
 243     try:
 244         rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
 245     except OSError as e:
 246         # make sure 'args' gets printed to help with debugging
 247         add_error('%r: exception: %s' % (args, e))
 248         raise
 249     if rv:
 250         add_error('%r: returned %d' % (args, rv))
 251
 252
 253 def mangle_name(name, mode, gitmode):
 254     """Mangle a file name to present an abstract name for segmented files.
 255     Mangled file names will have the ".bup" extension added to them. If a
 256     file's name already ends with ".bup", a ".bupl" extension is added to
 257     disambiguate normal files from segmented ones.
 258     """
 259     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 260         assert(stat.S_ISDIR(gitmode))
 261         return name + b'.bup'
 262     elif name.endswith(b'.bup') or name[:-1].endswith(b'.bup'):
 263         return name + b'.bupl'
 264     else:
 265         return name
 266
 267
 268 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 269 def demangle_name(name, mode):
 270     """Remove name mangling from a file name, if necessary.
 271
 272     The return value is a tuple (demangled_filename,mode), where mode is one of
 273     the following:
 274
 275     * BUP_NORMAL  : files that should be read as-is from the repository
 276     * BUP_CHUNKED : files that were chunked and need to be reassembled
 277
 278     For more information on the name mangling algorithm, see mangle_name()
 279     """
 280     if name.endswith(b'.bupl'):
 281         return (name[:-5], BUP_NORMAL)
 282     elif name.endswith(b'.bup'):
 283         return (name[:-4], BUP_CHUNKED)
 284     elif name.endswith(b'.bupm'):
 285         return (name[:-5],
 286                 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
 287     return (name, BUP_NORMAL)
 288
 289
 290 def calc_hash(type, content):
 291     """Calculate some content's hash in the Git fashion."""
 292     header = b'%s %d\0' % (type, len(content))
 293     sum = Sha1(header)
 294     sum.update(content)
 295     return sum.digest()
 296
 297
 298 def shalist_item_sort_key(ent):
 299     (mode, name, id) = ent
 300     assert(mode+0 == mode)
 301     if stat.S_ISDIR(mode):
 302         return name + b'/'
 303     else:
 304         return name
 305
 306
 307 def tree_encode(shalist):
 308     """Generate a git tree object from (mode,name,hash) tuples."""
 309     shalist = sorted(shalist, key = shalist_item_sort_key)
 310     l = []
 311     for (mode,name,bin) in shalist:
 312         assert(mode)
 313         assert(mode+0 == mode)
 314         assert(name)
 315         assert(len(bin) == 20)
 316         s = b'%o %s\0%s' % (mode,name,bin)
 317         assert s[0] != b'0'  # 0-padded octal is not acceptable in a git tree
 318         l.append(s)
 319     return b''.join(l)
 320
 321
 322 def tree_decode(buf):
 323     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 324     ofs = 0
 325     while ofs < len(buf):
 326         z = buf.find(b'\0', ofs)
 327         assert(z > ofs)
 328         spl = buf[ofs:z].split(b' ', 1)
 329         assert(len(spl) == 2)
 330         mode,name = spl
 331         sha = buf[z+1:z+1+20]
 332         ofs = z+1+20
 333         yield (int(mode, 8), name, sha)
 334
 335
 336 def _encode_packobj(type, content, compression_level=1):
 337     if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
 338         raise ValueError('invalid compression level %s' % compression_level)
 339     szout = b''
 340     sz = len(content)
 341     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 342     sz >>= 4
 343     while 1:
 344         if sz: szbits |= 0x80
 345         szout += bytes_from_uint(szbits)
 346         if not sz:
 347             break
 348         szbits = sz & 0x7f
 349         sz >>= 7
 350     z = zlib.compressobj(compression_level)
 351     yield szout
 352     yield z.compress(content)
 353     yield z.flush()
 354
 355
 356 def _decode_packobj(buf):
 357     assert(buf)
 358     c = byte_int(buf[0])
 359     type = _typermap[(c & 0x70) >> 4]
 360     sz = c & 0x0f
 361     shift = 4
 362     i = 0
 363     while c & 0x80:
 364         i += 1
 365         c = byte_int(buf[i])
 366         sz |= (c & 0x7f) << shift
 367         shift += 7
 368         if not (c & 0x80):
 369             break
 370     return (type, zlib.decompress(buf[i+1:]))
 371
 372
 373 class PackIdx:
 374     def __init__(self):
 375         assert(0)
 376
 377     def find_offset(self, hash):
 378         """Get the offset of an object inside the index file."""
 379         idx = self._idx_from_hash(hash)
 380         if idx != None:
 381             return self._ofs_from_idx(idx)
 382         return None
 383
 384     def exists(self, hash, want_source=False):
 385         """Return nonempty if the object exists in this index."""
 386         if hash and (self._idx_from_hash(hash) != None):
 387             return want_source and os.path.basename(self.name) or True
 388         return None
 389
 390     def _idx_from_hash(self, hash):
 391         global _total_searches, _total_steps
 392         _total_searches += 1
 393         assert(len(hash) == 20)
 394         b1 = byte_int(hash[0])
 395         start = self.fanout[b1-1] # range -1..254
 396         end = self.fanout[b1] # range 0..255
 397         want = hash
 398         _total_steps += 1  # lookup table is a step
 399         while start < end:
 400             _total_steps += 1
 401             mid = start + (end - start) // 2
 402             v = self._idx_to_hash(mid)
 403             if v < want:
 404                 start = mid+1
 405             elif v > want:
 406                 end = mid
 407             else: # got it!
 408                 return mid
 409         return None
 410
 411
 412 class PackIdxV1(PackIdx):
 413     """Object representation of a Git pack index (version 1) file."""
 414     def __init__(self, filename, f):
 415         self.name = filename
 416         self.idxnames = [self.name]
 417         self.map = mmap_read(f)
 418         # Min size for 'L' is 4, which is sufficient for struct's '!I'
 419         self.fanout = array('L', struct.unpack('!256I', self.map))
 420         self.fanout.append(0)  # entry "-1"
 421         self.nsha = self.fanout[255]
 422         self.sha_ofs = 256 * 4
 423         # Avoid slicing shatable for individual hashes (very high overhead)
 424         self.shatable = buffer(self.map, self.sha_ofs, self.nsha * 24)
 425
 426     def __enter__(self):
 427         return self
 428
 429     def __exit__(self, type, value, traceback):
 430         with pending_raise(value, rethrow=False):
 431             self.close()
 432
 433     def __len__(self):
 434         return int(self.nsha)  # int() from long for python 2
 435
 436     def _ofs_from_idx(self, idx):
 437         if idx >= self.nsha or idx < 0:
 438             raise IndexError('invalid pack index index %d' % idx)
 439         ofs = self.sha_ofs + idx * 24
 440         return struct.unpack_from('!I', self.map, offset=ofs)[0]
 441
 442     def _idx_to_hash(self, idx):
 443         if idx >= self.nsha or idx < 0:
 444             raise IndexError('invalid pack index index %d' % idx)
 445         ofs = self.sha_ofs + idx * 24 + 4
 446         return self.map[ofs : ofs + 20]
 447
 448     def __iter__(self):
 449         start = self.sha_ofs + 4
 450         for ofs in range(start, start + 24 * self.nsha, 24):
 451             yield self.map[ofs : ofs + 20]
 452
 453     def close(self):
 454         if self.map is not None:
 455             self.shatable = None
 456             self.map.close()
 457             self.map = None
 458
 459
 460 class PackIdxV2(PackIdx):
 461     """Object representation of a Git pack index (version 2) file."""
 462     def __init__(self, filename, f):
 463         self.name = filename
 464         self.idxnames = [self.name]
 465         self.map = mmap_read(f)
 466         assert self.map[0:8] == b'\377tOc\0\0\0\2'
 467         # Min size for 'L' is 4, which is sufficient for struct's '!I'
 468         self.fanout = array('L', struct.unpack_from('!256I', self.map, offset=8))
 469         self.fanout.append(0)
 470         self.nsha = self.fanout[255]
 471         self.sha_ofs = 8 + 256*4
 472         self.ofstable_ofs = self.sha_ofs + self.nsha * 20 + self.nsha * 4
 473         self.ofs64table_ofs = self.ofstable_ofs + self.nsha * 4
 474         # Avoid slicing this for individual hashes (very high overhead)
 475         self.shatable = buffer(self.map, self.sha_ofs, self.nsha*20)
 476
 477     def __enter__(self):
 478         return self
 479
 480     def __exit__(self, type, value, traceback):
 481         with pending_raise(value, rethrow=False):
 482             self.close()
 483
 484     def __len__(self):
 485         return int(self.nsha)  # int() from long for python 2
 486
 487     def _ofs_from_idx(self, idx):
 488         if idx >= self.nsha or idx < 0:
 489             raise IndexError('invalid pack index index %d' % idx)
 490         ofs_ofs = self.ofstable_ofs + idx * 4
 491         ofs = struct.unpack_from('!I', self.map, offset=ofs_ofs)[0]
 492         if ofs & 0x80000000:
 493             idx64 = ofs & 0x7fffffff
 494             ofs64_ofs = self.ofs64table_ofs + idx64 * 8
 495             ofs = struct.unpack_from('!Q', self.map, offset=ofs64_ofs)[0]
 496         return ofs
 497
 498     def _idx_to_hash(self, idx):
 499         if idx >= self.nsha or idx < 0:
 500             raise IndexError('invalid pack index index %d' % idx)
 501         ofs = self.sha_ofs + idx * 20
 502         return self.map[ofs : ofs + 20]
 503
 504     def __iter__(self):
 505         start = self.sha_ofs
 506         for ofs in range(start, start + 20 * self.nsha, 20):
 507             yield self.map[ofs : ofs + 20]
 508
 509     def close(self):
 510         if self.map is not None:
 511             self.shatable = None
 512             self.map.close()
 513             self.map = None
 514
 515
 516 _mpi_count = 0
 517 class PackIdxList:
 518     def __init__(self, dir, ignore_midx=False):
 519         global _mpi_count
 520         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 521         _mpi_count += 1
 522         self.dir = dir
 523         self.also = set()
 524         self.packs = []
 525         self.do_bloom = False
 526         self.bloom = None
 527         self.ignore_midx = ignore_midx
 528         self.refresh()
 529
 530     def __del__(self):
 531         global _mpi_count
 532         _mpi_count -= 1
 533         assert(_mpi_count == 0)
 534
 535     def __iter__(self):
 536         return iter(idxmerge(self.packs))
 537
 538     def __len__(self):
 539         return sum(len(pack) for pack in self.packs)
 540
 541     def exists(self, hash, want_source=False):
 542         """Return nonempty if the object exists in the index files."""
 543         global _total_searches
 544         _total_searches += 1
 545         if hash in self.also:
 546             return True
 547         if self.do_bloom and self.bloom:
 548             if self.bloom.exists(hash):
 549                 self.do_bloom = False
 550             else:
 551                 _total_searches -= 1  # was counted by bloom
 552                 return None
 553         for i in range(len(self.packs)):
 554             p = self.packs[i]
 555             _total_searches -= 1  # will be incremented by sub-pack
 556             ix = p.exists(hash, want_source=want_source)
 557             if ix:
 558                 # reorder so most recently used packs are searched first
 559                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 560                 return ix
 561         self.do_bloom = True
 562         return None
 563
 564     def refresh(self, skip_midx = False):
 565         """Refresh the index list.
 566         This method verifies if .midx files were superseded (e.g. all of its
 567         contents are in another, bigger .midx file) and removes the superseded
 568         files.
 569
 570         If skip_midx is True, all work on .midx files will be skipped and .midx
 571         files will be removed from the list.
 572
 573         The instance variable 'ignore_midx' can force this function to
 574         always act as if skip_midx was True.
 575         """
 576         if self.bloom is not None:
 577             self.bloom.close()
 578         self.bloom = None # Always reopen the bloom as it may have been relaced
 579         self.do_bloom = False
 580         skip_midx = skip_midx or self.ignore_midx
 581         d = dict((p.name, p) for p in self.packs
 582                  if not skip_midx or not isinstance(p, midx.PackMidx))
 583         if os.path.exists(self.dir):
 584             if not skip_midx:
 585                 midxl = []
 586                 midxes = set(glob.glob(os.path.join(self.dir, b'*.midx')))
 587                 # remove any *.midx files from our list that no longer exist
 588                 for ix in list(d.values()):
 589                     if not isinstance(ix, midx.PackMidx):
 590                         continue
 591                     if ix.name in midxes:
 592                         continue
 593                     # remove the midx
 594                     del d[ix.name]
 595                     ix.close()
 596                     self.packs.remove(ix)
 597                 for ix in self.packs:
 598                     if isinstance(ix, midx.PackMidx):
 599                         for name in ix.idxnames:
 600                             d[os.path.join(self.dir, name)] = ix
 601                 for full in midxes:
 602                     if not d.get(full):
 603                         mx = midx.PackMidx(full)
 604                         (mxd, mxf) = os.path.split(mx.name)
 605                         broken = False
 606                         for n in mx.idxnames:
 607                             if not os.path.exists(os.path.join(mxd, n)):
 608                                 log(('warning: index %s missing\n'
 609                                      '  used by %s\n')
 610                                     % (path_msg(n), path_msg(mxf)))
 611                                 broken = True
 612                         if broken:
 613                             mx.close()
 614                             del mx
 615                             unlink(full)
 616                         else:
 617                             midxl.append(mx)
 618                 midxl.sort(key=lambda ix:
 619                            (-len(ix), -xstat.stat(ix.name).st_mtime))
 620                 for ix in midxl:
 621                     any_needed = False
 622                     for sub in ix.idxnames:
 623                         found = d.get(os.path.join(self.dir, sub))
 624                         if not found or isinstance(found, PackIdx):
 625                             # doesn't exist, or exists but not in a midx
 626                             any_needed = True
 627                             break
 628                     if any_needed:
 629                         d[ix.name] = ix
 630                         for name in ix.idxnames:
 631                             d[os.path.join(self.dir, name)] = ix
 632                     elif not ix.force_keep:
 633                         debug1('midx: removing redundant: %s\n'
 634                                % path_msg(os.path.basename(ix.name)))
 635                         ix.close()
 636                         unlink(ix.name)
 637             for full in glob.glob(os.path.join(self.dir, b'*.idx')):
 638                 if not d.get(full):
 639                     try:
 640                         ix = open_idx(full)
 641                     except GitError as e:
 642                         add_error(e)
 643                         continue
 644                     d[full] = ix
 645             bfull = os.path.join(self.dir, b'bup.bloom')
 646             if self.bloom is None and os.path.exists(bfull):
 647                 self.bloom = bloom.ShaBloom(bfull)
 648             self.packs = list(set(d.values()))
 649             self.packs.sort(reverse=True, key=lambda x: len(x))
 650             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 651                 self.do_bloom = True
 652             else:
 653                 self.bloom = None
 654         debug1('PackIdxList: using %d index%s.\n'
 655             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 656
 657     def add(self, hash):
 658         """Insert an additional object in the list."""
 659         self.also.add(hash)
 660
 661
 662 def open_idx(filename):
 663     if filename.endswith(b'.idx'):
 664         f = open(filename, 'rb')
 665         header = f.read(8)
 666         if header[0:4] == b'\377tOc':
 667             version = struct.unpack('!I', header[4:8])[0]
 668             if version == 2:
 669                 return PackIdxV2(filename, f)
 670             else:
 671                 raise GitError('%s: expected idx file version 2, got %d'
 672                                % (path_msg(filename), version))
 673         elif len(header) == 8 and header[0:4] < b'\377tOc':
 674             return PackIdxV1(filename, f)
 675         else:
 676             raise GitError('%s: unrecognized idx file header'
 677                            % path_msg(filename))
 678     elif filename.endswith(b'.midx'):
 679         return midx.PackMidx(filename)
 680     else:
 681         raise GitError('idx filenames must end with .idx or .midx')
 682
 683
 684 def idxmerge(idxlist, final_progress=True):
 685     """Generate a list of all the objects reachable in a PackIdxList."""
 686     def pfunc(count, total):
 687         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 688                   % (count*100.0/total, count, total))
 689     def pfinal(count, total):
 690         if final_progress:
 691             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 692                      % (100, total, total))
 693     return merge_iter(idxlist, 10024, pfunc, pfinal)
 694
 695
 696 def create_commit_blob(tree, parent,
 697                        author, adate_sec, adate_tz,
 698                        committer, cdate_sec, cdate_tz,
 699                        msg):
 700     if adate_tz is not None:
 701         adate_str = _git_date_str(adate_sec, adate_tz)
 702     else:
 703         adate_str = _local_git_date_str(adate_sec)
 704     if cdate_tz is not None:
 705         cdate_str = _git_date_str(cdate_sec, cdate_tz)
 706     else:
 707         cdate_str = _local_git_date_str(cdate_sec)
 708     l = []
 709     if tree: l.append(b'tree %s' % hexlify(tree))
 710     if parent: l.append(b'parent %s' % hexlify(parent))
 711     if author: l.append(b'author %s %s' % (author, adate_str))
 712     if committer: l.append(b'committer %s %s' % (committer, cdate_str))
 713     l.append(b'')
 714     l.append(msg)
 715     return b'\n'.join(l)
 716
 717
 718 def _make_objcache():
 719     return PackIdxList(repo(b'objects/pack'))
 720
 721 # bup-gc assumes that it can disable all PackWriter activities
 722 # (bloom/midx/cache) via the constructor and close() arguments.
 723
 724 class PackWriter:
 725     """Writes Git objects inside a pack file."""
 726     def __init__(self, objcache_maker=_make_objcache, compression_level=1,
 727                  run_midx=True, on_pack_finish=None,
 728                  max_pack_size=None, max_pack_objects=None, repo_dir=None):
 729         self.repo_dir = repo_dir or repo()
 730         self.file = None
 731         self.parentfd = None
 732         self.count = 0
 733         self.outbytes = 0
 734         self.filename = None
 735         self.idx = None
 736         self.objcache_maker = objcache_maker
 737         self.objcache = None
 738         self.compression_level = compression_level
 739         self.run_midx=run_midx
 740         self.on_pack_finish = on_pack_finish
 741         if not max_pack_size:
 742             max_pack_size = git_config_get(b'pack.packSizeLimit',
 743                                            repo_dir=self.repo_dir,
 744                                            opttype='int')
 745             if not max_pack_size:
 746                 # larger packs slow down pruning
 747                 max_pack_size = 1000 * 1000 * 1000
 748         self.max_pack_size = max_pack_size
 749         # cache memory usage is about 83 bytes per object
 750         self.max_pack_objects = max_pack_objects if max_pack_objects \
 751                                 else max(1, self.max_pack_size // 5000)
 752
 753     def __del__(self):
 754         self.close()
 755
 756     def __enter__(self):
 757         return self
 758
 759     def __exit__(self, type, value, traceback):
 760         with pending_raise(value, rethrow=False):
 761             self.close()
 762
 763     def _open(self):
 764         if not self.file:
 765             objdir = dir = os.path.join(self.repo_dir, b'objects')
 766             fd, name = tempfile.mkstemp(suffix=b'.pack', dir=objdir)
 767             try:
 768                 self.file = os.fdopen(fd, 'w+b')
 769             except:
 770                 os.close(fd)
 771                 raise
 772             try:
 773                 self.parentfd = os.open(objdir, os.O_RDONLY)
 774             except:
 775                 f = self.file
 776                 self.file = None
 777                 f.close()
 778                 raise
 779             assert name.endswith(b'.pack')
 780             self.filename = name[:-5]
 781             self.file.write(b'PACK\0\0\0\2\0\0\0\0')
 782             self.idx = PackIdxV2Writer()
 783
 784     def _raw_write(self, datalist, sha):
 785         self._open()
 786         f = self.file
 787         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 788         # the file never has a *partial* blob.  So let's make sure it's
 789         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 790         # to our hashsplit algorithm.)  f.write() does its own buffering,
 791         # but that's okay because we'll flush it in _end().
 792         oneblob = b''.join(datalist)
 793         try:
 794             f.write(oneblob)
 795         except IOError as e:
 796             reraise(GitError(e))
 797         nw = len(oneblob)
 798         crc = zlib.crc32(oneblob) & 0xffffffff
 799         self._update_idx(sha, crc, nw)
 800         self.outbytes += nw
 801         self.count += 1
 802         return nw, crc
 803
 804     def _update_idx(self, sha, crc, size):
 805         assert(sha)
 806         if self.idx:
 807             self.idx.add(sha, crc, self.file.tell() - size)
 808
 809     def _write(self, sha, type, content):
 810         if verbose:
 811             log('>')
 812         if not sha:
 813             sha = calc_hash(type, content)
 814         size, crc = self._raw_write(_encode_packobj(type, content,
 815                                                     self.compression_level),
 816                                     sha=sha)
 817         if self.outbytes >= self.max_pack_size \
 818            or self.count >= self.max_pack_objects:
 819             self.breakpoint()
 820         return sha
 821
 822     def _require_objcache(self):
 823         if self.objcache is None and self.objcache_maker:
 824             self.objcache = self.objcache_maker()
 825         if self.objcache is None:
 826             raise GitError(
 827                     "PackWriter not opened or can't check exists w/o objcache")
 828
 829     def exists(self, id, want_source=False):
 830         """Return non-empty if an object is found in the object cache."""
 831         self._require_objcache()
 832         return self.objcache.exists(id, want_source=want_source)
 833
 834     def just_write(self, sha, type, content):
 835         """Write an object to the pack file without checking for duplication."""
 836         self._write(sha, type, content)
 837         # If nothing else, gc doesn't have/want an objcache
 838         if self.objcache is not None:
 839             self.objcache.add(sha)
 840
 841     def maybe_write(self, type, content):
 842         """Write an object to the pack file if not present and return its id."""
 843         sha = calc_hash(type, content)
 844         if not self.exists(sha):
 845             self._require_objcache()
 846             self.just_write(sha, type, content)
 847         return sha
 848
 849     def new_blob(self, blob):
 850         """Create a blob object in the pack with the supplied content."""
 851         return self.maybe_write(b'blob', blob)
 852
 853     def new_tree(self, shalist):
 854         """Create a tree object in the pack."""
 855         content = tree_encode(shalist)
 856         return self.maybe_write(b'tree', content)
 857
 858     def new_commit(self, tree, parent,
 859                    author, adate_sec, adate_tz,
 860                    committer, cdate_sec, cdate_tz,
 861                    msg):
 862         """Create a commit object in the pack.  The date_sec values must be
 863         epoch-seconds, and if a tz is None, the local timezone is assumed."""
 864         content = create_commit_blob(tree, parent,
 865                                      author, adate_sec, adate_tz,
 866                                      committer, cdate_sec, cdate_tz,
 867                                      msg)
 868         return self.maybe_write(b'commit', content)
 869
 870     def _end(self, run_midx=True, abort=False):
 871         # Ignores run_midx during abort
 872         if not self.file:
 873             return None
 874         self.file, f = None, self.file
 875         self.idx, idx = None, self.idx
 876         self.parentfd, pfd, = None, self.parentfd
 877         self.objcache = None
 878
 879         with finalized(pfd, lambda x: x is not None and os.close(x)), \
 880              f:
 881
 882             if abort:
 883                 os.unlink(self.filename + b'.pack')
 884                 return None
 885
 886             # update object count
 887             f.seek(8)
 888             cp = struct.pack('!i', self.count)
 889             assert len(cp) == 4
 890             f.write(cp)
 891
 892             # calculate the pack sha1sum
 893             f.seek(0)
 894             sum = Sha1()
 895             for b in chunkyreader(f):
 896                 sum.update(b)
 897             packbin = sum.digest()
 898             f.write(packbin)
 899             f.flush()
 900             fdatasync(f.fileno())
 901             f.close()
 902
 903             idx.write(self.filename + b'.idx', packbin)
 904             nameprefix = os.path.join(self.repo_dir,
 905                                       b'objects/pack/pack-' +  hexlify(packbin))
 906             if os.path.exists(self.filename + b'.map'):
 907                 os.unlink(self.filename + b'.map')
 908             os.rename(self.filename + b'.pack', nameprefix + b'.pack')
 909             os.rename(self.filename + b'.idx', nameprefix + b'.idx')
 910             os.fsync(pfd)
 911             if run_midx:
 912                 auto_midx(os.path.join(self.repo_dir, b'objects/pack'))
 913             if self.on_pack_finish:
 914                 self.on_pack_finish(nameprefix)
 915             return nameprefix
 916
 917     def abort(self):
 918         """Remove the pack file from disk."""
 919         self._end(abort=True)
 920
 921     def breakpoint(self):
 922         """Clear byte and object counts and return the last processed id."""
 923         id = self._end(self.run_midx)
 924         self.outbytes = self.count = 0
 925         return id
 926
 927     def close(self, run_midx=True):
 928         """Close the pack file and move it to its definitive path."""
 929         return self._end(run_midx=run_midx)
 930
 931
 932 class PackIdxV2Writer:
 933     def __init__(self):
 934         self.idx = list(list() for i in range(256))
 935         self.count = 0
 936
 937     def add(self, sha, crc, offs):
 938         assert(sha)
 939         self.count += 1
 940         self.idx[byte_int(sha[0])].append((sha, crc, offs))
 941
 942     def write(self, filename, packbin):
 943         ofs64_count = 0
 944         for section in self.idx:
 945             for entry in section:
 946                 if entry[2] >= 2**31:
 947                     ofs64_count += 1
 948
 949         # Length: header + fan-out + shas-and-crcs + overflow-offsets
 950         index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
 951         idx_map = None
 952         idx_f = open(filename, 'w+b')
 953         try:
 954             idx_f.truncate(index_len)
 955             fdatasync(idx_f.fileno())
 956             idx_map = mmap_readwrite(idx_f, close=False)
 957             try:
 958                 count = _helpers.write_idx(filename, idx_map, self.idx,
 959                                            self.count)
 960                 assert(count == self.count)
 961                 idx_map.flush()
 962             finally:
 963                 idx_map.close()
 964         finally:
 965             idx_f.close()
 966
 967         idx_f = open(filename, 'a+b')
 968         try:
 969             idx_f.write(packbin)
 970             idx_f.seek(0)
 971             idx_sum = Sha1()
 972             b = idx_f.read(8 + 4*256)
 973             idx_sum.update(b)
 974
 975             for b in chunkyreader(idx_f, 20 * self.count):
 976                 idx_sum.update(b)
 977
 978             for b in chunkyreader(idx_f):
 979                 idx_sum.update(b)
 980             idx_f.write(idx_sum.digest())
 981             fdatasync(idx_f.fileno())
 982         finally:
 983             idx_f.close()
 984
 985
 986 def list_refs(patterns=None, repo_dir=None,
 987               limit_to_heads=False, limit_to_tags=False):
 988     """Yield (refname, hash) tuples for all repository refs unless
 989     patterns are specified.  In that case, only include tuples for
 990     refs matching those patterns (cf. git-show-ref(1)).  The limits
 991     restrict the result items to refs/heads or refs/tags.  If both
 992     limits are specified, items from both sources will be included.
 993
 994     """
 995     argv = [b'git', b'show-ref']
 996     if limit_to_heads:
 997         argv.append(b'--heads')
 998     if limit_to_tags:
 999         argv.append(b'--tags')
1000     argv.append(b'--')
1001     if patterns:
1002         argv.extend(patterns)
1003     p = subprocess.Popen(argv, env=_gitenv(repo_dir), stdout=subprocess.PIPE,
1004                          close_fds=True)
1005     out = p.stdout.read().strip()
1006     rv = p.wait()  # not fatal
1007     if rv:
1008         assert(not out)
1009     if out:
1010         for d in out.split(b'\n'):
1011             sha, name = d.split(b' ', 1)
1012             yield name, unhexlify(sha)
1013
1014
1015 def read_ref(refname, repo_dir = None):
1016     """Get the commit id of the most recent commit made on a given ref."""
1017     refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
1018     l = tuple(islice(refs, 2))
1019     if l:
1020         assert(len(l) == 1)
1021         return l[0][1]
1022     else:
1023         return None
1024
1025
1026 def rev_list_invocation(ref_or_refs, format=None):
1027     if isinstance(ref_or_refs, bytes):
1028         refs = (ref_or_refs,)
1029     else:
1030         refs = ref_or_refs
1031     argv = [b'git', b'rev-list']
1032
1033     if format:
1034         argv.append(b'--pretty=format:' + format)
1035     for ref in refs:
1036         assert not ref.startswith(b'-')
1037         argv.append(ref)
1038     argv.append(b'--')
1039     return argv
1040
1041
1042 def rev_list(ref_or_refs, parse=None, format=None, repo_dir=None):
1043     """Yield information about commits as per "git rev-list".  If a format
1044     is not provided, yield one hex hash at a time.  If a format is
1045     provided, pass it to rev-list and call parse(git_stdout) for each
1046     commit with the stream positioned just after the rev-list "commit
1047     HASH" header line.  When a format is provided yield (oidx,
1048     parse(git_stdout)) for each commit.
1049
1050     """
1051     assert bool(parse) == bool(format)
1052     p = subprocess.Popen(rev_list_invocation(ref_or_refs,
1053                                              format=format),
1054                          env=_gitenv(repo_dir),
1055                          stdout = subprocess.PIPE,
1056                          close_fds=True)
1057     if not format:
1058         for line in p.stdout:
1059             yield line.strip()
1060     else:
1061         line = p.stdout.readline()
1062         while line:
1063             s = line.strip()
1064             if not s.startswith(b'commit '):
1065                 raise Exception('unexpected line ' + repr(s))
1066             s = s[7:]
1067             assert len(s) == 40
1068             yield s, parse(p.stdout)
1069             line = p.stdout.readline()
1070
1071     rv = p.wait()  # not fatal
1072     if rv:
1073         raise GitError('git rev-list returned error %d' % rv)
1074
1075
1076 def rev_parse(committish, repo_dir=None):
1077     """Resolve the full hash for 'committish', if it exists.
1078
1079     Should be roughly equivalent to 'git rev-parse'.
1080
1081     Returns the hex value of the hash if it is found, None if 'committish' does
1082     not correspond to anything.
1083     """
1084     head = read_ref(committish, repo_dir=repo_dir)
1085     if head:
1086         debug2("resolved from ref: commit = %s\n" % hexlify(head))
1087         return head
1088
1089     pL = PackIdxList(repo(b'objects/pack', repo_dir=repo_dir))
1090
1091     if len(committish) == 40:
1092         try:
1093             hash = unhexlify(committish)
1094         except TypeError:
1095             return None
1096
1097         if pL.exists(hash):
1098             return hash
1099
1100     return None
1101
1102
1103 def update_ref(refname, newval, oldval, repo_dir=None):
1104     """Update a repository reference."""
1105     if not oldval:
1106         oldval = b''
1107     assert refname.startswith(b'refs/heads/') \
1108         or refname.startswith(b'refs/tags/')
1109     p = subprocess.Popen([b'git', b'update-ref', refname,
1110                           hexlify(newval), hexlify(oldval)],
1111                          env=_gitenv(repo_dir),
1112                          close_fds=True)
1113     _git_wait(b'git update-ref', p)
1114
1115
1116 def delete_ref(refname, oldvalue=None):
1117     """Delete a repository reference (see git update-ref(1))."""
1118     assert refname.startswith(b'refs/')
1119     oldvalue = [] if not oldvalue else [oldvalue]
1120     p = subprocess.Popen([b'git', b'update-ref', b'-d', refname] + oldvalue,
1121                          env=_gitenv(),
1122                          close_fds=True)
1123     _git_wait('git update-ref', p)
1124
1125
1126 def guess_repo(path=None):
1127     """Set the path value in the global variable "repodir".
1128     This makes bup look for an existing bup repository, but not fail if a
1129     repository doesn't exist. Usually, if you are interacting with a bup
1130     repository, you would not be calling this function but using
1131     check_repo_or_die().
1132     """
1133     global repodir
1134     if path:
1135         repodir = path
1136     if not repodir:
1137         repodir = environ.get(b'BUP_DIR')
1138         if not repodir:
1139             repodir = os.path.expanduser(b'~/.bup')
1140
1141
1142 def init_repo(path=None):
1143     """Create the Git bare repository for bup in a given path."""
1144     guess_repo(path)
1145     d = repo()  # appends a / to the path
1146     parent = os.path.dirname(os.path.dirname(d))
1147     if parent and not os.path.exists(parent):
1148         raise GitError('parent directory "%s" does not exist\n'
1149                        % path_msg(parent))
1150     if os.path.exists(d) and not os.path.isdir(os.path.join(d, b'.')):
1151         raise GitError('"%s" exists but is not a directory\n' % path_msg(d))
1152     p = subprocess.Popen([b'git', b'--bare', b'init'], stdout=sys.stderr,
1153                          env=_gitenv(),
1154                          close_fds=True)
1155     _git_wait('git init', p)
1156     # Force the index version configuration in order to ensure bup works
1157     # regardless of the version of the installed Git binary.
1158     p = subprocess.Popen([b'git', b'config', b'pack.indexVersion', '2'],
1159                          stdout=sys.stderr, env=_gitenv(), close_fds=True)
1160     _git_wait('git config', p)
1161     # Enable the reflog
1162     p = subprocess.Popen([b'git', b'config', b'core.logAllRefUpdates', b'true'],
1163                          stdout=sys.stderr, env=_gitenv(), close_fds=True)
1164     _git_wait('git config', p)
1165
1166
1167 def check_repo_or_die(path=None):
1168     """Check to see if a bup repository probably exists, and abort if not."""
1169     guess_repo(path)
1170     top = repo()
1171     pst = stat_if_exists(top + b'/objects/pack')
1172     if pst and stat.S_ISDIR(pst.st_mode):
1173         return
1174     if not pst:
1175         top_st = stat_if_exists(top)
1176         if not top_st:
1177             log('error: repository %r does not exist (see "bup help init")\n'
1178                 % top)
1179             sys.exit(15)
1180     log('error: %s is not a repository\n' % path_msg(top))
1181     sys.exit(14)
1182
1183
1184 def is_suitable_git(ver_str):
1185     if not ver_str.startswith(b'git version '):
1186         return 'unrecognized'
1187     ver_str = ver_str[len(b'git version '):]
1188     if ver_str.startswith(b'0.'):
1189         return 'insufficient'
1190     if ver_str.startswith(b'1.'):
1191         if re.match(br'1\.[012345]rc', ver_str):
1192             return 'insufficient'
1193         if re.match(br'1\.[01234]\.', ver_str):
1194             return 'insufficient'
1195         if re.match(br'1\.5\.[012345]($|\.)', ver_str):
1196             return 'insufficient'
1197         if re.match(br'1\.5\.6-rc', ver_str):
1198             return 'insufficient'
1199         return 'suitable'
1200     if re.match(br'[0-9]+(\.|$)?', ver_str):
1201         return 'suitable'
1202     sys.exit(13)
1203
1204 _git_great = None
1205
1206 def require_suitable_git(ver_str=None):
1207     """Raise GitError if the version of git isn't suitable.
1208
1209     Rely on ver_str when provided, rather than invoking the git in the
1210     path.
1211
1212     """
1213     global _git_great
1214     if _git_great is not None:
1215         return
1216     if environ.get(b'BUP_GIT_VERSION_IS_FINE', b'').lower() \
1217        in (b'yes', b'true', b'1'):
1218         _git_great = True
1219         return
1220     if not ver_str:
1221         ver_str, _, _ = _git_exo([b'git', b'--version'])
1222     status = is_suitable_git(ver_str)
1223     if status == 'unrecognized':
1224         raise GitError('Unexpected git --version output: %r' % ver_str)
1225     if status == 'insufficient':
1226         log('error: git version must be at least 1.5.6\n')
1227         sys.exit(1)
1228     if status == 'suitable':
1229         _git_great = True
1230         return
1231     assert False
1232
1233
1234 class _AbortableIter:
1235     def __init__(self, it, onabort = None):
1236         self.it = it
1237         self.onabort = onabort
1238         self.done = None
1239
1240     def __iter__(self):
1241         return self
1242
1243     def __next__(self):
1244         try:
1245             return next(self.it)
1246         except StopIteration as e:
1247             self.done = True
1248             raise
1249         except:
1250             self.abort()
1251             raise
1252
1253     next = __next__
1254
1255     def abort(self):
1256         """Abort iteration and call the abortion callback, if needed."""
1257         if not self.done:
1258             self.done = True
1259             if self.onabort:
1260                 self.onabort()
1261
1262     def __del__(self):
1263         self.abort()
1264
1265
1266 class CatPipe:
1267     """Link to 'git cat-file' that is used to retrieve blob data."""
1268     def __init__(self, repo_dir = None):
1269         require_suitable_git()
1270         self.repo_dir = repo_dir
1271         self.p = self.inprogress = None
1272
1273     def close(self, wait=False):
1274         p = self.p
1275         if p:
1276             p.stdout.close()
1277             p.stdin.close()
1278         self.p = None
1279         self.inprogress = None
1280         if wait:
1281             p.wait()
1282             return p.returncode
1283         return None
1284
1285     def restart(self):
1286         self.close()
1287         self.p = subprocess.Popen([b'git', b'cat-file', b'--batch'],
1288                                   stdin=subprocess.PIPE,
1289                                   stdout=subprocess.PIPE,
1290                                   close_fds = True,
1291                                   bufsize = 4096,
1292                                   env=_gitenv(self.repo_dir))
1293
1294     def get(self, ref):
1295         """Yield (oidx, type, size), followed by the data referred to by ref.
1296         If ref does not exist, only yield (None, None, None).
1297
1298         """
1299         if not self.p or self.p.poll() != None:
1300             self.restart()
1301         assert(self.p)
1302         poll_result = self.p.poll()
1303         assert(poll_result == None)
1304         if self.inprogress:
1305             log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1306         assert(not self.inprogress)
1307         assert ref.find(b'\n') < 0
1308         assert ref.find(b'\r') < 0
1309         assert not ref.startswith(b'-')
1310         self.inprogress = ref
1311         self.p.stdin.write(ref + b'\n')
1312         self.p.stdin.flush()
1313         hdr = self.p.stdout.readline()
1314         if not hdr:
1315             raise GitError('unexpected cat-file EOF (last request: %r, exit: %s)'
1316                            % (ref, self.p.poll() or 'none'))
1317         if hdr.endswith(b' missing\n'):
1318             self.inprogress = None
1319             yield None, None, None
1320             return
1321         info = hdr.split(b' ')
1322         if len(info) != 3 or len(info[0]) != 40:
1323             raise GitError('expected object (id, type, size), got %r' % info)
1324         oidx, typ, size = info
1325         size = int(size)
1326         it = _AbortableIter(chunkyreader(self.p.stdout, size),
1327                             onabort=self.close)
1328         try:
1329             yield oidx, typ, size
1330             for blob in it:
1331                 yield blob
1332             readline_result = self.p.stdout.readline()
1333             assert readline_result == b'\n'
1334             self.inprogress = None
1335         except Exception as e:
1336             it.abort()
1337             raise
1338
1339     def _join(self, it):
1340         _, typ, _ = next(it)
1341         if typ == b'blob':
1342             for blob in it:
1343                 yield blob
1344         elif typ == b'tree':
1345             treefile = b''.join(it)
1346             for (mode, name, sha) in tree_decode(treefile):
1347                 for blob in self.join(hexlify(sha)):
1348                     yield blob
1349         elif typ == b'commit':
1350             treeline = b''.join(it).split(b'\n')[0]
1351             assert treeline.startswith(b'tree ')
1352             for blob in self.join(treeline[5:]):
1353                 yield blob
1354         else:
1355             raise GitError('invalid object type %r: expected blob/tree/commit'
1356                            % typ)
1357
1358     def join(self, id):
1359         """Generate a list of the content of all blobs that can be reached
1360         from an object.  The hash given in 'id' must point to a blob, a tree
1361         or a commit. The content of all blobs that can be seen from trees or
1362         commits will be added to the list.
1363         """
1364         for d in self._join(self.get(id)):
1365             yield d
1366
1367
1368 _cp = {}
1369
1370 def cp(repo_dir=None):
1371     """Create a CatPipe object or reuse the already existing one."""
1372     global _cp, repodir
1373     if not repo_dir:
1374         repo_dir = repodir or repo()
1375     repo_dir = os.path.abspath(repo_dir)
1376     cp = _cp.get(repo_dir)
1377     if not cp:
1378         cp = CatPipe(repo_dir)
1379         _cp[repo_dir] = cp
1380     return cp
1381
1382
1383 def close_catpipes():
1384     # FIXME: chain exceptions
1385     while _cp:
1386         _, cp = _cp.popitem()
1387         cp.close(wait=True)
1388
1389
1390 def tags(repo_dir = None):
1391     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1392     tags = {}
1393     for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1394         assert n.startswith(b'refs/tags/')
1395         name = n[10:]
1396         if not c in tags:
1397             tags[c] = []
1398         tags[c].append(name)  # more than one tag can point at 'c'
1399     return tags
1400
1401
1402 class MissingObject(KeyError):
1403     def __init__(self, oid):
1404         self.oid = oid
1405         KeyError.__init__(self, 'object %r is missing' % hexlify(oid))
1406
1407
1408 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1409                                    'path', 'chunk_path', 'data'])
1410 # The path is the mangled path, and if an item represents a fragment
1411 # of a chunked file, the chunk_path will be the chunked subtree path
1412 # for the chunk, i.e. ['', '2d3115e', ...].  The top-level path for a
1413 # chunked file will have a chunk_path of [''].  So some chunk subtree
1414 # of the file '/foo/bar/baz' might look like this:
1415 #
1416 #   item.path = ['foo', 'bar', 'baz.bup']
1417 #   item.chunk_path = ['', '2d3115e', '016b097']
1418 #   item.type = 'tree'
1419 #   ...
1420
1421
1422 def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1423     """Yield everything reachable from oidx via get_ref (which must behave
1424     like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1425     returns true.  Throw MissingObject if a hash encountered is
1426     missing from the repository, and don't read or return blob content
1427     in the data field unless include_data is set.
1428
1429     """
1430     # Maintain the pending stack on the heap to avoid stack overflow
1431     pending = [(oidx, [], [], None)]
1432     while len(pending):
1433         oidx, parent_path, chunk_path, mode = pending.pop()
1434         oid = unhexlify(oidx)
1435         if stop_at and stop_at(oidx):
1436             continue
1437
1438         if (not include_data) and mode and stat.S_ISREG(mode):
1439             # If the object is a "regular file", then it's a leaf in
1440             # the graph, so we can skip reading the data if the caller
1441             # hasn't requested it.
1442             yield WalkItem(oid=oid, type=b'blob',
1443                            chunk_path=chunk_path, path=parent_path,
1444                            mode=mode,
1445                            data=None)
1446             continue
1447
1448         item_it = get_ref(oidx)
1449         get_oidx, typ, _ = next(item_it)
1450         if not get_oidx:
1451             raise MissingObject(unhexlify(oidx))
1452         if typ not in (b'blob', b'commit', b'tree'):
1453             raise Exception('unexpected repository object type %r' % typ)
1454
1455         # FIXME: set the mode based on the type when the mode is None
1456         if typ == b'blob' and not include_data:
1457             # Dump data until we can ask cat_pipe not to fetch it
1458             for ignored in item_it:
1459                 pass
1460             data = None
1461         else:
1462             data = b''.join(item_it)
1463
1464         yield WalkItem(oid=oid, type=typ,
1465                        chunk_path=chunk_path, path=parent_path,
1466                        mode=mode,
1467                        data=(data if include_data else None))
1468
1469         if typ == b'commit':
1470             commit_items = parse_commit(data)
1471             for pid in commit_items.parents:
1472                 pending.append((pid, parent_path, chunk_path, mode))
1473             pending.append((commit_items.tree, parent_path, chunk_path,
1474                             hashsplit.GIT_MODE_TREE))
1475         elif typ == b'tree':
1476             for mode, name, ent_id in tree_decode(data):
1477                 demangled, bup_type = demangle_name(name, mode)
1478                 if chunk_path:
1479                     sub_path = parent_path
1480                     sub_chunk_path = chunk_path + [name]
1481                 else:
1482                     sub_path = parent_path + [name]
1483                     if bup_type == BUP_CHUNKED:
1484                         sub_chunk_path = [b'']
1485                     else:
1486                         sub_chunk_path = chunk_path
1487                 pending.append((hexlify(ent_id), sub_path, sub_chunk_path,
1488                                 mode))