lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5
   6 from __future__ import absolute_import, print_function
   7 import os, sys, zlib, subprocess, struct, stat, re, tempfile, glob
   8 from array import array
   9 from binascii import hexlify, unhexlify
  10 from collections import namedtuple
  11 from itertools import islice
  12
  13 from bup import _helpers, hashsplit, path, midx, bloom, xstat
  14 from bup.compat import (buffer,
  15                         byte_int, bytes_from_byte, bytes_from_uint,
  16                         environ,
  17                         ExitStack,
  18                         pending_raise,
  19                         reraise)
  20 from bup.io import path_msg
  21 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
  22                          exo,
  23                          fdatasync,
  24                          finalized,
  25                          log,
  26                          merge_dict,
  27                          merge_iter,
  28                          mmap_read, mmap_readwrite,
  29                          nullcontext_if_not,
  30                          progress, qprogress, stat_if_exists,
  31                          unlink,
  32                          utc_offset_str)
  33
  34
  35 verbose = 0
  36 repodir = None  # The default repository, once initialized
  37
  38 _typemap =  {b'blob': 3, b'tree': 2, b'commit': 1, b'tag': 4}
  39 _typermap = {v: k for k, v in _typemap.items()}
  40
  41
  42 _total_searches = 0
  43 _total_steps = 0
  44
  45
  46 class GitError(Exception):
  47     pass
  48
  49
  50 def _gitenv(repo_dir=None):
  51     if not repo_dir:
  52         repo_dir = repo()
  53     return merge_dict(environ, {b'GIT_DIR': os.path.abspath(repo_dir)})
  54
  55 def _git_wait(cmd, p):
  56     rv = p.wait()
  57     if rv != 0:
  58         raise GitError('%r returned %d' % (cmd, rv))
  59
  60 def _git_exo(cmd, **kwargs):
  61     kwargs['check'] = False
  62     result = exo(cmd, **kwargs)
  63     _, _, proc = result
  64     if proc.returncode != 0:
  65         raise GitError('%r returned %d' % (cmd, proc.returncode))
  66     return result
  67
  68 def git_config_get(option, repo_dir=None, opttype=None, cfg_file=None):
  69     assert not (repo_dir and cfg_file), "repo_dir and cfg_file cannot both be used"
  70     cmd = [b'git', b'config', b'--null']
  71     if cfg_file:
  72         cmd.extend([b'--file', cfg_file])
  73     if opttype == 'int':
  74         cmd.extend([b'--int'])
  75     elif opttype == 'bool':
  76         cmd.extend([b'--bool'])
  77     else:
  78         assert opttype is None
  79     cmd.extend([b'--get', option])
  80     env=None
  81     if repo_dir:
  82         env = _gitenv(repo_dir=repo_dir)
  83     p = subprocess.Popen(cmd, stdout=subprocess.PIPE, env=env,
  84                          close_fds=True)
  85     # with --null, git writes out a trailing \0 after the value
  86     r = p.stdout.read()[:-1]
  87     rc = p.wait()
  88     if rc == 0:
  89         if opttype == 'int':
  90             return int(r)
  91         elif opttype == 'bool':
  92             # git converts to 'true' or 'false'
  93             return r == b'true'
  94         return r
  95     if rc != 1:
  96         raise GitError('%r returned %d' % (cmd, rc))
  97     return None
  98
  99
 100 def parse_tz_offset(s):
 101     """UTC offset in seconds."""
 102     tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
 103     if bytes_from_byte(s[0]) == b'-':
 104         return - tz_off
 105     return tz_off
 106
 107 def parse_commit_gpgsig(sig):
 108     """Return the original signature bytes.
 109
 110     i.e. with the "gpgsig " header and the leading space character on
 111     each continuation line removed.
 112
 113     """
 114     if not sig:
 115         return None
 116     assert sig.startswith(b'gpgsig ')
 117     sig = sig[7:]
 118     return sig.replace(b'\n ', b'\n')
 119
 120 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
 121 # Make sure that's authoritative.
 122
 123 # See also
 124 # https://github.com/git/git/blob/master/Documentation/technical/signature-format.txt
 125 # The continuation lines have only one leading space.
 126
 127 _start_end_char = br'[^ .,:;<>"\'\0\n]'
 128 _content_char = br'[^\0\n<>]'
 129 _safe_str_rx = br'(?:%s{1,2}|(?:%s%s*%s))' \
 130     % (_start_end_char,
 131        _start_end_char, _content_char, _start_end_char)
 132 _tz_rx = br'[-+]\d\d[0-5]\d'
 133 _parent_rx = br'(?:parent [abcdefABCDEF0123456789]{40}\n)'
 134 # Assumes every following line starting with a space is part of the
 135 # mergetag.  Is there a formal commit blob spec?
 136 _mergetag_rx = br'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)'
 137 _commit_rx = re.compile(br'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
 138 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
 139 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?)
 140 (?P<gpgsig>gpgsig .*\n(?: .*\n)*)?
 141 (?P<message>(?:.|\n)*)''' % (_parent_rx,
 142                              _safe_str_rx, _safe_str_rx, _tz_rx,
 143                              _safe_str_rx, _safe_str_rx, _tz_rx,
 144                              _mergetag_rx))
 145 _parent_hash_rx = re.compile(br'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
 146
 147 # Note that the author_sec and committer_sec values are (UTC) epoch
 148 # seconds, and for now the mergetag is not included.
 149 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
 150                                        'author_name', 'author_mail',
 151                                        'author_sec', 'author_offset',
 152                                        'committer_name', 'committer_mail',
 153                                        'committer_sec', 'committer_offset',
 154                                        'gpgsig',
 155                                        'message'])
 156
 157 def parse_commit(content):
 158     commit_match = re.match(_commit_rx, content)
 159     if not commit_match:
 160         raise Exception('cannot parse commit %r' % content)
 161     matches = commit_match.groupdict()
 162     return CommitInfo(tree=matches['tree'],
 163                       parents=re.findall(_parent_hash_rx, matches['parents']),
 164                       author_name=matches['author_name'],
 165                       author_mail=matches['author_mail'],
 166                       author_sec=int(matches['asec']),
 167                       author_offset=parse_tz_offset(matches['atz']),
 168                       committer_name=matches['committer_name'],
 169                       committer_mail=matches['committer_mail'],
 170                       committer_sec=int(matches['csec']),
 171                       committer_offset=parse_tz_offset(matches['ctz']),
 172                       gpgsig=parse_commit_gpgsig(matches['gpgsig']),
 173                       message=matches['message'])
 174
 175
 176 def get_cat_data(cat_iterator, expected_type):
 177     _, kind, _ = next(cat_iterator)
 178     if kind != expected_type:
 179         raise Exception('expected %r, saw %r' % (expected_type, kind))
 180     return b''.join(cat_iterator)
 181
 182 def get_commit_items(id, cp):
 183     return parse_commit(get_cat_data(cp.get(id), b'commit'))
 184
 185 def _local_git_date_str(epoch_sec):
 186     return b'%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
 187
 188
 189 def _git_date_str(epoch_sec, tz_offset_sec):
 190     offs =  tz_offset_sec // 60
 191     return b'%d %s%02d%02d' \
 192         % (epoch_sec,
 193            b'+' if offs >= 0 else b'-',
 194            abs(offs) // 60,
 195            abs(offs) % 60)
 196
 197
 198 def repo(sub = b'', repo_dir=None):
 199     """Get the path to the git repository or one of its subdirectories."""
 200     repo_dir = repo_dir or repodir
 201     if not repo_dir:
 202         raise GitError('You should call check_repo_or_die()')
 203
 204     # If there's a .git subdirectory, then the actual repo is in there.
 205     gd = os.path.join(repo_dir, b'.git')
 206     if os.path.exists(gd):
 207         repo_dir = gd
 208
 209     return os.path.join(repo_dir, sub)
 210
 211
 212 _shorten_hash_rx = \
 213     re.compile(br'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)')
 214
 215 def shorten_hash(s):
 216     return _shorten_hash_rx.sub(br'\1\2*\3', s)
 217
 218
 219 def repo_rel(path):
 220     full = os.path.abspath(path)
 221     fullrepo = os.path.abspath(repo(b''))
 222     if not fullrepo.endswith(b'/'):
 223         fullrepo += b'/'
 224     if full.startswith(fullrepo):
 225         path = full[len(fullrepo):]
 226     if path.startswith(b'index-cache/'):
 227         path = path[len(b'index-cache/'):]
 228     return shorten_hash(path)
 229
 230
 231 def auto_midx(objdir):
 232     args = [path.exe(), b'midx', b'--auto', b'--dir', objdir]
 233     try:
 234         rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
 235     except OSError as e:
 236         # make sure 'args' gets printed to help with debugging
 237         add_error('%r: exception: %s' % (args, e))
 238         raise
 239     if rv:
 240         add_error('%r: returned %d' % (args, rv))
 241
 242     args = [path.exe(), b'bloom', b'--dir', objdir]
 243     try:
 244         rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
 245     except OSError as e:
 246         # make sure 'args' gets printed to help with debugging
 247         add_error('%r: exception: %s' % (args, e))
 248         raise
 249     if rv:
 250         add_error('%r: returned %d' % (args, rv))
 251
 252
 253 def mangle_name(name, mode, gitmode):
 254     """Mangle a file name to present an abstract name for segmented files.
 255     Mangled file names will have the ".bup" extension added to them. If a
 256     file's name already ends with ".bup", a ".bupl" extension is added to
 257     disambiguate normal files from segmented ones.
 258     """
 259     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 260         assert(stat.S_ISDIR(gitmode))
 261         return name + b'.bup'
 262     elif name.endswith(b'.bup') or name[:-1].endswith(b'.bup'):
 263         return name + b'.bupl'
 264     else:
 265         return name
 266
 267
 268 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 269 def demangle_name(name, mode):
 270     """Remove name mangling from a file name, if necessary.
 271
 272     The return value is a tuple (demangled_filename,mode), where mode is one of
 273     the following:
 274
 275     * BUP_NORMAL  : files that should be read as-is from the repository
 276     * BUP_CHUNKED : files that were chunked and need to be reassembled
 277
 278     For more information on the name mangling algorithm, see mangle_name()
 279     """
 280     if name.endswith(b'.bupl'):
 281         return (name[:-5], BUP_NORMAL)
 282     elif name.endswith(b'.bup'):
 283         return (name[:-4], BUP_CHUNKED)
 284     elif name.endswith(b'.bupm'):
 285         return (name[:-5],
 286                 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
 287     return (name, BUP_NORMAL)
 288
 289
 290 def calc_hash(type, content):
 291     """Calculate some content's hash in the Git fashion."""
 292     header = b'%s %d\0' % (type, len(content))
 293     sum = Sha1(header)
 294     sum.update(content)
 295     return sum.digest()
 296
 297
 298 def shalist_item_sort_key(ent):
 299     (mode, name, id) = ent
 300     assert(mode+0 == mode)
 301     if stat.S_ISDIR(mode):
 302         return name + b'/'
 303     else:
 304         return name
 305
 306
 307 def tree_encode(shalist):
 308     """Generate a git tree object from (mode,name,hash) tuples."""
 309     shalist = sorted(shalist, key = shalist_item_sort_key)
 310     l = []
 311     for (mode,name,bin) in shalist:
 312         assert(mode)
 313         assert(mode+0 == mode)
 314         assert(name)
 315         assert(len(bin) == 20)
 316         s = b'%o %s\0%s' % (mode,name,bin)
 317         assert s[0] != b'0'  # 0-padded octal is not acceptable in a git tree
 318         l.append(s)
 319     return b''.join(l)
 320
 321
 322 def tree_decode(buf):
 323     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 324     ofs = 0
 325     while ofs < len(buf):
 326         z = buf.find(b'\0', ofs)
 327         assert(z > ofs)
 328         spl = buf[ofs:z].split(b' ', 1)
 329         assert(len(spl) == 2)
 330         mode,name = spl
 331         sha = buf[z+1:z+1+20]
 332         ofs = z+1+20
 333         yield (int(mode, 8), name, sha)
 334
 335
 336 def _encode_packobj(type, content, compression_level=1):
 337     if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
 338         raise ValueError('invalid compression level %s' % compression_level)
 339     szout = b''
 340     sz = len(content)
 341     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 342     sz >>= 4
 343     while 1:
 344         if sz: szbits |= 0x80
 345         szout += bytes_from_uint(szbits)
 346         if not sz:
 347             break
 348         szbits = sz & 0x7f
 349         sz >>= 7
 350     z = zlib.compressobj(compression_level)
 351     yield szout
 352     yield z.compress(content)
 353     yield z.flush()
 354
 355
 356 def _decode_packobj(buf):
 357     assert(buf)
 358     c = byte_int(buf[0])
 359     type = _typermap[(c & 0x70) >> 4]
 360     sz = c & 0x0f
 361     shift = 4
 362     i = 0
 363     while c & 0x80:
 364         i += 1
 365         c = byte_int(buf[i])
 366         sz |= (c & 0x7f) << shift
 367         shift += 7
 368         if not (c & 0x80):
 369             break
 370     return (type, zlib.decompress(buf[i+1:]))
 371
 372
 373 class PackIdx(object):
 374     def find_offset(self, hash):
 375         """Get the offset of an object inside the index file."""
 376         idx = self._idx_from_hash(hash)
 377         if idx != None:
 378             return self._ofs_from_idx(idx)
 379         return None
 380
 381     def exists(self, hash, want_source=False):
 382         """Return nonempty if the object exists in this index."""
 383         if hash and (self._idx_from_hash(hash) != None):
 384             return want_source and os.path.basename(self.name) or True
 385         return None
 386
 387     def _idx_from_hash(self, hash):
 388         global _total_searches, _total_steps
 389         _total_searches += 1
 390         assert(len(hash) == 20)
 391         b1 = byte_int(hash[0])
 392         start = self.fanout[b1-1] # range -1..254
 393         end = self.fanout[b1] # range 0..255
 394         want = hash
 395         _total_steps += 1  # lookup table is a step
 396         while start < end:
 397             _total_steps += 1
 398             mid = start + (end - start) // 2
 399             v = self._idx_to_hash(mid)
 400             if v < want:
 401                 start = mid+1
 402             elif v > want:
 403                 end = mid
 404             else: # got it!
 405                 return mid
 406         return None
 407
 408
 409 class PackIdxV1(PackIdx):
 410     """Object representation of a Git pack index (version 1) file."""
 411     def __init__(self, filename, f):
 412         super(PackIdxV1, self).__init__()
 413         self.closed = False
 414         self.name = filename
 415         self.idxnames = [self.name]
 416         self.map = mmap_read(f)
 417         # Min size for 'L' is 4, which is sufficient for struct's '!I'
 418         self.fanout = array('L', struct.unpack('!256I', self.map))
 419         self.fanout.append(0)  # entry "-1"
 420         self.nsha = self.fanout[255]
 421         self.sha_ofs = 256 * 4
 422         # Avoid slicing shatable for individual hashes (very high overhead)
 423         self.shatable = buffer(self.map, self.sha_ofs, self.nsha * 24)
 424
 425     def __enter__(self):
 426         return self
 427
 428     def __exit__(self, type, value, traceback):
 429         with pending_raise(value, rethrow=False):
 430             self.close()
 431
 432     def __len__(self):
 433         return int(self.nsha)  # int() from long for python 2
 434
 435     def _ofs_from_idx(self, idx):
 436         if idx >= self.nsha or idx < 0:
 437             raise IndexError('invalid pack index index %d' % idx)
 438         ofs = self.sha_ofs + idx * 24
 439         return struct.unpack_from('!I', self.map, offset=ofs)[0]
 440
 441     def _idx_to_hash(self, idx):
 442         if idx >= self.nsha or idx < 0:
 443             raise IndexError('invalid pack index index %d' % idx)
 444         ofs = self.sha_ofs + idx * 24 + 4
 445         return self.map[ofs : ofs + 20]
 446
 447     def __iter__(self):
 448         start = self.sha_ofs + 4
 449         for ofs in range(start, start + 24 * self.nsha, 24):
 450             yield self.map[ofs : ofs + 20]
 451
 452     def close(self):
 453         self.closed = True
 454         if self.map is not None:
 455             self.shatable = None
 456             self.map.close()
 457             self.map = None
 458
 459     def __del__(self):
 460         assert self.closed
 461
 462
 463 class PackIdxV2(PackIdx):
 464     """Object representation of a Git pack index (version 2) file."""
 465     def __init__(self, filename, f):
 466         super(PackIdxV2, self).__init__()
 467         self.closed = False
 468         self.name = filename
 469         self.idxnames = [self.name]
 470         self.map = mmap_read(f)
 471         assert self.map[0:8] == b'\377tOc\0\0\0\2'
 472         # Min size for 'L' is 4, which is sufficient for struct's '!I'
 473         self.fanout = array('L', struct.unpack_from('!256I', self.map, offset=8))
 474         self.fanout.append(0)
 475         self.nsha = self.fanout[255]
 476         self.sha_ofs = 8 + 256*4
 477         self.ofstable_ofs = self.sha_ofs + self.nsha * 20 + self.nsha * 4
 478         self.ofs64table_ofs = self.ofstable_ofs + self.nsha * 4
 479         # Avoid slicing this for individual hashes (very high overhead)
 480         self.shatable = buffer(self.map, self.sha_ofs, self.nsha*20)
 481
 482     def __enter__(self):
 483         return self
 484
 485     def __exit__(self, type, value, traceback):
 486         with pending_raise(value, rethrow=False):
 487             self.close()
 488
 489     def __len__(self):
 490         return int(self.nsha)  # int() from long for python 2
 491
 492     def _ofs_from_idx(self, idx):
 493         if idx >= self.nsha or idx < 0:
 494             raise IndexError('invalid pack index index %d' % idx)
 495         ofs_ofs = self.ofstable_ofs + idx * 4
 496         ofs = struct.unpack_from('!I', self.map, offset=ofs_ofs)[0]
 497         if ofs & 0x80000000:
 498             idx64 = ofs & 0x7fffffff
 499             ofs64_ofs = self.ofs64table_ofs + idx64 * 8
 500             ofs = struct.unpack_from('!Q', self.map, offset=ofs64_ofs)[0]
 501         return ofs
 502
 503     def _idx_to_hash(self, idx):
 504         if idx >= self.nsha or idx < 0:
 505             raise IndexError('invalid pack index index %d' % idx)
 506         ofs = self.sha_ofs + idx * 20
 507         return self.map[ofs : ofs + 20]
 508
 509     def __iter__(self):
 510         start = self.sha_ofs
 511         for ofs in range(start, start + 20 * self.nsha, 20):
 512             yield self.map[ofs : ofs + 20]
 513
 514     def close(self):
 515         self.closed = True
 516         if self.map is not None:
 517             self.shatable = None
 518             self.map.close()
 519             self.map = None
 520
 521     def __del__(self):
 522         assert self.closed
 523
 524
 525 _mpi_count = 0
 526 class PackIdxList:
 527     def __init__(self, dir, ignore_midx=False):
 528         global _mpi_count
 529         # Q: was this also intended to prevent opening multiple repos?
 530         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 531         _mpi_count += 1
 532         self.open = True
 533         self.dir = dir
 534         self.also = set()
 535         self.packs = []
 536         self.do_bloom = False
 537         self.bloom = None
 538         self.ignore_midx = ignore_midx
 539         try:
 540             self.refresh()
 541         except BaseException as ex:
 542             with pending_raise(ex):
 543                 self.close()
 544
 545     def close(self):
 546         global _mpi_count
 547         if not self.open:
 548             assert _mpi_count == 0
 549             return
 550         _mpi_count -= 1
 551         assert _mpi_count == 0
 552         self.also = None
 553         self.bloom, bloom = None, self.bloom
 554         self.packs, packs = None, self.packs
 555         self.open = False
 556         with ExitStack() as stack:
 557             for pack in packs:
 558                 stack.enter_context(pack)
 559             if bloom:
 560                 bloom.close()
 561
 562     def __enter__(self):
 563         return self
 564
 565     def __exit__(self, type, value, traceback):
 566         with pending_raise(value, rethrow=False):
 567             self.close()
 568
 569     def __del__(self):
 570         assert not self.open
 571
 572     def __iter__(self):
 573         return iter(idxmerge(self.packs))
 574
 575     def __len__(self):
 576         return sum(len(pack) for pack in self.packs)
 577
 578     def exists(self, hash, want_source=False):
 579         """Return nonempty if the object exists in the index files."""
 580         global _total_searches
 581         _total_searches += 1
 582         if hash in self.also:
 583             return True
 584         if self.do_bloom and self.bloom:
 585             if self.bloom.exists(hash):
 586                 self.do_bloom = False
 587             else:
 588                 _total_searches -= 1  # was counted by bloom
 589                 return None
 590         for i in range(len(self.packs)):
 591             p = self.packs[i]
 592             _total_searches -= 1  # will be incremented by sub-pack
 593             ix = p.exists(hash, want_source=want_source)
 594             if ix:
 595                 # reorder so most recently used packs are searched first
 596                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 597                 return ix
 598         self.do_bloom = True
 599         return None
 600
 601     def refresh(self, skip_midx = False):
 602         """Refresh the index list.
 603         This method verifies if .midx files were superseded (e.g. all of its
 604         contents are in another, bigger .midx file) and removes the superseded
 605         files.
 606
 607         If skip_midx is True, all work on .midx files will be skipped and .midx
 608         files will be removed from the list.
 609
 610         The instance variable 'ignore_midx' can force this function to
 611         always act as if skip_midx was True.
 612         """
 613         if self.bloom is not None:
 614             self.bloom.close()
 615         self.bloom = None # Always reopen the bloom as it may have been relaced
 616         self.do_bloom = False
 617         skip_midx = skip_midx or self.ignore_midx
 618         d = dict((p.name, p) for p in self.packs
 619                  if not skip_midx or not isinstance(p, midx.PackMidx))
 620         if os.path.exists(self.dir):
 621             if not skip_midx:
 622                 midxl = []
 623                 midxes = set(glob.glob(os.path.join(self.dir, b'*.midx')))
 624                 # remove any *.midx files from our list that no longer exist
 625                 for ix in list(d.values()):
 626                     if not isinstance(ix, midx.PackMidx):
 627                         continue
 628                     if ix.name in midxes:
 629                         continue
 630                     # remove the midx
 631                     del d[ix.name]
 632                     ix.close()
 633                     self.packs.remove(ix)
 634                 for ix in self.packs:
 635                     if isinstance(ix, midx.PackMidx):
 636                         for name in ix.idxnames:
 637                             d[os.path.join(self.dir, name)] = ix
 638                 for full in midxes:
 639                     if not d.get(full):
 640                         mx = midx.PackMidx(full)
 641                         (mxd, mxf) = os.path.split(mx.name)
 642                         broken = False
 643                         for n in mx.idxnames:
 644                             if not os.path.exists(os.path.join(mxd, n)):
 645                                 log(('warning: index %s missing\n'
 646                                      '  used by %s\n')
 647                                     % (path_msg(n), path_msg(mxf)))
 648                                 broken = True
 649                         if broken:
 650                             mx.close()
 651                             unlink(full)
 652                         else:
 653                             midxl.append(mx)
 654                 midxl.sort(key=lambda ix:
 655                            (-len(ix), -xstat.stat(ix.name).st_mtime))
 656                 for ix in midxl:
 657                     any_needed = False
 658                     for sub in ix.idxnames:
 659                         found = d.get(os.path.join(self.dir, sub))
 660                         if not found or isinstance(found, PackIdx):
 661                             # doesn't exist, or exists but not in a midx
 662                             any_needed = True
 663                             break
 664                     if any_needed:
 665                         d[ix.name] = ix
 666                         for name in ix.idxnames:
 667                             d[os.path.join(self.dir, name)] = ix
 668                     elif not ix.force_keep:
 669                         debug1('midx: removing redundant: %s\n'
 670                                % path_msg(os.path.basename(ix.name)))
 671                         ix.close()
 672                         unlink(ix.name)
 673             for full in glob.glob(os.path.join(self.dir, b'*.idx')):
 674                 if not d.get(full):
 675                     try:
 676                         ix = open_idx(full)
 677                     except GitError as e:
 678                         add_error(e)
 679                         continue
 680                     d[full] = ix
 681             bfull = os.path.join(self.dir, b'bup.bloom')
 682             new_packs = set(d.values())
 683             for p in self.packs:
 684                 if not p in new_packs:
 685                     p.close()
 686             new_packs = list(new_packs)
 687             new_packs.sort(reverse=True, key=lambda x: len(x))
 688             self.packs = new_packs
 689             if self.bloom is None and os.path.exists(bfull):
 690                 self.bloom = bloom.ShaBloom(bfull)
 691             try:
 692                 if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 693                     self.do_bloom = True
 694                 else:
 695                     if self.bloom:
 696                         self.bloom, bloom_tmp = None, self.bloom
 697                         bloom_tmp.close()
 698             except BaseException as ex:
 699                 with pending_raise(ex):
 700                     if self.bloom:
 701                         self.bloom.close()
 702
 703         debug1('PackIdxList: using %d index%s.\n'
 704             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 705
 706     def add(self, hash):
 707         """Insert an additional object in the list."""
 708         self.also.add(hash)
 709
 710
 711 def open_idx(filename):
 712     if filename.endswith(b'.idx'):
 713         f = open(filename, 'rb')
 714         header = f.read(8)
 715         if header[0:4] == b'\377tOc':
 716             version = struct.unpack('!I', header[4:8])[0]
 717             if version == 2:
 718                 return PackIdxV2(filename, f)
 719             else:
 720                 raise GitError('%s: expected idx file version 2, got %d'
 721                                % (path_msg(filename), version))
 722         elif len(header) == 8 and header[0:4] < b'\377tOc':
 723             return PackIdxV1(filename, f)
 724         else:
 725             raise GitError('%s: unrecognized idx file header'
 726                            % path_msg(filename))
 727     elif filename.endswith(b'.midx'):
 728         return midx.PackMidx(filename)
 729     else:
 730         raise GitError('idx filenames must end with .idx or .midx')
 731
 732
 733 def idxmerge(idxlist, final_progress=True):
 734     """Generate a list of all the objects reachable in a PackIdxList."""
 735     def pfunc(count, total):
 736         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 737                   % (count*100.0/total, count, total))
 738     def pfinal(count, total):
 739         if final_progress:
 740             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 741                      % (100, total, total))
 742     return merge_iter(idxlist, 10024, pfunc, pfinal)
 743
 744
 745 def create_commit_blob(tree, parent,
 746                        author, adate_sec, adate_tz,
 747                        committer, cdate_sec, cdate_tz,
 748                        msg):
 749     if adate_tz is not None:
 750         adate_str = _git_date_str(adate_sec, adate_tz)
 751     else:
 752         adate_str = _local_git_date_str(adate_sec)
 753     if cdate_tz is not None:
 754         cdate_str = _git_date_str(cdate_sec, cdate_tz)
 755     else:
 756         cdate_str = _local_git_date_str(cdate_sec)
 757     l = []
 758     if tree: l.append(b'tree %s' % hexlify(tree))
 759     if parent: l.append(b'parent %s' % hexlify(parent))
 760     if author: l.append(b'author %s %s' % (author, adate_str))
 761     if committer: l.append(b'committer %s %s' % (committer, cdate_str))
 762     l.append(b'')
 763     l.append(msg)
 764     return b'\n'.join(l)
 765
 766 def _make_objcache():
 767     return PackIdxList(repo(b'objects/pack'))
 768
 769 # bup-gc assumes that it can disable all PackWriter activities
 770 # (bloom/midx/cache) via the constructor and close() arguments.
 771
 772 class PackWriter(object):
 773     """Writes Git objects inside a pack file."""
 774     def __init__(self, objcache_maker=_make_objcache, compression_level=1,
 775                  run_midx=True, on_pack_finish=None,
 776                  max_pack_size=None, max_pack_objects=None, repo_dir=None):
 777         self.closed = False
 778         self.repo_dir = repo_dir or repo()
 779         self.file = None
 780         self.parentfd = None
 781         self.count = 0
 782         self.outbytes = 0
 783         self.filename = None
 784         self.idx = None
 785         self.objcache_maker = objcache_maker
 786         self.objcache = None
 787         self.compression_level = compression_level
 788         self.run_midx=run_midx
 789         self.on_pack_finish = on_pack_finish
 790         if not max_pack_size:
 791             max_pack_size = git_config_get(b'pack.packSizeLimit',
 792                                            repo_dir=self.repo_dir,
 793                                            opttype='int')
 794             if not max_pack_size:
 795                 # larger packs slow down pruning
 796                 max_pack_size = 1000 * 1000 * 1000
 797         self.max_pack_size = max_pack_size
 798         # cache memory usage is about 83 bytes per object
 799         self.max_pack_objects = max_pack_objects if max_pack_objects \
 800                                 else max(1, self.max_pack_size // 5000)
 801
 802     def __enter__(self):
 803         return self
 804
 805     def __exit__(self, type, value, traceback):
 806         with pending_raise(value, rethrow=False):
 807             self.close()
 808
 809     def _open(self):
 810         if not self.file:
 811             objdir = dir = os.path.join(self.repo_dir, b'objects')
 812             fd, name = tempfile.mkstemp(suffix=b'.pack', dir=objdir)
 813             try:
 814                 self.file = os.fdopen(fd, 'w+b')
 815             except:
 816                 os.close(fd)
 817                 raise
 818             try:
 819                 self.parentfd = os.open(objdir, os.O_RDONLY)
 820             except:
 821                 f = self.file
 822                 self.file = None
 823                 f.close()
 824                 raise
 825             assert name.endswith(b'.pack')
 826             self.filename = name[:-5]
 827             self.file.write(b'PACK\0\0\0\2\0\0\0\0')
 828             self.idx = PackIdxV2Writer()
 829
 830     def _raw_write(self, datalist, sha):
 831         self._open()
 832         f = self.file
 833         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 834         # the file never has a *partial* blob.  So let's make sure it's
 835         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 836         # to our hashsplit algorithm.)  f.write() does its own buffering,
 837         # but that's okay because we'll flush it in _end().
 838         oneblob = b''.join(datalist)
 839         try:
 840             f.write(oneblob)
 841         except IOError as e:
 842             reraise(GitError(e))
 843         nw = len(oneblob)
 844         crc = zlib.crc32(oneblob) & 0xffffffff
 845         self._update_idx(sha, crc, nw)
 846         self.outbytes += nw
 847         self.count += 1
 848         return nw, crc
 849
 850     def _update_idx(self, sha, crc, size):
 851         assert(sha)
 852         if self.idx:
 853             self.idx.add(sha, crc, self.file.tell() - size)
 854
 855     def _write(self, sha, type, content):
 856         if verbose:
 857             log('>')
 858         assert sha
 859         size, crc = self._raw_write(_encode_packobj(type, content,
 860                                                     self.compression_level),
 861                                     sha=sha)
 862         if self.outbytes >= self.max_pack_size \
 863            or self.count >= self.max_pack_objects:
 864             self.breakpoint()
 865         return sha
 866
 867     def _require_objcache(self):
 868         if self.objcache is None and self.objcache_maker:
 869             self.objcache = self.objcache_maker()
 870         if self.objcache is None:
 871             raise GitError(
 872                     "PackWriter not opened or can't check exists w/o objcache")
 873
 874     def exists(self, id, want_source=False):
 875         """Return non-empty if an object is found in the object cache."""
 876         self._require_objcache()
 877         return self.objcache.exists(id, want_source=want_source)
 878
 879     def just_write(self, sha, type, content):
 880         """Write an object to the pack file without checking for duplication."""
 881         self._write(sha, type, content)
 882         # If nothing else, gc doesn't have/want an objcache
 883         if self.objcache is not None:
 884             self.objcache.add(sha)
 885
 886     def maybe_write(self, type, content):
 887         """Write an object to the pack file if not present and return its id."""
 888         sha = calc_hash(type, content)
 889         if not self.exists(sha):
 890             self._require_objcache()
 891             self.just_write(sha, type, content)
 892         return sha
 893
 894     def new_blob(self, blob):
 895         """Create a blob object in the pack with the supplied content."""
 896         return self.maybe_write(b'blob', blob)
 897
 898     def new_tree(self, shalist):
 899         """Create a tree object in the pack."""
 900         content = tree_encode(shalist)
 901         return self.maybe_write(b'tree', content)
 902
 903     def new_commit(self, tree, parent,
 904                    author, adate_sec, adate_tz,
 905                    committer, cdate_sec, cdate_tz,
 906                    msg):
 907         """Create a commit object in the pack.  The date_sec values must be
 908         epoch-seconds, and if a tz is None, the local timezone is assumed."""
 909         content = create_commit_blob(tree, parent,
 910                                      author, adate_sec, adate_tz,
 911                                      committer, cdate_sec, cdate_tz,
 912                                      msg)
 913         return self.maybe_write(b'commit', content)
 914
 915     def _end(self, run_midx=True, abort=False):
 916         # Ignores run_midx during abort
 917         self.parentfd, pfd, = None, self.parentfd
 918         self.file, f = None, self.file
 919         self.idx, idx = None, self.idx
 920         try:
 921             with nullcontext_if_not(self.objcache), \
 922                  finalized(pfd, lambda x: x is not None and os.close(x)), \
 923                  nullcontext_if_not(f):
 924                 if not f:
 925                     return None
 926
 927                 if abort:
 928                     os.unlink(self.filename + b'.pack')
 929                     return None
 930
 931                 # update object count
 932                 f.seek(8)
 933                 cp = struct.pack('!i', self.count)
 934                 assert len(cp) == 4
 935                 f.write(cp)
 936
 937                 # calculate the pack sha1sum
 938                 f.seek(0)
 939                 sum = Sha1()
 940                 for b in chunkyreader(f):
 941                     sum.update(b)
 942                 packbin = sum.digest()
 943                 f.write(packbin)
 944                 f.flush()
 945                 fdatasync(f.fileno())
 946                 f.close()
 947
 948                 idx.write(self.filename + b'.idx', packbin)
 949                 nameprefix = os.path.join(self.repo_dir,
 950                                           b'objects/pack/pack-' +  hexlify(packbin))
 951                 if os.path.exists(self.filename + b'.map'):
 952                     os.unlink(self.filename + b'.map')
 953                 os.rename(self.filename + b'.pack', nameprefix + b'.pack')
 954                 os.rename(self.filename + b'.idx', nameprefix + b'.idx')
 955                 os.fsync(pfd)
 956                 if run_midx:
 957                     auto_midx(os.path.join(self.repo_dir, b'objects/pack'))
 958                 if self.on_pack_finish:
 959                     self.on_pack_finish(nameprefix)
 960                 return nameprefix
 961         finally:
 962             # Must be last -- some of the code above depends on it
 963             self.objcache = None
 964
 965     def abort(self):
 966         """Remove the pack file from disk."""
 967         self.closed = True
 968         self._end(abort=True)
 969
 970     def breakpoint(self):
 971         """Clear byte and object counts and return the last processed id."""
 972         id = self._end(self.run_midx)
 973         self.outbytes = self.count = 0
 974         return id
 975
 976     def close(self, run_midx=True):
 977         """Close the pack file and move it to its definitive path."""
 978         self.closed = True
 979         return self._end(run_midx=run_midx)
 980
 981     def __del__(self):
 982         assert self.closed
 983
 984
 985 class PackIdxV2Writer:
 986     def __init__(self):
 987         self.idx = list(list() for i in range(256))
 988         self.count = 0
 989
 990     def add(self, sha, crc, offs):
 991         assert(sha)
 992         self.count += 1
 993         self.idx[byte_int(sha[0])].append((sha, crc, offs))
 994
 995     def write(self, filename, packbin):
 996         ofs64_count = 0
 997         for section in self.idx:
 998             for entry in section:
 999                 if entry[2] >= 2**31:
1000                     ofs64_count += 1
1001
1002         # Length: header + fan-out + shas-and-crcs + overflow-offsets
1003         index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
1004         idx_map = None
1005         idx_f = open(filename, 'w+b')
1006         try:
1007             idx_f.truncate(index_len)
1008             fdatasync(idx_f.fileno())
1009             idx_map = mmap_readwrite(idx_f, close=False)
1010             try:
1011                 count = _helpers.write_idx(filename, idx_map, self.idx,
1012                                            self.count)
1013                 assert(count == self.count)
1014                 idx_map.flush()
1015             finally:
1016                 idx_map.close()
1017         finally:
1018             idx_f.close()
1019
1020         idx_f = open(filename, 'a+b')
1021         try:
1022             idx_f.write(packbin)
1023             idx_f.seek(0)
1024             idx_sum = Sha1()
1025             b = idx_f.read(8 + 4*256)
1026             idx_sum.update(b)
1027
1028             for b in chunkyreader(idx_f, 20 * self.count):
1029                 idx_sum.update(b)
1030
1031             for b in chunkyreader(idx_f):
1032                 idx_sum.update(b)
1033             idx_f.write(idx_sum.digest())
1034             fdatasync(idx_f.fileno())
1035         finally:
1036             idx_f.close()
1037
1038
1039 def list_refs(patterns=None, repo_dir=None,
1040               limit_to_heads=False, limit_to_tags=False):
1041     """Yield (refname, hash) tuples for all repository refs unless
1042     patterns are specified.  In that case, only include tuples for
1043     refs matching those patterns (cf. git-show-ref(1)).  The limits
1044     restrict the result items to refs/heads or refs/tags.  If both
1045     limits are specified, items from both sources will be included.
1046
1047     """
1048     argv = [b'git', b'show-ref']
1049     if limit_to_heads:
1050         argv.append(b'--heads')
1051     if limit_to_tags:
1052         argv.append(b'--tags')
1053     argv.append(b'--')
1054     if patterns:
1055         argv.extend(patterns)
1056     p = subprocess.Popen(argv, env=_gitenv(repo_dir), stdout=subprocess.PIPE,
1057                          close_fds=True)
1058     out = p.stdout.read().strip()
1059     rv = p.wait()  # not fatal
1060     if rv:
1061         assert(not out)
1062     if out:
1063         for d in out.split(b'\n'):
1064             sha, name = d.split(b' ', 1)
1065             yield name, unhexlify(sha)
1066
1067
1068 def read_ref(refname, repo_dir = None):
1069     """Get the commit id of the most recent commit made on a given ref."""
1070     refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
1071     l = tuple(islice(refs, 2))
1072     if l:
1073         assert(len(l) == 1)
1074         return l[0][1]
1075     else:
1076         return None
1077
1078
1079 def rev_list_invocation(ref_or_refs, format=None):
1080     if isinstance(ref_or_refs, bytes):
1081         refs = (ref_or_refs,)
1082     else:
1083         refs = ref_or_refs
1084     argv = [b'git', b'rev-list']
1085
1086     if format:
1087         argv.append(b'--pretty=format:' + format)
1088     for ref in refs:
1089         assert not ref.startswith(b'-')
1090         argv.append(ref)
1091     argv.append(b'--')
1092     return argv
1093
1094
1095 def rev_list(ref_or_refs, parse=None, format=None, repo_dir=None):
1096     """Yield information about commits as per "git rev-list".  If a format
1097     is not provided, yield one hex hash at a time.  If a format is
1098     provided, pass it to rev-list and call parse(git_stdout) for each
1099     commit with the stream positioned just after the rev-list "commit
1100     HASH" header line.  When a format is provided yield (oidx,
1101     parse(git_stdout)) for each commit.
1102
1103     """
1104     assert bool(parse) == bool(format)
1105     p = subprocess.Popen(rev_list_invocation(ref_or_refs,
1106                                              format=format),
1107                          env=_gitenv(repo_dir),
1108                          stdout = subprocess.PIPE,
1109                          close_fds=True)
1110     if not format:
1111         for line in p.stdout:
1112             yield line.strip()
1113     else:
1114         line = p.stdout.readline()
1115         while line:
1116             s = line.strip()
1117             if not s.startswith(b'commit '):
1118                 raise Exception('unexpected line ' + repr(s))
1119             s = s[7:]
1120             assert len(s) == 40
1121             yield s, parse(p.stdout)
1122             line = p.stdout.readline()
1123
1124     rv = p.wait()  # not fatal
1125     if rv:
1126         raise GitError('git rev-list returned error %d' % rv)
1127
1128
1129 def rev_parse(committish, repo_dir=None):
1130     """Resolve the full hash for 'committish', if it exists.
1131
1132     Should be roughly equivalent to 'git rev-parse'.
1133
1134     Returns the hex value of the hash if it is found, None if 'committish' does
1135     not correspond to anything.
1136     """
1137     head = read_ref(committish, repo_dir=repo_dir)
1138     if head:
1139         debug2("resolved from ref: commit = %s\n" % hexlify(head))
1140         return head
1141
1142     if len(committish) == 40:
1143         try:
1144             hash = unhexlify(committish)
1145         except TypeError:
1146             return None
1147
1148         with PackIdxList(repo(b'objects/pack', repo_dir=repo_dir)) as pL:
1149             if pL.exists(hash):
1150                 return hash
1151
1152     return None
1153
1154
1155 def update_ref(refname, newval, oldval, repo_dir=None, force=False):
1156     """Update a repository reference.
1157
1158     With force=True, don't care about the previous ref (oldval);
1159     with force=False oldval must be either a sha1 or None (for an
1160     entirely new branch)
1161     """
1162     if force:
1163         assert oldval is None
1164         oldarg = []
1165     elif not oldval:
1166         oldarg = [b'']
1167     else:
1168         oldarg = [hexlify(oldval)]
1169     assert refname.startswith(b'refs/heads/') \
1170         or refname.startswith(b'refs/tags/')
1171     p = subprocess.Popen([b'git', b'update-ref', refname,
1172                           hexlify(newval)] + oldarg,
1173                          env=_gitenv(repo_dir),
1174                          close_fds=True)
1175     _git_wait(b'git update-ref', p)
1176
1177
1178 def delete_ref(refname, oldvalue=None):
1179     """Delete a repository reference (see git update-ref(1))."""
1180     assert refname.startswith(b'refs/')
1181     oldvalue = [] if not oldvalue else [oldvalue]
1182     p = subprocess.Popen([b'git', b'update-ref', b'-d', refname] + oldvalue,
1183                          env=_gitenv(),
1184                          close_fds=True)
1185     _git_wait('git update-ref', p)
1186
1187
1188 def guess_repo():
1189     """Return the global repodir or BUP_DIR when either is set, or ~/.bup.
1190     Usually, if you are interacting with a bup repository, you would
1191     not be calling this function but using check_repo_or_die().
1192
1193     """
1194     if repodir:
1195         return repodir
1196     repo = environ.get(b'BUP_DIR')
1197     if not repo:
1198         repo = os.path.expanduser(b'~/.bup')
1199     return repo
1200
1201
1202 def init_repo(path=None):
1203     """Create the Git bare repository for bup in a given path."""
1204     global repodir
1205     repodir = path or guess_repo()
1206     d = repo()  # appends a / to the path
1207     parent = os.path.dirname(os.path.dirname(d))
1208     if parent and not os.path.exists(parent):
1209         raise GitError('parent directory "%s" does not exist\n'
1210                        % path_msg(parent))
1211     if os.path.exists(d) and not os.path.isdir(os.path.join(d, b'.')):
1212         raise GitError('"%s" exists but is not a directory\n' % path_msg(d))
1213     p = subprocess.Popen([b'git', b'--bare', b'init'], stdout=sys.stderr,
1214                          env=_gitenv(),
1215                          close_fds=True)
1216     _git_wait('git init', p)
1217     # Force the index version configuration in order to ensure bup works
1218     # regardless of the version of the installed Git binary.
1219     p = subprocess.Popen([b'git', b'config', b'pack.indexVersion', '2'],
1220                          stdout=sys.stderr, env=_gitenv(), close_fds=True)
1221     _git_wait('git config', p)
1222     # Enable the reflog
1223     p = subprocess.Popen([b'git', b'config', b'core.logAllRefUpdates', b'true'],
1224                          stdout=sys.stderr, env=_gitenv(), close_fds=True)
1225     _git_wait('git config', p)
1226
1227
1228 def check_repo_or_die(path=None):
1229     """Check to see if a bup repository probably exists, and abort if not."""
1230     global repodir
1231     repodir = path or guess_repo()
1232     top = repo()
1233     pst = stat_if_exists(top + b'/objects/pack')
1234     if pst and stat.S_ISDIR(pst.st_mode):
1235         return
1236     if not pst:
1237         top_st = stat_if_exists(top)
1238         if not top_st:
1239             log('error: repository %r does not exist (see "bup help init")\n'
1240                 % top)
1241             sys.exit(15)
1242     log('error: %s is not a repository\n' % path_msg(top))
1243     sys.exit(14)
1244
1245
1246 def is_suitable_git(ver_str):
1247     if not ver_str.startswith(b'git version '):
1248         return 'unrecognized'
1249     ver_str = ver_str[len(b'git version '):]
1250     if ver_str.startswith(b'0.'):
1251         return 'insufficient'
1252     if ver_str.startswith(b'1.'):
1253         if re.match(br'1\.[012345]rc', ver_str):
1254             return 'insufficient'
1255         if re.match(br'1\.[01234]\.', ver_str):
1256             return 'insufficient'
1257         if re.match(br'1\.5\.[012345]($|\.)', ver_str):
1258             return 'insufficient'
1259         if re.match(br'1\.5\.6-rc', ver_str):
1260             return 'insufficient'
1261         return 'suitable'
1262     if re.match(br'[0-9]+(\.|$)?', ver_str):
1263         return 'suitable'
1264     sys.exit(13)
1265
1266 _git_great = None
1267
1268 def require_suitable_git(ver_str=None):
1269     """Raise GitError if the version of git isn't suitable.
1270
1271     Rely on ver_str when provided, rather than invoking the git in the
1272     path.
1273
1274     """
1275     global _git_great
1276     if _git_great is not None:
1277         return
1278     if environ.get(b'BUP_GIT_VERSION_IS_FINE', b'').lower() \
1279        in (b'yes', b'true', b'1'):
1280         _git_great = True
1281         return
1282     if not ver_str:
1283         ver_str, _, _ = _git_exo([b'git', b'--version'])
1284     status = is_suitable_git(ver_str)
1285     if status == 'unrecognized':
1286         raise GitError('Unexpected git --version output: %r' % ver_str)
1287     if status == 'insufficient':
1288         log('error: git version must be at least 1.5.6\n')
1289         sys.exit(1)
1290     if status == 'suitable':
1291         _git_great = True
1292         return
1293     assert False
1294
1295
1296 class CatPipe:
1297     """Link to 'git cat-file' that is used to retrieve blob data."""
1298     def __init__(self, repo_dir = None):
1299         require_suitable_git()
1300         self.repo_dir = repo_dir
1301         self.p = self.inprogress = None
1302
1303     def close(self, wait=False):
1304         self.p, p = None, self.p
1305         self.inprogress = None
1306         if p:
1307             try:
1308                 p.stdout.close()
1309             finally:
1310                 # This will handle pending exceptions correctly once
1311                 # we drop py2
1312                 p.stdin.close()
1313         if wait:
1314             p.wait()
1315             return p.returncode
1316         return None
1317
1318     def restart(self):
1319         self.close()
1320         self.p = subprocess.Popen([b'git', b'cat-file', b'--batch'],
1321                                   stdin=subprocess.PIPE,
1322                                   stdout=subprocess.PIPE,
1323                                   close_fds = True,
1324                                   bufsize = 4096,
1325                                   env=_gitenv(self.repo_dir))
1326
1327     def get(self, ref):
1328         """Yield (oidx, type, size), followed by the data referred to by ref.
1329         If ref does not exist, only yield (None, None, None).
1330
1331         """
1332         if not self.p or self.p.poll() != None:
1333             self.restart()
1334         assert(self.p)
1335         poll_result = self.p.poll()
1336         assert(poll_result == None)
1337         if self.inprogress:
1338             log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1339         assert(not self.inprogress)
1340         assert ref.find(b'\n') < 0
1341         assert ref.find(b'\r') < 0
1342         assert not ref.startswith(b'-')
1343         self.inprogress = ref
1344         self.p.stdin.write(ref + b'\n')
1345         self.p.stdin.flush()
1346         hdr = self.p.stdout.readline()
1347         if not hdr:
1348             raise GitError('unexpected cat-file EOF (last request: %r, exit: %s)'
1349                            % (ref, self.p.poll() or 'none'))
1350         if hdr.endswith(b' missing\n'):
1351             self.inprogress = None
1352             yield None, None, None
1353             return
1354         info = hdr.split(b' ')
1355         if len(info) != 3 or len(info[0]) != 40:
1356             raise GitError('expected object (id, type, size), got %r' % info)
1357         oidx, typ, size = info
1358         size = int(size)
1359         try:
1360             it = chunkyreader(self.p.stdout, size)
1361             yield oidx, typ, size
1362             for blob in chunkyreader(self.p.stdout, size):
1363                 yield blob
1364             readline_result = self.p.stdout.readline()
1365             assert readline_result == b'\n'
1366             self.inprogress = None
1367         except Exception as ex:
1368             with pending_raise(ex):
1369                 self.close()
1370
1371     def _join(self, it):
1372         _, typ, _ = next(it)
1373         if typ == b'blob':
1374             for blob in it:
1375                 yield blob
1376         elif typ == b'tree':
1377             treefile = b''.join(it)
1378             for (mode, name, sha) in tree_decode(treefile):
1379                 for blob in self.join(hexlify(sha)):
1380                     yield blob
1381         elif typ == b'commit':
1382             treeline = b''.join(it).split(b'\n')[0]
1383             assert treeline.startswith(b'tree ')
1384             for blob in self.join(treeline[5:]):
1385                 yield blob
1386         else:
1387             raise GitError('invalid object type %r: expected blob/tree/commit'
1388                            % typ)
1389
1390     def join(self, id):
1391         """Generate a list of the content of all blobs that can be reached
1392         from an object.  The hash given in 'id' must point to a blob, a tree
1393         or a commit. The content of all blobs that can be seen from trees or
1394         commits will be added to the list.
1395         """
1396         for d in self._join(self.get(id)):
1397             yield d
1398
1399
1400 _cp = {}
1401
1402 def cp(repo_dir=None):
1403     """Create a CatPipe object or reuse the already existing one."""
1404     global _cp, repodir
1405     if not repo_dir:
1406         repo_dir = repodir or repo()
1407     repo_dir = os.path.abspath(repo_dir)
1408     cp = _cp.get(repo_dir)
1409     if not cp:
1410         cp = CatPipe(repo_dir)
1411         _cp[repo_dir] = cp
1412     return cp
1413
1414
1415 def close_catpipes():
1416     # FIXME: chain exceptions
1417     while _cp:
1418         _, cp = _cp.popitem()
1419         cp.close(wait=True)
1420
1421
1422 def tags(repo_dir = None):
1423     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1424     tags = {}
1425     for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1426         assert n.startswith(b'refs/tags/')
1427         name = n[10:]
1428         if not c in tags:
1429             tags[c] = []
1430         tags[c].append(name)  # more than one tag can point at 'c'
1431     return tags
1432
1433
1434 class MissingObject(KeyError):
1435     def __init__(self, oid):
1436         self.oid = oid
1437         KeyError.__init__(self, 'object %r is missing' % hexlify(oid))
1438
1439
1440 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1441                                    'path', 'chunk_path', 'data'])
1442 # The path is the mangled path, and if an item represents a fragment
1443 # of a chunked file, the chunk_path will be the chunked subtree path
1444 # for the chunk, i.e. ['', '2d3115e', ...].  The top-level path for a
1445 # chunked file will have a chunk_path of [''].  So some chunk subtree
1446 # of the file '/foo/bar/baz' might look like this:
1447 #
1448 #   item.path = ['foo', 'bar', 'baz.bup']
1449 #   item.chunk_path = ['', '2d3115e', '016b097']
1450 #   item.type = 'tree'
1451 #   ...
1452
1453
1454 def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1455     """Yield everything reachable from oidx via get_ref (which must behave
1456     like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1457     returns true.  Throw MissingObject if a hash encountered is
1458     missing from the repository, and don't read or return blob content
1459     in the data field unless include_data is set.
1460
1461     """
1462     # Maintain the pending stack on the heap to avoid stack overflow
1463     pending = [(oidx, [], [], None)]
1464     while len(pending):
1465         oidx, parent_path, chunk_path, mode = pending.pop()
1466         oid = unhexlify(oidx)
1467         if stop_at and stop_at(oidx):
1468             continue
1469
1470         if (not include_data) and mode and stat.S_ISREG(mode):
1471             # If the object is a "regular file", then it's a leaf in
1472             # the graph, so we can skip reading the data if the caller
1473             # hasn't requested it.
1474             yield WalkItem(oid=oid, type=b'blob',
1475                            chunk_path=chunk_path, path=parent_path,
1476                            mode=mode,
1477                            data=None)
1478             continue
1479
1480         item_it = get_ref(oidx)
1481         get_oidx, typ, _ = next(item_it)
1482         if not get_oidx:
1483             raise MissingObject(unhexlify(oidx))
1484         if typ not in (b'blob', b'commit', b'tree'):
1485             raise Exception('unexpected repository object type %r' % typ)
1486
1487         # FIXME: set the mode based on the type when the mode is None
1488         if typ == b'blob' and not include_data:
1489             # Dump data until we can ask cat_pipe not to fetch it
1490             for ignored in item_it:
1491                 pass
1492             data = None
1493         else:
1494             data = b''.join(item_it)
1495
1496         yield WalkItem(oid=oid, type=typ,
1497                        chunk_path=chunk_path, path=parent_path,
1498                        mode=mode,
1499                        data=(data if include_data else None))
1500
1501         if typ == b'commit':
1502             commit_items = parse_commit(data)
1503             for pid in commit_items.parents:
1504                 pending.append((pid, parent_path, chunk_path, mode))
1505             pending.append((commit_items.tree, parent_path, chunk_path,
1506                             hashsplit.GIT_MODE_TREE))
1507         elif typ == b'tree':
1508             for mode, name, ent_id in tree_decode(data):
1509                 demangled, bup_type = demangle_name(name, mode)
1510                 if chunk_path:
1511                     sub_path = parent_path
1512                     sub_chunk_path = chunk_path + [name]
1513                 else:
1514                     sub_path = parent_path + [name]
1515                     if bup_type == BUP_CHUNKED:
1516                         sub_chunk_path = [b'']
1517                     else:
1518                         sub_chunk_path = chunk_path
1519                 pending.append((hexlify(ent_id), sub_path, sub_chunk_path,
1520                                 mode))