lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5
   6 from __future__ import absolute_import, print_function
   7 import os, sys, zlib, subprocess, struct, stat, re, glob
   8 from array import array
   9 from binascii import hexlify, unhexlify
  10 from collections import namedtuple
  11 from contextlib import ExitStack
  12 from itertools import islice
  13 from shutil import rmtree
  14
  15 from bup import _helpers, hashsplit, path, midx, bloom, xstat
  16 from bup.compat import (buffer,
  17                         byte_int, bytes_from_byte, bytes_from_uint,
  18                         environ,
  19                         pending_raise,
  20                         reraise)
  21 from bup.io import path_msg
  22 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
  23                          exo,
  24                          fdatasync,
  25                          finalized,
  26                          log,
  27                          merge_dict,
  28                          merge_iter,
  29                          mmap_read, mmap_readwrite,
  30                          nullcontext_if_not,
  31                          progress, qprogress, stat_if_exists,
  32                          temp_dir,
  33                          unlink,
  34                          utc_offset_str)
  35
  36
  37 verbose = 0
  38 repodir = None  # The default repository, once initialized
  39
  40 _typemap =  {b'blob': 3, b'tree': 2, b'commit': 1, b'tag': 4}
  41 _typermap = {v: k for k, v in _typemap.items()}
  42
  43
  44 _total_searches = 0
  45 _total_steps = 0
  46
  47
  48 class GitError(Exception):
  49     pass
  50
  51
  52 def _gitenv(repo_dir=None):
  53     if not repo_dir:
  54         repo_dir = repo()
  55     return merge_dict(environ, {b'GIT_DIR': os.path.abspath(repo_dir)})
  56
  57 def _git_wait(cmd, p):
  58     rv = p.wait()
  59     if rv != 0:
  60         raise GitError('%r returned %d' % (cmd, rv))
  61
  62 def _git_exo(cmd, **kwargs):
  63     kwargs['check'] = False
  64     result = exo(cmd, **kwargs)
  65     _, _, proc = result
  66     if proc.returncode != 0:
  67         raise GitError('%r returned %d' % (cmd, proc.returncode))
  68     return result
  69
  70 def git_config_get(option, repo_dir=None, opttype=None, cfg_file=None):
  71     assert not (repo_dir and cfg_file), "repo_dir and cfg_file cannot both be used"
  72     cmd = [b'git', b'config', b'--null']
  73     if cfg_file:
  74         cmd.extend([b'--file', cfg_file])
  75     if opttype == 'int':
  76         cmd.extend([b'--int'])
  77     elif opttype == 'bool':
  78         cmd.extend([b'--bool'])
  79     else:
  80         assert opttype is None
  81     cmd.extend([b'--get', option])
  82     env=None
  83     if repo_dir:
  84         env = _gitenv(repo_dir=repo_dir)
  85     p = subprocess.Popen(cmd, stdout=subprocess.PIPE, env=env,
  86                          close_fds=True)
  87     # with --null, git writes out a trailing \0 after the value
  88     r = p.stdout.read()[:-1]
  89     rc = p.wait()
  90     if rc == 0:
  91         if opttype == 'int':
  92             return int(r)
  93         elif opttype == 'bool':
  94             # git converts to 'true' or 'false'
  95             return r == b'true'
  96         return r
  97     if rc != 1:
  98         raise GitError('%r returned %d' % (cmd, rc))
  99     return None
 100
 101
 102 def parse_tz_offset(s):
 103     """UTC offset in seconds."""
 104     tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
 105     if bytes_from_byte(s[0]) == b'-':
 106         return - tz_off
 107     return tz_off
 108
 109 def parse_commit_gpgsig(sig):
 110     """Return the original signature bytes.
 111
 112     i.e. with the "gpgsig " header and the leading space character on
 113     each continuation line removed.
 114
 115     """
 116     if not sig:
 117         return None
 118     assert sig.startswith(b'gpgsig ')
 119     sig = sig[7:]
 120     return sig.replace(b'\n ', b'\n')
 121
 122 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
 123 # Make sure that's authoritative.
 124
 125 # See also
 126 # https://github.com/git/git/blob/master/Documentation/technical/signature-format.txt
 127 # The continuation lines have only one leading space.
 128
 129 _start_end_char = br'[^ .,:;<>"\'\0\n]'
 130 _content_char = br'[^\0\n<>]'
 131 _safe_str_rx = br'(?:%s{1,2}|(?:%s%s*%s))' \
 132     % (_start_end_char,
 133        _start_end_char, _content_char, _start_end_char)
 134 _tz_rx = br'[-+]\d\d[0-5]\d'
 135 _parent_rx = br'(?:parent [abcdefABCDEF0123456789]{40}\n)'
 136 # Assumes every following line starting with a space is part of the
 137 # mergetag.  Is there a formal commit blob spec?
 138 _mergetag_rx = br'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)'
 139 _commit_rx = re.compile(br'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
 140 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
 141 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?)
 142 (?P<gpgsig>gpgsig .*\n(?: .*\n)*)?
 143 (?P<message>(?:.|\n)*)''' % (_parent_rx,
 144                              _safe_str_rx, _safe_str_rx, _tz_rx,
 145                              _safe_str_rx, _safe_str_rx, _tz_rx,
 146                              _mergetag_rx))
 147 _parent_hash_rx = re.compile(br'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
 148
 149 # Note that the author_sec and committer_sec values are (UTC) epoch
 150 # seconds, and for now the mergetag is not included.
 151 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
 152                                        'author_name', 'author_mail',
 153                                        'author_sec', 'author_offset',
 154                                        'committer_name', 'committer_mail',
 155                                        'committer_sec', 'committer_offset',
 156                                        'gpgsig',
 157                                        'message'])
 158
 159 def parse_commit(content):
 160     commit_match = re.match(_commit_rx, content)
 161     if not commit_match:
 162         raise Exception('cannot parse commit %r' % content)
 163     matches = commit_match.groupdict()
 164     return CommitInfo(tree=matches['tree'],
 165                       parents=re.findall(_parent_hash_rx, matches['parents']),
 166                       author_name=matches['author_name'],
 167                       author_mail=matches['author_mail'],
 168                       author_sec=int(matches['asec']),
 169                       author_offset=parse_tz_offset(matches['atz']),
 170                       committer_name=matches['committer_name'],
 171                       committer_mail=matches['committer_mail'],
 172                       committer_sec=int(matches['csec']),
 173                       committer_offset=parse_tz_offset(matches['ctz']),
 174                       gpgsig=parse_commit_gpgsig(matches['gpgsig']),
 175                       message=matches['message'])
 176
 177
 178 def get_cat_data(cat_iterator, expected_type):
 179     _, kind, _ = next(cat_iterator)
 180     if kind != expected_type:
 181         raise Exception('expected %r, saw %r' % (expected_type, kind))
 182     return b''.join(cat_iterator)
 183
 184 def get_commit_items(id, cp):
 185     return parse_commit(get_cat_data(cp.get(id), b'commit'))
 186
 187 def _local_git_date_str(epoch_sec):
 188     return b'%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
 189
 190
 191 def _git_date_str(epoch_sec, tz_offset_sec):
 192     offs =  tz_offset_sec // 60
 193     return b'%d %s%02d%02d' \
 194         % (epoch_sec,
 195            b'+' if offs >= 0 else b'-',
 196            abs(offs) // 60,
 197            abs(offs) % 60)
 198
 199
 200 def repo(sub = b'', repo_dir=None):
 201     """Get the path to the git repository or one of its subdirectories."""
 202     repo_dir = repo_dir or repodir
 203     if not repo_dir:
 204         raise GitError('You should call check_repo_or_die()')
 205
 206     # If there's a .git subdirectory, then the actual repo is in there.
 207     gd = os.path.join(repo_dir, b'.git')
 208     if os.path.exists(gd):
 209         repo_dir = gd
 210
 211     return os.path.join(repo_dir, sub)
 212
 213
 214 _shorten_hash_rx = \
 215     re.compile(br'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)')
 216
 217 def shorten_hash(s):
 218     return _shorten_hash_rx.sub(br'\1\2*\3', s)
 219
 220
 221 def repo_rel(path):
 222     full = os.path.abspath(path)
 223     fullrepo = os.path.abspath(repo(b''))
 224     if not fullrepo.endswith(b'/'):
 225         fullrepo += b'/'
 226     if full.startswith(fullrepo):
 227         path = full[len(fullrepo):]
 228     if path.startswith(b'index-cache/'):
 229         path = path[len(b'index-cache/'):]
 230     return shorten_hash(path)
 231
 232
 233 def auto_midx(objdir):
 234     args = [path.exe(), b'midx', b'--auto', b'--dir', objdir]
 235     try:
 236         rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
 237     except OSError as e:
 238         # make sure 'args' gets printed to help with debugging
 239         add_error('%r: exception: %s' % (args, e))
 240         raise
 241     if rv:
 242         add_error('%r: returned %d' % (args, rv))
 243
 244     args = [path.exe(), b'bloom', b'--dir', objdir]
 245     try:
 246         rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
 247     except OSError as e:
 248         # make sure 'args' gets printed to help with debugging
 249         add_error('%r: exception: %s' % (args, e))
 250         raise
 251     if rv:
 252         add_error('%r: returned %d' % (args, rv))
 253
 254
 255 def mangle_name(name, mode, gitmode):
 256     """Mangle a file name to present an abstract name for segmented files.
 257     Mangled file names will have the ".bup" extension added to them. If a
 258     file's name already ends with ".bup", a ".bupl" extension is added to
 259     disambiguate normal files from segmented ones.
 260     """
 261     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 262         assert(stat.S_ISDIR(gitmode))
 263         return name + b'.bup'
 264     elif name.endswith(b'.bup') or name[:-1].endswith(b'.bup'):
 265         return name + b'.bupl'
 266     else:
 267         return name
 268
 269
 270 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 271 def demangle_name(name, mode):
 272     """Remove name mangling from a file name, if necessary.
 273
 274     The return value is a tuple (demangled_filename,mode), where mode is one of
 275     the following:
 276
 277     * BUP_NORMAL  : files that should be read as-is from the repository
 278     * BUP_CHUNKED : files that were chunked and need to be reassembled
 279
 280     For more information on the name mangling algorithm, see mangle_name()
 281     """
 282     if name.endswith(b'.bupl'):
 283         return (name[:-5], BUP_NORMAL)
 284     elif name.endswith(b'.bup'):
 285         return (name[:-4], BUP_CHUNKED)
 286     elif name.endswith(b'.bupm'):
 287         return (name[:-5],
 288                 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
 289     return (name, BUP_NORMAL)
 290
 291
 292 def calc_hash(type, content):
 293     """Calculate some content's hash in the Git fashion."""
 294     header = b'%s %d\0' % (type, len(content))
 295     sum = Sha1(header)
 296     sum.update(content)
 297     return sum.digest()
 298
 299
 300 def shalist_item_sort_key(ent):
 301     (mode, name, id) = ent
 302     assert(mode+0 == mode)
 303     if stat.S_ISDIR(mode):
 304         return name + b'/'
 305     else:
 306         return name
 307
 308
 309 def tree_encode(shalist):
 310     """Generate a git tree object from (mode,name,hash) tuples."""
 311     shalist = sorted(shalist, key = shalist_item_sort_key)
 312     l = []
 313     for (mode,name,bin) in shalist:
 314         assert(mode)
 315         assert(mode+0 == mode)
 316         assert(name)
 317         assert(len(bin) == 20)
 318         s = b'%o %s\0%s' % (mode,name,bin)
 319         assert s[0] != b'0'  # 0-padded octal is not acceptable in a git tree
 320         l.append(s)
 321     return b''.join(l)
 322
 323
 324 def tree_decode(buf):
 325     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 326     ofs = 0
 327     while ofs < len(buf):
 328         z = buf.find(b'\0', ofs)
 329         assert(z > ofs)
 330         spl = buf[ofs:z].split(b' ', 1)
 331         assert(len(spl) == 2)
 332         mode,name = spl
 333         sha = buf[z+1:z+1+20]
 334         ofs = z+1+20
 335         yield (int(mode, 8), name, sha)
 336
 337
 338 def _encode_packobj(type, content, compression_level=1):
 339     if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
 340         raise ValueError('invalid compression level %s' % compression_level)
 341     szout = b''
 342     sz = len(content)
 343     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 344     sz >>= 4
 345     while 1:
 346         if sz: szbits |= 0x80
 347         szout += bytes_from_uint(szbits)
 348         if not sz:
 349             break
 350         szbits = sz & 0x7f
 351         sz >>= 7
 352     z = zlib.compressobj(compression_level)
 353     yield szout
 354     yield z.compress(content)
 355     yield z.flush()
 356
 357
 358 def _decode_packobj(buf):
 359     assert(buf)
 360     c = byte_int(buf[0])
 361     type = _typermap[(c & 0x70) >> 4]
 362     sz = c & 0x0f
 363     shift = 4
 364     i = 0
 365     while c & 0x80:
 366         i += 1
 367         c = byte_int(buf[i])
 368         sz |= (c & 0x7f) << shift
 369         shift += 7
 370         if not (c & 0x80):
 371             break
 372     return (type, zlib.decompress(buf[i+1:]))
 373
 374
 375 class PackIdx(object):
 376     def find_offset(self, hash):
 377         """Get the offset of an object inside the index file."""
 378         idx = self._idx_from_hash(hash)
 379         if idx != None:
 380             return self._ofs_from_idx(idx)
 381         return None
 382
 383     def exists(self, hash, want_source=False):
 384         """Return nonempty if the object exists in this index."""
 385         if hash and (self._idx_from_hash(hash) != None):
 386             return want_source and os.path.basename(self.name) or True
 387         return None
 388
 389     def _idx_from_hash(self, hash):
 390         global _total_searches, _total_steps
 391         _total_searches += 1
 392         assert(len(hash) == 20)
 393         b1 = byte_int(hash[0])
 394         start = self.fanout[b1-1] # range -1..254
 395         end = self.fanout[b1] # range 0..255
 396         want = hash
 397         _total_steps += 1  # lookup table is a step
 398         while start < end:
 399             _total_steps += 1
 400             mid = start + (end - start) // 2
 401             v = self._idx_to_hash(mid)
 402             if v < want:
 403                 start = mid+1
 404             elif v > want:
 405                 end = mid
 406             else: # got it!
 407                 return mid
 408         return None
 409
 410
 411 class PackIdxV1(PackIdx):
 412     """Object representation of a Git pack index (version 1) file."""
 413     def __init__(self, filename, f):
 414         super(PackIdxV1, self).__init__()
 415         self.closed = False
 416         self.name = filename
 417         self.idxnames = [self.name]
 418         self.map = mmap_read(f)
 419         # Min size for 'L' is 4, which is sufficient for struct's '!I'
 420         self.fanout = array('L', struct.unpack('!256I', self.map))
 421         self.fanout.append(0)  # entry "-1"
 422         self.nsha = self.fanout[255]
 423         self.sha_ofs = 256 * 4
 424         # Avoid slicing shatable for individual hashes (very high overhead)
 425         self.shatable = buffer(self.map, self.sha_ofs, self.nsha * 24)
 426
 427     def __enter__(self):
 428         return self
 429
 430     def __exit__(self, type, value, traceback):
 431         with pending_raise(value, rethrow=False):
 432             self.close()
 433
 434     def __len__(self):
 435         return int(self.nsha)  # int() from long for python 2
 436
 437     def _ofs_from_idx(self, idx):
 438         if idx >= self.nsha or idx < 0:
 439             raise IndexError('invalid pack index index %d' % idx)
 440         ofs = self.sha_ofs + idx * 24
 441         return struct.unpack_from('!I', self.map, offset=ofs)[0]
 442
 443     def _idx_to_hash(self, idx):
 444         if idx >= self.nsha or idx < 0:
 445             raise IndexError('invalid pack index index %d' % idx)
 446         ofs = self.sha_ofs + idx * 24 + 4
 447         return self.map[ofs : ofs + 20]
 448
 449     def __iter__(self):
 450         start = self.sha_ofs + 4
 451         for ofs in range(start, start + 24 * self.nsha, 24):
 452             yield self.map[ofs : ofs + 20]
 453
 454     def close(self):
 455         self.closed = True
 456         if self.map is not None:
 457             self.shatable = None
 458             self.map.close()
 459             self.map = None
 460
 461     def __del__(self):
 462         assert self.closed
 463
 464
 465 class PackIdxV2(PackIdx):
 466     """Object representation of a Git pack index (version 2) file."""
 467     def __init__(self, filename, f):
 468         super(PackIdxV2, self).__init__()
 469         self.closed = False
 470         self.name = filename
 471         self.idxnames = [self.name]
 472         self.map = mmap_read(f)
 473         assert self.map[0:8] == b'\377tOc\0\0\0\2'
 474         # Min size for 'L' is 4, which is sufficient for struct's '!I'
 475         self.fanout = array('L', struct.unpack_from('!256I', self.map, offset=8))
 476         self.fanout.append(0)
 477         self.nsha = self.fanout[255]
 478         self.sha_ofs = 8 + 256*4
 479         self.ofstable_ofs = self.sha_ofs + self.nsha * 20 + self.nsha * 4
 480         self.ofs64table_ofs = self.ofstable_ofs + self.nsha * 4
 481         # Avoid slicing this for individual hashes (very high overhead)
 482         self.shatable = buffer(self.map, self.sha_ofs, self.nsha*20)
 483
 484     def __enter__(self):
 485         return self
 486
 487     def __exit__(self, type, value, traceback):
 488         with pending_raise(value, rethrow=False):
 489             self.close()
 490
 491     def __len__(self):
 492         return int(self.nsha)  # int() from long for python 2
 493
 494     def _ofs_from_idx(self, idx):
 495         if idx >= self.nsha or idx < 0:
 496             raise IndexError('invalid pack index index %d' % idx)
 497         ofs_ofs = self.ofstable_ofs + idx * 4
 498         ofs = struct.unpack_from('!I', self.map, offset=ofs_ofs)[0]
 499         if ofs & 0x80000000:
 500             idx64 = ofs & 0x7fffffff
 501             ofs64_ofs = self.ofs64table_ofs + idx64 * 8
 502             ofs = struct.unpack_from('!Q', self.map, offset=ofs64_ofs)[0]
 503         return ofs
 504
 505     def _idx_to_hash(self, idx):
 506         if idx >= self.nsha or idx < 0:
 507             raise IndexError('invalid pack index index %d' % idx)
 508         ofs = self.sha_ofs + idx * 20
 509         return self.map[ofs : ofs + 20]
 510
 511     def __iter__(self):
 512         start = self.sha_ofs
 513         for ofs in range(start, start + 20 * self.nsha, 20):
 514             yield self.map[ofs : ofs + 20]
 515
 516     def close(self):
 517         self.closed = True
 518         if self.map is not None:
 519             self.shatable = None
 520             self.map.close()
 521             self.map = None
 522
 523     def __del__(self):
 524         assert self.closed
 525
 526
 527 _mpi_count = 0
 528 class PackIdxList:
 529     def __init__(self, dir, ignore_midx=False):
 530         global _mpi_count
 531         # Q: was this also intended to prevent opening multiple repos?
 532         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 533         _mpi_count += 1
 534         self.open = True
 535         self.dir = dir
 536         self.also = set()
 537         self.packs = []
 538         self.do_bloom = False
 539         self.bloom = None
 540         self.ignore_midx = ignore_midx
 541         try:
 542             self.refresh()
 543         except BaseException as ex:
 544             with pending_raise(ex):
 545                 self.close()
 546
 547     def close(self):
 548         global _mpi_count
 549         if not self.open:
 550             assert _mpi_count == 0
 551             return
 552         _mpi_count -= 1
 553         assert _mpi_count == 0
 554         self.also = None
 555         self.bloom, bloom = None, self.bloom
 556         self.packs, packs = None, self.packs
 557         self.open = False
 558         with ExitStack() as stack:
 559             for pack in packs:
 560                 stack.enter_context(pack)
 561             if bloom:
 562                 bloom.close()
 563
 564     def __enter__(self):
 565         return self
 566
 567     def __exit__(self, type, value, traceback):
 568         with pending_raise(value, rethrow=False):
 569             self.close()
 570
 571     def __del__(self):
 572         assert not self.open
 573
 574     def __iter__(self):
 575         return iter(idxmerge(self.packs))
 576
 577     def __len__(self):
 578         return sum(len(pack) for pack in self.packs)
 579
 580     def exists(self, hash, want_source=False):
 581         """Return nonempty if the object exists in the index files."""
 582         global _total_searches
 583         _total_searches += 1
 584         if hash in self.also:
 585             return True
 586         if self.do_bloom and self.bloom:
 587             if self.bloom.exists(hash):
 588                 self.do_bloom = False
 589             else:
 590                 _total_searches -= 1  # was counted by bloom
 591                 return None
 592         for i in range(len(self.packs)):
 593             p = self.packs[i]
 594             _total_searches -= 1  # will be incremented by sub-pack
 595             ix = p.exists(hash, want_source=want_source)
 596             if ix:
 597                 # reorder so most recently used packs are searched first
 598                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 599                 return ix
 600         self.do_bloom = True
 601         return None
 602
 603     def refresh(self, skip_midx = False):
 604         """Refresh the index list.
 605         This method verifies if .midx files were superseded (e.g. all of its
 606         contents are in another, bigger .midx file) and removes the superseded
 607         files.
 608
 609         If skip_midx is True, all work on .midx files will be skipped and .midx
 610         files will be removed from the list.
 611
 612         The instance variable 'ignore_midx' can force this function to
 613         always act as if skip_midx was True.
 614         """
 615         if self.bloom is not None:
 616             self.bloom.close()
 617         self.bloom = None # Always reopen the bloom as it may have been relaced
 618         self.do_bloom = False
 619         skip_midx = skip_midx or self.ignore_midx
 620         d = dict((p.name, p) for p in self.packs
 621                  if not skip_midx or not isinstance(p, midx.PackMidx))
 622         if os.path.exists(self.dir):
 623             if not skip_midx:
 624                 midxl = []
 625                 midxes = set(glob.glob(os.path.join(self.dir, b'*.midx')))
 626                 # remove any *.midx files from our list that no longer exist
 627                 for ix in list(d.values()):
 628                     if not isinstance(ix, midx.PackMidx):
 629                         continue
 630                     if ix.name in midxes:
 631                         continue
 632                     # remove the midx
 633                     del d[ix.name]
 634                     ix.close()
 635                     self.packs.remove(ix)
 636                 for ix in self.packs:
 637                     if isinstance(ix, midx.PackMidx):
 638                         for name in ix.idxnames:
 639                             d[os.path.join(self.dir, name)] = ix
 640                 for full in midxes:
 641                     if not d.get(full):
 642                         mx = midx.PackMidx(full)
 643                         (mxd, mxf) = os.path.split(mx.name)
 644                         broken = False
 645                         for n in mx.idxnames:
 646                             if not os.path.exists(os.path.join(mxd, n)):
 647                                 log(('warning: index %s missing\n'
 648                                      '  used by %s\n')
 649                                     % (path_msg(n), path_msg(mxf)))
 650                                 broken = True
 651                         if broken:
 652                             mx.close()
 653                             unlink(full)
 654                         else:
 655                             midxl.append(mx)
 656                 midxl.sort(key=lambda ix:
 657                            (-len(ix), -xstat.stat(ix.name).st_mtime))
 658                 for ix in midxl:
 659                     any_needed = False
 660                     for sub in ix.idxnames:
 661                         found = d.get(os.path.join(self.dir, sub))
 662                         if not found or isinstance(found, PackIdx):
 663                             # doesn't exist, or exists but not in a midx
 664                             any_needed = True
 665                             break
 666                     if any_needed:
 667                         d[ix.name] = ix
 668                         for name in ix.idxnames:
 669                             d[os.path.join(self.dir, name)] = ix
 670                     elif not ix.force_keep:
 671                         debug1('midx: removing redundant: %s\n'
 672                                % path_msg(os.path.basename(ix.name)))
 673                         ix.close()
 674                         unlink(ix.name)
 675             for full in glob.glob(os.path.join(self.dir, b'*.idx')):
 676                 if not d.get(full):
 677                     try:
 678                         ix = open_idx(full)
 679                     except GitError as e:
 680                         add_error(e)
 681                         continue
 682                     d[full] = ix
 683             bfull = os.path.join(self.dir, b'bup.bloom')
 684             new_packs = set(d.values())
 685             for p in self.packs:
 686                 if not p in new_packs:
 687                     p.close()
 688             new_packs = list(new_packs)
 689             new_packs.sort(reverse=True, key=lambda x: len(x))
 690             self.packs = new_packs
 691             if self.bloom is None and os.path.exists(bfull):
 692                 self.bloom = bloom.ShaBloom(bfull)
 693             try:
 694                 if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 695                     self.do_bloom = True
 696                 else:
 697                     if self.bloom:
 698                         self.bloom, bloom_tmp = None, self.bloom
 699                         bloom_tmp.close()
 700             except BaseException as ex:
 701                 with pending_raise(ex):
 702                     if self.bloom:
 703                         self.bloom.close()
 704
 705         debug1('PackIdxList: using %d index%s.\n'
 706             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 707
 708     def add(self, hash):
 709         """Insert an additional object in the list."""
 710         self.also.add(hash)
 711
 712
 713 def open_idx(filename):
 714     if filename.endswith(b'.idx'):
 715         f = open(filename, 'rb')
 716         header = f.read(8)
 717         if header[0:4] == b'\377tOc':
 718             version = struct.unpack('!I', header[4:8])[0]
 719             if version == 2:
 720                 return PackIdxV2(filename, f)
 721             else:
 722                 raise GitError('%s: expected idx file version 2, got %d'
 723                                % (path_msg(filename), version))
 724         elif len(header) == 8 and header[0:4] < b'\377tOc':
 725             return PackIdxV1(filename, f)
 726         else:
 727             raise GitError('%s: unrecognized idx file header'
 728                            % path_msg(filename))
 729     elif filename.endswith(b'.midx'):
 730         return midx.PackMidx(filename)
 731     else:
 732         raise GitError('idx filenames must end with .idx or .midx')
 733
 734
 735 def idxmerge(idxlist, final_progress=True):
 736     """Generate a list of all the objects reachable in a PackIdxList."""
 737     def pfunc(count, total):
 738         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 739                   % (count*100.0/total, count, total))
 740     def pfinal(count, total):
 741         if final_progress:
 742             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 743                      % (100, total, total))
 744     return merge_iter(idxlist, 10024, pfunc, pfinal)
 745
 746
 747 def create_commit_blob(tree, parent,
 748                        author, adate_sec, adate_tz,
 749                        committer, cdate_sec, cdate_tz,
 750                        msg):
 751     if adate_tz is not None:
 752         adate_str = _git_date_str(adate_sec, adate_tz)
 753     else:
 754         adate_str = _local_git_date_str(adate_sec)
 755     if cdate_tz is not None:
 756         cdate_str = _git_date_str(cdate_sec, cdate_tz)
 757     else:
 758         cdate_str = _local_git_date_str(cdate_sec)
 759     l = []
 760     if tree: l.append(b'tree %s' % hexlify(tree))
 761     if parent: l.append(b'parent %s' % hexlify(parent))
 762     if author: l.append(b'author %s %s' % (author, adate_str))
 763     if committer: l.append(b'committer %s %s' % (committer, cdate_str))
 764     l.append(b'')
 765     l.append(msg)
 766     return b'\n'.join(l)
 767
 768 def _make_objcache():
 769     return PackIdxList(repo(b'objects/pack'))
 770
 771 # bup-gc assumes that it can disable all PackWriter activities
 772 # (bloom/midx/cache) via the constructor and close() arguments.
 773
 774 class PackWriter(object):
 775     """Writes Git objects inside a pack file."""
 776     def __init__(self, objcache_maker=_make_objcache, compression_level=1,
 777                  run_midx=True, on_pack_finish=None,
 778                  max_pack_size=None, max_pack_objects=None, repo_dir=None):
 779         self.closed = False
 780         self.repo_dir = repo_dir or repo()
 781         self.file = None
 782         self.parentfd = None
 783         self.count = 0
 784         self.outbytes = 0
 785         self.tmpdir = None
 786         self.idx = None
 787         self.objcache_maker = objcache_maker
 788         self.objcache = None
 789         self.compression_level = compression_level
 790         self.run_midx=run_midx
 791         self.on_pack_finish = on_pack_finish
 792         if not max_pack_size:
 793             max_pack_size = git_config_get(b'pack.packSizeLimit',
 794                                            repo_dir=self.repo_dir,
 795                                            opttype='int')
 796             if not max_pack_size:
 797                 # larger packs slow down pruning
 798                 max_pack_size = 1000 * 1000 * 1000
 799         self.max_pack_size = max_pack_size
 800         # cache memory usage is about 83 bytes per object
 801         self.max_pack_objects = max_pack_objects if max_pack_objects \
 802                                 else max(1, self.max_pack_size // 5000)
 803
 804     def __enter__(self):
 805         return self
 806
 807     def __exit__(self, type, value, traceback):
 808         with pending_raise(value, rethrow=False):
 809             self.close()
 810
 811     def _open(self):
 812         if not self.file:
 813             with ExitStack() as err_stack:
 814                 objdir = dir = os.path.join(self.repo_dir, b'objects')
 815                 self.tmpdir = err_stack.enter_context(temp_dir(dir=objdir, prefix=b'pack-tmp-'))
 816                 self.file = err_stack.enter_context(open(self.tmpdir + b'/pack', 'w+b'))
 817                 self.parentfd = err_stack.enter_context(finalized(os.open(objdir, os.O_RDONLY),
 818                                                                   lambda x: os.close(x)))
 819                 self.file.write(b'PACK\0\0\0\2\0\0\0\0')
 820                 self.idx = PackIdxV2Writer()
 821                 err_stack.pop_all()
 822
 823     def _raw_write(self, datalist, sha):
 824         self._open()
 825         f = self.file
 826         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 827         # the file never has a *partial* blob.  So let's make sure it's
 828         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 829         # to our hashsplit algorithm.)  f.write() does its own buffering,
 830         # but that's okay because we'll flush it in _end().
 831         oneblob = b''.join(datalist)
 832         try:
 833             f.write(oneblob)
 834         except IOError as e:
 835             reraise(GitError(e))
 836         nw = len(oneblob)
 837         crc = zlib.crc32(oneblob) & 0xffffffff
 838         self._update_idx(sha, crc, nw)
 839         self.outbytes += nw
 840         self.count += 1
 841         return nw, crc
 842
 843     def _update_idx(self, sha, crc, size):
 844         assert(sha)
 845         if self.idx:
 846             self.idx.add(sha, crc, self.file.tell() - size)
 847
 848     def _write(self, sha, type, content):
 849         if verbose:
 850             log('>')
 851         assert sha
 852         size, crc = self._raw_write(_encode_packobj(type, content,
 853                                                     self.compression_level),
 854                                     sha=sha)
 855         if self.outbytes >= self.max_pack_size \
 856            or self.count >= self.max_pack_objects:
 857             self.breakpoint()
 858         return sha
 859
 860     def _require_objcache(self):
 861         if self.objcache is None and self.objcache_maker:
 862             self.objcache = self.objcache_maker()
 863         if self.objcache is None:
 864             raise GitError(
 865                     "PackWriter not opened or can't check exists w/o objcache")
 866
 867     def exists(self, id, want_source=False):
 868         """Return non-empty if an object is found in the object cache."""
 869         self._require_objcache()
 870         return self.objcache.exists(id, want_source=want_source)
 871
 872     def just_write(self, sha, type, content):
 873         """Write an object to the pack file without checking for duplication."""
 874         self._write(sha, type, content)
 875         # If nothing else, gc doesn't have/want an objcache
 876         if self.objcache is not None:
 877             self.objcache.add(sha)
 878
 879     def maybe_write(self, type, content):
 880         """Write an object to the pack file if not present and return its id."""
 881         sha = calc_hash(type, content)
 882         if not self.exists(sha):
 883             self._require_objcache()
 884             self.just_write(sha, type, content)
 885         return sha
 886
 887     def new_blob(self, blob):
 888         """Create a blob object in the pack with the supplied content."""
 889         return self.maybe_write(b'blob', blob)
 890
 891     def new_tree(self, shalist):
 892         """Create a tree object in the pack."""
 893         content = tree_encode(shalist)
 894         return self.maybe_write(b'tree', content)
 895
 896     def new_commit(self, tree, parent,
 897                    author, adate_sec, adate_tz,
 898                    committer, cdate_sec, cdate_tz,
 899                    msg):
 900         """Create a commit object in the pack.  The date_sec values must be
 901         epoch-seconds, and if a tz is None, the local timezone is assumed."""
 902         content = create_commit_blob(tree, parent,
 903                                      author, adate_sec, adate_tz,
 904                                      committer, cdate_sec, cdate_tz,
 905                                      msg)
 906         return self.maybe_write(b'commit', content)
 907
 908     def _end(self, run_midx=True, abort=False):
 909         # Ignores run_midx during abort
 910         self.tmpdir, tmpdir = None, self.tmpdir
 911         self.parentfd, pfd, = None, self.parentfd
 912         self.file, f = None, self.file
 913         self.idx, idx = None, self.idx
 914         try:
 915             with nullcontext_if_not(self.objcache), \
 916                  finalized(pfd, lambda x: x is not None and os.close(x)), \
 917                  nullcontext_if_not(f):
 918                 if abort or not f:
 919                     return None
 920
 921                 # update object count
 922                 f.seek(8)
 923                 cp = struct.pack('!i', self.count)
 924                 assert len(cp) == 4
 925                 f.write(cp)
 926
 927                 # calculate the pack sha1sum
 928                 f.seek(0)
 929                 sum = Sha1()
 930                 for b in chunkyreader(f):
 931                     sum.update(b)
 932                 packbin = sum.digest()
 933                 f.write(packbin)
 934                 f.flush()
 935                 fdatasync(f.fileno())
 936                 f.close()
 937
 938                 idx.write(tmpdir + b'/idx', packbin)
 939                 nameprefix = os.path.join(self.repo_dir,
 940                                           b'objects/pack/pack-' +  hexlify(packbin))
 941                 os.rename(tmpdir + b'/pack', nameprefix + b'.pack')
 942                 os.rename(tmpdir + b'/idx', nameprefix + b'.idx')
 943                 os.fsync(pfd)
 944                 if run_midx:
 945                     auto_midx(os.path.join(self.repo_dir, b'objects/pack'))
 946                 if self.on_pack_finish:
 947                     self.on_pack_finish(nameprefix)
 948                 return nameprefix
 949         finally:
 950             if tmpdir:
 951                 rmtree(tmpdir)
 952             # Must be last -- some of the code above depends on it
 953             self.objcache = None
 954
 955     def abort(self):
 956         """Remove the pack file from disk."""
 957         self.closed = True
 958         self._end(abort=True)
 959
 960     def breakpoint(self):
 961         """Clear byte and object counts and return the last processed id."""
 962         id = self._end(self.run_midx)
 963         self.outbytes = self.count = 0
 964         return id
 965
 966     def close(self, run_midx=True):
 967         """Close the pack file and move it to its definitive path."""
 968         self.closed = True
 969         return self._end(run_midx=run_midx)
 970
 971     def __del__(self):
 972         assert self.closed
 973
 974
 975 class PackIdxV2Writer:
 976     def __init__(self):
 977         self.idx = list(list() for i in range(256))
 978         self.count = 0
 979
 980     def add(self, sha, crc, offs):
 981         assert(sha)
 982         self.count += 1
 983         self.idx[byte_int(sha[0])].append((sha, crc, offs))
 984
 985     def write(self, filename, packbin):
 986         ofs64_count = 0
 987         for section in self.idx:
 988             for entry in section:
 989                 if entry[2] >= 2**31:
 990                     ofs64_count += 1
 991
 992         # Length: header + fan-out + shas-and-crcs + overflow-offsets
 993         index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
 994         idx_map = None
 995         idx_f = open(filename, 'w+b')
 996         try:
 997             idx_f.truncate(index_len)
 998             fdatasync(idx_f.fileno())
 999             idx_map = mmap_readwrite(idx_f, close=False)
1000             try:
1001                 count = _helpers.write_idx(filename, idx_map, self.idx,
1002                                            self.count)
1003                 assert(count == self.count)
1004                 idx_map.flush()
1005             finally:
1006                 idx_map.close()
1007         finally:
1008             idx_f.close()
1009
1010         idx_f = open(filename, 'a+b')
1011         try:
1012             idx_f.write(packbin)
1013             idx_f.seek(0)
1014             idx_sum = Sha1()
1015             b = idx_f.read(8 + 4*256)
1016             idx_sum.update(b)
1017
1018             for b in chunkyreader(idx_f, 20 * self.count):
1019                 idx_sum.update(b)
1020
1021             for b in chunkyreader(idx_f):
1022                 idx_sum.update(b)
1023             idx_f.write(idx_sum.digest())
1024             fdatasync(idx_f.fileno())
1025         finally:
1026             idx_f.close()
1027
1028
1029 def list_refs(patterns=None, repo_dir=None,
1030               limit_to_heads=False, limit_to_tags=False):
1031     """Yield (refname, hash) tuples for all repository refs unless
1032     patterns are specified.  In that case, only include tuples for
1033     refs matching those patterns (cf. git-show-ref(1)).  The limits
1034     restrict the result items to refs/heads or refs/tags.  If both
1035     limits are specified, items from both sources will be included.
1036
1037     """
1038     argv = [b'git', b'show-ref']
1039     if limit_to_heads:
1040         argv.append(b'--heads')
1041     if limit_to_tags:
1042         argv.append(b'--tags')
1043     argv.append(b'--')
1044     if patterns:
1045         argv.extend(patterns)
1046     p = subprocess.Popen(argv, env=_gitenv(repo_dir), stdout=subprocess.PIPE,
1047                          close_fds=True)
1048     out = p.stdout.read().strip()
1049     rv = p.wait()  # not fatal
1050     if rv:
1051         assert(not out)
1052     if out:
1053         for d in out.split(b'\n'):
1054             sha, name = d.split(b' ', 1)
1055             yield name, unhexlify(sha)
1056
1057
1058 def read_ref(refname, repo_dir = None):
1059     """Get the commit id of the most recent commit made on a given ref."""
1060     refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
1061     l = tuple(islice(refs, 2))
1062     if l:
1063         assert(len(l) == 1)
1064         return l[0][1]
1065     else:
1066         return None
1067
1068
1069 def rev_list_invocation(ref_or_refs, format=None):
1070     if isinstance(ref_or_refs, bytes):
1071         refs = (ref_or_refs,)
1072     else:
1073         refs = ref_or_refs
1074     argv = [b'git', b'rev-list']
1075
1076     if format:
1077         argv.append(b'--pretty=format:' + format)
1078     for ref in refs:
1079         assert not ref.startswith(b'-')
1080         argv.append(ref)
1081     argv.append(b'--')
1082     return argv
1083
1084
1085 def rev_list(ref_or_refs, parse=None, format=None, repo_dir=None):
1086     """Yield information about commits as per "git rev-list".  If a format
1087     is not provided, yield one hex hash at a time.  If a format is
1088     provided, pass it to rev-list and call parse(git_stdout) for each
1089     commit with the stream positioned just after the rev-list "commit
1090     HASH" header line.  When a format is provided yield (oidx,
1091     parse(git_stdout)) for each commit.
1092
1093     """
1094     assert bool(parse) == bool(format)
1095     p = subprocess.Popen(rev_list_invocation(ref_or_refs,
1096                                              format=format),
1097                          env=_gitenv(repo_dir),
1098                          stdout = subprocess.PIPE,
1099                          close_fds=True)
1100     if not format:
1101         for line in p.stdout:
1102             yield line.strip()
1103     else:
1104         line = p.stdout.readline()
1105         while line:
1106             s = line.strip()
1107             if not s.startswith(b'commit '):
1108                 raise Exception('unexpected line ' + repr(s))
1109             s = s[7:]
1110             assert len(s) == 40
1111             yield s, parse(p.stdout)
1112             line = p.stdout.readline()
1113
1114     rv = p.wait()  # not fatal
1115     if rv:
1116         raise GitError('git rev-list returned error %d' % rv)
1117
1118
1119 def rev_parse(committish, repo_dir=None):
1120     """Resolve the full hash for 'committish', if it exists.
1121
1122     Should be roughly equivalent to 'git rev-parse'.
1123
1124     Returns the hex value of the hash if it is found, None if 'committish' does
1125     not correspond to anything.
1126     """
1127     head = read_ref(committish, repo_dir=repo_dir)
1128     if head:
1129         debug2("resolved from ref: commit = %s\n" % hexlify(head))
1130         return head
1131
1132     if len(committish) == 40:
1133         try:
1134             hash = unhexlify(committish)
1135         except TypeError:
1136             return None
1137
1138         with PackIdxList(repo(b'objects/pack', repo_dir=repo_dir)) as pL:
1139             if pL.exists(hash):
1140                 return hash
1141
1142     return None
1143
1144
1145 def update_ref(refname, newval, oldval, repo_dir=None, force=False):
1146     """Update a repository reference.
1147
1148     With force=True, don't care about the previous ref (oldval);
1149     with force=False oldval must be either a sha1 or None (for an
1150     entirely new branch)
1151     """
1152     if force:
1153         assert oldval is None
1154         oldarg = []
1155     elif not oldval:
1156         oldarg = [b'']
1157     else:
1158         oldarg = [hexlify(oldval)]
1159     assert refname.startswith(b'refs/heads/') \
1160         or refname.startswith(b'refs/tags/')
1161     p = subprocess.Popen([b'git', b'update-ref', refname,
1162                           hexlify(newval)] + oldarg,
1163                          env=_gitenv(repo_dir),
1164                          close_fds=True)
1165     _git_wait(b'git update-ref', p)
1166
1167
1168 def delete_ref(refname, oldvalue=None):
1169     """Delete a repository reference (see git update-ref(1))."""
1170     assert refname.startswith(b'refs/')
1171     oldvalue = [] if not oldvalue else [oldvalue]
1172     p = subprocess.Popen([b'git', b'update-ref', b'-d', refname] + oldvalue,
1173                          env=_gitenv(),
1174                          close_fds=True)
1175     _git_wait('git update-ref', p)
1176
1177
1178 def guess_repo():
1179     """Return the global repodir or BUP_DIR when either is set, or ~/.bup.
1180     Usually, if you are interacting with a bup repository, you would
1181     not be calling this function but using check_repo_or_die().
1182
1183     """
1184     if repodir:
1185         return repodir
1186     repo = environ.get(b'BUP_DIR')
1187     if not repo:
1188         repo = os.path.expanduser(b'~/.bup')
1189     return repo
1190
1191
1192 def init_repo(path=None):
1193     """Create the Git bare repository for bup in a given path."""
1194     global repodir
1195     repodir = path or guess_repo()
1196     d = repo()  # appends a / to the path
1197     parent = os.path.dirname(os.path.dirname(d))
1198     if parent and not os.path.exists(parent):
1199         raise GitError('parent directory "%s" does not exist\n'
1200                        % path_msg(parent))
1201     if os.path.exists(d) and not os.path.isdir(os.path.join(d, b'.')):
1202         raise GitError('"%s" exists but is not a directory\n' % path_msg(d))
1203     p = subprocess.Popen([b'git', b'--bare', b'init'], stdout=sys.stderr,
1204                          env=_gitenv(),
1205                          close_fds=True)
1206     _git_wait('git init', p)
1207     # Force the index version configuration in order to ensure bup works
1208     # regardless of the version of the installed Git binary.
1209     p = subprocess.Popen([b'git', b'config', b'pack.indexVersion', '2'],
1210                          stdout=sys.stderr, env=_gitenv(), close_fds=True)
1211     _git_wait('git config', p)
1212     # Enable the reflog
1213     p = subprocess.Popen([b'git', b'config', b'core.logAllRefUpdates', b'true'],
1214                          stdout=sys.stderr, env=_gitenv(), close_fds=True)
1215     _git_wait('git config', p)
1216
1217
1218 def check_repo_or_die(path=None):
1219     """Check to see if a bup repository probably exists, and abort if not."""
1220     global repodir
1221     repodir = path or guess_repo()
1222     top = repo()
1223     pst = stat_if_exists(top + b'/objects/pack')
1224     if pst and stat.S_ISDIR(pst.st_mode):
1225         return
1226     if not pst:
1227         top_st = stat_if_exists(top)
1228         if not top_st:
1229             log('error: repository %r does not exist (see "bup help init")\n'
1230                 % top)
1231             sys.exit(15)
1232     log('error: %s is not a repository\n' % path_msg(top))
1233     sys.exit(14)
1234
1235
1236 def is_suitable_git(ver_str):
1237     if not ver_str.startswith(b'git version '):
1238         return 'unrecognized'
1239     ver_str = ver_str[len(b'git version '):]
1240     if ver_str.startswith(b'0.'):
1241         return 'insufficient'
1242     if ver_str.startswith(b'1.'):
1243         if re.match(br'1\.[012345]rc', ver_str):
1244             return 'insufficient'
1245         if re.match(br'1\.[01234]\.', ver_str):
1246             return 'insufficient'
1247         if re.match(br'1\.5\.[012345]($|\.)', ver_str):
1248             return 'insufficient'
1249         if re.match(br'1\.5\.6-rc', ver_str):
1250             return 'insufficient'
1251         return 'suitable'
1252     if re.match(br'[0-9]+(\.|$)?', ver_str):
1253         return 'suitable'
1254     sys.exit(13)
1255
1256 _git_great = None
1257
1258 def require_suitable_git(ver_str=None):
1259     """Raise GitError if the version of git isn't suitable.
1260
1261     Rely on ver_str when provided, rather than invoking the git in the
1262     path.
1263
1264     """
1265     global _git_great
1266     if _git_great is not None:
1267         return
1268     if environ.get(b'BUP_GIT_VERSION_IS_FINE', b'').lower() \
1269        in (b'yes', b'true', b'1'):
1270         _git_great = True
1271         return
1272     if not ver_str:
1273         ver_str, _, _ = _git_exo([b'git', b'--version'])
1274     status = is_suitable_git(ver_str)
1275     if status == 'unrecognized':
1276         raise GitError('Unexpected git --version output: %r' % ver_str)
1277     if status == 'insufficient':
1278         log('error: git version must be at least 1.5.6\n')
1279         sys.exit(1)
1280     if status == 'suitable':
1281         _git_great = True
1282         return
1283     assert False
1284
1285
1286 class CatPipe:
1287     """Link to 'git cat-file' that is used to retrieve blob data."""
1288     def __init__(self, repo_dir = None):
1289         require_suitable_git()
1290         self.repo_dir = repo_dir
1291         self.p = self.inprogress = None
1292
1293     def close(self, wait=False):
1294         self.p, p = None, self.p
1295         self.inprogress = None
1296         if p:
1297             try:
1298                 p.stdout.close()
1299             finally:
1300                 # This will handle pending exceptions correctly once
1301                 # we drop py2
1302                 p.stdin.close()
1303         if wait:
1304             p.wait()
1305             return p.returncode
1306         return None
1307
1308     def restart(self):
1309         self.close()
1310         self.p = subprocess.Popen([b'git', b'cat-file', b'--batch'],
1311                                   stdin=subprocess.PIPE,
1312                                   stdout=subprocess.PIPE,
1313                                   close_fds = True,
1314                                   bufsize = 4096,
1315                                   env=_gitenv(self.repo_dir))
1316
1317     def get(self, ref):
1318         """Yield (oidx, type, size), followed by the data referred to by ref.
1319         If ref does not exist, only yield (None, None, None).
1320
1321         """
1322         if not self.p or self.p.poll() != None:
1323             self.restart()
1324         assert(self.p)
1325         poll_result = self.p.poll()
1326         assert(poll_result == None)
1327         if self.inprogress:
1328             log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1329         assert(not self.inprogress)
1330         assert ref.find(b'\n') < 0
1331         assert ref.find(b'\r') < 0
1332         assert not ref.startswith(b'-')
1333         self.inprogress = ref
1334         self.p.stdin.write(ref + b'\n')
1335         self.p.stdin.flush()
1336         hdr = self.p.stdout.readline()
1337         if not hdr:
1338             raise GitError('unexpected cat-file EOF (last request: %r, exit: %s)'
1339                            % (ref, self.p.poll() or 'none'))
1340         if hdr.endswith(b' missing\n'):
1341             self.inprogress = None
1342             yield None, None, None
1343             return
1344         info = hdr.split(b' ')
1345         if len(info) != 3 or len(info[0]) != 40:
1346             raise GitError('expected object (id, type, size), got %r' % info)
1347         oidx, typ, size = info
1348         size = int(size)
1349         try:
1350             it = chunkyreader(self.p.stdout, size)
1351             yield oidx, typ, size
1352             for blob in chunkyreader(self.p.stdout, size):
1353                 yield blob
1354             readline_result = self.p.stdout.readline()
1355             assert readline_result == b'\n'
1356             self.inprogress = None
1357         except Exception as ex:
1358             with pending_raise(ex):
1359                 self.close()
1360
1361     def _join(self, it):
1362         _, typ, _ = next(it)
1363         if typ == b'blob':
1364             for blob in it:
1365                 yield blob
1366         elif typ == b'tree':
1367             treefile = b''.join(it)
1368             for (mode, name, sha) in tree_decode(treefile):
1369                 for blob in self.join(hexlify(sha)):
1370                     yield blob
1371         elif typ == b'commit':
1372             treeline = b''.join(it).split(b'\n')[0]
1373             assert treeline.startswith(b'tree ')
1374             for blob in self.join(treeline[5:]):
1375                 yield blob
1376         else:
1377             raise GitError('invalid object type %r: expected blob/tree/commit'
1378                            % typ)
1379
1380     def join(self, id):
1381         """Generate a list of the content of all blobs that can be reached
1382         from an object.  The hash given in 'id' must point to a blob, a tree
1383         or a commit. The content of all blobs that can be seen from trees or
1384         commits will be added to the list.
1385         """
1386         for d in self._join(self.get(id)):
1387             yield d
1388
1389
1390 _cp = {}
1391
1392 def cp(repo_dir=None):
1393     """Create a CatPipe object or reuse the already existing one."""
1394     global _cp, repodir
1395     if not repo_dir:
1396         repo_dir = repodir or repo()
1397     repo_dir = os.path.abspath(repo_dir)
1398     cp = _cp.get(repo_dir)
1399     if not cp:
1400         cp = CatPipe(repo_dir)
1401         _cp[repo_dir] = cp
1402     return cp
1403
1404
1405 def close_catpipes():
1406     # FIXME: chain exceptions
1407     while _cp:
1408         _, cp = _cp.popitem()
1409         cp.close(wait=True)
1410
1411
1412 def tags(repo_dir = None):
1413     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1414     tags = {}
1415     for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1416         assert n.startswith(b'refs/tags/')
1417         name = n[10:]
1418         if not c in tags:
1419             tags[c] = []
1420         tags[c].append(name)  # more than one tag can point at 'c'
1421     return tags
1422
1423
1424 class MissingObject(KeyError):
1425     def __init__(self, oid):
1426         self.oid = oid
1427         KeyError.__init__(self, 'object %r is missing' % hexlify(oid))
1428
1429
1430 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1431                                    'path', 'chunk_path', 'data'])
1432 # The path is the mangled path, and if an item represents a fragment
1433 # of a chunked file, the chunk_path will be the chunked subtree path
1434 # for the chunk, i.e. ['', '2d3115e', ...].  The top-level path for a
1435 # chunked file will have a chunk_path of [''].  So some chunk subtree
1436 # of the file '/foo/bar/baz' might look like this:
1437 #
1438 #   item.path = ['foo', 'bar', 'baz.bup']
1439 #   item.chunk_path = ['', '2d3115e', '016b097']
1440 #   item.type = 'tree'
1441 #   ...
1442
1443
1444 def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1445     """Yield everything reachable from oidx via get_ref (which must behave
1446     like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1447     returns true.  Throw MissingObject if a hash encountered is
1448     missing from the repository, and don't read or return blob content
1449     in the data field unless include_data is set.
1450
1451     """
1452     # Maintain the pending stack on the heap to avoid stack overflow
1453     pending = [(oidx, [], [], None)]
1454     while len(pending):
1455         oidx, parent_path, chunk_path, mode = pending.pop()
1456         oid = unhexlify(oidx)
1457         if stop_at and stop_at(oidx):
1458             continue
1459
1460         if (not include_data) and mode and stat.S_ISREG(mode):
1461             # If the object is a "regular file", then it's a leaf in
1462             # the graph, so we can skip reading the data if the caller
1463             # hasn't requested it.
1464             yield WalkItem(oid=oid, type=b'blob',
1465                            chunk_path=chunk_path, path=parent_path,
1466                            mode=mode,
1467                            data=None)
1468             continue
1469
1470         item_it = get_ref(oidx)
1471         get_oidx, typ, _ = next(item_it)
1472         if not get_oidx:
1473             raise MissingObject(unhexlify(oidx))
1474         if typ not in (b'blob', b'commit', b'tree'):
1475             raise Exception('unexpected repository object type %r' % typ)
1476
1477         # FIXME: set the mode based on the type when the mode is None
1478         if typ == b'blob' and not include_data:
1479             # Dump data until we can ask cat_pipe not to fetch it
1480             for ignored in item_it:
1481                 pass
1482             data = None
1483         else:
1484             data = b''.join(item_it)
1485
1486         yield WalkItem(oid=oid, type=typ,
1487                        chunk_path=chunk_path, path=parent_path,
1488                        mode=mode,
1489                        data=(data if include_data else None))
1490
1491         if typ == b'commit':
1492             commit_items = parse_commit(data)
1493             for pid in commit_items.parents:
1494                 pending.append((pid, parent_path, chunk_path, mode))
1495             pending.append((commit_items.tree, parent_path, chunk_path,
1496                             hashsplit.GIT_MODE_TREE))
1497         elif typ == b'tree':
1498             for mode, name, ent_id in tree_decode(data):
1499                 demangled, bup_type = demangle_name(name, mode)
1500                 if chunk_path:
1501                     sub_path = parent_path
1502                     sub_chunk_path = chunk_path + [name]
1503                 else:
1504                     sub_path = parent_path + [name]
1505                     if bup_type == BUP_CHUNKED:
1506                         sub_chunk_path = [b'']
1507                     else:
1508                         sub_chunk_path = chunk_path
1509                 pending.append((hexlify(ent_id), sub_path, sub_chunk_path,
1510                                 mode))