lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5
   6 from __future__ import absolute_import, print_function
   7 import os, sys, zlib, subprocess, struct, stat, re, tempfile, glob
   8 from array import array
   9 from binascii import hexlify, unhexlify
  10 from collections import namedtuple
  11 from itertools import islice
  12
  13 from bup import _helpers, hashsplit, path, midx, bloom, xstat
  14 from bup.compat import (buffer,
  15                         byte_int, bytes_from_byte, bytes_from_uint,
  16                         environ,
  17                         ExitStack,
  18                         items,
  19                         pending_raise,
  20                         range,
  21                         reraise)
  22 from bup.io import path_msg
  23 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
  24                          exo,
  25                          fdatasync,
  26                          finalized,
  27                          log,
  28                          merge_dict,
  29                          merge_iter,
  30                          mmap_read, mmap_readwrite,
  31                          nullcontext_if_not,
  32                          progress, qprogress, stat_if_exists,
  33                          unlink,
  34                          utc_offset_str)
  35
  36
  37 verbose = 0
  38 repodir = None  # The default repository, once initialized
  39
  40 _typemap =  {b'blob': 3, b'tree': 2, b'commit': 1, b'tag': 4}
  41 _typermap = {v: k for k, v in items(_typemap)}
  42
  43
  44 _total_searches = 0
  45 _total_steps = 0
  46
  47
  48 class GitError(Exception):
  49     pass
  50
  51
  52 def _gitenv(repo_dir=None):
  53     if not repo_dir:
  54         repo_dir = repo()
  55     return merge_dict(environ, {b'GIT_DIR': os.path.abspath(repo_dir)})
  56
  57 def _git_wait(cmd, p):
  58     rv = p.wait()
  59     if rv != 0:
  60         raise GitError('%r returned %d' % (cmd, rv))
  61
  62 def _git_exo(cmd, **kwargs):
  63     kwargs['check'] = False
  64     result = exo(cmd, **kwargs)
  65     _, _, proc = result
  66     if proc.returncode != 0:
  67         raise GitError('%r returned %d' % (cmd, proc.returncode))
  68     return result
  69
  70 def git_config_get(option, repo_dir=None, opttype=None, cfg_file=None):
  71     assert not (repo_dir and cfg_file), "repo_dir and cfg_file cannot both be used"
  72     cmd = [b'git', b'config', b'--null']
  73     if cfg_file:
  74         cmd.extend([b'--file', cfg_file])
  75     if opttype == 'int':
  76         cmd.extend([b'--int'])
  77     elif opttype == 'bool':
  78         cmd.extend([b'--bool'])
  79     else:
  80         assert opttype is None
  81     cmd.extend([b'--get', option])
  82     env=None
  83     if repo_dir:
  84         env = _gitenv(repo_dir=repo_dir)
  85     p = subprocess.Popen(cmd, stdout=subprocess.PIPE, env=env,
  86                          close_fds=True)
  87     # with --null, git writes out a trailing \0 after the value
  88     r = p.stdout.read()[:-1]
  89     rc = p.wait()
  90     if rc == 0:
  91         if opttype == 'int':
  92             return int(r)
  93         elif opttype == 'bool':
  94             # git converts to 'true' or 'false'
  95             return r == b'true'
  96         return r
  97     if rc != 1:
  98         raise GitError('%r returned %d' % (cmd, rc))
  99     return None
 100
 101
 102 def parse_tz_offset(s):
 103     """UTC offset in seconds."""
 104     tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
 105     if bytes_from_byte(s[0]) == b'-':
 106         return - tz_off
 107     return tz_off
 108
 109 def parse_commit_gpgsig(sig):
 110     """Return the original signature bytes.
 111
 112     i.e. with the "gpgsig " header and the leading space character on
 113     each continuation line removed.
 114
 115     """
 116     if not sig:
 117         return None
 118     assert sig.startswith(b'gpgsig ')
 119     sig = sig[7:]
 120     return sig.replace(b'\n ', b'\n')
 121
 122 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
 123 # Make sure that's authoritative.
 124
 125 # See also
 126 # https://github.com/git/git/blob/master/Documentation/technical/signature-format.txt
 127 # The continuation lines have only one leading space.
 128
 129 _start_end_char = br'[^ .,:;<>"\'\0\n]'
 130 _content_char = br'[^\0\n<>]'
 131 _safe_str_rx = br'(?:%s{1,2}|(?:%s%s*%s))' \
 132     % (_start_end_char,
 133        _start_end_char, _content_char, _start_end_char)
 134 _tz_rx = br'[-+]\d\d[0-5]\d'
 135 _parent_rx = br'(?:parent [abcdefABCDEF0123456789]{40}\n)'
 136 # Assumes every following line starting with a space is part of the
 137 # mergetag.  Is there a formal commit blob spec?
 138 _mergetag_rx = br'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)'
 139 _commit_rx = re.compile(br'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
 140 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
 141 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?)
 142 (?P<gpgsig>gpgsig .*\n(?: .*\n)*)?
 143 (?P<message>(?:.|\n)*)''' % (_parent_rx,
 144                              _safe_str_rx, _safe_str_rx, _tz_rx,
 145                              _safe_str_rx, _safe_str_rx, _tz_rx,
 146                              _mergetag_rx))
 147 _parent_hash_rx = re.compile(br'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
 148
 149 # Note that the author_sec and committer_sec values are (UTC) epoch
 150 # seconds, and for now the mergetag is not included.
 151 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
 152                                        'author_name', 'author_mail',
 153                                        'author_sec', 'author_offset',
 154                                        'committer_name', 'committer_mail',
 155                                        'committer_sec', 'committer_offset',
 156                                        'gpgsig',
 157                                        'message'])
 158
 159 def parse_commit(content):
 160     commit_match = re.match(_commit_rx, content)
 161     if not commit_match:
 162         raise Exception('cannot parse commit %r' % content)
 163     matches = commit_match.groupdict()
 164     return CommitInfo(tree=matches['tree'],
 165                       parents=re.findall(_parent_hash_rx, matches['parents']),
 166                       author_name=matches['author_name'],
 167                       author_mail=matches['author_mail'],
 168                       author_sec=int(matches['asec']),
 169                       author_offset=parse_tz_offset(matches['atz']),
 170                       committer_name=matches['committer_name'],
 171                       committer_mail=matches['committer_mail'],
 172                       committer_sec=int(matches['csec']),
 173                       committer_offset=parse_tz_offset(matches['ctz']),
 174                       gpgsig=parse_commit_gpgsig(matches['gpgsig']),
 175                       message=matches['message'])
 176
 177
 178 def get_cat_data(cat_iterator, expected_type):
 179     _, kind, _ = next(cat_iterator)
 180     if kind != expected_type:
 181         raise Exception('expected %r, saw %r' % (expected_type, kind))
 182     return b''.join(cat_iterator)
 183
 184 def get_commit_items(id, cp):
 185     return parse_commit(get_cat_data(cp.get(id), b'commit'))
 186
 187 def _local_git_date_str(epoch_sec):
 188     return b'%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
 189
 190
 191 def _git_date_str(epoch_sec, tz_offset_sec):
 192     offs =  tz_offset_sec // 60
 193     return b'%d %s%02d%02d' \
 194         % (epoch_sec,
 195            b'+' if offs >= 0 else b'-',
 196            abs(offs) // 60,
 197            abs(offs) % 60)
 198
 199
 200 def repo(sub = b'', repo_dir=None):
 201     """Get the path to the git repository or one of its subdirectories."""
 202     repo_dir = repo_dir or repodir
 203     if not repo_dir:
 204         raise GitError('You should call check_repo_or_die()')
 205
 206     # If there's a .git subdirectory, then the actual repo is in there.
 207     gd = os.path.join(repo_dir, b'.git')
 208     if os.path.exists(gd):
 209         repo_dir = gd
 210
 211     return os.path.join(repo_dir, sub)
 212
 213
 214 _shorten_hash_rx = \
 215     re.compile(br'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)')
 216
 217 def shorten_hash(s):
 218     return _shorten_hash_rx.sub(br'\1\2*\3', s)
 219
 220
 221 def repo_rel(path):
 222     full = os.path.abspath(path)
 223     fullrepo = os.path.abspath(repo(b''))
 224     if not fullrepo.endswith(b'/'):
 225         fullrepo += b'/'
 226     if full.startswith(fullrepo):
 227         path = full[len(fullrepo):]
 228     if path.startswith(b'index-cache/'):
 229         path = path[len(b'index-cache/'):]
 230     return shorten_hash(path)
 231
 232
 233 def auto_midx(objdir):
 234     args = [path.exe(), b'midx', b'--auto', b'--dir', objdir]
 235     try:
 236         rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
 237     except OSError as e:
 238         # make sure 'args' gets printed to help with debugging
 239         add_error('%r: exception: %s' % (args, e))
 240         raise
 241     if rv:
 242         add_error('%r: returned %d' % (args, rv))
 243
 244     args = [path.exe(), b'bloom', b'--dir', objdir]
 245     try:
 246         rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
 247     except OSError as e:
 248         # make sure 'args' gets printed to help with debugging
 249         add_error('%r: exception: %s' % (args, e))
 250         raise
 251     if rv:
 252         add_error('%r: returned %d' % (args, rv))
 253
 254
 255 def mangle_name(name, mode, gitmode):
 256     """Mangle a file name to present an abstract name for segmented files.
 257     Mangled file names will have the ".bup" extension added to them. If a
 258     file's name already ends with ".bup", a ".bupl" extension is added to
 259     disambiguate normal files from segmented ones.
 260     """
 261     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 262         assert(stat.S_ISDIR(gitmode))
 263         return name + b'.bup'
 264     elif name.endswith(b'.bup') or name[:-1].endswith(b'.bup'):
 265         return name + b'.bupl'
 266     else:
 267         return name
 268
 269
 270 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 271 def demangle_name(name, mode):
 272     """Remove name mangling from a file name, if necessary.
 273
 274     The return value is a tuple (demangled_filename,mode), where mode is one of
 275     the following:
 276
 277     * BUP_NORMAL  : files that should be read as-is from the repository
 278     * BUP_CHUNKED : files that were chunked and need to be reassembled
 279
 280     For more information on the name mangling algorithm, see mangle_name()
 281     """
 282     if name.endswith(b'.bupl'):
 283         return (name[:-5], BUP_NORMAL)
 284     elif name.endswith(b'.bup'):
 285         return (name[:-4], BUP_CHUNKED)
 286     elif name.endswith(b'.bupm'):
 287         return (name[:-5],
 288                 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
 289     return (name, BUP_NORMAL)
 290
 291
 292 def calc_hash(type, content):
 293     """Calculate some content's hash in the Git fashion."""
 294     header = b'%s %d\0' % (type, len(content))
 295     sum = Sha1(header)
 296     sum.update(content)
 297     return sum.digest()
 298
 299
 300 def shalist_item_sort_key(ent):
 301     (mode, name, id) = ent
 302     assert(mode+0 == mode)
 303     if stat.S_ISDIR(mode):
 304         return name + b'/'
 305     else:
 306         return name
 307
 308
 309 def tree_encode(shalist):
 310     """Generate a git tree object from (mode,name,hash) tuples."""
 311     shalist = sorted(shalist, key = shalist_item_sort_key)
 312     l = []
 313     for (mode,name,bin) in shalist:
 314         assert(mode)
 315         assert(mode+0 == mode)
 316         assert(name)
 317         assert(len(bin) == 20)
 318         s = b'%o %s\0%s' % (mode,name,bin)
 319         assert s[0] != b'0'  # 0-padded octal is not acceptable in a git tree
 320         l.append(s)
 321     return b''.join(l)
 322
 323
 324 def tree_decode(buf):
 325     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 326     ofs = 0
 327     while ofs < len(buf):
 328         z = buf.find(b'\0', ofs)
 329         assert(z > ofs)
 330         spl = buf[ofs:z].split(b' ', 1)
 331         assert(len(spl) == 2)
 332         mode,name = spl
 333         sha = buf[z+1:z+1+20]
 334         ofs = z+1+20
 335         yield (int(mode, 8), name, sha)
 336
 337
 338 def _encode_packobj(type, content, compression_level=1):
 339     if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
 340         raise ValueError('invalid compression level %s' % compression_level)
 341     szout = b''
 342     sz = len(content)
 343     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 344     sz >>= 4
 345     while 1:
 346         if sz: szbits |= 0x80
 347         szout += bytes_from_uint(szbits)
 348         if not sz:
 349             break
 350         szbits = sz & 0x7f
 351         sz >>= 7
 352     z = zlib.compressobj(compression_level)
 353     yield szout
 354     yield z.compress(content)
 355     yield z.flush()
 356
 357
 358 def _decode_packobj(buf):
 359     assert(buf)
 360     c = byte_int(buf[0])
 361     type = _typermap[(c & 0x70) >> 4]
 362     sz = c & 0x0f
 363     shift = 4
 364     i = 0
 365     while c & 0x80:
 366         i += 1
 367         c = byte_int(buf[i])
 368         sz |= (c & 0x7f) << shift
 369         shift += 7
 370         if not (c & 0x80):
 371             break
 372     return (type, zlib.decompress(buf[i+1:]))
 373
 374
 375 class PackIdx:
 376     def __init__(self):
 377         assert(0)
 378
 379     def find_offset(self, hash):
 380         """Get the offset of an object inside the index file."""
 381         idx = self._idx_from_hash(hash)
 382         if idx != None:
 383             return self._ofs_from_idx(idx)
 384         return None
 385
 386     def exists(self, hash, want_source=False):
 387         """Return nonempty if the object exists in this index."""
 388         if hash and (self._idx_from_hash(hash) != None):
 389             return want_source and os.path.basename(self.name) or True
 390         return None
 391
 392     def _idx_from_hash(self, hash):
 393         global _total_searches, _total_steps
 394         _total_searches += 1
 395         assert(len(hash) == 20)
 396         b1 = byte_int(hash[0])
 397         start = self.fanout[b1-1] # range -1..254
 398         end = self.fanout[b1] # range 0..255
 399         want = hash
 400         _total_steps += 1  # lookup table is a step
 401         while start < end:
 402             _total_steps += 1
 403             mid = start + (end - start) // 2
 404             v = self._idx_to_hash(mid)
 405             if v < want:
 406                 start = mid+1
 407             elif v > want:
 408                 end = mid
 409             else: # got it!
 410                 return mid
 411         return None
 412
 413
 414 class PackIdxV1(PackIdx):
 415     """Object representation of a Git pack index (version 1) file."""
 416     def __init__(self, filename, f):
 417         self.name = filename
 418         self.idxnames = [self.name]
 419         self.map = mmap_read(f)
 420         # Min size for 'L' is 4, which is sufficient for struct's '!I'
 421         self.fanout = array('L', struct.unpack('!256I', self.map))
 422         self.fanout.append(0)  # entry "-1"
 423         self.nsha = self.fanout[255]
 424         self.sha_ofs = 256 * 4
 425         # Avoid slicing shatable for individual hashes (very high overhead)
 426         self.shatable = buffer(self.map, self.sha_ofs, self.nsha * 24)
 427
 428     def __enter__(self):
 429         return self
 430
 431     def __exit__(self, type, value, traceback):
 432         with pending_raise(value, rethrow=False):
 433             self.close()
 434
 435     def __len__(self):
 436         return int(self.nsha)  # int() from long for python 2
 437
 438     def _ofs_from_idx(self, idx):
 439         if idx >= self.nsha or idx < 0:
 440             raise IndexError('invalid pack index index %d' % idx)
 441         ofs = self.sha_ofs + idx * 24
 442         return struct.unpack_from('!I', self.map, offset=ofs)[0]
 443
 444     def _idx_to_hash(self, idx):
 445         if idx >= self.nsha or idx < 0:
 446             raise IndexError('invalid pack index index %d' % idx)
 447         ofs = self.sha_ofs + idx * 24 + 4
 448         return self.map[ofs : ofs + 20]
 449
 450     def __iter__(self):
 451         start = self.sha_ofs + 4
 452         for ofs in range(start, start + 24 * self.nsha, 24):
 453             yield self.map[ofs : ofs + 20]
 454
 455     def close(self):
 456         if self.map is not None:
 457             self.shatable = None
 458             self.map.close()
 459             self.map = None
 460
 461
 462 class PackIdxV2(PackIdx):
 463     """Object representation of a Git pack index (version 2) file."""
 464     def __init__(self, filename, f):
 465         self.name = filename
 466         self.idxnames = [self.name]
 467         self.map = mmap_read(f)
 468         assert self.map[0:8] == b'\377tOc\0\0\0\2'
 469         # Min size for 'L' is 4, which is sufficient for struct's '!I'
 470         self.fanout = array('L', struct.unpack_from('!256I', self.map, offset=8))
 471         self.fanout.append(0)
 472         self.nsha = self.fanout[255]
 473         self.sha_ofs = 8 + 256*4
 474         self.ofstable_ofs = self.sha_ofs + self.nsha * 20 + self.nsha * 4
 475         self.ofs64table_ofs = self.ofstable_ofs + self.nsha * 4
 476         # Avoid slicing this for individual hashes (very high overhead)
 477         self.shatable = buffer(self.map, self.sha_ofs, self.nsha*20)
 478
 479     def __enter__(self):
 480         return self
 481
 482     def __exit__(self, type, value, traceback):
 483         with pending_raise(value, rethrow=False):
 484             self.close()
 485
 486     def __len__(self):
 487         return int(self.nsha)  # int() from long for python 2
 488
 489     def _ofs_from_idx(self, idx):
 490         if idx >= self.nsha or idx < 0:
 491             raise IndexError('invalid pack index index %d' % idx)
 492         ofs_ofs = self.ofstable_ofs + idx * 4
 493         ofs = struct.unpack_from('!I', self.map, offset=ofs_ofs)[0]
 494         if ofs & 0x80000000:
 495             idx64 = ofs & 0x7fffffff
 496             ofs64_ofs = self.ofs64table_ofs + idx64 * 8
 497             ofs = struct.unpack_from('!Q', self.map, offset=ofs64_ofs)[0]
 498         return ofs
 499
 500     def _idx_to_hash(self, idx):
 501         if idx >= self.nsha or idx < 0:
 502             raise IndexError('invalid pack index index %d' % idx)
 503         ofs = self.sha_ofs + idx * 20
 504         return self.map[ofs : ofs + 20]
 505
 506     def __iter__(self):
 507         start = self.sha_ofs
 508         for ofs in range(start, start + 20 * self.nsha, 20):
 509             yield self.map[ofs : ofs + 20]
 510
 511     def close(self):
 512         if self.map is not None:
 513             self.shatable = None
 514             self.map.close()
 515             self.map = None
 516
 517
 518 _mpi_count = 0
 519 class PackIdxList:
 520     def __init__(self, dir, ignore_midx=False):
 521         global _mpi_count
 522         # Q: was this also intended to prevent opening multiple repos?
 523         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 524         _mpi_count += 1
 525         self.open = True
 526         self.dir = dir
 527         self.also = set()
 528         self.packs = []
 529         self.do_bloom = False
 530         self.bloom = None
 531         self.ignore_midx = ignore_midx
 532         self.refresh()
 533
 534     def close(self):
 535         global _mpi_count
 536         if not self.open:
 537             assert _mpi_count == 0
 538             return
 539         _mpi_count -= 1
 540         assert _mpi_count == 0
 541         self.also = None
 542         self.bloom, bloom = None, self.bloom
 543         self.packs, packs = None, self.packs
 544         self.open = False
 545         with ExitStack() as stack:
 546             for pack in packs:
 547                 stack.enter_context(pack)
 548             if bloom:
 549                 bloom.close()
 550
 551     def __enter__(self):
 552         return self
 553
 554     def __exit__(self, type, value, traceback):
 555         with pending_raise(value, rethrow=False):
 556             self.close()
 557
 558     def __del__(self):
 559         assert not self.open
 560
 561     def __iter__(self):
 562         return iter(idxmerge(self.packs))
 563
 564     def __len__(self):
 565         return sum(len(pack) for pack in self.packs)
 566
 567     def exists(self, hash, want_source=False):
 568         """Return nonempty if the object exists in the index files."""
 569         global _total_searches
 570         _total_searches += 1
 571         if hash in self.also:
 572             return True
 573         if self.do_bloom and self.bloom:
 574             if self.bloom.exists(hash):
 575                 self.do_bloom = False
 576             else:
 577                 _total_searches -= 1  # was counted by bloom
 578                 return None
 579         for i in range(len(self.packs)):
 580             p = self.packs[i]
 581             _total_searches -= 1  # will be incremented by sub-pack
 582             ix = p.exists(hash, want_source=want_source)
 583             if ix:
 584                 # reorder so most recently used packs are searched first
 585                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 586                 return ix
 587         self.do_bloom = True
 588         return None
 589
 590     def refresh(self, skip_midx = False):
 591         """Refresh the index list.
 592         This method verifies if .midx files were superseded (e.g. all of its
 593         contents are in another, bigger .midx file) and removes the superseded
 594         files.
 595
 596         If skip_midx is True, all work on .midx files will be skipped and .midx
 597         files will be removed from the list.
 598
 599         The instance variable 'ignore_midx' can force this function to
 600         always act as if skip_midx was True.
 601         """
 602         if self.bloom is not None:
 603             self.bloom.close()
 604         self.bloom = None # Always reopen the bloom as it may have been relaced
 605         self.do_bloom = False
 606         skip_midx = skip_midx or self.ignore_midx
 607         d = dict((p.name, p) for p in self.packs
 608                  if not skip_midx or not isinstance(p, midx.PackMidx))
 609         if os.path.exists(self.dir):
 610             if not skip_midx:
 611                 midxl = []
 612                 midxes = set(glob.glob(os.path.join(self.dir, b'*.midx')))
 613                 # remove any *.midx files from our list that no longer exist
 614                 for ix in list(d.values()):
 615                     if not isinstance(ix, midx.PackMidx):
 616                         continue
 617                     if ix.name in midxes:
 618                         continue
 619                     # remove the midx
 620                     del d[ix.name]
 621                     ix.close()
 622                     self.packs.remove(ix)
 623                 for ix in self.packs:
 624                     if isinstance(ix, midx.PackMidx):
 625                         for name in ix.idxnames:
 626                             d[os.path.join(self.dir, name)] = ix
 627                 for full in midxes:
 628                     if not d.get(full):
 629                         mx = midx.PackMidx(full)
 630                         (mxd, mxf) = os.path.split(mx.name)
 631                         broken = False
 632                         for n in mx.idxnames:
 633                             if not os.path.exists(os.path.join(mxd, n)):
 634                                 log(('warning: index %s missing\n'
 635                                      '  used by %s\n')
 636                                     % (path_msg(n), path_msg(mxf)))
 637                                 broken = True
 638                         if broken:
 639                             mx.close()
 640                             unlink(full)
 641                         else:
 642                             midxl.append(mx)
 643                 midxl.sort(key=lambda ix:
 644                            (-len(ix), -xstat.stat(ix.name).st_mtime))
 645                 for ix in midxl:
 646                     any_needed = False
 647                     for sub in ix.idxnames:
 648                         found = d.get(os.path.join(self.dir, sub))
 649                         if not found or isinstance(found, PackIdx):
 650                             # doesn't exist, or exists but not in a midx
 651                             any_needed = True
 652                             break
 653                     if any_needed:
 654                         d[ix.name] = ix
 655                         for name in ix.idxnames:
 656                             d[os.path.join(self.dir, name)] = ix
 657                     elif not ix.force_keep:
 658                         debug1('midx: removing redundant: %s\n'
 659                                % path_msg(os.path.basename(ix.name)))
 660                         ix.close()
 661                         unlink(ix.name)
 662             for full in glob.glob(os.path.join(self.dir, b'*.idx')):
 663                 if not d.get(full):
 664                     try:
 665                         ix = open_idx(full)
 666                     except GitError as e:
 667                         add_error(e)
 668                         continue
 669                     d[full] = ix
 670             bfull = os.path.join(self.dir, b'bup.bloom')
 671             self.packs = list(set(d.values()))
 672             self.packs.sort(reverse=True, key=lambda x: len(x))
 673             if self.bloom is None and os.path.exists(bfull):
 674                 self.bloom = bloom.ShaBloom(bfull)
 675             try:
 676                 if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 677                     self.do_bloom = True
 678                 else:
 679                     if self.bloom:
 680                         self.bloom, bloom_tmp = None, self.bloom
 681                         bloom_tmp.close()
 682             except BaseException as ex:
 683                 with pending_raise(ex):
 684                     if self.bloom:
 685                         self.bloom.close()
 686
 687         debug1('PackIdxList: using %d index%s.\n'
 688             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 689
 690     def add(self, hash):
 691         """Insert an additional object in the list."""
 692         self.also.add(hash)
 693
 694
 695 def open_idx(filename):
 696     if filename.endswith(b'.idx'):
 697         f = open(filename, 'rb')
 698         header = f.read(8)
 699         if header[0:4] == b'\377tOc':
 700             version = struct.unpack('!I', header[4:8])[0]
 701             if version == 2:
 702                 return PackIdxV2(filename, f)
 703             else:
 704                 raise GitError('%s: expected idx file version 2, got %d'
 705                                % (path_msg(filename), version))
 706         elif len(header) == 8 and header[0:4] < b'\377tOc':
 707             return PackIdxV1(filename, f)
 708         else:
 709             raise GitError('%s: unrecognized idx file header'
 710                            % path_msg(filename))
 711     elif filename.endswith(b'.midx'):
 712         return midx.PackMidx(filename)
 713     else:
 714         raise GitError('idx filenames must end with .idx or .midx')
 715
 716
 717 def idxmerge(idxlist, final_progress=True):
 718     """Generate a list of all the objects reachable in a PackIdxList."""
 719     def pfunc(count, total):
 720         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 721                   % (count*100.0/total, count, total))
 722     def pfinal(count, total):
 723         if final_progress:
 724             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 725                      % (100, total, total))
 726     return merge_iter(idxlist, 10024, pfunc, pfinal)
 727
 728
 729 def create_commit_blob(tree, parent,
 730                        author, adate_sec, adate_tz,
 731                        committer, cdate_sec, cdate_tz,
 732                        msg):
 733     if adate_tz is not None:
 734         adate_str = _git_date_str(adate_sec, adate_tz)
 735     else:
 736         adate_str = _local_git_date_str(adate_sec)
 737     if cdate_tz is not None:
 738         cdate_str = _git_date_str(cdate_sec, cdate_tz)
 739     else:
 740         cdate_str = _local_git_date_str(cdate_sec)
 741     l = []
 742     if tree: l.append(b'tree %s' % hexlify(tree))
 743     if parent: l.append(b'parent %s' % hexlify(parent))
 744     if author: l.append(b'author %s %s' % (author, adate_str))
 745     if committer: l.append(b'committer %s %s' % (committer, cdate_str))
 746     l.append(b'')
 747     l.append(msg)
 748     return b'\n'.join(l)
 749
 750 def _make_objcache():
 751     return PackIdxList(repo(b'objects/pack'))
 752
 753 # bup-gc assumes that it can disable all PackWriter activities
 754 # (bloom/midx/cache) via the constructor and close() arguments.
 755
 756 class PackWriter:
 757     """Writes Git objects inside a pack file."""
 758     def __init__(self, objcache_maker=_make_objcache, compression_level=1,
 759                  run_midx=True, on_pack_finish=None,
 760                  max_pack_size=None, max_pack_objects=None, repo_dir=None):
 761         self.repo_dir = repo_dir or repo()
 762         self.file = None
 763         self.parentfd = None
 764         self.count = 0
 765         self.outbytes = 0
 766         self.filename = None
 767         self.idx = None
 768         self.objcache_maker = objcache_maker
 769         self.objcache = None
 770         self.compression_level = compression_level
 771         self.run_midx=run_midx
 772         self.on_pack_finish = on_pack_finish
 773         if not max_pack_size:
 774             max_pack_size = git_config_get(b'pack.packSizeLimit',
 775                                            repo_dir=self.repo_dir,
 776                                            opttype='int')
 777             if not max_pack_size:
 778                 # larger packs slow down pruning
 779                 max_pack_size = 1000 * 1000 * 1000
 780         self.max_pack_size = max_pack_size
 781         # cache memory usage is about 83 bytes per object
 782         self.max_pack_objects = max_pack_objects if max_pack_objects \
 783                                 else max(1, self.max_pack_size // 5000)
 784
 785     def __enter__(self):
 786         return self
 787
 788     def __exit__(self, type, value, traceback):
 789         with pending_raise(value, rethrow=False):
 790             self.close()
 791
 792     def _open(self):
 793         if not self.file:
 794             objdir = dir = os.path.join(self.repo_dir, b'objects')
 795             fd, name = tempfile.mkstemp(suffix=b'.pack', dir=objdir)
 796             try:
 797                 self.file = os.fdopen(fd, 'w+b')
 798             except:
 799                 os.close(fd)
 800                 raise
 801             try:
 802                 self.parentfd = os.open(objdir, os.O_RDONLY)
 803             except:
 804                 f = self.file
 805                 self.file = None
 806                 f.close()
 807                 raise
 808             assert name.endswith(b'.pack')
 809             self.filename = name[:-5]
 810             self.file.write(b'PACK\0\0\0\2\0\0\0\0')
 811             self.idx = PackIdxV2Writer()
 812
 813     def _raw_write(self, datalist, sha):
 814         self._open()
 815         f = self.file
 816         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 817         # the file never has a *partial* blob.  So let's make sure it's
 818         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 819         # to our hashsplit algorithm.)  f.write() does its own buffering,
 820         # but that's okay because we'll flush it in _end().
 821         oneblob = b''.join(datalist)
 822         try:
 823             f.write(oneblob)
 824         except IOError as e:
 825             reraise(GitError(e))
 826         nw = len(oneblob)
 827         crc = zlib.crc32(oneblob) & 0xffffffff
 828         self._update_idx(sha, crc, nw)
 829         self.outbytes += nw
 830         self.count += 1
 831         return nw, crc
 832
 833     def _update_idx(self, sha, crc, size):
 834         assert(sha)
 835         if self.idx:
 836             self.idx.add(sha, crc, self.file.tell() - size)
 837
 838     def _write(self, sha, type, content):
 839         if verbose:
 840             log('>')
 841         if not sha:
 842             sha = calc_hash(type, content)
 843         size, crc = self._raw_write(_encode_packobj(type, content,
 844                                                     self.compression_level),
 845                                     sha=sha)
 846         if self.outbytes >= self.max_pack_size \
 847            or self.count >= self.max_pack_objects:
 848             self.breakpoint()
 849         return sha
 850
 851     def _require_objcache(self):
 852         if self.objcache is None and self.objcache_maker:
 853             self.objcache = self.objcache_maker()
 854         if self.objcache is None:
 855             raise GitError(
 856                     "PackWriter not opened or can't check exists w/o objcache")
 857
 858     def exists(self, id, want_source=False):
 859         """Return non-empty if an object is found in the object cache."""
 860         self._require_objcache()
 861         return self.objcache.exists(id, want_source=want_source)
 862
 863     def just_write(self, sha, type, content):
 864         """Write an object to the pack file without checking for duplication."""
 865         self._write(sha, type, content)
 866         # If nothing else, gc doesn't have/want an objcache
 867         if self.objcache is not None:
 868             self.objcache.add(sha)
 869
 870     def maybe_write(self, type, content):
 871         """Write an object to the pack file if not present and return its id."""
 872         sha = calc_hash(type, content)
 873         if not self.exists(sha):
 874             self._require_objcache()
 875             self.just_write(sha, type, content)
 876         return sha
 877
 878     def new_blob(self, blob):
 879         """Create a blob object in the pack with the supplied content."""
 880         return self.maybe_write(b'blob', blob)
 881
 882     def new_tree(self, shalist):
 883         """Create a tree object in the pack."""
 884         content = tree_encode(shalist)
 885         return self.maybe_write(b'tree', content)
 886
 887     def new_commit(self, tree, parent,
 888                    author, adate_sec, adate_tz,
 889                    committer, cdate_sec, cdate_tz,
 890                    msg):
 891         """Create a commit object in the pack.  The date_sec values must be
 892         epoch-seconds, and if a tz is None, the local timezone is assumed."""
 893         content = create_commit_blob(tree, parent,
 894                                      author, adate_sec, adate_tz,
 895                                      committer, cdate_sec, cdate_tz,
 896                                      msg)
 897         return self.maybe_write(b'commit', content)
 898
 899     def _end(self, run_midx=True, abort=False):
 900         # Ignores run_midx during abort
 901         if not self.file:
 902             return None
 903         self.file, f = None, self.file
 904         self.idx, idx = None, self.idx
 905         self.parentfd, pfd, = None, self.parentfd
 906
 907         try:
 908             with nullcontext_if_not(self.objcache), \
 909                  finalized(pfd, lambda x: x is not None and os.close(x)), \
 910                  f:
 911
 912                 if abort:
 913                     os.unlink(self.filename + b'.pack')
 914                     return None
 915
 916                 # update object count
 917                 f.seek(8)
 918                 cp = struct.pack('!i', self.count)
 919                 assert len(cp) == 4
 920                 f.write(cp)
 921
 922                 # calculate the pack sha1sum
 923                 f.seek(0)
 924                 sum = Sha1()
 925                 for b in chunkyreader(f):
 926                     sum.update(b)
 927                 packbin = sum.digest()
 928                 f.write(packbin)
 929                 f.flush()
 930                 fdatasync(f.fileno())
 931                 f.close()
 932
 933                 idx.write(self.filename + b'.idx', packbin)
 934                 nameprefix = os.path.join(self.repo_dir,
 935                                           b'objects/pack/pack-' +  hexlify(packbin))
 936                 if os.path.exists(self.filename + b'.map'):
 937                     os.unlink(self.filename + b'.map')
 938                 os.rename(self.filename + b'.pack', nameprefix + b'.pack')
 939                 os.rename(self.filename + b'.idx', nameprefix + b'.idx')
 940                 os.fsync(pfd)
 941                 if run_midx:
 942                     auto_midx(os.path.join(self.repo_dir, b'objects/pack'))
 943                 if self.on_pack_finish:
 944                     self.on_pack_finish(nameprefix)
 945                 return nameprefix
 946         finally:
 947             # Must be last -- some of the code above depends on it
 948             self.objcache = None
 949
 950     def abort(self):
 951         """Remove the pack file from disk."""
 952         self._end(abort=True)
 953
 954     def breakpoint(self):
 955         """Clear byte and object counts and return the last processed id."""
 956         id = self._end(self.run_midx)
 957         self.outbytes = self.count = 0
 958         return id
 959
 960     def close(self, run_midx=True):
 961         """Close the pack file and move it to its definitive path."""
 962         return self._end(run_midx=run_midx)
 963
 964
 965 class PackIdxV2Writer:
 966     def __init__(self):
 967         self.idx = list(list() for i in range(256))
 968         self.count = 0
 969
 970     def add(self, sha, crc, offs):
 971         assert(sha)
 972         self.count += 1
 973         self.idx[byte_int(sha[0])].append((sha, crc, offs))
 974
 975     def write(self, filename, packbin):
 976         ofs64_count = 0
 977         for section in self.idx:
 978             for entry in section:
 979                 if entry[2] >= 2**31:
 980                     ofs64_count += 1
 981
 982         # Length: header + fan-out + shas-and-crcs + overflow-offsets
 983         index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
 984         idx_map = None
 985         idx_f = open(filename, 'w+b')
 986         try:
 987             idx_f.truncate(index_len)
 988             fdatasync(idx_f.fileno())
 989             idx_map = mmap_readwrite(idx_f, close=False)
 990             try:
 991                 count = _helpers.write_idx(filename, idx_map, self.idx,
 992                                            self.count)
 993                 assert(count == self.count)
 994                 idx_map.flush()
 995             finally:
 996                 idx_map.close()
 997         finally:
 998             idx_f.close()
 999
1000         idx_f = open(filename, 'a+b')
1001         try:
1002             idx_f.write(packbin)
1003             idx_f.seek(0)
1004             idx_sum = Sha1()
1005             b = idx_f.read(8 + 4*256)
1006             idx_sum.update(b)
1007
1008             for b in chunkyreader(idx_f, 20 * self.count):
1009                 idx_sum.update(b)
1010
1011             for b in chunkyreader(idx_f):
1012                 idx_sum.update(b)
1013             idx_f.write(idx_sum.digest())
1014             fdatasync(idx_f.fileno())
1015         finally:
1016             idx_f.close()
1017
1018
1019 def list_refs(patterns=None, repo_dir=None,
1020               limit_to_heads=False, limit_to_tags=False):
1021     """Yield (refname, hash) tuples for all repository refs unless
1022     patterns are specified.  In that case, only include tuples for
1023     refs matching those patterns (cf. git-show-ref(1)).  The limits
1024     restrict the result items to refs/heads or refs/tags.  If both
1025     limits are specified, items from both sources will be included.
1026
1027     """
1028     argv = [b'git', b'show-ref']
1029     if limit_to_heads:
1030         argv.append(b'--heads')
1031     if limit_to_tags:
1032         argv.append(b'--tags')
1033     argv.append(b'--')
1034     if patterns:
1035         argv.extend(patterns)
1036     p = subprocess.Popen(argv, env=_gitenv(repo_dir), stdout=subprocess.PIPE,
1037                          close_fds=True)
1038     out = p.stdout.read().strip()
1039     rv = p.wait()  # not fatal
1040     if rv:
1041         assert(not out)
1042     if out:
1043         for d in out.split(b'\n'):
1044             sha, name = d.split(b' ', 1)
1045             yield name, unhexlify(sha)
1046
1047
1048 def read_ref(refname, repo_dir = None):
1049     """Get the commit id of the most recent commit made on a given ref."""
1050     refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
1051     l = tuple(islice(refs, 2))
1052     if l:
1053         assert(len(l) == 1)
1054         return l[0][1]
1055     else:
1056         return None
1057
1058
1059 def rev_list_invocation(ref_or_refs, format=None):
1060     if isinstance(ref_or_refs, bytes):
1061         refs = (ref_or_refs,)
1062     else:
1063         refs = ref_or_refs
1064     argv = [b'git', b'rev-list']
1065
1066     if format:
1067         argv.append(b'--pretty=format:' + format)
1068     for ref in refs:
1069         assert not ref.startswith(b'-')
1070         argv.append(ref)
1071     argv.append(b'--')
1072     return argv
1073
1074
1075 def rev_list(ref_or_refs, parse=None, format=None, repo_dir=None):
1076     """Yield information about commits as per "git rev-list".  If a format
1077     is not provided, yield one hex hash at a time.  If a format is
1078     provided, pass it to rev-list and call parse(git_stdout) for each
1079     commit with the stream positioned just after the rev-list "commit
1080     HASH" header line.  When a format is provided yield (oidx,
1081     parse(git_stdout)) for each commit.
1082
1083     """
1084     assert bool(parse) == bool(format)
1085     p = subprocess.Popen(rev_list_invocation(ref_or_refs,
1086                                              format=format),
1087                          env=_gitenv(repo_dir),
1088                          stdout = subprocess.PIPE,
1089                          close_fds=True)
1090     if not format:
1091         for line in p.stdout:
1092             yield line.strip()
1093     else:
1094         line = p.stdout.readline()
1095         while line:
1096             s = line.strip()
1097             if not s.startswith(b'commit '):
1098                 raise Exception('unexpected line ' + repr(s))
1099             s = s[7:]
1100             assert len(s) == 40
1101             yield s, parse(p.stdout)
1102             line = p.stdout.readline()
1103
1104     rv = p.wait()  # not fatal
1105     if rv:
1106         raise GitError('git rev-list returned error %d' % rv)
1107
1108
1109 def rev_parse(committish, repo_dir=None):
1110     """Resolve the full hash for 'committish', if it exists.
1111
1112     Should be roughly equivalent to 'git rev-parse'.
1113
1114     Returns the hex value of the hash if it is found, None if 'committish' does
1115     not correspond to anything.
1116     """
1117     head = read_ref(committish, repo_dir=repo_dir)
1118     if head:
1119         debug2("resolved from ref: commit = %s\n" % hexlify(head))
1120         return head
1121
1122     if len(committish) == 40:
1123         try:
1124             hash = unhexlify(committish)
1125         except TypeError:
1126             return None
1127
1128         with PackIdxList(repo(b'objects/pack', repo_dir=repo_dir)) as pL:
1129             if pL.exists(hash):
1130                 return hash
1131
1132     return None
1133
1134
1135 def update_ref(refname, newval, oldval, repo_dir=None):
1136     """Update a repository reference."""
1137     if not oldval:
1138         oldval = b''
1139     assert refname.startswith(b'refs/heads/') \
1140         or refname.startswith(b'refs/tags/')
1141     p = subprocess.Popen([b'git', b'update-ref', refname,
1142                           hexlify(newval), hexlify(oldval)],
1143                          env=_gitenv(repo_dir),
1144                          close_fds=True)
1145     _git_wait(b'git update-ref', p)
1146
1147
1148 def delete_ref(refname, oldvalue=None):
1149     """Delete a repository reference (see git update-ref(1))."""
1150     assert refname.startswith(b'refs/')
1151     oldvalue = [] if not oldvalue else [oldvalue]
1152     p = subprocess.Popen([b'git', b'update-ref', b'-d', refname] + oldvalue,
1153                          env=_gitenv(),
1154                          close_fds=True)
1155     _git_wait('git update-ref', p)
1156
1157
1158 def guess_repo(path=None):
1159     """Set the path value in the global variable "repodir".
1160     This makes bup look for an existing bup repository, but not fail if a
1161     repository doesn't exist. Usually, if you are interacting with a bup
1162     repository, you would not be calling this function but using
1163     check_repo_or_die().
1164     """
1165     global repodir
1166     if path:
1167         repodir = path
1168     if not repodir:
1169         repodir = environ.get(b'BUP_DIR')
1170         if not repodir:
1171             repodir = os.path.expanduser(b'~/.bup')
1172
1173
1174 def init_repo(path=None):
1175     """Create the Git bare repository for bup in a given path."""
1176     guess_repo(path)
1177     d = repo()  # appends a / to the path
1178     parent = os.path.dirname(os.path.dirname(d))
1179     if parent and not os.path.exists(parent):
1180         raise GitError('parent directory "%s" does not exist\n'
1181                        % path_msg(parent))
1182     if os.path.exists(d) and not os.path.isdir(os.path.join(d, b'.')):
1183         raise GitError('"%s" exists but is not a directory\n' % path_msg(d))
1184     p = subprocess.Popen([b'git', b'--bare', b'init'], stdout=sys.stderr,
1185                          env=_gitenv(),
1186                          close_fds=True)
1187     _git_wait('git init', p)
1188     # Force the index version configuration in order to ensure bup works
1189     # regardless of the version of the installed Git binary.
1190     p = subprocess.Popen([b'git', b'config', b'pack.indexVersion', '2'],
1191                          stdout=sys.stderr, env=_gitenv(), close_fds=True)
1192     _git_wait('git config', p)
1193     # Enable the reflog
1194     p = subprocess.Popen([b'git', b'config', b'core.logAllRefUpdates', b'true'],
1195                          stdout=sys.stderr, env=_gitenv(), close_fds=True)
1196     _git_wait('git config', p)
1197
1198
1199 def check_repo_or_die(path=None):
1200     """Check to see if a bup repository probably exists, and abort if not."""
1201     guess_repo(path)
1202     top = repo()
1203     pst = stat_if_exists(top + b'/objects/pack')
1204     if pst and stat.S_ISDIR(pst.st_mode):
1205         return
1206     if not pst:
1207         top_st = stat_if_exists(top)
1208         if not top_st:
1209             log('error: repository %r does not exist (see "bup help init")\n'
1210                 % top)
1211             sys.exit(15)
1212     log('error: %s is not a repository\n' % path_msg(top))
1213     sys.exit(14)
1214
1215
1216 def is_suitable_git(ver_str):
1217     if not ver_str.startswith(b'git version '):
1218         return 'unrecognized'
1219     ver_str = ver_str[len(b'git version '):]
1220     if ver_str.startswith(b'0.'):
1221         return 'insufficient'
1222     if ver_str.startswith(b'1.'):
1223         if re.match(br'1\.[012345]rc', ver_str):
1224             return 'insufficient'
1225         if re.match(br'1\.[01234]\.', ver_str):
1226             return 'insufficient'
1227         if re.match(br'1\.5\.[012345]($|\.)', ver_str):
1228             return 'insufficient'
1229         if re.match(br'1\.5\.6-rc', ver_str):
1230             return 'insufficient'
1231         return 'suitable'
1232     if re.match(br'[0-9]+(\.|$)?', ver_str):
1233         return 'suitable'
1234     sys.exit(13)
1235
1236 _git_great = None
1237
1238 def require_suitable_git(ver_str=None):
1239     """Raise GitError if the version of git isn't suitable.
1240
1241     Rely on ver_str when provided, rather than invoking the git in the
1242     path.
1243
1244     """
1245     global _git_great
1246     if _git_great is not None:
1247         return
1248     if environ.get(b'BUP_GIT_VERSION_IS_FINE', b'').lower() \
1249        in (b'yes', b'true', b'1'):
1250         _git_great = True
1251         return
1252     if not ver_str:
1253         ver_str, _, _ = _git_exo([b'git', b'--version'])
1254     status = is_suitable_git(ver_str)
1255     if status == 'unrecognized':
1256         raise GitError('Unexpected git --version output: %r' % ver_str)
1257     if status == 'insufficient':
1258         log('error: git version must be at least 1.5.6\n')
1259         sys.exit(1)
1260     if status == 'suitable':
1261         _git_great = True
1262         return
1263     assert False
1264
1265
1266 class CatPipe:
1267     """Link to 'git cat-file' that is used to retrieve blob data."""
1268     def __init__(self, repo_dir = None):
1269         require_suitable_git()
1270         self.repo_dir = repo_dir
1271         self.p = self.inprogress = None
1272
1273     def close(self, wait=False):
1274         self.p, p = None, self.p
1275         self.inprogress = None
1276         if p:
1277             try:
1278                 p.stdout.close()
1279             finally:
1280                 # This will handle pending exceptions correctly once
1281                 # we drop py2
1282                 p.stdin.close()
1283         if wait:
1284             p.wait()
1285             return p.returncode
1286         return None
1287
1288     def restart(self):
1289         self.close()
1290         self.p = subprocess.Popen([b'git', b'cat-file', b'--batch'],
1291                                   stdin=subprocess.PIPE,
1292                                   stdout=subprocess.PIPE,
1293                                   close_fds = True,
1294                                   bufsize = 4096,
1295                                   env=_gitenv(self.repo_dir))
1296
1297     def get(self, ref):
1298         """Yield (oidx, type, size), followed by the data referred to by ref.
1299         If ref does not exist, only yield (None, None, None).
1300
1301         """
1302         if not self.p or self.p.poll() != None:
1303             self.restart()
1304         assert(self.p)
1305         poll_result = self.p.poll()
1306         assert(poll_result == None)
1307         if self.inprogress:
1308             log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1309         assert(not self.inprogress)
1310         assert ref.find(b'\n') < 0
1311         assert ref.find(b'\r') < 0
1312         assert not ref.startswith(b'-')
1313         self.inprogress = ref
1314         self.p.stdin.write(ref + b'\n')
1315         self.p.stdin.flush()
1316         hdr = self.p.stdout.readline()
1317         if not hdr:
1318             raise GitError('unexpected cat-file EOF (last request: %r, exit: %s)'
1319                            % (ref, self.p.poll() or 'none'))
1320         if hdr.endswith(b' missing\n'):
1321             self.inprogress = None
1322             yield None, None, None
1323             return
1324         info = hdr.split(b' ')
1325         if len(info) != 3 or len(info[0]) != 40:
1326             raise GitError('expected object (id, type, size), got %r' % info)
1327         oidx, typ, size = info
1328         size = int(size)
1329         try:
1330             it = chunkyreader(self.p.stdout, size)
1331             yield oidx, typ, size
1332             for blob in chunkyreader(self.p.stdout, size):
1333                 yield blob
1334             readline_result = self.p.stdout.readline()
1335             assert readline_result == b'\n'
1336             self.inprogress = None
1337         except Exception as ex:
1338             with pending_raise(ex):
1339                 self.close()
1340
1341     def _join(self, it):
1342         _, typ, _ = next(it)
1343         if typ == b'blob':
1344             for blob in it:
1345                 yield blob
1346         elif typ == b'tree':
1347             treefile = b''.join(it)
1348             for (mode, name, sha) in tree_decode(treefile):
1349                 for blob in self.join(hexlify(sha)):
1350                     yield blob
1351         elif typ == b'commit':
1352             treeline = b''.join(it).split(b'\n')[0]
1353             assert treeline.startswith(b'tree ')
1354             for blob in self.join(treeline[5:]):
1355                 yield blob
1356         else:
1357             raise GitError('invalid object type %r: expected blob/tree/commit'
1358                            % typ)
1359
1360     def join(self, id):
1361         """Generate a list of the content of all blobs that can be reached
1362         from an object.  The hash given in 'id' must point to a blob, a tree
1363         or a commit. The content of all blobs that can be seen from trees or
1364         commits will be added to the list.
1365         """
1366         for d in self._join(self.get(id)):
1367             yield d
1368
1369
1370 _cp = {}
1371
1372 def cp(repo_dir=None):
1373     """Create a CatPipe object or reuse the already existing one."""
1374     global _cp, repodir
1375     if not repo_dir:
1376         repo_dir = repodir or repo()
1377     repo_dir = os.path.abspath(repo_dir)
1378     cp = _cp.get(repo_dir)
1379     if not cp:
1380         cp = CatPipe(repo_dir)
1381         _cp[repo_dir] = cp
1382     return cp
1383
1384
1385 def close_catpipes():
1386     # FIXME: chain exceptions
1387     while _cp:
1388         _, cp = _cp.popitem()
1389         cp.close(wait=True)
1390
1391
1392 def tags(repo_dir = None):
1393     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1394     tags = {}
1395     for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1396         assert n.startswith(b'refs/tags/')
1397         name = n[10:]
1398         if not c in tags:
1399             tags[c] = []
1400         tags[c].append(name)  # more than one tag can point at 'c'
1401     return tags
1402
1403
1404 class MissingObject(KeyError):
1405     def __init__(self, oid):
1406         self.oid = oid
1407         KeyError.__init__(self, 'object %r is missing' % hexlify(oid))
1408
1409
1410 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1411                                    'path', 'chunk_path', 'data'])
1412 # The path is the mangled path, and if an item represents a fragment
1413 # of a chunked file, the chunk_path will be the chunked subtree path
1414 # for the chunk, i.e. ['', '2d3115e', ...].  The top-level path for a
1415 # chunked file will have a chunk_path of [''].  So some chunk subtree
1416 # of the file '/foo/bar/baz' might look like this:
1417 #
1418 #   item.path = ['foo', 'bar', 'baz.bup']
1419 #   item.chunk_path = ['', '2d3115e', '016b097']
1420 #   item.type = 'tree'
1421 #   ...
1422
1423
1424 def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1425     """Yield everything reachable from oidx via get_ref (which must behave
1426     like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1427     returns true.  Throw MissingObject if a hash encountered is
1428     missing from the repository, and don't read or return blob content
1429     in the data field unless include_data is set.
1430
1431     """
1432     # Maintain the pending stack on the heap to avoid stack overflow
1433     pending = [(oidx, [], [], None)]
1434     while len(pending):
1435         oidx, parent_path, chunk_path, mode = pending.pop()
1436         oid = unhexlify(oidx)
1437         if stop_at and stop_at(oidx):
1438             continue
1439
1440         if (not include_data) and mode and stat.S_ISREG(mode):
1441             # If the object is a "regular file", then it's a leaf in
1442             # the graph, so we can skip reading the data if the caller
1443             # hasn't requested it.
1444             yield WalkItem(oid=oid, type=b'blob',
1445                            chunk_path=chunk_path, path=parent_path,
1446                            mode=mode,
1447                            data=None)
1448             continue
1449
1450         item_it = get_ref(oidx)
1451         get_oidx, typ, _ = next(item_it)
1452         if not get_oidx:
1453             raise MissingObject(unhexlify(oidx))
1454         if typ not in (b'blob', b'commit', b'tree'):
1455             raise Exception('unexpected repository object type %r' % typ)
1456
1457         # FIXME: set the mode based on the type when the mode is None
1458         if typ == b'blob' and not include_data:
1459             # Dump data until we can ask cat_pipe not to fetch it
1460             for ignored in item_it:
1461                 pass
1462             data = None
1463         else:
1464             data = b''.join(item_it)
1465
1466         yield WalkItem(oid=oid, type=typ,
1467                        chunk_path=chunk_path, path=parent_path,
1468                        mode=mode,
1469                        data=(data if include_data else None))
1470
1471         if typ == b'commit':
1472             commit_items = parse_commit(data)
1473             for pid in commit_items.parents:
1474                 pending.append((pid, parent_path, chunk_path, mode))
1475             pending.append((commit_items.tree, parent_path, chunk_path,
1476                             hashsplit.GIT_MODE_TREE))
1477         elif typ == b'tree':
1478             for mode, name, ent_id in tree_decode(data):
1479                 demangled, bup_type = demangle_name(name, mode)
1480                 if chunk_path:
1481                     sub_path = parent_path
1482                     sub_chunk_path = chunk_path + [name]
1483                 else:
1484                     sub_path = parent_path + [name]
1485                     if bup_type == BUP_CHUNKED:
1486                         sub_chunk_path = [b'']
1487                     else:
1488                         sub_chunk_path = chunk_path
1489                 pending.append((hexlify(ent_id), sub_path, sub_chunk_path,
1490                                 mode))