lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5
   6 from __future__ import absolute_import, print_function
   7 import os, sys, zlib, subprocess, struct, stat, re, tempfile, glob
   8 from array import array
   9 from binascii import hexlify, unhexlify
  10 from collections import namedtuple
  11 from itertools import islice
  12
  13 from bup import _helpers, hashsplit, path, midx, bloom, xstat
  14 from bup.compat import (buffer,
  15                         byte_int, bytes_from_byte, bytes_from_uint,
  16                         environ,
  17                         ExitStack,
  18                         items,
  19                         pending_raise,
  20                         range,
  21                         reraise)
  22 from bup.io import path_msg
  23 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
  24                          exo,
  25                          fdatasync,
  26                          finalized,
  27                          log,
  28                          merge_dict,
  29                          merge_iter,
  30                          mmap_read, mmap_readwrite,
  31                          nullcontext_if_not,
  32                          progress, qprogress, stat_if_exists,
  33                          unlink,
  34                          utc_offset_str)
  35
  36
  37 verbose = 0
  38 repodir = None  # The default repository, once initialized
  39
  40 _typemap =  {b'blob': 3, b'tree': 2, b'commit': 1, b'tag': 4}
  41 _typermap = {v: k for k, v in items(_typemap)}
  42
  43
  44 _total_searches = 0
  45 _total_steps = 0
  46
  47
  48 class GitError(Exception):
  49     pass
  50
  51
  52 def _gitenv(repo_dir=None):
  53     if not repo_dir:
  54         repo_dir = repo()
  55     return merge_dict(environ, {b'GIT_DIR': os.path.abspath(repo_dir)})
  56
  57 def _git_wait(cmd, p):
  58     rv = p.wait()
  59     if rv != 0:
  60         raise GitError('%r returned %d' % (cmd, rv))
  61
  62 def _git_exo(cmd, **kwargs):
  63     kwargs['check'] = False
  64     result = exo(cmd, **kwargs)
  65     _, _, proc = result
  66     if proc.returncode != 0:
  67         raise GitError('%r returned %d' % (cmd, proc.returncode))
  68     return result
  69
  70 def git_config_get(option, repo_dir=None, opttype=None, cfg_file=None):
  71     assert not (repo_dir and cfg_file), "repo_dir and cfg_file cannot both be used"
  72     cmd = [b'git', b'config', b'--null']
  73     if cfg_file:
  74         cmd.extend([b'--file', cfg_file])
  75     if opttype == 'int':
  76         cmd.extend([b'--int'])
  77     elif opttype == 'bool':
  78         cmd.extend([b'--bool'])
  79     else:
  80         assert opttype is None
  81     cmd.extend([b'--get', option])
  82     env=None
  83     if repo_dir:
  84         env = _gitenv(repo_dir=repo_dir)
  85     p = subprocess.Popen(cmd, stdout=subprocess.PIPE, env=env,
  86                          close_fds=True)
  87     # with --null, git writes out a trailing \0 after the value
  88     r = p.stdout.read()[:-1]
  89     rc = p.wait()
  90     if rc == 0:
  91         if opttype == 'int':
  92             return int(r)
  93         elif opttype == 'bool':
  94             # git converts to 'true' or 'false'
  95             return r == b'true'
  96         return r
  97     if rc != 1:
  98         raise GitError('%r returned %d' % (cmd, rc))
  99     return None
 100
 101
 102 def parse_tz_offset(s):
 103     """UTC offset in seconds."""
 104     tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
 105     if bytes_from_byte(s[0]) == b'-':
 106         return - tz_off
 107     return tz_off
 108
 109 def parse_commit_gpgsig(sig):
 110     """Return the original signature bytes.
 111
 112     i.e. with the "gpgsig " header and the leading space character on
 113     each continuation line removed.
 114
 115     """
 116     if not sig:
 117         return None
 118     assert sig.startswith(b'gpgsig ')
 119     sig = sig[7:]
 120     return sig.replace(b'\n ', b'\n')
 121
 122 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
 123 # Make sure that's authoritative.
 124
 125 # See also
 126 # https://github.com/git/git/blob/master/Documentation/technical/signature-format.txt
 127 # The continuation lines have only one leading space.
 128
 129 _start_end_char = br'[^ .,:;<>"\'\0\n]'
 130 _content_char = br'[^\0\n<>]'
 131 _safe_str_rx = br'(?:%s{1,2}|(?:%s%s*%s))' \
 132     % (_start_end_char,
 133        _start_end_char, _content_char, _start_end_char)
 134 _tz_rx = br'[-+]\d\d[0-5]\d'
 135 _parent_rx = br'(?:parent [abcdefABCDEF0123456789]{40}\n)'
 136 # Assumes every following line starting with a space is part of the
 137 # mergetag.  Is there a formal commit blob spec?
 138 _mergetag_rx = br'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)'
 139 _commit_rx = re.compile(br'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
 140 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
 141 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?)
 142 (?P<gpgsig>gpgsig .*\n(?: .*\n)*)?
 143 (?P<message>(?:.|\n)*)''' % (_parent_rx,
 144                              _safe_str_rx, _safe_str_rx, _tz_rx,
 145                              _safe_str_rx, _safe_str_rx, _tz_rx,
 146                              _mergetag_rx))
 147 _parent_hash_rx = re.compile(br'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
 148
 149 # Note that the author_sec and committer_sec values are (UTC) epoch
 150 # seconds, and for now the mergetag is not included.
 151 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
 152                                        'author_name', 'author_mail',
 153                                        'author_sec', 'author_offset',
 154                                        'committer_name', 'committer_mail',
 155                                        'committer_sec', 'committer_offset',
 156                                        'gpgsig',
 157                                        'message'])
 158
 159 def parse_commit(content):
 160     commit_match = re.match(_commit_rx, content)
 161     if not commit_match:
 162         raise Exception('cannot parse commit %r' % content)
 163     matches = commit_match.groupdict()
 164     return CommitInfo(tree=matches['tree'],
 165                       parents=re.findall(_parent_hash_rx, matches['parents']),
 166                       author_name=matches['author_name'],
 167                       author_mail=matches['author_mail'],
 168                       author_sec=int(matches['asec']),
 169                       author_offset=parse_tz_offset(matches['atz']),
 170                       committer_name=matches['committer_name'],
 171                       committer_mail=matches['committer_mail'],
 172                       committer_sec=int(matches['csec']),
 173                       committer_offset=parse_tz_offset(matches['ctz']),
 174                       gpgsig=parse_commit_gpgsig(matches['gpgsig']),
 175                       message=matches['message'])
 176
 177
 178 def get_cat_data(cat_iterator, expected_type):
 179     _, kind, _ = next(cat_iterator)
 180     if kind != expected_type:
 181         raise Exception('expected %r, saw %r' % (expected_type, kind))
 182     return b''.join(cat_iterator)
 183
 184 def get_commit_items(id, cp):
 185     return parse_commit(get_cat_data(cp.get(id), b'commit'))
 186
 187 def _local_git_date_str(epoch_sec):
 188     return b'%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
 189
 190
 191 def _git_date_str(epoch_sec, tz_offset_sec):
 192     offs =  tz_offset_sec // 60
 193     return b'%d %s%02d%02d' \
 194         % (epoch_sec,
 195            b'+' if offs >= 0 else b'-',
 196            abs(offs) // 60,
 197            abs(offs) % 60)
 198
 199
 200 def repo(sub = b'', repo_dir=None):
 201     """Get the path to the git repository or one of its subdirectories."""
 202     repo_dir = repo_dir or repodir
 203     if not repo_dir:
 204         raise GitError('You should call check_repo_or_die()')
 205
 206     # If there's a .git subdirectory, then the actual repo is in there.
 207     gd = os.path.join(repo_dir, b'.git')
 208     if os.path.exists(gd):
 209         repo_dir = gd
 210
 211     return os.path.join(repo_dir, sub)
 212
 213
 214 _shorten_hash_rx = \
 215     re.compile(br'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)')
 216
 217 def shorten_hash(s):
 218     return _shorten_hash_rx.sub(br'\1\2*\3', s)
 219
 220
 221 def repo_rel(path):
 222     full = os.path.abspath(path)
 223     fullrepo = os.path.abspath(repo(b''))
 224     if not fullrepo.endswith(b'/'):
 225         fullrepo += b'/'
 226     if full.startswith(fullrepo):
 227         path = full[len(fullrepo):]
 228     if path.startswith(b'index-cache/'):
 229         path = path[len(b'index-cache/'):]
 230     return shorten_hash(path)
 231
 232
 233 def auto_midx(objdir):
 234     args = [path.exe(), b'midx', b'--auto', b'--dir', objdir]
 235     try:
 236         rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
 237     except OSError as e:
 238         # make sure 'args' gets printed to help with debugging
 239         add_error('%r: exception: %s' % (args, e))
 240         raise
 241     if rv:
 242         add_error('%r: returned %d' % (args, rv))
 243
 244     args = [path.exe(), b'bloom', b'--dir', objdir]
 245     try:
 246         rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
 247     except OSError as e:
 248         # make sure 'args' gets printed to help with debugging
 249         add_error('%r: exception: %s' % (args, e))
 250         raise
 251     if rv:
 252         add_error('%r: returned %d' % (args, rv))
 253
 254
 255 def mangle_name(name, mode, gitmode):
 256     """Mangle a file name to present an abstract name for segmented files.
 257     Mangled file names will have the ".bup" extension added to them. If a
 258     file's name already ends with ".bup", a ".bupl" extension is added to
 259     disambiguate normal files from segmented ones.
 260     """
 261     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 262         assert(stat.S_ISDIR(gitmode))
 263         return name + b'.bup'
 264     elif name.endswith(b'.bup') or name[:-1].endswith(b'.bup'):
 265         return name + b'.bupl'
 266     else:
 267         return name
 268
 269
 270 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 271 def demangle_name(name, mode):
 272     """Remove name mangling from a file name, if necessary.
 273
 274     The return value is a tuple (demangled_filename,mode), where mode is one of
 275     the following:
 276
 277     * BUP_NORMAL  : files that should be read as-is from the repository
 278     * BUP_CHUNKED : files that were chunked and need to be reassembled
 279
 280     For more information on the name mangling algorithm, see mangle_name()
 281     """
 282     if name.endswith(b'.bupl'):
 283         return (name[:-5], BUP_NORMAL)
 284     elif name.endswith(b'.bup'):
 285         return (name[:-4], BUP_CHUNKED)
 286     elif name.endswith(b'.bupm'):
 287         return (name[:-5],
 288                 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
 289     return (name, BUP_NORMAL)
 290
 291
 292 def calc_hash(type, content):
 293     """Calculate some content's hash in the Git fashion."""
 294     header = b'%s %d\0' % (type, len(content))
 295     sum = Sha1(header)
 296     sum.update(content)
 297     return sum.digest()
 298
 299
 300 def shalist_item_sort_key(ent):
 301     (mode, name, id) = ent
 302     assert(mode+0 == mode)
 303     if stat.S_ISDIR(mode):
 304         return name + b'/'
 305     else:
 306         return name
 307
 308
 309 def tree_encode(shalist):
 310     """Generate a git tree object from (mode,name,hash) tuples."""
 311     shalist = sorted(shalist, key = shalist_item_sort_key)
 312     l = []
 313     for (mode,name,bin) in shalist:
 314         assert(mode)
 315         assert(mode+0 == mode)
 316         assert(name)
 317         assert(len(bin) == 20)
 318         s = b'%o %s\0%s' % (mode,name,bin)
 319         assert s[0] != b'0'  # 0-padded octal is not acceptable in a git tree
 320         l.append(s)
 321     return b''.join(l)
 322
 323
 324 def tree_decode(buf):
 325     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 326     ofs = 0
 327     while ofs < len(buf):
 328         z = buf.find(b'\0', ofs)
 329         assert(z > ofs)
 330         spl = buf[ofs:z].split(b' ', 1)
 331         assert(len(spl) == 2)
 332         mode,name = spl
 333         sha = buf[z+1:z+1+20]
 334         ofs = z+1+20
 335         yield (int(mode, 8), name, sha)
 336
 337
 338 def _encode_packobj(type, content, compression_level=1):
 339     if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
 340         raise ValueError('invalid compression level %s' % compression_level)
 341     szout = b''
 342     sz = len(content)
 343     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 344     sz >>= 4
 345     while 1:
 346         if sz: szbits |= 0x80
 347         szout += bytes_from_uint(szbits)
 348         if not sz:
 349             break
 350         szbits = sz & 0x7f
 351         sz >>= 7
 352     z = zlib.compressobj(compression_level)
 353     yield szout
 354     yield z.compress(content)
 355     yield z.flush()
 356
 357
 358 def _decode_packobj(buf):
 359     assert(buf)
 360     c = byte_int(buf[0])
 361     type = _typermap[(c & 0x70) >> 4]
 362     sz = c & 0x0f
 363     shift = 4
 364     i = 0
 365     while c & 0x80:
 366         i += 1
 367         c = byte_int(buf[i])
 368         sz |= (c & 0x7f) << shift
 369         shift += 7
 370         if not (c & 0x80):
 371             break
 372     return (type, zlib.decompress(buf[i+1:]))
 373
 374
 375 class PackIdx:
 376     def __init__(self):
 377         assert(0)
 378
 379     def find_offset(self, hash):
 380         """Get the offset of an object inside the index file."""
 381         idx = self._idx_from_hash(hash)
 382         if idx != None:
 383             return self._ofs_from_idx(idx)
 384         return None
 385
 386     def exists(self, hash, want_source=False):
 387         """Return nonempty if the object exists in this index."""
 388         if hash and (self._idx_from_hash(hash) != None):
 389             return want_source and os.path.basename(self.name) or True
 390         return None
 391
 392     def _idx_from_hash(self, hash):
 393         global _total_searches, _total_steps
 394         _total_searches += 1
 395         assert(len(hash) == 20)
 396         b1 = byte_int(hash[0])
 397         start = self.fanout[b1-1] # range -1..254
 398         end = self.fanout[b1] # range 0..255
 399         want = hash
 400         _total_steps += 1  # lookup table is a step
 401         while start < end:
 402             _total_steps += 1
 403             mid = start + (end - start) // 2
 404             v = self._idx_to_hash(mid)
 405             if v < want:
 406                 start = mid+1
 407             elif v > want:
 408                 end = mid
 409             else: # got it!
 410                 return mid
 411         return None
 412
 413
 414 class PackIdxV1(PackIdx):
 415     """Object representation of a Git pack index (version 1) file."""
 416     def __init__(self, filename, f):
 417         self.closed = False
 418         self.name = filename
 419         self.idxnames = [self.name]
 420         self.map = mmap_read(f)
 421         # Min size for 'L' is 4, which is sufficient for struct's '!I'
 422         self.fanout = array('L', struct.unpack('!256I', self.map))
 423         self.fanout.append(0)  # entry "-1"
 424         self.nsha = self.fanout[255]
 425         self.sha_ofs = 256 * 4
 426         # Avoid slicing shatable for individual hashes (very high overhead)
 427         self.shatable = buffer(self.map, self.sha_ofs, self.nsha * 24)
 428
 429     def __enter__(self):
 430         return self
 431
 432     def __exit__(self, type, value, traceback):
 433         with pending_raise(value, rethrow=False):
 434             self.close()
 435
 436     def __len__(self):
 437         return int(self.nsha)  # int() from long for python 2
 438
 439     def _ofs_from_idx(self, idx):
 440         if idx >= self.nsha or idx < 0:
 441             raise IndexError('invalid pack index index %d' % idx)
 442         ofs = self.sha_ofs + idx * 24
 443         return struct.unpack_from('!I', self.map, offset=ofs)[0]
 444
 445     def _idx_to_hash(self, idx):
 446         if idx >= self.nsha or idx < 0:
 447             raise IndexError('invalid pack index index %d' % idx)
 448         ofs = self.sha_ofs + idx * 24 + 4
 449         return self.map[ofs : ofs + 20]
 450
 451     def __iter__(self):
 452         start = self.sha_ofs + 4
 453         for ofs in range(start, start + 24 * self.nsha, 24):
 454             yield self.map[ofs : ofs + 20]
 455
 456     def close(self):
 457         self.closed = True
 458         if self.map is not None:
 459             self.shatable = None
 460             self.map.close()
 461             self.map = None
 462
 463     def __del__(self):
 464         assert self.closed
 465
 466
 467 class PackIdxV2(PackIdx):
 468     """Object representation of a Git pack index (version 2) file."""
 469     def __init__(self, filename, f):
 470         self.closed = False
 471         self.name = filename
 472         self.idxnames = [self.name]
 473         self.map = mmap_read(f)
 474         assert self.map[0:8] == b'\377tOc\0\0\0\2'
 475         # Min size for 'L' is 4, which is sufficient for struct's '!I'
 476         self.fanout = array('L', struct.unpack_from('!256I', self.map, offset=8))
 477         self.fanout.append(0)
 478         self.nsha = self.fanout[255]
 479         self.sha_ofs = 8 + 256*4
 480         self.ofstable_ofs = self.sha_ofs + self.nsha * 20 + self.nsha * 4
 481         self.ofs64table_ofs = self.ofstable_ofs + self.nsha * 4
 482         # Avoid slicing this for individual hashes (very high overhead)
 483         self.shatable = buffer(self.map, self.sha_ofs, self.nsha*20)
 484
 485     def __enter__(self):
 486         return self
 487
 488     def __exit__(self, type, value, traceback):
 489         with pending_raise(value, rethrow=False):
 490             self.close()
 491
 492     def __len__(self):
 493         return int(self.nsha)  # int() from long for python 2
 494
 495     def _ofs_from_idx(self, idx):
 496         if idx >= self.nsha or idx < 0:
 497             raise IndexError('invalid pack index index %d' % idx)
 498         ofs_ofs = self.ofstable_ofs + idx * 4
 499         ofs = struct.unpack_from('!I', self.map, offset=ofs_ofs)[0]
 500         if ofs & 0x80000000:
 501             idx64 = ofs & 0x7fffffff
 502             ofs64_ofs = self.ofs64table_ofs + idx64 * 8
 503             ofs = struct.unpack_from('!Q', self.map, offset=ofs64_ofs)[0]
 504         return ofs
 505
 506     def _idx_to_hash(self, idx):
 507         if idx >= self.nsha or idx < 0:
 508             raise IndexError('invalid pack index index %d' % idx)
 509         ofs = self.sha_ofs + idx * 20
 510         return self.map[ofs : ofs + 20]
 511
 512     def __iter__(self):
 513         start = self.sha_ofs
 514         for ofs in range(start, start + 20 * self.nsha, 20):
 515             yield self.map[ofs : ofs + 20]
 516
 517     def close(self):
 518         self.closed = True
 519         if self.map is not None:
 520             self.shatable = None
 521             self.map.close()
 522             self.map = None
 523
 524     def __del__(self):
 525         assert self.closed
 526
 527
 528 _mpi_count = 0
 529 class PackIdxList:
 530     def __init__(self, dir, ignore_midx=False):
 531         global _mpi_count
 532         # Q: was this also intended to prevent opening multiple repos?
 533         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 534         _mpi_count += 1
 535         self.open = True
 536         self.dir = dir
 537         self.also = set()
 538         self.packs = []
 539         self.do_bloom = False
 540         self.bloom = None
 541         self.ignore_midx = ignore_midx
 542         try:
 543             self.refresh()
 544         except BaseException as ex:
 545             with pending_raise(ex):
 546                 self.close()
 547
 548     def close(self):
 549         global _mpi_count
 550         if not self.open:
 551             assert _mpi_count == 0
 552             return
 553         _mpi_count -= 1
 554         assert _mpi_count == 0
 555         self.also = None
 556         self.bloom, bloom = None, self.bloom
 557         self.packs, packs = None, self.packs
 558         self.open = False
 559         with ExitStack() as stack:
 560             for pack in packs:
 561                 stack.enter_context(pack)
 562             if bloom:
 563                 bloom.close()
 564
 565     def __enter__(self):
 566         return self
 567
 568     def __exit__(self, type, value, traceback):
 569         with pending_raise(value, rethrow=False):
 570             self.close()
 571
 572     def __del__(self):
 573         assert not self.open
 574
 575     def __iter__(self):
 576         return iter(idxmerge(self.packs))
 577
 578     def __len__(self):
 579         return sum(len(pack) for pack in self.packs)
 580
 581     def exists(self, hash, want_source=False):
 582         """Return nonempty if the object exists in the index files."""
 583         global _total_searches
 584         _total_searches += 1
 585         if hash in self.also:
 586             return True
 587         if self.do_bloom and self.bloom:
 588             if self.bloom.exists(hash):
 589                 self.do_bloom = False
 590             else:
 591                 _total_searches -= 1  # was counted by bloom
 592                 return None
 593         for i in range(len(self.packs)):
 594             p = self.packs[i]
 595             _total_searches -= 1  # will be incremented by sub-pack
 596             ix = p.exists(hash, want_source=want_source)
 597             if ix:
 598                 # reorder so most recently used packs are searched first
 599                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 600                 return ix
 601         self.do_bloom = True
 602         return None
 603
 604     def refresh(self, skip_midx = False):
 605         """Refresh the index list.
 606         This method verifies if .midx files were superseded (e.g. all of its
 607         contents are in another, bigger .midx file) and removes the superseded
 608         files.
 609
 610         If skip_midx is True, all work on .midx files will be skipped and .midx
 611         files will be removed from the list.
 612
 613         The instance variable 'ignore_midx' can force this function to
 614         always act as if skip_midx was True.
 615         """
 616         if self.bloom is not None:
 617             self.bloom.close()
 618         self.bloom = None # Always reopen the bloom as it may have been relaced
 619         self.do_bloom = False
 620         skip_midx = skip_midx or self.ignore_midx
 621         d = dict((p.name, p) for p in self.packs
 622                  if not skip_midx or not isinstance(p, midx.PackMidx))
 623         if os.path.exists(self.dir):
 624             if not skip_midx:
 625                 midxl = []
 626                 midxes = set(glob.glob(os.path.join(self.dir, b'*.midx')))
 627                 # remove any *.midx files from our list that no longer exist
 628                 for ix in list(d.values()):
 629                     if not isinstance(ix, midx.PackMidx):
 630                         continue
 631                     if ix.name in midxes:
 632                         continue
 633                     # remove the midx
 634                     del d[ix.name]
 635                     ix.close()
 636                     self.packs.remove(ix)
 637                 for ix in self.packs:
 638                     if isinstance(ix, midx.PackMidx):
 639                         for name in ix.idxnames:
 640                             d[os.path.join(self.dir, name)] = ix
 641                 for full in midxes:
 642                     if not d.get(full):
 643                         mx = midx.PackMidx(full)
 644                         (mxd, mxf) = os.path.split(mx.name)
 645                         broken = False
 646                         for n in mx.idxnames:
 647                             if not os.path.exists(os.path.join(mxd, n)):
 648                                 log(('warning: index %s missing\n'
 649                                      '  used by %s\n')
 650                                     % (path_msg(n), path_msg(mxf)))
 651                                 broken = True
 652                         if broken:
 653                             mx.close()
 654                             unlink(full)
 655                         else:
 656                             midxl.append(mx)
 657                 midxl.sort(key=lambda ix:
 658                            (-len(ix), -xstat.stat(ix.name).st_mtime))
 659                 for ix in midxl:
 660                     any_needed = False
 661                     for sub in ix.idxnames:
 662                         found = d.get(os.path.join(self.dir, sub))
 663                         if not found or isinstance(found, PackIdx):
 664                             # doesn't exist, or exists but not in a midx
 665                             any_needed = True
 666                             break
 667                     if any_needed:
 668                         d[ix.name] = ix
 669                         for name in ix.idxnames:
 670                             d[os.path.join(self.dir, name)] = ix
 671                     elif not ix.force_keep:
 672                         debug1('midx: removing redundant: %s\n'
 673                                % path_msg(os.path.basename(ix.name)))
 674                         ix.close()
 675                         unlink(ix.name)
 676             for full in glob.glob(os.path.join(self.dir, b'*.idx')):
 677                 if not d.get(full):
 678                     try:
 679                         ix = open_idx(full)
 680                     except GitError as e:
 681                         add_error(e)
 682                         continue
 683                     d[full] = ix
 684             bfull = os.path.join(self.dir, b'bup.bloom')
 685             self.packs = list(set(d.values()))
 686             self.packs.sort(reverse=True, key=lambda x: len(x))
 687             if self.bloom is None and os.path.exists(bfull):
 688                 self.bloom = bloom.ShaBloom(bfull)
 689             try:
 690                 if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 691                     self.do_bloom = True
 692                 else:
 693                     if self.bloom:
 694                         self.bloom, bloom_tmp = None, self.bloom
 695                         bloom_tmp.close()
 696             except BaseException as ex:
 697                 with pending_raise(ex):
 698                     if self.bloom:
 699                         self.bloom.close()
 700
 701         debug1('PackIdxList: using %d index%s.\n'
 702             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 703
 704     def add(self, hash):
 705         """Insert an additional object in the list."""
 706         self.also.add(hash)
 707
 708
 709 def open_idx(filename):
 710     if filename.endswith(b'.idx'):
 711         f = open(filename, 'rb')
 712         header = f.read(8)
 713         if header[0:4] == b'\377tOc':
 714             version = struct.unpack('!I', header[4:8])[0]
 715             if version == 2:
 716                 return PackIdxV2(filename, f)
 717             else:
 718                 raise GitError('%s: expected idx file version 2, got %d'
 719                                % (path_msg(filename), version))
 720         elif len(header) == 8 and header[0:4] < b'\377tOc':
 721             return PackIdxV1(filename, f)
 722         else:
 723             raise GitError('%s: unrecognized idx file header'
 724                            % path_msg(filename))
 725     elif filename.endswith(b'.midx'):
 726         return midx.PackMidx(filename)
 727     else:
 728         raise GitError('idx filenames must end with .idx or .midx')
 729
 730
 731 def idxmerge(idxlist, final_progress=True):
 732     """Generate a list of all the objects reachable in a PackIdxList."""
 733     def pfunc(count, total):
 734         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 735                   % (count*100.0/total, count, total))
 736     def pfinal(count, total):
 737         if final_progress:
 738             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 739                      % (100, total, total))
 740     return merge_iter(idxlist, 10024, pfunc, pfinal)
 741
 742
 743 def create_commit_blob(tree, parent,
 744                        author, adate_sec, adate_tz,
 745                        committer, cdate_sec, cdate_tz,
 746                        msg):
 747     if adate_tz is not None:
 748         adate_str = _git_date_str(adate_sec, adate_tz)
 749     else:
 750         adate_str = _local_git_date_str(adate_sec)
 751     if cdate_tz is not None:
 752         cdate_str = _git_date_str(cdate_sec, cdate_tz)
 753     else:
 754         cdate_str = _local_git_date_str(cdate_sec)
 755     l = []
 756     if tree: l.append(b'tree %s' % hexlify(tree))
 757     if parent: l.append(b'parent %s' % hexlify(parent))
 758     if author: l.append(b'author %s %s' % (author, adate_str))
 759     if committer: l.append(b'committer %s %s' % (committer, cdate_str))
 760     l.append(b'')
 761     l.append(msg)
 762     return b'\n'.join(l)
 763
 764 def _make_objcache():
 765     return PackIdxList(repo(b'objects/pack'))
 766
 767 # bup-gc assumes that it can disable all PackWriter activities
 768 # (bloom/midx/cache) via the constructor and close() arguments.
 769
 770 class PackWriter:
 771     """Writes Git objects inside a pack file."""
 772     def __init__(self, objcache_maker=_make_objcache, compression_level=1,
 773                  run_midx=True, on_pack_finish=None,
 774                  max_pack_size=None, max_pack_objects=None, repo_dir=None):
 775         self.closed = False
 776         self.repo_dir = repo_dir or repo()
 777         self.file = None
 778         self.parentfd = None
 779         self.count = 0
 780         self.outbytes = 0
 781         self.filename = None
 782         self.idx = None
 783         self.objcache_maker = objcache_maker
 784         self.objcache = None
 785         self.compression_level = compression_level
 786         self.run_midx=run_midx
 787         self.on_pack_finish = on_pack_finish
 788         if not max_pack_size:
 789             max_pack_size = git_config_get(b'pack.packSizeLimit',
 790                                            repo_dir=self.repo_dir,
 791                                            opttype='int')
 792             if not max_pack_size:
 793                 # larger packs slow down pruning
 794                 max_pack_size = 1000 * 1000 * 1000
 795         self.max_pack_size = max_pack_size
 796         # cache memory usage is about 83 bytes per object
 797         self.max_pack_objects = max_pack_objects if max_pack_objects \
 798                                 else max(1, self.max_pack_size // 5000)
 799
 800     def __enter__(self):
 801         return self
 802
 803     def __exit__(self, type, value, traceback):
 804         with pending_raise(value, rethrow=False):
 805             self.close()
 806
 807     def _open(self):
 808         if not self.file:
 809             objdir = dir = os.path.join(self.repo_dir, b'objects')
 810             fd, name = tempfile.mkstemp(suffix=b'.pack', dir=objdir)
 811             try:
 812                 self.file = os.fdopen(fd, 'w+b')
 813             except:
 814                 os.close(fd)
 815                 raise
 816             try:
 817                 self.parentfd = os.open(objdir, os.O_RDONLY)
 818             except:
 819                 f = self.file
 820                 self.file = None
 821                 f.close()
 822                 raise
 823             assert name.endswith(b'.pack')
 824             self.filename = name[:-5]
 825             self.file.write(b'PACK\0\0\0\2\0\0\0\0')
 826             self.idx = PackIdxV2Writer()
 827
 828     def _raw_write(self, datalist, sha):
 829         self._open()
 830         f = self.file
 831         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 832         # the file never has a *partial* blob.  So let's make sure it's
 833         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 834         # to our hashsplit algorithm.)  f.write() does its own buffering,
 835         # but that's okay because we'll flush it in _end().
 836         oneblob = b''.join(datalist)
 837         try:
 838             f.write(oneblob)
 839         except IOError as e:
 840             reraise(GitError(e))
 841         nw = len(oneblob)
 842         crc = zlib.crc32(oneblob) & 0xffffffff
 843         self._update_idx(sha, crc, nw)
 844         self.outbytes += nw
 845         self.count += 1
 846         return nw, crc
 847
 848     def _update_idx(self, sha, crc, size):
 849         assert(sha)
 850         if self.idx:
 851             self.idx.add(sha, crc, self.file.tell() - size)
 852
 853     def _write(self, sha, type, content):
 854         if verbose:
 855             log('>')
 856         if not sha:
 857             sha = calc_hash(type, content)
 858         size, crc = self._raw_write(_encode_packobj(type, content,
 859                                                     self.compression_level),
 860                                     sha=sha)
 861         if self.outbytes >= self.max_pack_size \
 862            or self.count >= self.max_pack_objects:
 863             self.breakpoint()
 864         return sha
 865
 866     def _require_objcache(self):
 867         if self.objcache is None and self.objcache_maker:
 868             self.objcache = self.objcache_maker()
 869         if self.objcache is None:
 870             raise GitError(
 871                     "PackWriter not opened or can't check exists w/o objcache")
 872
 873     def exists(self, id, want_source=False):
 874         """Return non-empty if an object is found in the object cache."""
 875         self._require_objcache()
 876         return self.objcache.exists(id, want_source=want_source)
 877
 878     def just_write(self, sha, type, content):
 879         """Write an object to the pack file without checking for duplication."""
 880         self._write(sha, type, content)
 881         # If nothing else, gc doesn't have/want an objcache
 882         if self.objcache is not None:
 883             self.objcache.add(sha)
 884
 885     def maybe_write(self, type, content):
 886         """Write an object to the pack file if not present and return its id."""
 887         sha = calc_hash(type, content)
 888         if not self.exists(sha):
 889             self._require_objcache()
 890             self.just_write(sha, type, content)
 891         return sha
 892
 893     def new_blob(self, blob):
 894         """Create a blob object in the pack with the supplied content."""
 895         return self.maybe_write(b'blob', blob)
 896
 897     def new_tree(self, shalist):
 898         """Create a tree object in the pack."""
 899         content = tree_encode(shalist)
 900         return self.maybe_write(b'tree', content)
 901
 902     def new_commit(self, tree, parent,
 903                    author, adate_sec, adate_tz,
 904                    committer, cdate_sec, cdate_tz,
 905                    msg):
 906         """Create a commit object in the pack.  The date_sec values must be
 907         epoch-seconds, and if a tz is None, the local timezone is assumed."""
 908         content = create_commit_blob(tree, parent,
 909                                      author, adate_sec, adate_tz,
 910                                      committer, cdate_sec, cdate_tz,
 911                                      msg)
 912         return self.maybe_write(b'commit', content)
 913
 914     def _end(self, run_midx=True, abort=False):
 915         # Ignores run_midx during abort
 916         if not self.file:
 917             return None
 918         self.file, f = None, self.file
 919         self.idx, idx = None, self.idx
 920         self.parentfd, pfd, = None, self.parentfd
 921
 922         try:
 923             with nullcontext_if_not(self.objcache), \
 924                  finalized(pfd, lambda x: x is not None and os.close(x)), \
 925                  f:
 926
 927                 if abort:
 928                     os.unlink(self.filename + b'.pack')
 929                     return None
 930
 931                 # update object count
 932                 f.seek(8)
 933                 cp = struct.pack('!i', self.count)
 934                 assert len(cp) == 4
 935                 f.write(cp)
 936
 937                 # calculate the pack sha1sum
 938                 f.seek(0)
 939                 sum = Sha1()
 940                 for b in chunkyreader(f):
 941                     sum.update(b)
 942                 packbin = sum.digest()
 943                 f.write(packbin)
 944                 f.flush()
 945                 fdatasync(f.fileno())
 946                 f.close()
 947
 948                 idx.write(self.filename + b'.idx', packbin)
 949                 nameprefix = os.path.join(self.repo_dir,
 950                                           b'objects/pack/pack-' +  hexlify(packbin))
 951                 if os.path.exists(self.filename + b'.map'):
 952                     os.unlink(self.filename + b'.map')
 953                 os.rename(self.filename + b'.pack', nameprefix + b'.pack')
 954                 os.rename(self.filename + b'.idx', nameprefix + b'.idx')
 955                 os.fsync(pfd)
 956                 if run_midx:
 957                     auto_midx(os.path.join(self.repo_dir, b'objects/pack'))
 958                 if self.on_pack_finish:
 959                     self.on_pack_finish(nameprefix)
 960                 return nameprefix
 961         finally:
 962             # Must be last -- some of the code above depends on it
 963             self.objcache = None
 964
 965     def abort(self):
 966         """Remove the pack file from disk."""
 967         self.closed = True
 968         self._end(abort=True)
 969
 970     def breakpoint(self):
 971         """Clear byte and object counts and return the last processed id."""
 972         id = self._end(self.run_midx)
 973         self.outbytes = self.count = 0
 974         return id
 975
 976     def close(self, run_midx=True):
 977         """Close the pack file and move it to its definitive path."""
 978         self.closed = True
 979         return self._end(run_midx=run_midx)
 980
 981     def __del__(self):
 982         assert self.closed
 983
 984
 985 class PackIdxV2Writer:
 986     def __init__(self):
 987         self.idx = list(list() for i in range(256))
 988         self.count = 0
 989
 990     def add(self, sha, crc, offs):
 991         assert(sha)
 992         self.count += 1
 993         self.idx[byte_int(sha[0])].append((sha, crc, offs))
 994
 995     def write(self, filename, packbin):
 996         ofs64_count = 0
 997         for section in self.idx:
 998             for entry in section:
 999                 if entry[2] >= 2**31:
1000                     ofs64_count += 1
1001
1002         # Length: header + fan-out + shas-and-crcs + overflow-offsets
1003         index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
1004         idx_map = None
1005         idx_f = open(filename, 'w+b')
1006         try:
1007             idx_f.truncate(index_len)
1008             fdatasync(idx_f.fileno())
1009             idx_map = mmap_readwrite(idx_f, close=False)
1010             try:
1011                 count = _helpers.write_idx(filename, idx_map, self.idx,
1012                                            self.count)
1013                 assert(count == self.count)
1014                 idx_map.flush()
1015             finally:
1016                 idx_map.close()
1017         finally:
1018             idx_f.close()
1019
1020         idx_f = open(filename, 'a+b')
1021         try:
1022             idx_f.write(packbin)
1023             idx_f.seek(0)
1024             idx_sum = Sha1()
1025             b = idx_f.read(8 + 4*256)
1026             idx_sum.update(b)
1027
1028             for b in chunkyreader(idx_f, 20 * self.count):
1029                 idx_sum.update(b)
1030
1031             for b in chunkyreader(idx_f):
1032                 idx_sum.update(b)
1033             idx_f.write(idx_sum.digest())
1034             fdatasync(idx_f.fileno())
1035         finally:
1036             idx_f.close()
1037
1038
1039 def list_refs(patterns=None, repo_dir=None,
1040               limit_to_heads=False, limit_to_tags=False):
1041     """Yield (refname, hash) tuples for all repository refs unless
1042     patterns are specified.  In that case, only include tuples for
1043     refs matching those patterns (cf. git-show-ref(1)).  The limits
1044     restrict the result items to refs/heads or refs/tags.  If both
1045     limits are specified, items from both sources will be included.
1046
1047     """
1048     argv = [b'git', b'show-ref']
1049     if limit_to_heads:
1050         argv.append(b'--heads')
1051     if limit_to_tags:
1052         argv.append(b'--tags')
1053     argv.append(b'--')
1054     if patterns:
1055         argv.extend(patterns)
1056     p = subprocess.Popen(argv, env=_gitenv(repo_dir), stdout=subprocess.PIPE,
1057                          close_fds=True)
1058     out = p.stdout.read().strip()
1059     rv = p.wait()  # not fatal
1060     if rv:
1061         assert(not out)
1062     if out:
1063         for d in out.split(b'\n'):
1064             sha, name = d.split(b' ', 1)
1065             yield name, unhexlify(sha)
1066
1067
1068 def read_ref(refname, repo_dir = None):
1069     """Get the commit id of the most recent commit made on a given ref."""
1070     refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
1071     l = tuple(islice(refs, 2))
1072     if l:
1073         assert(len(l) == 1)
1074         return l[0][1]
1075     else:
1076         return None
1077
1078
1079 def rev_list_invocation(ref_or_refs, format=None):
1080     if isinstance(ref_or_refs, bytes):
1081         refs = (ref_or_refs,)
1082     else:
1083         refs = ref_or_refs
1084     argv = [b'git', b'rev-list']
1085
1086     if format:
1087         argv.append(b'--pretty=format:' + format)
1088     for ref in refs:
1089         assert not ref.startswith(b'-')
1090         argv.append(ref)
1091     argv.append(b'--')
1092     return argv
1093
1094
1095 def rev_list(ref_or_refs, parse=None, format=None, repo_dir=None):
1096     """Yield information about commits as per "git rev-list".  If a format
1097     is not provided, yield one hex hash at a time.  If a format is
1098     provided, pass it to rev-list and call parse(git_stdout) for each
1099     commit with the stream positioned just after the rev-list "commit
1100     HASH" header line.  When a format is provided yield (oidx,
1101     parse(git_stdout)) for each commit.
1102
1103     """
1104     assert bool(parse) == bool(format)
1105     p = subprocess.Popen(rev_list_invocation(ref_or_refs,
1106                                              format=format),
1107                          env=_gitenv(repo_dir),
1108                          stdout = subprocess.PIPE,
1109                          close_fds=True)
1110     if not format:
1111         for line in p.stdout:
1112             yield line.strip()
1113     else:
1114         line = p.stdout.readline()
1115         while line:
1116             s = line.strip()
1117             if not s.startswith(b'commit '):
1118                 raise Exception('unexpected line ' + repr(s))
1119             s = s[7:]
1120             assert len(s) == 40
1121             yield s, parse(p.stdout)
1122             line = p.stdout.readline()
1123
1124     rv = p.wait()  # not fatal
1125     if rv:
1126         raise GitError('git rev-list returned error %d' % rv)
1127
1128
1129 def rev_parse(committish, repo_dir=None):
1130     """Resolve the full hash for 'committish', if it exists.
1131
1132     Should be roughly equivalent to 'git rev-parse'.
1133
1134     Returns the hex value of the hash if it is found, None if 'committish' does
1135     not correspond to anything.
1136     """
1137     head = read_ref(committish, repo_dir=repo_dir)
1138     if head:
1139         debug2("resolved from ref: commit = %s\n" % hexlify(head))
1140         return head
1141
1142     if len(committish) == 40:
1143         try:
1144             hash = unhexlify(committish)
1145         except TypeError:
1146             return None
1147
1148         with PackIdxList(repo(b'objects/pack', repo_dir=repo_dir)) as pL:
1149             if pL.exists(hash):
1150                 return hash
1151
1152     return None
1153
1154
1155 def update_ref(refname, newval, oldval, repo_dir=None):
1156     """Update a repository reference."""
1157     if not oldval:
1158         oldval = b''
1159     assert refname.startswith(b'refs/heads/') \
1160         or refname.startswith(b'refs/tags/')
1161     p = subprocess.Popen([b'git', b'update-ref', refname,
1162                           hexlify(newval), hexlify(oldval)],
1163                          env=_gitenv(repo_dir),
1164                          close_fds=True)
1165     _git_wait(b'git update-ref', p)
1166
1167
1168 def delete_ref(refname, oldvalue=None):
1169     """Delete a repository reference (see git update-ref(1))."""
1170     assert refname.startswith(b'refs/')
1171     oldvalue = [] if not oldvalue else [oldvalue]
1172     p = subprocess.Popen([b'git', b'update-ref', b'-d', refname] + oldvalue,
1173                          env=_gitenv(),
1174                          close_fds=True)
1175     _git_wait('git update-ref', p)
1176
1177
1178 def guess_repo(path=None):
1179     """Set the path value in the global variable "repodir".
1180     This makes bup look for an existing bup repository, but not fail if a
1181     repository doesn't exist. Usually, if you are interacting with a bup
1182     repository, you would not be calling this function but using
1183     check_repo_or_die().
1184     """
1185     global repodir
1186     if path:
1187         repodir = path
1188     if not repodir:
1189         repodir = environ.get(b'BUP_DIR')
1190         if not repodir:
1191             repodir = os.path.expanduser(b'~/.bup')
1192
1193
1194 def init_repo(path=None):
1195     """Create the Git bare repository for bup in a given path."""
1196     guess_repo(path)
1197     d = repo()  # appends a / to the path
1198     parent = os.path.dirname(os.path.dirname(d))
1199     if parent and not os.path.exists(parent):
1200         raise GitError('parent directory "%s" does not exist\n'
1201                        % path_msg(parent))
1202     if os.path.exists(d) and not os.path.isdir(os.path.join(d, b'.')):
1203         raise GitError('"%s" exists but is not a directory\n' % path_msg(d))
1204     p = subprocess.Popen([b'git', b'--bare', b'init'], stdout=sys.stderr,
1205                          env=_gitenv(),
1206                          close_fds=True)
1207     _git_wait('git init', p)
1208     # Force the index version configuration in order to ensure bup works
1209     # regardless of the version of the installed Git binary.
1210     p = subprocess.Popen([b'git', b'config', b'pack.indexVersion', '2'],
1211                          stdout=sys.stderr, env=_gitenv(), close_fds=True)
1212     _git_wait('git config', p)
1213     # Enable the reflog
1214     p = subprocess.Popen([b'git', b'config', b'core.logAllRefUpdates', b'true'],
1215                          stdout=sys.stderr, env=_gitenv(), close_fds=True)
1216     _git_wait('git config', p)
1217
1218
1219 def check_repo_or_die(path=None):
1220     """Check to see if a bup repository probably exists, and abort if not."""
1221     guess_repo(path)
1222     top = repo()
1223     pst = stat_if_exists(top + b'/objects/pack')
1224     if pst and stat.S_ISDIR(pst.st_mode):
1225         return
1226     if not pst:
1227         top_st = stat_if_exists(top)
1228         if not top_st:
1229             log('error: repository %r does not exist (see "bup help init")\n'
1230                 % top)
1231             sys.exit(15)
1232     log('error: %s is not a repository\n' % path_msg(top))
1233     sys.exit(14)
1234
1235
1236 def is_suitable_git(ver_str):
1237     if not ver_str.startswith(b'git version '):
1238         return 'unrecognized'
1239     ver_str = ver_str[len(b'git version '):]
1240     if ver_str.startswith(b'0.'):
1241         return 'insufficient'
1242     if ver_str.startswith(b'1.'):
1243         if re.match(br'1\.[012345]rc', ver_str):
1244             return 'insufficient'
1245         if re.match(br'1\.[01234]\.', ver_str):
1246             return 'insufficient'
1247         if re.match(br'1\.5\.[012345]($|\.)', ver_str):
1248             return 'insufficient'
1249         if re.match(br'1\.5\.6-rc', ver_str):
1250             return 'insufficient'
1251         return 'suitable'
1252     if re.match(br'[0-9]+(\.|$)?', ver_str):
1253         return 'suitable'
1254     sys.exit(13)
1255
1256 _git_great = None
1257
1258 def require_suitable_git(ver_str=None):
1259     """Raise GitError if the version of git isn't suitable.
1260
1261     Rely on ver_str when provided, rather than invoking the git in the
1262     path.
1263
1264     """
1265     global _git_great
1266     if _git_great is not None:
1267         return
1268     if environ.get(b'BUP_GIT_VERSION_IS_FINE', b'').lower() \
1269        in (b'yes', b'true', b'1'):
1270         _git_great = True
1271         return
1272     if not ver_str:
1273         ver_str, _, _ = _git_exo([b'git', b'--version'])
1274     status = is_suitable_git(ver_str)
1275     if status == 'unrecognized':
1276         raise GitError('Unexpected git --version output: %r' % ver_str)
1277     if status == 'insufficient':
1278         log('error: git version must be at least 1.5.6\n')
1279         sys.exit(1)
1280     if status == 'suitable':
1281         _git_great = True
1282         return
1283     assert False
1284
1285
1286 class CatPipe:
1287     """Link to 'git cat-file' that is used to retrieve blob data."""
1288     def __init__(self, repo_dir = None):
1289         require_suitable_git()
1290         self.repo_dir = repo_dir
1291         self.p = self.inprogress = None
1292
1293     def close(self, wait=False):
1294         self.p, p = None, self.p
1295         self.inprogress = None
1296         if p:
1297             try:
1298                 p.stdout.close()
1299             finally:
1300                 # This will handle pending exceptions correctly once
1301                 # we drop py2
1302                 p.stdin.close()
1303         if wait:
1304             p.wait()
1305             return p.returncode
1306         return None
1307
1308     def restart(self):
1309         self.close()
1310         self.p = subprocess.Popen([b'git', b'cat-file', b'--batch'],
1311                                   stdin=subprocess.PIPE,
1312                                   stdout=subprocess.PIPE,
1313                                   close_fds = True,
1314                                   bufsize = 4096,
1315                                   env=_gitenv(self.repo_dir))
1316
1317     def get(self, ref):
1318         """Yield (oidx, type, size), followed by the data referred to by ref.
1319         If ref does not exist, only yield (None, None, None).
1320
1321         """
1322         if not self.p or self.p.poll() != None:
1323             self.restart()
1324         assert(self.p)
1325         poll_result = self.p.poll()
1326         assert(poll_result == None)
1327         if self.inprogress:
1328             log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1329         assert(not self.inprogress)
1330         assert ref.find(b'\n') < 0
1331         assert ref.find(b'\r') < 0
1332         assert not ref.startswith(b'-')
1333         self.inprogress = ref
1334         self.p.stdin.write(ref + b'\n')
1335         self.p.stdin.flush()
1336         hdr = self.p.stdout.readline()
1337         if not hdr:
1338             raise GitError('unexpected cat-file EOF (last request: %r, exit: %s)'
1339                            % (ref, self.p.poll() or 'none'))
1340         if hdr.endswith(b' missing\n'):
1341             self.inprogress = None
1342             yield None, None, None
1343             return
1344         info = hdr.split(b' ')
1345         if len(info) != 3 or len(info[0]) != 40:
1346             raise GitError('expected object (id, type, size), got %r' % info)
1347         oidx, typ, size = info
1348         size = int(size)
1349         try:
1350             it = chunkyreader(self.p.stdout, size)
1351             yield oidx, typ, size
1352             for blob in chunkyreader(self.p.stdout, size):
1353                 yield blob
1354             readline_result = self.p.stdout.readline()
1355             assert readline_result == b'\n'
1356             self.inprogress = None
1357         except Exception as ex:
1358             with pending_raise(ex):
1359                 self.close()
1360
1361     def _join(self, it):
1362         _, typ, _ = next(it)
1363         if typ == b'blob':
1364             for blob in it:
1365                 yield blob
1366         elif typ == b'tree':
1367             treefile = b''.join(it)
1368             for (mode, name, sha) in tree_decode(treefile):
1369                 for blob in self.join(hexlify(sha)):
1370                     yield blob
1371         elif typ == b'commit':
1372             treeline = b''.join(it).split(b'\n')[0]
1373             assert treeline.startswith(b'tree ')
1374             for blob in self.join(treeline[5:]):
1375                 yield blob
1376         else:
1377             raise GitError('invalid object type %r: expected blob/tree/commit'
1378                            % typ)
1379
1380     def join(self, id):
1381         """Generate a list of the content of all blobs that can be reached
1382         from an object.  The hash given in 'id' must point to a blob, a tree
1383         or a commit. The content of all blobs that can be seen from trees or
1384         commits will be added to the list.
1385         """
1386         for d in self._join(self.get(id)):
1387             yield d
1388
1389
1390 _cp = {}
1391
1392 def cp(repo_dir=None):
1393     """Create a CatPipe object or reuse the already existing one."""
1394     global _cp, repodir
1395     if not repo_dir:
1396         repo_dir = repodir or repo()
1397     repo_dir = os.path.abspath(repo_dir)
1398     cp = _cp.get(repo_dir)
1399     if not cp:
1400         cp = CatPipe(repo_dir)
1401         _cp[repo_dir] = cp
1402     return cp
1403
1404
1405 def close_catpipes():
1406     # FIXME: chain exceptions
1407     while _cp:
1408         _, cp = _cp.popitem()
1409         cp.close(wait=True)
1410
1411
1412 def tags(repo_dir = None):
1413     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1414     tags = {}
1415     for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1416         assert n.startswith(b'refs/tags/')
1417         name = n[10:]
1418         if not c in tags:
1419             tags[c] = []
1420         tags[c].append(name)  # more than one tag can point at 'c'
1421     return tags
1422
1423
1424 class MissingObject(KeyError):
1425     def __init__(self, oid):
1426         self.oid = oid
1427         KeyError.__init__(self, 'object %r is missing' % hexlify(oid))
1428
1429
1430 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1431                                    'path', 'chunk_path', 'data'])
1432 # The path is the mangled path, and if an item represents a fragment
1433 # of a chunked file, the chunk_path will be the chunked subtree path
1434 # for the chunk, i.e. ['', '2d3115e', ...].  The top-level path for a
1435 # chunked file will have a chunk_path of [''].  So some chunk subtree
1436 # of the file '/foo/bar/baz' might look like this:
1437 #
1438 #   item.path = ['foo', 'bar', 'baz.bup']
1439 #   item.chunk_path = ['', '2d3115e', '016b097']
1440 #   item.type = 'tree'
1441 #   ...
1442
1443
1444 def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1445     """Yield everything reachable from oidx via get_ref (which must behave
1446     like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1447     returns true.  Throw MissingObject if a hash encountered is
1448     missing from the repository, and don't read or return blob content
1449     in the data field unless include_data is set.
1450
1451     """
1452     # Maintain the pending stack on the heap to avoid stack overflow
1453     pending = [(oidx, [], [], None)]
1454     while len(pending):
1455         oidx, parent_path, chunk_path, mode = pending.pop()
1456         oid = unhexlify(oidx)
1457         if stop_at and stop_at(oidx):
1458             continue
1459
1460         if (not include_data) and mode and stat.S_ISREG(mode):
1461             # If the object is a "regular file", then it's a leaf in
1462             # the graph, so we can skip reading the data if the caller
1463             # hasn't requested it.
1464             yield WalkItem(oid=oid, type=b'blob',
1465                            chunk_path=chunk_path, path=parent_path,
1466                            mode=mode,
1467                            data=None)
1468             continue
1469
1470         item_it = get_ref(oidx)
1471         get_oidx, typ, _ = next(item_it)
1472         if not get_oidx:
1473             raise MissingObject(unhexlify(oidx))
1474         if typ not in (b'blob', b'commit', b'tree'):
1475             raise Exception('unexpected repository object type %r' % typ)
1476
1477         # FIXME: set the mode based on the type when the mode is None
1478         if typ == b'blob' and not include_data:
1479             # Dump data until we can ask cat_pipe not to fetch it
1480             for ignored in item_it:
1481                 pass
1482             data = None
1483         else:
1484             data = b''.join(item_it)
1485
1486         yield WalkItem(oid=oid, type=typ,
1487                        chunk_path=chunk_path, path=parent_path,
1488                        mode=mode,
1489                        data=(data if include_data else None))
1490
1491         if typ == b'commit':
1492             commit_items = parse_commit(data)
1493             for pid in commit_items.parents:
1494                 pending.append((pid, parent_path, chunk_path, mode))
1495             pending.append((commit_items.tree, parent_path, chunk_path,
1496                             hashsplit.GIT_MODE_TREE))
1497         elif typ == b'tree':
1498             for mode, name, ent_id in tree_decode(data):
1499                 demangled, bup_type = demangle_name(name, mode)
1500                 if chunk_path:
1501                     sub_path = parent_path
1502                     sub_chunk_path = chunk_path + [name]
1503                 else:
1504                     sub_path = parent_path + [name]
1505                     if bup_type == BUP_CHUNKED:
1506                         sub_chunk_path = [b'']
1507                     else:
1508                         sub_chunk_path = chunk_path
1509                 pending.append((hexlify(ent_id), sub_path, sub_chunk_path,
1510                                 mode))