lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5
   6 from __future__ import absolute_import, print_function
   7 import os, sys, zlib, subprocess, struct, stat, re, tempfile, glob
   8 from array import array
   9 from binascii import hexlify, unhexlify
  10 from collections import namedtuple
  11 from itertools import islice
  12
  13 from bup import _helpers, hashsplit, path, midx, bloom, xstat
  14 from bup.compat import (buffer,
  15                         byte_int, bytes_from_byte, bytes_from_uint,
  16                         environ,
  17                         ExitStack,
  18                         items,
  19                         pending_raise,
  20                         range,
  21                         reraise)
  22 from bup.io import path_msg
  23 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
  24                          exo,
  25                          fdatasync,
  26                          finalized,
  27                          log,
  28                          merge_dict,
  29                          merge_iter,
  30                          mmap_read, mmap_readwrite,
  31                          nullcontext_if_not,
  32                          progress, qprogress, stat_if_exists,
  33                          unlink,
  34                          utc_offset_str)
  35
  36
  37 verbose = 0
  38 repodir = None  # The default repository, once initialized
  39
  40 _typemap =  {b'blob': 3, b'tree': 2, b'commit': 1, b'tag': 4}
  41 _typermap = {v: k for k, v in items(_typemap)}
  42
  43
  44 _total_searches = 0
  45 _total_steps = 0
  46
  47
  48 class GitError(Exception):
  49     pass
  50
  51
  52 def _gitenv(repo_dir=None):
  53     if not repo_dir:
  54         repo_dir = repo()
  55     return merge_dict(environ, {b'GIT_DIR': os.path.abspath(repo_dir)})
  56
  57 def _git_wait(cmd, p):
  58     rv = p.wait()
  59     if rv != 0:
  60         raise GitError('%r returned %d' % (cmd, rv))
  61
  62 def _git_exo(cmd, **kwargs):
  63     kwargs['check'] = False
  64     result = exo(cmd, **kwargs)
  65     _, _, proc = result
  66     if proc.returncode != 0:
  67         raise GitError('%r returned %d' % (cmd, proc.returncode))
  68     return result
  69
  70 def git_config_get(option, repo_dir=None, opttype=None, cfg_file=None):
  71     assert not (repo_dir and cfg_file), "repo_dir and cfg_file cannot both be used"
  72     cmd = [b'git', b'config', b'--null']
  73     if cfg_file:
  74         cmd.extend([b'--file', cfg_file])
  75     if opttype == 'int':
  76         cmd.extend([b'--int'])
  77     elif opttype == 'bool':
  78         cmd.extend([b'--bool'])
  79     else:
  80         assert opttype is None
  81     cmd.extend([b'--get', option])
  82     env=None
  83     if repo_dir:
  84         env = _gitenv(repo_dir=repo_dir)
  85     p = subprocess.Popen(cmd, stdout=subprocess.PIPE, env=env,
  86                          close_fds=True)
  87     # with --null, git writes out a trailing \0 after the value
  88     r = p.stdout.read()[:-1]
  89     rc = p.wait()
  90     if rc == 0:
  91         if opttype == 'int':
  92             return int(r)
  93         elif opttype == 'bool':
  94             # git converts to 'true' or 'false'
  95             return r == b'true'
  96         return r
  97     if rc != 1:
  98         raise GitError('%r returned %d' % (cmd, rc))
  99     return None
 100
 101
 102 def parse_tz_offset(s):
 103     """UTC offset in seconds."""
 104     tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
 105     if bytes_from_byte(s[0]) == b'-':
 106         return - tz_off
 107     return tz_off
 108
 109 def parse_commit_gpgsig(sig):
 110     """Return the original signature bytes.
 111
 112     i.e. with the "gpgsig " header and the leading space character on
 113     each continuation line removed.
 114
 115     """
 116     if not sig:
 117         return None
 118     assert sig.startswith(b'gpgsig ')
 119     sig = sig[7:]
 120     return sig.replace(b'\n ', b'\n')
 121
 122 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
 123 # Make sure that's authoritative.
 124
 125 # See also
 126 # https://github.com/git/git/blob/master/Documentation/technical/signature-format.txt
 127 # The continuation lines have only one leading space.
 128
 129 _start_end_char = br'[^ .,:;<>"\'\0\n]'
 130 _content_char = br'[^\0\n<>]'
 131 _safe_str_rx = br'(?:%s{1,2}|(?:%s%s*%s))' \
 132     % (_start_end_char,
 133        _start_end_char, _content_char, _start_end_char)
 134 _tz_rx = br'[-+]\d\d[0-5]\d'
 135 _parent_rx = br'(?:parent [abcdefABCDEF0123456789]{40}\n)'
 136 # Assumes every following line starting with a space is part of the
 137 # mergetag.  Is there a formal commit blob spec?
 138 _mergetag_rx = br'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)'
 139 _commit_rx = re.compile(br'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
 140 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
 141 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?)
 142 (?P<gpgsig>gpgsig .*\n(?: .*\n)*)?
 143 (?P<message>(?:.|\n)*)''' % (_parent_rx,
 144                              _safe_str_rx, _safe_str_rx, _tz_rx,
 145                              _safe_str_rx, _safe_str_rx, _tz_rx,
 146                              _mergetag_rx))
 147 _parent_hash_rx = re.compile(br'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
 148
 149 # Note that the author_sec and committer_sec values are (UTC) epoch
 150 # seconds, and for now the mergetag is not included.
 151 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
 152                                        'author_name', 'author_mail',
 153                                        'author_sec', 'author_offset',
 154                                        'committer_name', 'committer_mail',
 155                                        'committer_sec', 'committer_offset',
 156                                        'gpgsig',
 157                                        'message'])
 158
 159 def parse_commit(content):
 160     commit_match = re.match(_commit_rx, content)
 161     if not commit_match:
 162         raise Exception('cannot parse commit %r' % content)
 163     matches = commit_match.groupdict()
 164     return CommitInfo(tree=matches['tree'],
 165                       parents=re.findall(_parent_hash_rx, matches['parents']),
 166                       author_name=matches['author_name'],
 167                       author_mail=matches['author_mail'],
 168                       author_sec=int(matches['asec']),
 169                       author_offset=parse_tz_offset(matches['atz']),
 170                       committer_name=matches['committer_name'],
 171                       committer_mail=matches['committer_mail'],
 172                       committer_sec=int(matches['csec']),
 173                       committer_offset=parse_tz_offset(matches['ctz']),
 174                       gpgsig=parse_commit_gpgsig(matches['gpgsig']),
 175                       message=matches['message'])
 176
 177
 178 def get_cat_data(cat_iterator, expected_type):
 179     _, kind, _ = next(cat_iterator)
 180     if kind != expected_type:
 181         raise Exception('expected %r, saw %r' % (expected_type, kind))
 182     return b''.join(cat_iterator)
 183
 184 def get_commit_items(id, cp):
 185     return parse_commit(get_cat_data(cp.get(id), b'commit'))
 186
 187 def _local_git_date_str(epoch_sec):
 188     return b'%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
 189
 190
 191 def _git_date_str(epoch_sec, tz_offset_sec):
 192     offs =  tz_offset_sec // 60
 193     return b'%d %s%02d%02d' \
 194         % (epoch_sec,
 195            b'+' if offs >= 0 else b'-',
 196            abs(offs) // 60,
 197            abs(offs) % 60)
 198
 199
 200 def repo(sub = b'', repo_dir=None):
 201     """Get the path to the git repository or one of its subdirectories."""
 202     repo_dir = repo_dir or repodir
 203     if not repo_dir:
 204         raise GitError('You should call check_repo_or_die()')
 205
 206     # If there's a .git subdirectory, then the actual repo is in there.
 207     gd = os.path.join(repo_dir, b'.git')
 208     if os.path.exists(gd):
 209         repo_dir = gd
 210
 211     return os.path.join(repo_dir, sub)
 212
 213
 214 _shorten_hash_rx = \
 215     re.compile(br'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)')
 216
 217 def shorten_hash(s):
 218     return _shorten_hash_rx.sub(br'\1\2*\3', s)
 219
 220
 221 def repo_rel(path):
 222     full = os.path.abspath(path)
 223     fullrepo = os.path.abspath(repo(b''))
 224     if not fullrepo.endswith(b'/'):
 225         fullrepo += b'/'
 226     if full.startswith(fullrepo):
 227         path = full[len(fullrepo):]
 228     if path.startswith(b'index-cache/'):
 229         path = path[len(b'index-cache/'):]
 230     return shorten_hash(path)
 231
 232
 233 def auto_midx(objdir):
 234     args = [path.exe(), b'midx', b'--auto', b'--dir', objdir]
 235     try:
 236         rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
 237     except OSError as e:
 238         # make sure 'args' gets printed to help with debugging
 239         add_error('%r: exception: %s' % (args, e))
 240         raise
 241     if rv:
 242         add_error('%r: returned %d' % (args, rv))
 243
 244     args = [path.exe(), b'bloom', b'--dir', objdir]
 245     try:
 246         rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
 247     except OSError as e:
 248         # make sure 'args' gets printed to help with debugging
 249         add_error('%r: exception: %s' % (args, e))
 250         raise
 251     if rv:
 252         add_error('%r: returned %d' % (args, rv))
 253
 254
 255 def mangle_name(name, mode, gitmode):
 256     """Mangle a file name to present an abstract name for segmented files.
 257     Mangled file names will have the ".bup" extension added to them. If a
 258     file's name already ends with ".bup", a ".bupl" extension is added to
 259     disambiguate normal files from segmented ones.
 260     """
 261     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 262         assert(stat.S_ISDIR(gitmode))
 263         return name + b'.bup'
 264     elif name.endswith(b'.bup') or name[:-1].endswith(b'.bup'):
 265         return name + b'.bupl'
 266     else:
 267         return name
 268
 269
 270 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 271 def demangle_name(name, mode):
 272     """Remove name mangling from a file name, if necessary.
 273
 274     The return value is a tuple (demangled_filename,mode), where mode is one of
 275     the following:
 276
 277     * BUP_NORMAL  : files that should be read as-is from the repository
 278     * BUP_CHUNKED : files that were chunked and need to be reassembled
 279
 280     For more information on the name mangling algorithm, see mangle_name()
 281     """
 282     if name.endswith(b'.bupl'):
 283         return (name[:-5], BUP_NORMAL)
 284     elif name.endswith(b'.bup'):
 285         return (name[:-4], BUP_CHUNKED)
 286     elif name.endswith(b'.bupm'):
 287         return (name[:-5],
 288                 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
 289     return (name, BUP_NORMAL)
 290
 291
 292 def calc_hash(type, content):
 293     """Calculate some content's hash in the Git fashion."""
 294     header = b'%s %d\0' % (type, len(content))
 295     sum = Sha1(header)
 296     sum.update(content)
 297     return sum.digest()
 298
 299
 300 def shalist_item_sort_key(ent):
 301     (mode, name, id) = ent
 302     assert(mode+0 == mode)
 303     if stat.S_ISDIR(mode):
 304         return name + b'/'
 305     else:
 306         return name
 307
 308
 309 def tree_encode(shalist):
 310     """Generate a git tree object from (mode,name,hash) tuples."""
 311     shalist = sorted(shalist, key = shalist_item_sort_key)
 312     l = []
 313     for (mode,name,bin) in shalist:
 314         assert(mode)
 315         assert(mode+0 == mode)
 316         assert(name)
 317         assert(len(bin) == 20)
 318         s = b'%o %s\0%s' % (mode,name,bin)
 319         assert s[0] != b'0'  # 0-padded octal is not acceptable in a git tree
 320         l.append(s)
 321     return b''.join(l)
 322
 323
 324 def tree_decode(buf):
 325     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 326     ofs = 0
 327     while ofs < len(buf):
 328         z = buf.find(b'\0', ofs)
 329         assert(z > ofs)
 330         spl = buf[ofs:z].split(b' ', 1)
 331         assert(len(spl) == 2)
 332         mode,name = spl
 333         sha = buf[z+1:z+1+20]
 334         ofs = z+1+20
 335         yield (int(mode, 8), name, sha)
 336
 337
 338 def _encode_packobj(type, content, compression_level=1):
 339     if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
 340         raise ValueError('invalid compression level %s' % compression_level)
 341     szout = b''
 342     sz = len(content)
 343     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 344     sz >>= 4
 345     while 1:
 346         if sz: szbits |= 0x80
 347         szout += bytes_from_uint(szbits)
 348         if not sz:
 349             break
 350         szbits = sz & 0x7f
 351         sz >>= 7
 352     z = zlib.compressobj(compression_level)
 353     yield szout
 354     yield z.compress(content)
 355     yield z.flush()
 356
 357
 358 def _decode_packobj(buf):
 359     assert(buf)
 360     c = byte_int(buf[0])
 361     type = _typermap[(c & 0x70) >> 4]
 362     sz = c & 0x0f
 363     shift = 4
 364     i = 0
 365     while c & 0x80:
 366         i += 1
 367         c = byte_int(buf[i])
 368         sz |= (c & 0x7f) << shift
 369         shift += 7
 370         if not (c & 0x80):
 371             break
 372     return (type, zlib.decompress(buf[i+1:]))
 373
 374
 375 class PackIdx:
 376     def __init__(self):
 377         assert(0)
 378
 379     def find_offset(self, hash):
 380         """Get the offset of an object inside the index file."""
 381         idx = self._idx_from_hash(hash)
 382         if idx != None:
 383             return self._ofs_from_idx(idx)
 384         return None
 385
 386     def exists(self, hash, want_source=False):
 387         """Return nonempty if the object exists in this index."""
 388         if hash and (self._idx_from_hash(hash) != None):
 389             return want_source and os.path.basename(self.name) or True
 390         return None
 391
 392     def _idx_from_hash(self, hash):
 393         global _total_searches, _total_steps
 394         _total_searches += 1
 395         assert(len(hash) == 20)
 396         b1 = byte_int(hash[0])
 397         start = self.fanout[b1-1] # range -1..254
 398         end = self.fanout[b1] # range 0..255
 399         want = hash
 400         _total_steps += 1  # lookup table is a step
 401         while start < end:
 402             _total_steps += 1
 403             mid = start + (end - start) // 2
 404             v = self._idx_to_hash(mid)
 405             if v < want:
 406                 start = mid+1
 407             elif v > want:
 408                 end = mid
 409             else: # got it!
 410                 return mid
 411         return None
 412
 413
 414 class PackIdxV1(PackIdx):
 415     """Object representation of a Git pack index (version 1) file."""
 416     def __init__(self, filename, f):
 417         self.closed = False
 418         self.name = filename
 419         self.idxnames = [self.name]
 420         self.map = mmap_read(f)
 421         # Min size for 'L' is 4, which is sufficient for struct's '!I'
 422         self.fanout = array('L', struct.unpack('!256I', self.map))
 423         self.fanout.append(0)  # entry "-1"
 424         self.nsha = self.fanout[255]
 425         self.sha_ofs = 256 * 4
 426         # Avoid slicing shatable for individual hashes (very high overhead)
 427         self.shatable = buffer(self.map, self.sha_ofs, self.nsha * 24)
 428
 429     def __enter__(self):
 430         return self
 431
 432     def __exit__(self, type, value, traceback):
 433         with pending_raise(value, rethrow=False):
 434             self.close()
 435
 436     def __len__(self):
 437         return int(self.nsha)  # int() from long for python 2
 438
 439     def _ofs_from_idx(self, idx):
 440         if idx >= self.nsha or idx < 0:
 441             raise IndexError('invalid pack index index %d' % idx)
 442         ofs = self.sha_ofs + idx * 24
 443         return struct.unpack_from('!I', self.map, offset=ofs)[0]
 444
 445     def _idx_to_hash(self, idx):
 446         if idx >= self.nsha or idx < 0:
 447             raise IndexError('invalid pack index index %d' % idx)
 448         ofs = self.sha_ofs + idx * 24 + 4
 449         return self.map[ofs : ofs + 20]
 450
 451     def __iter__(self):
 452         start = self.sha_ofs + 4
 453         for ofs in range(start, start + 24 * self.nsha, 24):
 454             yield self.map[ofs : ofs + 20]
 455
 456     def close(self):
 457         self.closed = True
 458         if self.map is not None:
 459             self.shatable = None
 460             self.map.close()
 461             self.map = None
 462
 463     def __del__(self):
 464         assert self.closed
 465
 466
 467 class PackIdxV2(PackIdx):
 468     """Object representation of a Git pack index (version 2) file."""
 469     def __init__(self, filename, f):
 470         self.closed = False
 471         self.name = filename
 472         self.idxnames = [self.name]
 473         self.map = mmap_read(f)
 474         assert self.map[0:8] == b'\377tOc\0\0\0\2'
 475         # Min size for 'L' is 4, which is sufficient for struct's '!I'
 476         self.fanout = array('L', struct.unpack_from('!256I', self.map, offset=8))
 477         self.fanout.append(0)
 478         self.nsha = self.fanout[255]
 479         self.sha_ofs = 8 + 256*4
 480         self.ofstable_ofs = self.sha_ofs + self.nsha * 20 + self.nsha * 4
 481         self.ofs64table_ofs = self.ofstable_ofs + self.nsha * 4
 482         # Avoid slicing this for individual hashes (very high overhead)
 483         self.shatable = buffer(self.map, self.sha_ofs, self.nsha*20)
 484
 485     def __enter__(self):
 486         return self
 487
 488     def __exit__(self, type, value, traceback):
 489         with pending_raise(value, rethrow=False):
 490             self.close()
 491
 492     def __len__(self):
 493         return int(self.nsha)  # int() from long for python 2
 494
 495     def _ofs_from_idx(self, idx):
 496         if idx >= self.nsha or idx < 0:
 497             raise IndexError('invalid pack index index %d' % idx)
 498         ofs_ofs = self.ofstable_ofs + idx * 4
 499         ofs = struct.unpack_from('!I', self.map, offset=ofs_ofs)[0]
 500         if ofs & 0x80000000:
 501             idx64 = ofs & 0x7fffffff
 502             ofs64_ofs = self.ofs64table_ofs + idx64 * 8
 503             ofs = struct.unpack_from('!Q', self.map, offset=ofs64_ofs)[0]
 504         return ofs
 505
 506     def _idx_to_hash(self, idx):
 507         if idx >= self.nsha or idx < 0:
 508             raise IndexError('invalid pack index index %d' % idx)
 509         ofs = self.sha_ofs + idx * 20
 510         return self.map[ofs : ofs + 20]
 511
 512     def __iter__(self):
 513         start = self.sha_ofs
 514         for ofs in range(start, start + 20 * self.nsha, 20):
 515             yield self.map[ofs : ofs + 20]
 516
 517     def close(self):
 518         self.closed = True
 519         if self.map is not None:
 520             self.shatable = None
 521             self.map.close()
 522             self.map = None
 523
 524     def __del__(self):
 525         assert self.closed
 526
 527
 528 _mpi_count = 0
 529 class PackIdxList:
 530     def __init__(self, dir, ignore_midx=False):
 531         global _mpi_count
 532         # Q: was this also intended to prevent opening multiple repos?
 533         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 534         _mpi_count += 1
 535         self.open = True
 536         self.dir = dir
 537         self.also = set()
 538         self.packs = []
 539         self.do_bloom = False
 540         self.bloom = None
 541         self.ignore_midx = ignore_midx
 542         try:
 543             self.refresh()
 544         except BaseException as ex:
 545             with pending_raise(ex):
 546                 self.close()
 547
 548     def close(self):
 549         global _mpi_count
 550         if not self.open:
 551             assert _mpi_count == 0
 552             return
 553         _mpi_count -= 1
 554         assert _mpi_count == 0
 555         self.also = None
 556         self.bloom, bloom = None, self.bloom
 557         self.packs, packs = None, self.packs
 558         self.open = False
 559         with ExitStack() as stack:
 560             for pack in packs:
 561                 stack.enter_context(pack)
 562             if bloom:
 563                 bloom.close()
 564
 565     def __enter__(self):
 566         return self
 567
 568     def __exit__(self, type, value, traceback):
 569         with pending_raise(value, rethrow=False):
 570             self.close()
 571
 572     def __del__(self):
 573         assert not self.open
 574
 575     def __iter__(self):
 576         return iter(idxmerge(self.packs))
 577
 578     def __len__(self):
 579         return sum(len(pack) for pack in self.packs)
 580
 581     def exists(self, hash, want_source=False):
 582         """Return nonempty if the object exists in the index files."""
 583         global _total_searches
 584         _total_searches += 1
 585         if hash in self.also:
 586             return True
 587         if self.do_bloom and self.bloom:
 588             if self.bloom.exists(hash):
 589                 self.do_bloom = False
 590             else:
 591                 _total_searches -= 1  # was counted by bloom
 592                 return None
 593         for i in range(len(self.packs)):
 594             p = self.packs[i]
 595             _total_searches -= 1  # will be incremented by sub-pack
 596             ix = p.exists(hash, want_source=want_source)
 597             if ix:
 598                 # reorder so most recently used packs are searched first
 599                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 600                 return ix
 601         self.do_bloom = True
 602         return None
 603
 604     def refresh(self, skip_midx = False):
 605         """Refresh the index list.
 606         This method verifies if .midx files were superseded (e.g. all of its
 607         contents are in another, bigger .midx file) and removes the superseded
 608         files.
 609
 610         If skip_midx is True, all work on .midx files will be skipped and .midx
 611         files will be removed from the list.
 612
 613         The instance variable 'ignore_midx' can force this function to
 614         always act as if skip_midx was True.
 615         """
 616         if self.bloom is not None:
 617             self.bloom.close()
 618         self.bloom = None # Always reopen the bloom as it may have been relaced
 619         self.do_bloom = False
 620         skip_midx = skip_midx or self.ignore_midx
 621         d = dict((p.name, p) for p in self.packs
 622                  if not skip_midx or not isinstance(p, midx.PackMidx))
 623         if os.path.exists(self.dir):
 624             if not skip_midx:
 625                 midxl = []
 626                 midxes = set(glob.glob(os.path.join(self.dir, b'*.midx')))
 627                 # remove any *.midx files from our list that no longer exist
 628                 for ix in list(d.values()):
 629                     if not isinstance(ix, midx.PackMidx):
 630                         continue
 631                     if ix.name in midxes:
 632                         continue
 633                     # remove the midx
 634                     del d[ix.name]
 635                     ix.close()
 636                     self.packs.remove(ix)
 637                 for ix in self.packs:
 638                     if isinstance(ix, midx.PackMidx):
 639                         for name in ix.idxnames:
 640                             d[os.path.join(self.dir, name)] = ix
 641                 for full in midxes:
 642                     if not d.get(full):
 643                         mx = midx.PackMidx(full)
 644                         (mxd, mxf) = os.path.split(mx.name)
 645                         broken = False
 646                         for n in mx.idxnames:
 647                             if not os.path.exists(os.path.join(mxd, n)):
 648                                 log(('warning: index %s missing\n'
 649                                      '  used by %s\n')
 650                                     % (path_msg(n), path_msg(mxf)))
 651                                 broken = True
 652                         if broken:
 653                             mx.close()
 654                             unlink(full)
 655                         else:
 656                             midxl.append(mx)
 657                 midxl.sort(key=lambda ix:
 658                            (-len(ix), -xstat.stat(ix.name).st_mtime))
 659                 for ix in midxl:
 660                     any_needed = False
 661                     for sub in ix.idxnames:
 662                         found = d.get(os.path.join(self.dir, sub))
 663                         if not found or isinstance(found, PackIdx):
 664                             # doesn't exist, or exists but not in a midx
 665                             any_needed = True
 666                             break
 667                     if any_needed:
 668                         d[ix.name] = ix
 669                         for name in ix.idxnames:
 670                             d[os.path.join(self.dir, name)] = ix
 671                     elif not ix.force_keep:
 672                         debug1('midx: removing redundant: %s\n'
 673                                % path_msg(os.path.basename(ix.name)))
 674                         ix.close()
 675                         unlink(ix.name)
 676             for full in glob.glob(os.path.join(self.dir, b'*.idx')):
 677                 if not d.get(full):
 678                     try:
 679                         ix = open_idx(full)
 680                     except GitError as e:
 681                         add_error(e)
 682                         continue
 683                     d[full] = ix
 684             bfull = os.path.join(self.dir, b'bup.bloom')
 685             new_packs = set(d.values())
 686             for p in self.packs:
 687                 if not p in new_packs:
 688                     p.close()
 689             new_packs = list(new_packs)
 690             new_packs.sort(reverse=True, key=lambda x: len(x))
 691             self.packs = new_packs
 692             if self.bloom is None and os.path.exists(bfull):
 693                 self.bloom = bloom.ShaBloom(bfull)
 694             try:
 695                 if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 696                     self.do_bloom = True
 697                 else:
 698                     if self.bloom:
 699                         self.bloom, bloom_tmp = None, self.bloom
 700                         bloom_tmp.close()
 701             except BaseException as ex:
 702                 with pending_raise(ex):
 703                     if self.bloom:
 704                         self.bloom.close()
 705
 706         debug1('PackIdxList: using %d index%s.\n'
 707             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 708
 709     def add(self, hash):
 710         """Insert an additional object in the list."""
 711         self.also.add(hash)
 712
 713
 714 def open_idx(filename):
 715     if filename.endswith(b'.idx'):
 716         f = open(filename, 'rb')
 717         header = f.read(8)
 718         if header[0:4] == b'\377tOc':
 719             version = struct.unpack('!I', header[4:8])[0]
 720             if version == 2:
 721                 return PackIdxV2(filename, f)
 722             else:
 723                 raise GitError('%s: expected idx file version 2, got %d'
 724                                % (path_msg(filename), version))
 725         elif len(header) == 8 and header[0:4] < b'\377tOc':
 726             return PackIdxV1(filename, f)
 727         else:
 728             raise GitError('%s: unrecognized idx file header'
 729                            % path_msg(filename))
 730     elif filename.endswith(b'.midx'):
 731         return midx.PackMidx(filename)
 732     else:
 733         raise GitError('idx filenames must end with .idx or .midx')
 734
 735
 736 def idxmerge(idxlist, final_progress=True):
 737     """Generate a list of all the objects reachable in a PackIdxList."""
 738     def pfunc(count, total):
 739         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 740                   % (count*100.0/total, count, total))
 741     def pfinal(count, total):
 742         if final_progress:
 743             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 744                      % (100, total, total))
 745     return merge_iter(idxlist, 10024, pfunc, pfinal)
 746
 747
 748 def create_commit_blob(tree, parent,
 749                        author, adate_sec, adate_tz,
 750                        committer, cdate_sec, cdate_tz,
 751                        msg):
 752     if adate_tz is not None:
 753         adate_str = _git_date_str(adate_sec, adate_tz)
 754     else:
 755         adate_str = _local_git_date_str(adate_sec)
 756     if cdate_tz is not None:
 757         cdate_str = _git_date_str(cdate_sec, cdate_tz)
 758     else:
 759         cdate_str = _local_git_date_str(cdate_sec)
 760     l = []
 761     if tree: l.append(b'tree %s' % hexlify(tree))
 762     if parent: l.append(b'parent %s' % hexlify(parent))
 763     if author: l.append(b'author %s %s' % (author, adate_str))
 764     if committer: l.append(b'committer %s %s' % (committer, cdate_str))
 765     l.append(b'')
 766     l.append(msg)
 767     return b'\n'.join(l)
 768
 769 def _make_objcache():
 770     return PackIdxList(repo(b'objects/pack'))
 771
 772 # bup-gc assumes that it can disable all PackWriter activities
 773 # (bloom/midx/cache) via the constructor and close() arguments.
 774
 775 class PackWriter:
 776     """Writes Git objects inside a pack file."""
 777     def __init__(self, objcache_maker=_make_objcache, compression_level=1,
 778                  run_midx=True, on_pack_finish=None,
 779                  max_pack_size=None, max_pack_objects=None, repo_dir=None):
 780         self.closed = False
 781         self.repo_dir = repo_dir or repo()
 782         self.file = None
 783         self.parentfd = None
 784         self.count = 0
 785         self.outbytes = 0
 786         self.filename = None
 787         self.idx = None
 788         self.objcache_maker = objcache_maker
 789         self.objcache = None
 790         self.compression_level = compression_level
 791         self.run_midx=run_midx
 792         self.on_pack_finish = on_pack_finish
 793         if not max_pack_size:
 794             max_pack_size = git_config_get(b'pack.packSizeLimit',
 795                                            repo_dir=self.repo_dir,
 796                                            opttype='int')
 797             if not max_pack_size:
 798                 # larger packs slow down pruning
 799                 max_pack_size = 1000 * 1000 * 1000
 800         self.max_pack_size = max_pack_size
 801         # cache memory usage is about 83 bytes per object
 802         self.max_pack_objects = max_pack_objects if max_pack_objects \
 803                                 else max(1, self.max_pack_size // 5000)
 804
 805     def __enter__(self):
 806         return self
 807
 808     def __exit__(self, type, value, traceback):
 809         with pending_raise(value, rethrow=False):
 810             self.close()
 811
 812     def _open(self):
 813         if not self.file:
 814             objdir = dir = os.path.join(self.repo_dir, b'objects')
 815             fd, name = tempfile.mkstemp(suffix=b'.pack', dir=objdir)
 816             try:
 817                 self.file = os.fdopen(fd, 'w+b')
 818             except:
 819                 os.close(fd)
 820                 raise
 821             try:
 822                 self.parentfd = os.open(objdir, os.O_RDONLY)
 823             except:
 824                 f = self.file
 825                 self.file = None
 826                 f.close()
 827                 raise
 828             assert name.endswith(b'.pack')
 829             self.filename = name[:-5]
 830             self.file.write(b'PACK\0\0\0\2\0\0\0\0')
 831             self.idx = PackIdxV2Writer()
 832
 833     def _raw_write(self, datalist, sha):
 834         self._open()
 835         f = self.file
 836         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 837         # the file never has a *partial* blob.  So let's make sure it's
 838         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 839         # to our hashsplit algorithm.)  f.write() does its own buffering,
 840         # but that's okay because we'll flush it in _end().
 841         oneblob = b''.join(datalist)
 842         try:
 843             f.write(oneblob)
 844         except IOError as e:
 845             reraise(GitError(e))
 846         nw = len(oneblob)
 847         crc = zlib.crc32(oneblob) & 0xffffffff
 848         self._update_idx(sha, crc, nw)
 849         self.outbytes += nw
 850         self.count += 1
 851         return nw, crc
 852
 853     def _update_idx(self, sha, crc, size):
 854         assert(sha)
 855         if self.idx:
 856             self.idx.add(sha, crc, self.file.tell() - size)
 857
 858     def _write(self, sha, type, content):
 859         if verbose:
 860             log('>')
 861         if not sha:
 862             sha = calc_hash(type, content)
 863         size, crc = self._raw_write(_encode_packobj(type, content,
 864                                                     self.compression_level),
 865                                     sha=sha)
 866         if self.outbytes >= self.max_pack_size \
 867            or self.count >= self.max_pack_objects:
 868             self.breakpoint()
 869         return sha
 870
 871     def _require_objcache(self):
 872         if self.objcache is None and self.objcache_maker:
 873             self.objcache = self.objcache_maker()
 874         if self.objcache is None:
 875             raise GitError(
 876                     "PackWriter not opened or can't check exists w/o objcache")
 877
 878     def exists(self, id, want_source=False):
 879         """Return non-empty if an object is found in the object cache."""
 880         self._require_objcache()
 881         return self.objcache.exists(id, want_source=want_source)
 882
 883     def just_write(self, sha, type, content):
 884         """Write an object to the pack file without checking for duplication."""
 885         self._write(sha, type, content)
 886         # If nothing else, gc doesn't have/want an objcache
 887         if self.objcache is not None:
 888             self.objcache.add(sha)
 889
 890     def maybe_write(self, type, content):
 891         """Write an object to the pack file if not present and return its id."""
 892         sha = calc_hash(type, content)
 893         if not self.exists(sha):
 894             self._require_objcache()
 895             self.just_write(sha, type, content)
 896         return sha
 897
 898     def new_blob(self, blob):
 899         """Create a blob object in the pack with the supplied content."""
 900         return self.maybe_write(b'blob', blob)
 901
 902     def new_tree(self, shalist):
 903         """Create a tree object in the pack."""
 904         content = tree_encode(shalist)
 905         return self.maybe_write(b'tree', content)
 906
 907     def new_commit(self, tree, parent,
 908                    author, adate_sec, adate_tz,
 909                    committer, cdate_sec, cdate_tz,
 910                    msg):
 911         """Create a commit object in the pack.  The date_sec values must be
 912         epoch-seconds, and if a tz is None, the local timezone is assumed."""
 913         content = create_commit_blob(tree, parent,
 914                                      author, adate_sec, adate_tz,
 915                                      committer, cdate_sec, cdate_tz,
 916                                      msg)
 917         return self.maybe_write(b'commit', content)
 918
 919     def _end(self, run_midx=True, abort=False):
 920         # Ignores run_midx during abort
 921         if not self.file:
 922             return None
 923         self.file, f = None, self.file
 924         self.idx, idx = None, self.idx
 925         self.parentfd, pfd, = None, self.parentfd
 926
 927         try:
 928             with nullcontext_if_not(self.objcache), \
 929                  finalized(pfd, lambda x: x is not None and os.close(x)), \
 930                  f:
 931
 932                 if abort:
 933                     os.unlink(self.filename + b'.pack')
 934                     return None
 935
 936                 # update object count
 937                 f.seek(8)
 938                 cp = struct.pack('!i', self.count)
 939                 assert len(cp) == 4
 940                 f.write(cp)
 941
 942                 # calculate the pack sha1sum
 943                 f.seek(0)
 944                 sum = Sha1()
 945                 for b in chunkyreader(f):
 946                     sum.update(b)
 947                 packbin = sum.digest()
 948                 f.write(packbin)
 949                 f.flush()
 950                 fdatasync(f.fileno())
 951                 f.close()
 952
 953                 idx.write(self.filename + b'.idx', packbin)
 954                 nameprefix = os.path.join(self.repo_dir,
 955                                           b'objects/pack/pack-' +  hexlify(packbin))
 956                 if os.path.exists(self.filename + b'.map'):
 957                     os.unlink(self.filename + b'.map')
 958                 os.rename(self.filename + b'.pack', nameprefix + b'.pack')
 959                 os.rename(self.filename + b'.idx', nameprefix + b'.idx')
 960                 os.fsync(pfd)
 961                 if run_midx:
 962                     auto_midx(os.path.join(self.repo_dir, b'objects/pack'))
 963                 if self.on_pack_finish:
 964                     self.on_pack_finish(nameprefix)
 965                 return nameprefix
 966         finally:
 967             # Must be last -- some of the code above depends on it
 968             self.objcache = None
 969
 970     def abort(self):
 971         """Remove the pack file from disk."""
 972         self.closed = True
 973         self._end(abort=True)
 974
 975     def breakpoint(self):
 976         """Clear byte and object counts and return the last processed id."""
 977         id = self._end(self.run_midx)
 978         self.outbytes = self.count = 0
 979         return id
 980
 981     def close(self, run_midx=True):
 982         """Close the pack file and move it to its definitive path."""
 983         self.closed = True
 984         return self._end(run_midx=run_midx)
 985
 986     def __del__(self):
 987         assert self.closed
 988
 989
 990 class PackIdxV2Writer:
 991     def __init__(self):
 992         self.idx = list(list() for i in range(256))
 993         self.count = 0
 994
 995     def add(self, sha, crc, offs):
 996         assert(sha)
 997         self.count += 1
 998         self.idx[byte_int(sha[0])].append((sha, crc, offs))
 999
1000     def write(self, filename, packbin):
1001         ofs64_count = 0
1002         for section in self.idx:
1003             for entry in section:
1004                 if entry[2] >= 2**31:
1005                     ofs64_count += 1
1006
1007         # Length: header + fan-out + shas-and-crcs + overflow-offsets
1008         index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
1009         idx_map = None
1010         idx_f = open(filename, 'w+b')
1011         try:
1012             idx_f.truncate(index_len)
1013             fdatasync(idx_f.fileno())
1014             idx_map = mmap_readwrite(idx_f, close=False)
1015             try:
1016                 count = _helpers.write_idx(filename, idx_map, self.idx,
1017                                            self.count)
1018                 assert(count == self.count)
1019                 idx_map.flush()
1020             finally:
1021                 idx_map.close()
1022         finally:
1023             idx_f.close()
1024
1025         idx_f = open(filename, 'a+b')
1026         try:
1027             idx_f.write(packbin)
1028             idx_f.seek(0)
1029             idx_sum = Sha1()
1030             b = idx_f.read(8 + 4*256)
1031             idx_sum.update(b)
1032
1033             for b in chunkyreader(idx_f, 20 * self.count):
1034                 idx_sum.update(b)
1035
1036             for b in chunkyreader(idx_f):
1037                 idx_sum.update(b)
1038             idx_f.write(idx_sum.digest())
1039             fdatasync(idx_f.fileno())
1040         finally:
1041             idx_f.close()
1042
1043
1044 def list_refs(patterns=None, repo_dir=None,
1045               limit_to_heads=False, limit_to_tags=False):
1046     """Yield (refname, hash) tuples for all repository refs unless
1047     patterns are specified.  In that case, only include tuples for
1048     refs matching those patterns (cf. git-show-ref(1)).  The limits
1049     restrict the result items to refs/heads or refs/tags.  If both
1050     limits are specified, items from both sources will be included.
1051
1052     """
1053     argv = [b'git', b'show-ref']
1054     if limit_to_heads:
1055         argv.append(b'--heads')
1056     if limit_to_tags:
1057         argv.append(b'--tags')
1058     argv.append(b'--')
1059     if patterns:
1060         argv.extend(patterns)
1061     p = subprocess.Popen(argv, env=_gitenv(repo_dir), stdout=subprocess.PIPE,
1062                          close_fds=True)
1063     out = p.stdout.read().strip()
1064     rv = p.wait()  # not fatal
1065     if rv:
1066         assert(not out)
1067     if out:
1068         for d in out.split(b'\n'):
1069             sha, name = d.split(b' ', 1)
1070             yield name, unhexlify(sha)
1071
1072
1073 def read_ref(refname, repo_dir = None):
1074     """Get the commit id of the most recent commit made on a given ref."""
1075     refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
1076     l = tuple(islice(refs, 2))
1077     if l:
1078         assert(len(l) == 1)
1079         return l[0][1]
1080     else:
1081         return None
1082
1083
1084 def rev_list_invocation(ref_or_refs, format=None):
1085     if isinstance(ref_or_refs, bytes):
1086         refs = (ref_or_refs,)
1087     else:
1088         refs = ref_or_refs
1089     argv = [b'git', b'rev-list']
1090
1091     if format:
1092         argv.append(b'--pretty=format:' + format)
1093     for ref in refs:
1094         assert not ref.startswith(b'-')
1095         argv.append(ref)
1096     argv.append(b'--')
1097     return argv
1098
1099
1100 def rev_list(ref_or_refs, parse=None, format=None, repo_dir=None):
1101     """Yield information about commits as per "git rev-list".  If a format
1102     is not provided, yield one hex hash at a time.  If a format is
1103     provided, pass it to rev-list and call parse(git_stdout) for each
1104     commit with the stream positioned just after the rev-list "commit
1105     HASH" header line.  When a format is provided yield (oidx,
1106     parse(git_stdout)) for each commit.
1107
1108     """
1109     assert bool(parse) == bool(format)
1110     p = subprocess.Popen(rev_list_invocation(ref_or_refs,
1111                                              format=format),
1112                          env=_gitenv(repo_dir),
1113                          stdout = subprocess.PIPE,
1114                          close_fds=True)
1115     if not format:
1116         for line in p.stdout:
1117             yield line.strip()
1118     else:
1119         line = p.stdout.readline()
1120         while line:
1121             s = line.strip()
1122             if not s.startswith(b'commit '):
1123                 raise Exception('unexpected line ' + repr(s))
1124             s = s[7:]
1125             assert len(s) == 40
1126             yield s, parse(p.stdout)
1127             line = p.stdout.readline()
1128
1129     rv = p.wait()  # not fatal
1130     if rv:
1131         raise GitError('git rev-list returned error %d' % rv)
1132
1133
1134 def rev_parse(committish, repo_dir=None):
1135     """Resolve the full hash for 'committish', if it exists.
1136
1137     Should be roughly equivalent to 'git rev-parse'.
1138
1139     Returns the hex value of the hash if it is found, None if 'committish' does
1140     not correspond to anything.
1141     """
1142     head = read_ref(committish, repo_dir=repo_dir)
1143     if head:
1144         debug2("resolved from ref: commit = %s\n" % hexlify(head))
1145         return head
1146
1147     if len(committish) == 40:
1148         try:
1149             hash = unhexlify(committish)
1150         except TypeError:
1151             return None
1152
1153         with PackIdxList(repo(b'objects/pack', repo_dir=repo_dir)) as pL:
1154             if pL.exists(hash):
1155                 return hash
1156
1157     return None
1158
1159
1160 def update_ref(refname, newval, oldval, repo_dir=None):
1161     """Update a repository reference."""
1162     if not oldval:
1163         oldval = b''
1164     assert refname.startswith(b'refs/heads/') \
1165         or refname.startswith(b'refs/tags/')
1166     p = subprocess.Popen([b'git', b'update-ref', refname,
1167                           hexlify(newval), hexlify(oldval)],
1168                          env=_gitenv(repo_dir),
1169                          close_fds=True)
1170     _git_wait(b'git update-ref', p)
1171
1172
1173 def delete_ref(refname, oldvalue=None):
1174     """Delete a repository reference (see git update-ref(1))."""
1175     assert refname.startswith(b'refs/')
1176     oldvalue = [] if not oldvalue else [oldvalue]
1177     p = subprocess.Popen([b'git', b'update-ref', b'-d', refname] + oldvalue,
1178                          env=_gitenv(),
1179                          close_fds=True)
1180     _git_wait('git update-ref', p)
1181
1182
1183 def guess_repo(path=None):
1184     """Set the path value in the global variable "repodir".
1185     This makes bup look for an existing bup repository, but not fail if a
1186     repository doesn't exist. Usually, if you are interacting with a bup
1187     repository, you would not be calling this function but using
1188     check_repo_or_die().
1189     """
1190     global repodir
1191     if path:
1192         repodir = path
1193     if not repodir:
1194         repodir = environ.get(b'BUP_DIR')
1195         if not repodir:
1196             repodir = os.path.expanduser(b'~/.bup')
1197
1198
1199 def init_repo(path=None):
1200     """Create the Git bare repository for bup in a given path."""
1201     guess_repo(path)
1202     d = repo()  # appends a / to the path
1203     parent = os.path.dirname(os.path.dirname(d))
1204     if parent and not os.path.exists(parent):
1205         raise GitError('parent directory "%s" does not exist\n'
1206                        % path_msg(parent))
1207     if os.path.exists(d) and not os.path.isdir(os.path.join(d, b'.')):
1208         raise GitError('"%s" exists but is not a directory\n' % path_msg(d))
1209     p = subprocess.Popen([b'git', b'--bare', b'init'], stdout=sys.stderr,
1210                          env=_gitenv(),
1211                          close_fds=True)
1212     _git_wait('git init', p)
1213     # Force the index version configuration in order to ensure bup works
1214     # regardless of the version of the installed Git binary.
1215     p = subprocess.Popen([b'git', b'config', b'pack.indexVersion', '2'],
1216                          stdout=sys.stderr, env=_gitenv(), close_fds=True)
1217     _git_wait('git config', p)
1218     # Enable the reflog
1219     p = subprocess.Popen([b'git', b'config', b'core.logAllRefUpdates', b'true'],
1220                          stdout=sys.stderr, env=_gitenv(), close_fds=True)
1221     _git_wait('git config', p)
1222
1223
1224 def check_repo_or_die(path=None):
1225     """Check to see if a bup repository probably exists, and abort if not."""
1226     guess_repo(path)
1227     top = repo()
1228     pst = stat_if_exists(top + b'/objects/pack')
1229     if pst and stat.S_ISDIR(pst.st_mode):
1230         return
1231     if not pst:
1232         top_st = stat_if_exists(top)
1233         if not top_st:
1234             log('error: repository %r does not exist (see "bup help init")\n'
1235                 % top)
1236             sys.exit(15)
1237     log('error: %s is not a repository\n' % path_msg(top))
1238     sys.exit(14)
1239
1240
1241 def is_suitable_git(ver_str):
1242     if not ver_str.startswith(b'git version '):
1243         return 'unrecognized'
1244     ver_str = ver_str[len(b'git version '):]
1245     if ver_str.startswith(b'0.'):
1246         return 'insufficient'
1247     if ver_str.startswith(b'1.'):
1248         if re.match(br'1\.[012345]rc', ver_str):
1249             return 'insufficient'
1250         if re.match(br'1\.[01234]\.', ver_str):
1251             return 'insufficient'
1252         if re.match(br'1\.5\.[012345]($|\.)', ver_str):
1253             return 'insufficient'
1254         if re.match(br'1\.5\.6-rc', ver_str):
1255             return 'insufficient'
1256         return 'suitable'
1257     if re.match(br'[0-9]+(\.|$)?', ver_str):
1258         return 'suitable'
1259     sys.exit(13)
1260
1261 _git_great = None
1262
1263 def require_suitable_git(ver_str=None):
1264     """Raise GitError if the version of git isn't suitable.
1265
1266     Rely on ver_str when provided, rather than invoking the git in the
1267     path.
1268
1269     """
1270     global _git_great
1271     if _git_great is not None:
1272         return
1273     if environ.get(b'BUP_GIT_VERSION_IS_FINE', b'').lower() \
1274        in (b'yes', b'true', b'1'):
1275         _git_great = True
1276         return
1277     if not ver_str:
1278         ver_str, _, _ = _git_exo([b'git', b'--version'])
1279     status = is_suitable_git(ver_str)
1280     if status == 'unrecognized':
1281         raise GitError('Unexpected git --version output: %r' % ver_str)
1282     if status == 'insufficient':
1283         log('error: git version must be at least 1.5.6\n')
1284         sys.exit(1)
1285     if status == 'suitable':
1286         _git_great = True
1287         return
1288     assert False
1289
1290
1291 class CatPipe:
1292     """Link to 'git cat-file' that is used to retrieve blob data."""
1293     def __init__(self, repo_dir = None):
1294         require_suitable_git()
1295         self.repo_dir = repo_dir
1296         self.p = self.inprogress = None
1297
1298     def close(self, wait=False):
1299         self.p, p = None, self.p
1300         self.inprogress = None
1301         if p:
1302             try:
1303                 p.stdout.close()
1304             finally:
1305                 # This will handle pending exceptions correctly once
1306                 # we drop py2
1307                 p.stdin.close()
1308         if wait:
1309             p.wait()
1310             return p.returncode
1311         return None
1312
1313     def restart(self):
1314         self.close()
1315         self.p = subprocess.Popen([b'git', b'cat-file', b'--batch'],
1316                                   stdin=subprocess.PIPE,
1317                                   stdout=subprocess.PIPE,
1318                                   close_fds = True,
1319                                   bufsize = 4096,
1320                                   env=_gitenv(self.repo_dir))
1321
1322     def get(self, ref):
1323         """Yield (oidx, type, size), followed by the data referred to by ref.
1324         If ref does not exist, only yield (None, None, None).
1325
1326         """
1327         if not self.p or self.p.poll() != None:
1328             self.restart()
1329         assert(self.p)
1330         poll_result = self.p.poll()
1331         assert(poll_result == None)
1332         if self.inprogress:
1333             log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1334         assert(not self.inprogress)
1335         assert ref.find(b'\n') < 0
1336         assert ref.find(b'\r') < 0
1337         assert not ref.startswith(b'-')
1338         self.inprogress = ref
1339         self.p.stdin.write(ref + b'\n')
1340         self.p.stdin.flush()
1341         hdr = self.p.stdout.readline()
1342         if not hdr:
1343             raise GitError('unexpected cat-file EOF (last request: %r, exit: %s)'
1344                            % (ref, self.p.poll() or 'none'))
1345         if hdr.endswith(b' missing\n'):
1346             self.inprogress = None
1347             yield None, None, None
1348             return
1349         info = hdr.split(b' ')
1350         if len(info) != 3 or len(info[0]) != 40:
1351             raise GitError('expected object (id, type, size), got %r' % info)
1352         oidx, typ, size = info
1353         size = int(size)
1354         try:
1355             it = chunkyreader(self.p.stdout, size)
1356             yield oidx, typ, size
1357             for blob in chunkyreader(self.p.stdout, size):
1358                 yield blob
1359             readline_result = self.p.stdout.readline()
1360             assert readline_result == b'\n'
1361             self.inprogress = None
1362         except Exception as ex:
1363             with pending_raise(ex):
1364                 self.close()
1365
1366     def _join(self, it):
1367         _, typ, _ = next(it)
1368         if typ == b'blob':
1369             for blob in it:
1370                 yield blob
1371         elif typ == b'tree':
1372             treefile = b''.join(it)
1373             for (mode, name, sha) in tree_decode(treefile):
1374                 for blob in self.join(hexlify(sha)):
1375                     yield blob
1376         elif typ == b'commit':
1377             treeline = b''.join(it).split(b'\n')[0]
1378             assert treeline.startswith(b'tree ')
1379             for blob in self.join(treeline[5:]):
1380                 yield blob
1381         else:
1382             raise GitError('invalid object type %r: expected blob/tree/commit'
1383                            % typ)
1384
1385     def join(self, id):
1386         """Generate a list of the content of all blobs that can be reached
1387         from an object.  The hash given in 'id' must point to a blob, a tree
1388         or a commit. The content of all blobs that can be seen from trees or
1389         commits will be added to the list.
1390         """
1391         for d in self._join(self.get(id)):
1392             yield d
1393
1394
1395 _cp = {}
1396
1397 def cp(repo_dir=None):
1398     """Create a CatPipe object or reuse the already existing one."""
1399     global _cp, repodir
1400     if not repo_dir:
1401         repo_dir = repodir or repo()
1402     repo_dir = os.path.abspath(repo_dir)
1403     cp = _cp.get(repo_dir)
1404     if not cp:
1405         cp = CatPipe(repo_dir)
1406         _cp[repo_dir] = cp
1407     return cp
1408
1409
1410 def close_catpipes():
1411     # FIXME: chain exceptions
1412     while _cp:
1413         _, cp = _cp.popitem()
1414         cp.close(wait=True)
1415
1416
1417 def tags(repo_dir = None):
1418     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1419     tags = {}
1420     for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1421         assert n.startswith(b'refs/tags/')
1422         name = n[10:]
1423         if not c in tags:
1424             tags[c] = []
1425         tags[c].append(name)  # more than one tag can point at 'c'
1426     return tags
1427
1428
1429 class MissingObject(KeyError):
1430     def __init__(self, oid):
1431         self.oid = oid
1432         KeyError.__init__(self, 'object %r is missing' % hexlify(oid))
1433
1434
1435 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1436                                    'path', 'chunk_path', 'data'])
1437 # The path is the mangled path, and if an item represents a fragment
1438 # of a chunked file, the chunk_path will be the chunked subtree path
1439 # for the chunk, i.e. ['', '2d3115e', ...].  The top-level path for a
1440 # chunked file will have a chunk_path of [''].  So some chunk subtree
1441 # of the file '/foo/bar/baz' might look like this:
1442 #
1443 #   item.path = ['foo', 'bar', 'baz.bup']
1444 #   item.chunk_path = ['', '2d3115e', '016b097']
1445 #   item.type = 'tree'
1446 #   ...
1447
1448
1449 def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1450     """Yield everything reachable from oidx via get_ref (which must behave
1451     like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1452     returns true.  Throw MissingObject if a hash encountered is
1453     missing from the repository, and don't read or return blob content
1454     in the data field unless include_data is set.
1455
1456     """
1457     # Maintain the pending stack on the heap to avoid stack overflow
1458     pending = [(oidx, [], [], None)]
1459     while len(pending):
1460         oidx, parent_path, chunk_path, mode = pending.pop()
1461         oid = unhexlify(oidx)
1462         if stop_at and stop_at(oidx):
1463             continue
1464
1465         if (not include_data) and mode and stat.S_ISREG(mode):
1466             # If the object is a "regular file", then it's a leaf in
1467             # the graph, so we can skip reading the data if the caller
1468             # hasn't requested it.
1469             yield WalkItem(oid=oid, type=b'blob',
1470                            chunk_path=chunk_path, path=parent_path,
1471                            mode=mode,
1472                            data=None)
1473             continue
1474
1475         item_it = get_ref(oidx)
1476         get_oidx, typ, _ = next(item_it)
1477         if not get_oidx:
1478             raise MissingObject(unhexlify(oidx))
1479         if typ not in (b'blob', b'commit', b'tree'):
1480             raise Exception('unexpected repository object type %r' % typ)
1481
1482         # FIXME: set the mode based on the type when the mode is None
1483         if typ == b'blob' and not include_data:
1484             # Dump data until we can ask cat_pipe not to fetch it
1485             for ignored in item_it:
1486                 pass
1487             data = None
1488         else:
1489             data = b''.join(item_it)
1490
1491         yield WalkItem(oid=oid, type=typ,
1492                        chunk_path=chunk_path, path=parent_path,
1493                        mode=mode,
1494                        data=(data if include_data else None))
1495
1496         if typ == b'commit':
1497             commit_items = parse_commit(data)
1498             for pid in commit_items.parents:
1499                 pending.append((pid, parent_path, chunk_path, mode))
1500             pending.append((commit_items.tree, parent_path, chunk_path,
1501                             hashsplit.GIT_MODE_TREE))
1502         elif typ == b'tree':
1503             for mode, name, ent_id in tree_decode(data):
1504                 demangled, bup_type = demangle_name(name, mode)
1505                 if chunk_path:
1506                     sub_path = parent_path
1507                     sub_chunk_path = chunk_path + [name]
1508                 else:
1509                     sub_path = parent_path + [name]
1510                     if bup_type == BUP_CHUNKED:
1511                         sub_chunk_path = [b'']
1512                     else:
1513                         sub_chunk_path = chunk_path
1514                 pending.append((hexlify(ent_id), sub_path, sub_chunk_path,
1515                                 mode))