lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5
   6 from __future__ import absolute_import, print_function
   7 import os, sys, zlib, subprocess, struct, stat, re, tempfile, glob
   8 from array import array
   9 from binascii import hexlify, unhexlify
  10 from collections import namedtuple
  11 from itertools import islice
  12
  13 from bup import _helpers, hashsplit, path, midx, bloom, xstat
  14 from bup.compat import (buffer,
  15                         byte_int, bytes_from_byte, bytes_from_uint,
  16                         environ,
  17                         items,
  18                         pending_raise,
  19                         range,
  20                         reraise)
  21 from bup.io import path_msg
  22 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
  23                          exo,
  24                          fdatasync,
  25                          finalized,
  26                          log,
  27                          merge_dict,
  28                          merge_iter,
  29                          mmap_read, mmap_readwrite,
  30                          progress, qprogress, stat_if_exists,
  31                          unlink,
  32                          utc_offset_str)
  33
  34
  35 verbose = 0
  36 repodir = None  # The default repository, once initialized
  37
  38 _typemap =  {b'blob': 3, b'tree': 2, b'commit': 1, b'tag': 4}
  39 _typermap = {v: k for k, v in items(_typemap)}
  40
  41
  42 _total_searches = 0
  43 _total_steps = 0
  44
  45
  46 class GitError(Exception):
  47     pass
  48
  49
  50 def _gitenv(repo_dir=None):
  51     if not repo_dir:
  52         repo_dir = repo()
  53     return merge_dict(environ, {b'GIT_DIR': os.path.abspath(repo_dir)})
  54
  55 def _git_wait(cmd, p):
  56     rv = p.wait()
  57     if rv != 0:
  58         raise GitError('%r returned %d' % (cmd, rv))
  59
  60 def _git_exo(cmd, **kwargs):
  61     kwargs['check'] = False
  62     result = exo(cmd, **kwargs)
  63     _, _, proc = result
  64     if proc.returncode != 0:
  65         raise GitError('%r returned %d' % (cmd, proc.returncode))
  66     return result
  67
  68 def git_config_get(option, repo_dir=None, opttype=None, cfg_file=None):
  69     assert not (repo_dir and cfg_file), "repo_dir and cfg_file cannot both be used"
  70     cmd = [b'git', b'config', b'--null']
  71     if cfg_file:
  72         cmd.extend([b'--file', cfg_file])
  73     if opttype == 'int':
  74         cmd.extend([b'--int'])
  75     elif opttype == 'bool':
  76         cmd.extend([b'--bool'])
  77     else:
  78         assert opttype is None
  79     cmd.extend([b'--get', option])
  80     env=None
  81     if repo_dir:
  82         env = _gitenv(repo_dir=repo_dir)
  83     p = subprocess.Popen(cmd, stdout=subprocess.PIPE, env=env,
  84                          close_fds=True)
  85     # with --null, git writes out a trailing \0 after the value
  86     r = p.stdout.read()[:-1]
  87     rc = p.wait()
  88     if rc == 0:
  89         if opttype == 'int':
  90             return int(r)
  91         elif opttype == 'bool':
  92             # git converts to 'true' or 'false'
  93             return r == b'true'
  94         return r
  95     if rc != 1:
  96         raise GitError('%r returned %d' % (cmd, rc))
  97     return None
  98
  99
 100 def parse_tz_offset(s):
 101     """UTC offset in seconds."""
 102     tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
 103     if bytes_from_byte(s[0]) == b'-':
 104         return - tz_off
 105     return tz_off
 106
 107 def parse_commit_gpgsig(sig):
 108     """Return the original signature bytes.
 109
 110     i.e. with the "gpgsig " header and the leading space character on
 111     each continuation line removed.
 112
 113     """
 114     if not sig:
 115         return None
 116     assert sig.startswith(b'gpgsig ')
 117     sig = sig[7:]
 118     return sig.replace(b'\n ', b'\n')
 119
 120 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
 121 # Make sure that's authoritative.
 122
 123 # See also
 124 # https://github.com/git/git/blob/master/Documentation/technical/signature-format.txt
 125 # The continuation lines have only one leading space.
 126
 127 _start_end_char = br'[^ .,:;<>"\'\0\n]'
 128 _content_char = br'[^\0\n<>]'
 129 _safe_str_rx = br'(?:%s{1,2}|(?:%s%s*%s))' \
 130     % (_start_end_char,
 131        _start_end_char, _content_char, _start_end_char)
 132 _tz_rx = br'[-+]\d\d[0-5]\d'
 133 _parent_rx = br'(?:parent [abcdefABCDEF0123456789]{40}\n)'
 134 # Assumes every following line starting with a space is part of the
 135 # mergetag.  Is there a formal commit blob spec?
 136 _mergetag_rx = br'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)'
 137 _commit_rx = re.compile(br'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
 138 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
 139 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?)
 140 (?P<gpgsig>gpgsig .*\n(?: .*\n)*)?
 141 (?P<message>(?:.|\n)*)''' % (_parent_rx,
 142                              _safe_str_rx, _safe_str_rx, _tz_rx,
 143                              _safe_str_rx, _safe_str_rx, _tz_rx,
 144                              _mergetag_rx))
 145 _parent_hash_rx = re.compile(br'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
 146
 147 # Note that the author_sec and committer_sec values are (UTC) epoch
 148 # seconds, and for now the mergetag is not included.
 149 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
 150                                        'author_name', 'author_mail',
 151                                        'author_sec', 'author_offset',
 152                                        'committer_name', 'committer_mail',
 153                                        'committer_sec', 'committer_offset',
 154                                        'gpgsig',
 155                                        'message'])
 156
 157 def parse_commit(content):
 158     commit_match = re.match(_commit_rx, content)
 159     if not commit_match:
 160         raise Exception('cannot parse commit %r' % content)
 161     matches = commit_match.groupdict()
 162     return CommitInfo(tree=matches['tree'],
 163                       parents=re.findall(_parent_hash_rx, matches['parents']),
 164                       author_name=matches['author_name'],
 165                       author_mail=matches['author_mail'],
 166                       author_sec=int(matches['asec']),
 167                       author_offset=parse_tz_offset(matches['atz']),
 168                       committer_name=matches['committer_name'],
 169                       committer_mail=matches['committer_mail'],
 170                       committer_sec=int(matches['csec']),
 171                       committer_offset=parse_tz_offset(matches['ctz']),
 172                       gpgsig=parse_commit_gpgsig(matches['gpgsig']),
 173                       message=matches['message'])
 174
 175
 176 def get_cat_data(cat_iterator, expected_type):
 177     _, kind, _ = next(cat_iterator)
 178     if kind != expected_type:
 179         raise Exception('expected %r, saw %r' % (expected_type, kind))
 180     return b''.join(cat_iterator)
 181
 182 def get_commit_items(id, cp):
 183     return parse_commit(get_cat_data(cp.get(id), b'commit'))
 184
 185 def _local_git_date_str(epoch_sec):
 186     return b'%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
 187
 188
 189 def _git_date_str(epoch_sec, tz_offset_sec):
 190     offs =  tz_offset_sec // 60
 191     return b'%d %s%02d%02d' \
 192         % (epoch_sec,
 193            b'+' if offs >= 0 else b'-',
 194            abs(offs) // 60,
 195            abs(offs) % 60)
 196
 197
 198 def repo(sub = b'', repo_dir=None):
 199     """Get the path to the git repository or one of its subdirectories."""
 200     repo_dir = repo_dir or repodir
 201     if not repo_dir:
 202         raise GitError('You should call check_repo_or_die()')
 203
 204     # If there's a .git subdirectory, then the actual repo is in there.
 205     gd = os.path.join(repo_dir, b'.git')
 206     if os.path.exists(gd):
 207         repo_dir = gd
 208
 209     return os.path.join(repo_dir, sub)
 210
 211
 212 _shorten_hash_rx = \
 213     re.compile(br'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)')
 214
 215 def shorten_hash(s):
 216     return _shorten_hash_rx.sub(br'\1\2*\3', s)
 217
 218
 219 def repo_rel(path):
 220     full = os.path.abspath(path)
 221     fullrepo = os.path.abspath(repo(b''))
 222     if not fullrepo.endswith(b'/'):
 223         fullrepo += b'/'
 224     if full.startswith(fullrepo):
 225         path = full[len(fullrepo):]
 226     if path.startswith(b'index-cache/'):
 227         path = path[len(b'index-cache/'):]
 228     return shorten_hash(path)
 229
 230
 231 def auto_midx(objdir):
 232     args = [path.exe(), b'midx', b'--auto', b'--dir', objdir]
 233     try:
 234         rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
 235     except OSError as e:
 236         # make sure 'args' gets printed to help with debugging
 237         add_error('%r: exception: %s' % (args, e))
 238         raise
 239     if rv:
 240         add_error('%r: returned %d' % (args, rv))
 241
 242     args = [path.exe(), b'bloom', b'--dir', objdir]
 243     try:
 244         rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
 245     except OSError as e:
 246         # make sure 'args' gets printed to help with debugging
 247         add_error('%r: exception: %s' % (args, e))
 248         raise
 249     if rv:
 250         add_error('%r: returned %d' % (args, rv))
 251
 252
 253 def mangle_name(name, mode, gitmode):
 254     """Mangle a file name to present an abstract name for segmented files.
 255     Mangled file names will have the ".bup" extension added to them. If a
 256     file's name already ends with ".bup", a ".bupl" extension is added to
 257     disambiguate normal files from segmented ones.
 258     """
 259     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 260         assert(stat.S_ISDIR(gitmode))
 261         return name + b'.bup'
 262     elif name.endswith(b'.bup') or name[:-1].endswith(b'.bup'):
 263         return name + b'.bupl'
 264     else:
 265         return name
 266
 267
 268 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 269 def demangle_name(name, mode):
 270     """Remove name mangling from a file name, if necessary.
 271
 272     The return value is a tuple (demangled_filename,mode), where mode is one of
 273     the following:
 274
 275     * BUP_NORMAL  : files that should be read as-is from the repository
 276     * BUP_CHUNKED : files that were chunked and need to be reassembled
 277
 278     For more information on the name mangling algorithm, see mangle_name()
 279     """
 280     if name.endswith(b'.bupl'):
 281         return (name[:-5], BUP_NORMAL)
 282     elif name.endswith(b'.bup'):
 283         return (name[:-4], BUP_CHUNKED)
 284     elif name.endswith(b'.bupm'):
 285         return (name[:-5],
 286                 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
 287     return (name, BUP_NORMAL)
 288
 289
 290 def calc_hash(type, content):
 291     """Calculate some content's hash in the Git fashion."""
 292     header = b'%s %d\0' % (type, len(content))
 293     sum = Sha1(header)
 294     sum.update(content)
 295     return sum.digest()
 296
 297
 298 def shalist_item_sort_key(ent):
 299     (mode, name, id) = ent
 300     assert(mode+0 == mode)
 301     if stat.S_ISDIR(mode):
 302         return name + b'/'
 303     else:
 304         return name
 305
 306
 307 def tree_encode(shalist):
 308     """Generate a git tree object from (mode,name,hash) tuples."""
 309     shalist = sorted(shalist, key = shalist_item_sort_key)
 310     l = []
 311     for (mode,name,bin) in shalist:
 312         assert(mode)
 313         assert(mode+0 == mode)
 314         assert(name)
 315         assert(len(bin) == 20)
 316         s = b'%o %s\0%s' % (mode,name,bin)
 317         assert s[0] != b'0'  # 0-padded octal is not acceptable in a git tree
 318         l.append(s)
 319     return b''.join(l)
 320
 321
 322 def tree_decode(buf):
 323     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 324     ofs = 0
 325     while ofs < len(buf):
 326         z = buf.find(b'\0', ofs)
 327         assert(z > ofs)
 328         spl = buf[ofs:z].split(b' ', 1)
 329         assert(len(spl) == 2)
 330         mode,name = spl
 331         sha = buf[z+1:z+1+20]
 332         ofs = z+1+20
 333         yield (int(mode, 8), name, sha)
 334
 335
 336 def _encode_packobj(type, content, compression_level=1):
 337     if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
 338         raise ValueError('invalid compression level %s' % compression_level)
 339     szout = b''
 340     sz = len(content)
 341     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 342     sz >>= 4
 343     while 1:
 344         if sz: szbits |= 0x80
 345         szout += bytes_from_uint(szbits)
 346         if not sz:
 347             break
 348         szbits = sz & 0x7f
 349         sz >>= 7
 350     z = zlib.compressobj(compression_level)
 351     yield szout
 352     yield z.compress(content)
 353     yield z.flush()
 354
 355
 356 def _decode_packobj(buf):
 357     assert(buf)
 358     c = byte_int(buf[0])
 359     type = _typermap[(c & 0x70) >> 4]
 360     sz = c & 0x0f
 361     shift = 4
 362     i = 0
 363     while c & 0x80:
 364         i += 1
 365         c = byte_int(buf[i])
 366         sz |= (c & 0x7f) << shift
 367         shift += 7
 368         if not (c & 0x80):
 369             break
 370     return (type, zlib.decompress(buf[i+1:]))
 371
 372
 373 class PackIdx:
 374     def __init__(self):
 375         assert(0)
 376
 377     def find_offset(self, hash):
 378         """Get the offset of an object inside the index file."""
 379         idx = self._idx_from_hash(hash)
 380         if idx != None:
 381             return self._ofs_from_idx(idx)
 382         return None
 383
 384     def exists(self, hash, want_source=False):
 385         """Return nonempty if the object exists in this index."""
 386         if hash and (self._idx_from_hash(hash) != None):
 387             return want_source and os.path.basename(self.name) or True
 388         return None
 389
 390     def _idx_from_hash(self, hash):
 391         global _total_searches, _total_steps
 392         _total_searches += 1
 393         assert(len(hash) == 20)
 394         b1 = byte_int(hash[0])
 395         start = self.fanout[b1-1] # range -1..254
 396         end = self.fanout[b1] # range 0..255
 397         want = hash
 398         _total_steps += 1  # lookup table is a step
 399         while start < end:
 400             _total_steps += 1
 401             mid = start + (end - start) // 2
 402             v = self._idx_to_hash(mid)
 403             if v < want:
 404                 start = mid+1
 405             elif v > want:
 406                 end = mid
 407             else: # got it!
 408                 return mid
 409         return None
 410
 411
 412 class PackIdxV1(PackIdx):
 413     """Object representation of a Git pack index (version 1) file."""
 414     def __init__(self, filename, f):
 415         self.name = filename
 416         self.idxnames = [self.name]
 417         self.map = mmap_read(f)
 418         # Min size for 'L' is 4, which is sufficient for struct's '!I'
 419         self.fanout = array('L', struct.unpack('!256I', self.map))
 420         self.fanout.append(0)  # entry "-1"
 421         self.nsha = self.fanout[255]
 422         self.sha_ofs = 256 * 4
 423         # Avoid slicing shatable for individual hashes (very high overhead)
 424         self.shatable = buffer(self.map, self.sha_ofs, self.nsha * 24)
 425
 426     def __enter__(self):
 427         return self
 428
 429     def __exit__(self, type, value, traceback):
 430         with pending_raise(value, rethrow=False):
 431             self.close()
 432
 433     def __len__(self):
 434         return int(self.nsha)  # int() from long for python 2
 435
 436     def _ofs_from_idx(self, idx):
 437         if idx >= self.nsha or idx < 0:
 438             raise IndexError('invalid pack index index %d' % idx)
 439         ofs = self.sha_ofs + idx * 24
 440         return struct.unpack_from('!I', self.map, offset=ofs)[0]
 441
 442     def _idx_to_hash(self, idx):
 443         if idx >= self.nsha or idx < 0:
 444             raise IndexError('invalid pack index index %d' % idx)
 445         ofs = self.sha_ofs + idx * 24 + 4
 446         return self.map[ofs : ofs + 20]
 447
 448     def __iter__(self):
 449         start = self.sha_ofs + 4
 450         for ofs in range(start, start + 24 * self.nsha, 24):
 451             yield self.map[ofs : ofs + 20]
 452
 453     def close(self):
 454         if self.map is not None:
 455             self.shatable = None
 456             self.map.close()
 457             self.map = None
 458
 459
 460 class PackIdxV2(PackIdx):
 461     """Object representation of a Git pack index (version 2) file."""
 462     def __init__(self, filename, f):
 463         self.name = filename
 464         self.idxnames = [self.name]
 465         self.map = mmap_read(f)
 466         assert self.map[0:8] == b'\377tOc\0\0\0\2'
 467         # Min size for 'L' is 4, which is sufficient for struct's '!I'
 468         self.fanout = array('L', struct.unpack_from('!256I', self.map, offset=8))
 469         self.fanout.append(0)
 470         self.nsha = self.fanout[255]
 471         self.sha_ofs = 8 + 256*4
 472         self.ofstable_ofs = self.sha_ofs + self.nsha * 20 + self.nsha * 4
 473         self.ofs64table_ofs = self.ofstable_ofs + self.nsha * 4
 474         # Avoid slicing this for individual hashes (very high overhead)
 475         self.shatable = buffer(self.map, self.sha_ofs, self.nsha*20)
 476
 477     def __enter__(self):
 478         return self
 479
 480     def __exit__(self, type, value, traceback):
 481         with pending_raise(value, rethrow=False):
 482             self.close()
 483
 484     def __len__(self):
 485         return int(self.nsha)  # int() from long for python 2
 486
 487     def _ofs_from_idx(self, idx):
 488         if idx >= self.nsha or idx < 0:
 489             raise IndexError('invalid pack index index %d' % idx)
 490         ofs_ofs = self.ofstable_ofs + idx * 4
 491         ofs = struct.unpack_from('!I', self.map, offset=ofs_ofs)[0]
 492         if ofs & 0x80000000:
 493             idx64 = ofs & 0x7fffffff
 494             ofs64_ofs = self.ofs64table_ofs + idx64 * 8
 495             ofs = struct.unpack_from('!Q', self.map, offset=ofs64_ofs)[0]
 496         return ofs
 497
 498     def _idx_to_hash(self, idx):
 499         if idx >= self.nsha or idx < 0:
 500             raise IndexError('invalid pack index index %d' % idx)
 501         ofs = self.sha_ofs + idx * 20
 502         return self.map[ofs : ofs + 20]
 503
 504     def __iter__(self):
 505         start = self.sha_ofs
 506         for ofs in range(start, start + 20 * self.nsha, 20):
 507             yield self.map[ofs : ofs + 20]
 508
 509     def close(self):
 510         if self.map is not None:
 511             self.shatable = None
 512             self.map.close()
 513             self.map = None
 514
 515
 516 _mpi_count = 0
 517 class PackIdxList:
 518     def __init__(self, dir, ignore_midx=False):
 519         global _mpi_count
 520         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 521         _mpi_count += 1
 522         self.dir = dir
 523         self.also = set()
 524         self.packs = []
 525         self.do_bloom = False
 526         self.bloom = None
 527         self.ignore_midx = ignore_midx
 528         self.refresh()
 529
 530     def __del__(self):
 531         global _mpi_count
 532         _mpi_count -= 1
 533         assert(_mpi_count == 0)
 534
 535     def __iter__(self):
 536         return iter(idxmerge(self.packs))
 537
 538     def __len__(self):
 539         return sum(len(pack) for pack in self.packs)
 540
 541     def exists(self, hash, want_source=False):
 542         """Return nonempty if the object exists in the index files."""
 543         global _total_searches
 544         _total_searches += 1
 545         if hash in self.also:
 546             return True
 547         if self.do_bloom and self.bloom:
 548             if self.bloom.exists(hash):
 549                 self.do_bloom = False
 550             else:
 551                 _total_searches -= 1  # was counted by bloom
 552                 return None
 553         for i in range(len(self.packs)):
 554             p = self.packs[i]
 555             _total_searches -= 1  # will be incremented by sub-pack
 556             ix = p.exists(hash, want_source=want_source)
 557             if ix:
 558                 # reorder so most recently used packs are searched first
 559                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 560                 return ix
 561         self.do_bloom = True
 562         return None
 563
 564     def refresh(self, skip_midx = False):
 565         """Refresh the index list.
 566         This method verifies if .midx files were superseded (e.g. all of its
 567         contents are in another, bigger .midx file) and removes the superseded
 568         files.
 569
 570         If skip_midx is True, all work on .midx files will be skipped and .midx
 571         files will be removed from the list.
 572
 573         The instance variable 'ignore_midx' can force this function to
 574         always act as if skip_midx was True.
 575         """
 576         if self.bloom is not None:
 577             self.bloom.close()
 578         self.bloom = None # Always reopen the bloom as it may have been relaced
 579         self.do_bloom = False
 580         skip_midx = skip_midx or self.ignore_midx
 581         d = dict((p.name, p) for p in self.packs
 582                  if not skip_midx or not isinstance(p, midx.PackMidx))
 583         if os.path.exists(self.dir):
 584             if not skip_midx:
 585                 midxl = []
 586                 midxes = set(glob.glob(os.path.join(self.dir, b'*.midx')))
 587                 # remove any *.midx files from our list that no longer exist
 588                 for ix in list(d.values()):
 589                     if not isinstance(ix, midx.PackMidx):
 590                         continue
 591                     if ix.name in midxes:
 592                         continue
 593                     # remove the midx
 594                     del d[ix.name]
 595                     ix.close()
 596                     self.packs.remove(ix)
 597                 for ix in self.packs:
 598                     if isinstance(ix, midx.PackMidx):
 599                         for name in ix.idxnames:
 600                             d[os.path.join(self.dir, name)] = ix
 601                 for full in midxes:
 602                     if not d.get(full):
 603                         mx = midx.PackMidx(full)
 604                         (mxd, mxf) = os.path.split(mx.name)
 605                         broken = False
 606                         for n in mx.idxnames:
 607                             if not os.path.exists(os.path.join(mxd, n)):
 608                                 log(('warning: index %s missing\n'
 609                                      '  used by %s\n')
 610                                     % (path_msg(n), path_msg(mxf)))
 611                                 broken = True
 612                         if broken:
 613                             mx.close()
 614                             del mx
 615                             unlink(full)
 616                         else:
 617                             midxl.append(mx)
 618                 midxl.sort(key=lambda ix:
 619                            (-len(ix), -xstat.stat(ix.name).st_mtime))
 620                 for ix in midxl:
 621                     any_needed = False
 622                     for sub in ix.idxnames:
 623                         found = d.get(os.path.join(self.dir, sub))
 624                         if not found or isinstance(found, PackIdx):
 625                             # doesn't exist, or exists but not in a midx
 626                             any_needed = True
 627                             break
 628                     if any_needed:
 629                         d[ix.name] = ix
 630                         for name in ix.idxnames:
 631                             d[os.path.join(self.dir, name)] = ix
 632                     elif not ix.force_keep:
 633                         debug1('midx: removing redundant: %s\n'
 634                                % path_msg(os.path.basename(ix.name)))
 635                         ix.close()
 636                         unlink(ix.name)
 637             for full in glob.glob(os.path.join(self.dir, b'*.idx')):
 638                 if not d.get(full):
 639                     try:
 640                         ix = open_idx(full)
 641                     except GitError as e:
 642                         add_error(e)
 643                         continue
 644                     d[full] = ix
 645             bfull = os.path.join(self.dir, b'bup.bloom')
 646             self.packs = list(set(d.values()))
 647             self.packs.sort(reverse=True, key=lambda x: len(x))
 648             if self.bloom is None and os.path.exists(bfull):
 649                 self.bloom = bloom.ShaBloom(bfull)
 650             try:
 651                 if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 652                     self.do_bloom = True
 653                 else:
 654                     if self.bloom:
 655                         self.bloom, bloom_tmp = None, self.bloom
 656                         bloom_tmp.close()
 657             except BaseException as ex:
 658                 with pending_raise(ex):
 659                     if self.bloom:
 660                         self.bloom.close()
 661
 662         debug1('PackIdxList: using %d index%s.\n'
 663             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 664
 665     def add(self, hash):
 666         """Insert an additional object in the list."""
 667         self.also.add(hash)
 668
 669
 670 def open_idx(filename):
 671     if filename.endswith(b'.idx'):
 672         f = open(filename, 'rb')
 673         header = f.read(8)
 674         if header[0:4] == b'\377tOc':
 675             version = struct.unpack('!I', header[4:8])[0]
 676             if version == 2:
 677                 return PackIdxV2(filename, f)
 678             else:
 679                 raise GitError('%s: expected idx file version 2, got %d'
 680                                % (path_msg(filename), version))
 681         elif len(header) == 8 and header[0:4] < b'\377tOc':
 682             return PackIdxV1(filename, f)
 683         else:
 684             raise GitError('%s: unrecognized idx file header'
 685                            % path_msg(filename))
 686     elif filename.endswith(b'.midx'):
 687         return midx.PackMidx(filename)
 688     else:
 689         raise GitError('idx filenames must end with .idx or .midx')
 690
 691
 692 def idxmerge(idxlist, final_progress=True):
 693     """Generate a list of all the objects reachable in a PackIdxList."""
 694     def pfunc(count, total):
 695         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 696                   % (count*100.0/total, count, total))
 697     def pfinal(count, total):
 698         if final_progress:
 699             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 700                      % (100, total, total))
 701     return merge_iter(idxlist, 10024, pfunc, pfinal)
 702
 703
 704 def create_commit_blob(tree, parent,
 705                        author, adate_sec, adate_tz,
 706                        committer, cdate_sec, cdate_tz,
 707                        msg):
 708     if adate_tz is not None:
 709         adate_str = _git_date_str(adate_sec, adate_tz)
 710     else:
 711         adate_str = _local_git_date_str(adate_sec)
 712     if cdate_tz is not None:
 713         cdate_str = _git_date_str(cdate_sec, cdate_tz)
 714     else:
 715         cdate_str = _local_git_date_str(cdate_sec)
 716     l = []
 717     if tree: l.append(b'tree %s' % hexlify(tree))
 718     if parent: l.append(b'parent %s' % hexlify(parent))
 719     if author: l.append(b'author %s %s' % (author, adate_str))
 720     if committer: l.append(b'committer %s %s' % (committer, cdate_str))
 721     l.append(b'')
 722     l.append(msg)
 723     return b'\n'.join(l)
 724
 725
 726 def _make_objcache():
 727     return PackIdxList(repo(b'objects/pack'))
 728
 729 # bup-gc assumes that it can disable all PackWriter activities
 730 # (bloom/midx/cache) via the constructor and close() arguments.
 731
 732 class PackWriter:
 733     """Writes Git objects inside a pack file."""
 734     def __init__(self, objcache_maker=_make_objcache, compression_level=1,
 735                  run_midx=True, on_pack_finish=None,
 736                  max_pack_size=None, max_pack_objects=None, repo_dir=None):
 737         self.repo_dir = repo_dir or repo()
 738         self.file = None
 739         self.parentfd = None
 740         self.count = 0
 741         self.outbytes = 0
 742         self.filename = None
 743         self.idx = None
 744         self.objcache_maker = objcache_maker
 745         self.objcache = None
 746         self.compression_level = compression_level
 747         self.run_midx=run_midx
 748         self.on_pack_finish = on_pack_finish
 749         if not max_pack_size:
 750             max_pack_size = git_config_get(b'pack.packSizeLimit',
 751                                            repo_dir=self.repo_dir,
 752                                            opttype='int')
 753             if not max_pack_size:
 754                 # larger packs slow down pruning
 755                 max_pack_size = 1000 * 1000 * 1000
 756         self.max_pack_size = max_pack_size
 757         # cache memory usage is about 83 bytes per object
 758         self.max_pack_objects = max_pack_objects if max_pack_objects \
 759                                 else max(1, self.max_pack_size // 5000)
 760
 761     def __enter__(self):
 762         return self
 763
 764     def __exit__(self, type, value, traceback):
 765         with pending_raise(value, rethrow=False):
 766             self.close()
 767
 768     def _open(self):
 769         if not self.file:
 770             objdir = dir = os.path.join(self.repo_dir, b'objects')
 771             fd, name = tempfile.mkstemp(suffix=b'.pack', dir=objdir)
 772             try:
 773                 self.file = os.fdopen(fd, 'w+b')
 774             except:
 775                 os.close(fd)
 776                 raise
 777             try:
 778                 self.parentfd = os.open(objdir, os.O_RDONLY)
 779             except:
 780                 f = self.file
 781                 self.file = None
 782                 f.close()
 783                 raise
 784             assert name.endswith(b'.pack')
 785             self.filename = name[:-5]
 786             self.file.write(b'PACK\0\0\0\2\0\0\0\0')
 787             self.idx = PackIdxV2Writer()
 788
 789     def _raw_write(self, datalist, sha):
 790         self._open()
 791         f = self.file
 792         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 793         # the file never has a *partial* blob.  So let's make sure it's
 794         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 795         # to our hashsplit algorithm.)  f.write() does its own buffering,
 796         # but that's okay because we'll flush it in _end().
 797         oneblob = b''.join(datalist)
 798         try:
 799             f.write(oneblob)
 800         except IOError as e:
 801             reraise(GitError(e))
 802         nw = len(oneblob)
 803         crc = zlib.crc32(oneblob) & 0xffffffff
 804         self._update_idx(sha, crc, nw)
 805         self.outbytes += nw
 806         self.count += 1
 807         return nw, crc
 808
 809     def _update_idx(self, sha, crc, size):
 810         assert(sha)
 811         if self.idx:
 812             self.idx.add(sha, crc, self.file.tell() - size)
 813
 814     def _write(self, sha, type, content):
 815         if verbose:
 816             log('>')
 817         if not sha:
 818             sha = calc_hash(type, content)
 819         size, crc = self._raw_write(_encode_packobj(type, content,
 820                                                     self.compression_level),
 821                                     sha=sha)
 822         if self.outbytes >= self.max_pack_size \
 823            or self.count >= self.max_pack_objects:
 824             self.breakpoint()
 825         return sha
 826
 827     def _require_objcache(self):
 828         if self.objcache is None and self.objcache_maker:
 829             self.objcache = self.objcache_maker()
 830         if self.objcache is None:
 831             raise GitError(
 832                     "PackWriter not opened or can't check exists w/o objcache")
 833
 834     def exists(self, id, want_source=False):
 835         """Return non-empty if an object is found in the object cache."""
 836         self._require_objcache()
 837         return self.objcache.exists(id, want_source=want_source)
 838
 839     def just_write(self, sha, type, content):
 840         """Write an object to the pack file without checking for duplication."""
 841         self._write(sha, type, content)
 842         # If nothing else, gc doesn't have/want an objcache
 843         if self.objcache is not None:
 844             self.objcache.add(sha)
 845
 846     def maybe_write(self, type, content):
 847         """Write an object to the pack file if not present and return its id."""
 848         sha = calc_hash(type, content)
 849         if not self.exists(sha):
 850             self._require_objcache()
 851             self.just_write(sha, type, content)
 852         return sha
 853
 854     def new_blob(self, blob):
 855         """Create a blob object in the pack with the supplied content."""
 856         return self.maybe_write(b'blob', blob)
 857
 858     def new_tree(self, shalist):
 859         """Create a tree object in the pack."""
 860         content = tree_encode(shalist)
 861         return self.maybe_write(b'tree', content)
 862
 863     def new_commit(self, tree, parent,
 864                    author, adate_sec, adate_tz,
 865                    committer, cdate_sec, cdate_tz,
 866                    msg):
 867         """Create a commit object in the pack.  The date_sec values must be
 868         epoch-seconds, and if a tz is None, the local timezone is assumed."""
 869         content = create_commit_blob(tree, parent,
 870                                      author, adate_sec, adate_tz,
 871                                      committer, cdate_sec, cdate_tz,
 872                                      msg)
 873         return self.maybe_write(b'commit', content)
 874
 875     def _end(self, run_midx=True, abort=False):
 876         # Ignores run_midx during abort
 877         if not self.file:
 878             return None
 879         self.file, f = None, self.file
 880         self.idx, idx = None, self.idx
 881         self.parentfd, pfd, = None, self.parentfd
 882         self.objcache = None
 883
 884         with finalized(pfd, lambda x: x is not None and os.close(x)), \
 885              f:
 886
 887             if abort:
 888                 os.unlink(self.filename + b'.pack')
 889                 return None
 890
 891             # update object count
 892             f.seek(8)
 893             cp = struct.pack('!i', self.count)
 894             assert len(cp) == 4
 895             f.write(cp)
 896
 897             # calculate the pack sha1sum
 898             f.seek(0)
 899             sum = Sha1()
 900             for b in chunkyreader(f):
 901                 sum.update(b)
 902             packbin = sum.digest()
 903             f.write(packbin)
 904             f.flush()
 905             fdatasync(f.fileno())
 906             f.close()
 907
 908             idx.write(self.filename + b'.idx', packbin)
 909             nameprefix = os.path.join(self.repo_dir,
 910                                       b'objects/pack/pack-' +  hexlify(packbin))
 911             if os.path.exists(self.filename + b'.map'):
 912                 os.unlink(self.filename + b'.map')
 913             os.rename(self.filename + b'.pack', nameprefix + b'.pack')
 914             os.rename(self.filename + b'.idx', nameprefix + b'.idx')
 915             os.fsync(pfd)
 916             if run_midx:
 917                 auto_midx(os.path.join(self.repo_dir, b'objects/pack'))
 918             if self.on_pack_finish:
 919                 self.on_pack_finish(nameprefix)
 920             return nameprefix
 921
 922     def abort(self):
 923         """Remove the pack file from disk."""
 924         self._end(abort=True)
 925
 926     def breakpoint(self):
 927         """Clear byte and object counts and return the last processed id."""
 928         id = self._end(self.run_midx)
 929         self.outbytes = self.count = 0
 930         return id
 931
 932     def close(self, run_midx=True):
 933         """Close the pack file and move it to its definitive path."""
 934         return self._end(run_midx=run_midx)
 935
 936
 937 class PackIdxV2Writer:
 938     def __init__(self):
 939         self.idx = list(list() for i in range(256))
 940         self.count = 0
 941
 942     def add(self, sha, crc, offs):
 943         assert(sha)
 944         self.count += 1
 945         self.idx[byte_int(sha[0])].append((sha, crc, offs))
 946
 947     def write(self, filename, packbin):
 948         ofs64_count = 0
 949         for section in self.idx:
 950             for entry in section:
 951                 if entry[2] >= 2**31:
 952                     ofs64_count += 1
 953
 954         # Length: header + fan-out + shas-and-crcs + overflow-offsets
 955         index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
 956         idx_map = None
 957         idx_f = open(filename, 'w+b')
 958         try:
 959             idx_f.truncate(index_len)
 960             fdatasync(idx_f.fileno())
 961             idx_map = mmap_readwrite(idx_f, close=False)
 962             try:
 963                 count = _helpers.write_idx(filename, idx_map, self.idx,
 964                                            self.count)
 965                 assert(count == self.count)
 966                 idx_map.flush()
 967             finally:
 968                 idx_map.close()
 969         finally:
 970             idx_f.close()
 971
 972         idx_f = open(filename, 'a+b')
 973         try:
 974             idx_f.write(packbin)
 975             idx_f.seek(0)
 976             idx_sum = Sha1()
 977             b = idx_f.read(8 + 4*256)
 978             idx_sum.update(b)
 979
 980             for b in chunkyreader(idx_f, 20 * self.count):
 981                 idx_sum.update(b)
 982
 983             for b in chunkyreader(idx_f):
 984                 idx_sum.update(b)
 985             idx_f.write(idx_sum.digest())
 986             fdatasync(idx_f.fileno())
 987         finally:
 988             idx_f.close()
 989
 990
 991 def list_refs(patterns=None, repo_dir=None,
 992               limit_to_heads=False, limit_to_tags=False):
 993     """Yield (refname, hash) tuples for all repository refs unless
 994     patterns are specified.  In that case, only include tuples for
 995     refs matching those patterns (cf. git-show-ref(1)).  The limits
 996     restrict the result items to refs/heads or refs/tags.  If both
 997     limits are specified, items from both sources will be included.
 998
 999     """
1000     argv = [b'git', b'show-ref']
1001     if limit_to_heads:
1002         argv.append(b'--heads')
1003     if limit_to_tags:
1004         argv.append(b'--tags')
1005     argv.append(b'--')
1006     if patterns:
1007         argv.extend(patterns)
1008     p = subprocess.Popen(argv, env=_gitenv(repo_dir), stdout=subprocess.PIPE,
1009                          close_fds=True)
1010     out = p.stdout.read().strip()
1011     rv = p.wait()  # not fatal
1012     if rv:
1013         assert(not out)
1014     if out:
1015         for d in out.split(b'\n'):
1016             sha, name = d.split(b' ', 1)
1017             yield name, unhexlify(sha)
1018
1019
1020 def read_ref(refname, repo_dir = None):
1021     """Get the commit id of the most recent commit made on a given ref."""
1022     refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
1023     l = tuple(islice(refs, 2))
1024     if l:
1025         assert(len(l) == 1)
1026         return l[0][1]
1027     else:
1028         return None
1029
1030
1031 def rev_list_invocation(ref_or_refs, format=None):
1032     if isinstance(ref_or_refs, bytes):
1033         refs = (ref_or_refs,)
1034     else:
1035         refs = ref_or_refs
1036     argv = [b'git', b'rev-list']
1037
1038     if format:
1039         argv.append(b'--pretty=format:' + format)
1040     for ref in refs:
1041         assert not ref.startswith(b'-')
1042         argv.append(ref)
1043     argv.append(b'--')
1044     return argv
1045
1046
1047 def rev_list(ref_or_refs, parse=None, format=None, repo_dir=None):
1048     """Yield information about commits as per "git rev-list".  If a format
1049     is not provided, yield one hex hash at a time.  If a format is
1050     provided, pass it to rev-list and call parse(git_stdout) for each
1051     commit with the stream positioned just after the rev-list "commit
1052     HASH" header line.  When a format is provided yield (oidx,
1053     parse(git_stdout)) for each commit.
1054
1055     """
1056     assert bool(parse) == bool(format)
1057     p = subprocess.Popen(rev_list_invocation(ref_or_refs,
1058                                              format=format),
1059                          env=_gitenv(repo_dir),
1060                          stdout = subprocess.PIPE,
1061                          close_fds=True)
1062     if not format:
1063         for line in p.stdout:
1064             yield line.strip()
1065     else:
1066         line = p.stdout.readline()
1067         while line:
1068             s = line.strip()
1069             if not s.startswith(b'commit '):
1070                 raise Exception('unexpected line ' + repr(s))
1071             s = s[7:]
1072             assert len(s) == 40
1073             yield s, parse(p.stdout)
1074             line = p.stdout.readline()
1075
1076     rv = p.wait()  # not fatal
1077     if rv:
1078         raise GitError('git rev-list returned error %d' % rv)
1079
1080
1081 def rev_parse(committish, repo_dir=None):
1082     """Resolve the full hash for 'committish', if it exists.
1083
1084     Should be roughly equivalent to 'git rev-parse'.
1085
1086     Returns the hex value of the hash if it is found, None if 'committish' does
1087     not correspond to anything.
1088     """
1089     head = read_ref(committish, repo_dir=repo_dir)
1090     if head:
1091         debug2("resolved from ref: commit = %s\n" % hexlify(head))
1092         return head
1093
1094     pL = PackIdxList(repo(b'objects/pack', repo_dir=repo_dir))
1095
1096     if len(committish) == 40:
1097         try:
1098             hash = unhexlify(committish)
1099         except TypeError:
1100             return None
1101
1102         if pL.exists(hash):
1103             return hash
1104
1105     return None
1106
1107
1108 def update_ref(refname, newval, oldval, repo_dir=None):
1109     """Update a repository reference."""
1110     if not oldval:
1111         oldval = b''
1112     assert refname.startswith(b'refs/heads/') \
1113         or refname.startswith(b'refs/tags/')
1114     p = subprocess.Popen([b'git', b'update-ref', refname,
1115                           hexlify(newval), hexlify(oldval)],
1116                          env=_gitenv(repo_dir),
1117                          close_fds=True)
1118     _git_wait(b'git update-ref', p)
1119
1120
1121 def delete_ref(refname, oldvalue=None):
1122     """Delete a repository reference (see git update-ref(1))."""
1123     assert refname.startswith(b'refs/')
1124     oldvalue = [] if not oldvalue else [oldvalue]
1125     p = subprocess.Popen([b'git', b'update-ref', b'-d', refname] + oldvalue,
1126                          env=_gitenv(),
1127                          close_fds=True)
1128     _git_wait('git update-ref', p)
1129
1130
1131 def guess_repo(path=None):
1132     """Set the path value in the global variable "repodir".
1133     This makes bup look for an existing bup repository, but not fail if a
1134     repository doesn't exist. Usually, if you are interacting with a bup
1135     repository, you would not be calling this function but using
1136     check_repo_or_die().
1137     """
1138     global repodir
1139     if path:
1140         repodir = path
1141     if not repodir:
1142         repodir = environ.get(b'BUP_DIR')
1143         if not repodir:
1144             repodir = os.path.expanduser(b'~/.bup')
1145
1146
1147 def init_repo(path=None):
1148     """Create the Git bare repository for bup in a given path."""
1149     guess_repo(path)
1150     d = repo()  # appends a / to the path
1151     parent = os.path.dirname(os.path.dirname(d))
1152     if parent and not os.path.exists(parent):
1153         raise GitError('parent directory "%s" does not exist\n'
1154                        % path_msg(parent))
1155     if os.path.exists(d) and not os.path.isdir(os.path.join(d, b'.')):
1156         raise GitError('"%s" exists but is not a directory\n' % path_msg(d))
1157     p = subprocess.Popen([b'git', b'--bare', b'init'], stdout=sys.stderr,
1158                          env=_gitenv(),
1159                          close_fds=True)
1160     _git_wait('git init', p)
1161     # Force the index version configuration in order to ensure bup works
1162     # regardless of the version of the installed Git binary.
1163     p = subprocess.Popen([b'git', b'config', b'pack.indexVersion', '2'],
1164                          stdout=sys.stderr, env=_gitenv(), close_fds=True)
1165     _git_wait('git config', p)
1166     # Enable the reflog
1167     p = subprocess.Popen([b'git', b'config', b'core.logAllRefUpdates', b'true'],
1168                          stdout=sys.stderr, env=_gitenv(), close_fds=True)
1169     _git_wait('git config', p)
1170
1171
1172 def check_repo_or_die(path=None):
1173     """Check to see if a bup repository probably exists, and abort if not."""
1174     guess_repo(path)
1175     top = repo()
1176     pst = stat_if_exists(top + b'/objects/pack')
1177     if pst and stat.S_ISDIR(pst.st_mode):
1178         return
1179     if not pst:
1180         top_st = stat_if_exists(top)
1181         if not top_st:
1182             log('error: repository %r does not exist (see "bup help init")\n'
1183                 % top)
1184             sys.exit(15)
1185     log('error: %s is not a repository\n' % path_msg(top))
1186     sys.exit(14)
1187
1188
1189 def is_suitable_git(ver_str):
1190     if not ver_str.startswith(b'git version '):
1191         return 'unrecognized'
1192     ver_str = ver_str[len(b'git version '):]
1193     if ver_str.startswith(b'0.'):
1194         return 'insufficient'
1195     if ver_str.startswith(b'1.'):
1196         if re.match(br'1\.[012345]rc', ver_str):
1197             return 'insufficient'
1198         if re.match(br'1\.[01234]\.', ver_str):
1199             return 'insufficient'
1200         if re.match(br'1\.5\.[012345]($|\.)', ver_str):
1201             return 'insufficient'
1202         if re.match(br'1\.5\.6-rc', ver_str):
1203             return 'insufficient'
1204         return 'suitable'
1205     if re.match(br'[0-9]+(\.|$)?', ver_str):
1206         return 'suitable'
1207     sys.exit(13)
1208
1209 _git_great = None
1210
1211 def require_suitable_git(ver_str=None):
1212     """Raise GitError if the version of git isn't suitable.
1213
1214     Rely on ver_str when provided, rather than invoking the git in the
1215     path.
1216
1217     """
1218     global _git_great
1219     if _git_great is not None:
1220         return
1221     if environ.get(b'BUP_GIT_VERSION_IS_FINE', b'').lower() \
1222        in (b'yes', b'true', b'1'):
1223         _git_great = True
1224         return
1225     if not ver_str:
1226         ver_str, _, _ = _git_exo([b'git', b'--version'])
1227     status = is_suitable_git(ver_str)
1228     if status == 'unrecognized':
1229         raise GitError('Unexpected git --version output: %r' % ver_str)
1230     if status == 'insufficient':
1231         log('error: git version must be at least 1.5.6\n')
1232         sys.exit(1)
1233     if status == 'suitable':
1234         _git_great = True
1235         return
1236     assert False
1237
1238
1239 class _AbortableIter:
1240     def __init__(self, it, onabort = None):
1241         self.it = it
1242         self.onabort = onabort
1243         self.done = None
1244
1245     def __iter__(self):
1246         return self
1247
1248     def __next__(self):
1249         try:
1250             return next(self.it)
1251         except StopIteration as e:
1252             self.done = True
1253             raise
1254         except:
1255             self.abort()
1256             raise
1257
1258     next = __next__
1259
1260     def abort(self):
1261         """Abort iteration and call the abortion callback, if needed."""
1262         if not self.done:
1263             self.done = True
1264             if self.onabort:
1265                 self.onabort()
1266
1267     def __del__(self):
1268         self.abort()
1269
1270
1271 class CatPipe:
1272     """Link to 'git cat-file' that is used to retrieve blob data."""
1273     def __init__(self, repo_dir = None):
1274         require_suitable_git()
1275         self.repo_dir = repo_dir
1276         self.p = self.inprogress = None
1277
1278     def close(self, wait=False):
1279         p = self.p
1280         if p:
1281             p.stdout.close()
1282             p.stdin.close()
1283         self.p = None
1284         self.inprogress = None
1285         if wait:
1286             p.wait()
1287             return p.returncode
1288         return None
1289
1290     def restart(self):
1291         self.close()
1292         self.p = subprocess.Popen([b'git', b'cat-file', b'--batch'],
1293                                   stdin=subprocess.PIPE,
1294                                   stdout=subprocess.PIPE,
1295                                   close_fds = True,
1296                                   bufsize = 4096,
1297                                   env=_gitenv(self.repo_dir))
1298
1299     def get(self, ref):
1300         """Yield (oidx, type, size), followed by the data referred to by ref.
1301         If ref does not exist, only yield (None, None, None).
1302
1303         """
1304         if not self.p or self.p.poll() != None:
1305             self.restart()
1306         assert(self.p)
1307         poll_result = self.p.poll()
1308         assert(poll_result == None)
1309         if self.inprogress:
1310             log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1311         assert(not self.inprogress)
1312         assert ref.find(b'\n') < 0
1313         assert ref.find(b'\r') < 0
1314         assert not ref.startswith(b'-')
1315         self.inprogress = ref
1316         self.p.stdin.write(ref + b'\n')
1317         self.p.stdin.flush()
1318         hdr = self.p.stdout.readline()
1319         if not hdr:
1320             raise GitError('unexpected cat-file EOF (last request: %r, exit: %s)'
1321                            % (ref, self.p.poll() or 'none'))
1322         if hdr.endswith(b' missing\n'):
1323             self.inprogress = None
1324             yield None, None, None
1325             return
1326         info = hdr.split(b' ')
1327         if len(info) != 3 or len(info[0]) != 40:
1328             raise GitError('expected object (id, type, size), got %r' % info)
1329         oidx, typ, size = info
1330         size = int(size)
1331         it = _AbortableIter(chunkyreader(self.p.stdout, size),
1332                             onabort=self.close)
1333         try:
1334             yield oidx, typ, size
1335             for blob in it:
1336                 yield blob
1337             readline_result = self.p.stdout.readline()
1338             assert readline_result == b'\n'
1339             self.inprogress = None
1340         except Exception as e:
1341             it.abort()
1342             raise
1343
1344     def _join(self, it):
1345         _, typ, _ = next(it)
1346         if typ == b'blob':
1347             for blob in it:
1348                 yield blob
1349         elif typ == b'tree':
1350             treefile = b''.join(it)
1351             for (mode, name, sha) in tree_decode(treefile):
1352                 for blob in self.join(hexlify(sha)):
1353                     yield blob
1354         elif typ == b'commit':
1355             treeline = b''.join(it).split(b'\n')[0]
1356             assert treeline.startswith(b'tree ')
1357             for blob in self.join(treeline[5:]):
1358                 yield blob
1359         else:
1360             raise GitError('invalid object type %r: expected blob/tree/commit'
1361                            % typ)
1362
1363     def join(self, id):
1364         """Generate a list of the content of all blobs that can be reached
1365         from an object.  The hash given in 'id' must point to a blob, a tree
1366         or a commit. The content of all blobs that can be seen from trees or
1367         commits will be added to the list.
1368         """
1369         for d in self._join(self.get(id)):
1370             yield d
1371
1372
1373 _cp = {}
1374
1375 def cp(repo_dir=None):
1376     """Create a CatPipe object or reuse the already existing one."""
1377     global _cp, repodir
1378     if not repo_dir:
1379         repo_dir = repodir or repo()
1380     repo_dir = os.path.abspath(repo_dir)
1381     cp = _cp.get(repo_dir)
1382     if not cp:
1383         cp = CatPipe(repo_dir)
1384         _cp[repo_dir] = cp
1385     return cp
1386
1387
1388 def close_catpipes():
1389     # FIXME: chain exceptions
1390     while _cp:
1391         _, cp = _cp.popitem()
1392         cp.close(wait=True)
1393
1394
1395 def tags(repo_dir = None):
1396     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1397     tags = {}
1398     for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1399         assert n.startswith(b'refs/tags/')
1400         name = n[10:]
1401         if not c in tags:
1402             tags[c] = []
1403         tags[c].append(name)  # more than one tag can point at 'c'
1404     return tags
1405
1406
1407 class MissingObject(KeyError):
1408     def __init__(self, oid):
1409         self.oid = oid
1410         KeyError.__init__(self, 'object %r is missing' % hexlify(oid))
1411
1412
1413 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1414                                    'path', 'chunk_path', 'data'])
1415 # The path is the mangled path, and if an item represents a fragment
1416 # of a chunked file, the chunk_path will be the chunked subtree path
1417 # for the chunk, i.e. ['', '2d3115e', ...].  The top-level path for a
1418 # chunked file will have a chunk_path of [''].  So some chunk subtree
1419 # of the file '/foo/bar/baz' might look like this:
1420 #
1421 #   item.path = ['foo', 'bar', 'baz.bup']
1422 #   item.chunk_path = ['', '2d3115e', '016b097']
1423 #   item.type = 'tree'
1424 #   ...
1425
1426
1427 def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1428     """Yield everything reachable from oidx via get_ref (which must behave
1429     like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1430     returns true.  Throw MissingObject if a hash encountered is
1431     missing from the repository, and don't read or return blob content
1432     in the data field unless include_data is set.
1433
1434     """
1435     # Maintain the pending stack on the heap to avoid stack overflow
1436     pending = [(oidx, [], [], None)]
1437     while len(pending):
1438         oidx, parent_path, chunk_path, mode = pending.pop()
1439         oid = unhexlify(oidx)
1440         if stop_at and stop_at(oidx):
1441             continue
1442
1443         if (not include_data) and mode and stat.S_ISREG(mode):
1444             # If the object is a "regular file", then it's a leaf in
1445             # the graph, so we can skip reading the data if the caller
1446             # hasn't requested it.
1447             yield WalkItem(oid=oid, type=b'blob',
1448                            chunk_path=chunk_path, path=parent_path,
1449                            mode=mode,
1450                            data=None)
1451             continue
1452
1453         item_it = get_ref(oidx)
1454         get_oidx, typ, _ = next(item_it)
1455         if not get_oidx:
1456             raise MissingObject(unhexlify(oidx))
1457         if typ not in (b'blob', b'commit', b'tree'):
1458             raise Exception('unexpected repository object type %r' % typ)
1459
1460         # FIXME: set the mode based on the type when the mode is None
1461         if typ == b'blob' and not include_data:
1462             # Dump data until we can ask cat_pipe not to fetch it
1463             for ignored in item_it:
1464                 pass
1465             data = None
1466         else:
1467             data = b''.join(item_it)
1468
1469         yield WalkItem(oid=oid, type=typ,
1470                        chunk_path=chunk_path, path=parent_path,
1471                        mode=mode,
1472                        data=(data if include_data else None))
1473
1474         if typ == b'commit':
1475             commit_items = parse_commit(data)
1476             for pid in commit_items.parents:
1477                 pending.append((pid, parent_path, chunk_path, mode))
1478             pending.append((commit_items.tree, parent_path, chunk_path,
1479                             hashsplit.GIT_MODE_TREE))
1480         elif typ == b'tree':
1481             for mode, name, ent_id in tree_decode(data):
1482                 demangled, bup_type = demangle_name(name, mode)
1483                 if chunk_path:
1484                     sub_path = parent_path
1485                     sub_chunk_path = chunk_path + [name]
1486                 else:
1487                     sub_path = parent_path + [name]
1488                     if bup_type == BUP_CHUNKED:
1489                         sub_chunk_path = [b'']
1490                     else:
1491                         sub_chunk_path = chunk_path
1492                 pending.append((hexlify(ent_id), sub_path, sub_chunk_path,
1493                                 mode))