lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5
   6 from __future__ import absolute_import, print_function
   7 import os, sys, zlib, subprocess, struct, stat, re, tempfile, glob
   8 from array import array
   9 from binascii import hexlify, unhexlify
  10 from collections import namedtuple
  11 from itertools import islice
  12
  13 from bup import _helpers, hashsplit, path, midx, bloom, xstat
  14 from bup.compat import (buffer,
  15                         byte_int, bytes_from_byte, bytes_from_uint,
  16                         environ,
  17                         items,
  18                         pending_raise,
  19                         range,
  20                         reraise)
  21 from bup.io import path_msg
  22 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
  23                          exo,
  24                          fdatasync,
  25                          log,
  26                          merge_dict,
  27                          merge_iter,
  28                          mmap_read, mmap_readwrite,
  29                          progress, qprogress, stat_if_exists,
  30                          unlink,
  31                          utc_offset_str)
  32
  33
  34 verbose = 0
  35 repodir = None  # The default repository, once initialized
  36
  37 _typemap =  {b'blob': 3, b'tree': 2, b'commit': 1, b'tag': 4}
  38 _typermap = {v: k for k, v in items(_typemap)}
  39
  40
  41 _total_searches = 0
  42 _total_steps = 0
  43
  44
  45 class GitError(Exception):
  46     pass
  47
  48
  49 def _gitenv(repo_dir=None):
  50     if not repo_dir:
  51         repo_dir = repo()
  52     return merge_dict(environ, {b'GIT_DIR': os.path.abspath(repo_dir)})
  53
  54 def _git_wait(cmd, p):
  55     rv = p.wait()
  56     if rv != 0:
  57         raise GitError('%r returned %d' % (cmd, rv))
  58
  59 def _git_exo(cmd, **kwargs):
  60     kwargs['check'] = False
  61     result = exo(cmd, **kwargs)
  62     _, _, proc = result
  63     if proc.returncode != 0:
  64         raise GitError('%r returned %d' % (cmd, proc.returncode))
  65     return result
  66
  67 def git_config_get(option, repo_dir=None, opttype=None, cfg_file=None):
  68     assert not (repo_dir and cfg_file), "repo_dir and cfg_file cannot both be used"
  69     cmd = [b'git', b'config', b'--null']
  70     if cfg_file:
  71         cmd.extend([b'--file', cfg_file])
  72     if opttype == 'int':
  73         cmd.extend([b'--int'])
  74     elif opttype == 'bool':
  75         cmd.extend([b'--bool'])
  76     else:
  77         assert opttype is None
  78     cmd.extend([b'--get', option])
  79     env=None
  80     if repo_dir:
  81         env = _gitenv(repo_dir=repo_dir)
  82     p = subprocess.Popen(cmd, stdout=subprocess.PIPE, env=env,
  83                          close_fds=True)
  84     # with --null, git writes out a trailing \0 after the value
  85     r = p.stdout.read()[:-1]
  86     rc = p.wait()
  87     if rc == 0:
  88         if opttype == 'int':
  89             return int(r)
  90         elif opttype == 'bool':
  91             # git converts to 'true' or 'false'
  92             return r == b'true'
  93         return r
  94     if rc != 1:
  95         raise GitError('%r returned %d' % (cmd, rc))
  96     return None
  97
  98
  99 def parse_tz_offset(s):
 100     """UTC offset in seconds."""
 101     tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
 102     if bytes_from_byte(s[0]) == b'-':
 103         return - tz_off
 104     return tz_off
 105
 106 def parse_commit_gpgsig(sig):
 107     """Return the original signature bytes.
 108
 109     i.e. with the "gpgsig " header and the leading space character on
 110     each continuation line removed.
 111
 112     """
 113     if not sig:
 114         return None
 115     assert sig.startswith(b'gpgsig ')
 116     sig = sig[7:]
 117     return sig.replace(b'\n ', b'\n')
 118
 119 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
 120 # Make sure that's authoritative.
 121
 122 # See also
 123 # https://github.com/git/git/blob/master/Documentation/technical/signature-format.txt
 124 # The continuation lines have only one leading space.
 125
 126 _start_end_char = br'[^ .,:;<>"\'\0\n]'
 127 _content_char = br'[^\0\n<>]'
 128 _safe_str_rx = br'(?:%s{1,2}|(?:%s%s*%s))' \
 129     % (_start_end_char,
 130        _start_end_char, _content_char, _start_end_char)
 131 _tz_rx = br'[-+]\d\d[0-5]\d'
 132 _parent_rx = br'(?:parent [abcdefABCDEF0123456789]{40}\n)'
 133 # Assumes every following line starting with a space is part of the
 134 # mergetag.  Is there a formal commit blob spec?
 135 _mergetag_rx = br'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)'
 136 _commit_rx = re.compile(br'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
 137 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
 138 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?)
 139 (?P<gpgsig>gpgsig .*\n(?: .*\n)*)?
 140 (?P<message>(?:.|\n)*)''' % (_parent_rx,
 141                              _safe_str_rx, _safe_str_rx, _tz_rx,
 142                              _safe_str_rx, _safe_str_rx, _tz_rx,
 143                              _mergetag_rx))
 144 _parent_hash_rx = re.compile(br'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
 145
 146 # Note that the author_sec and committer_sec values are (UTC) epoch
 147 # seconds, and for now the mergetag is not included.
 148 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
 149                                        'author_name', 'author_mail',
 150                                        'author_sec', 'author_offset',
 151                                        'committer_name', 'committer_mail',
 152                                        'committer_sec', 'committer_offset',
 153                                        'gpgsig',
 154                                        'message'])
 155
 156 def parse_commit(content):
 157     commit_match = re.match(_commit_rx, content)
 158     if not commit_match:
 159         raise Exception('cannot parse commit %r' % content)
 160     matches = commit_match.groupdict()
 161     return CommitInfo(tree=matches['tree'],
 162                       parents=re.findall(_parent_hash_rx, matches['parents']),
 163                       author_name=matches['author_name'],
 164                       author_mail=matches['author_mail'],
 165                       author_sec=int(matches['asec']),
 166                       author_offset=parse_tz_offset(matches['atz']),
 167                       committer_name=matches['committer_name'],
 168                       committer_mail=matches['committer_mail'],
 169                       committer_sec=int(matches['csec']),
 170                       committer_offset=parse_tz_offset(matches['ctz']),
 171                       gpgsig=parse_commit_gpgsig(matches['gpgsig']),
 172                       message=matches['message'])
 173
 174
 175 def get_cat_data(cat_iterator, expected_type):
 176     _, kind, _ = next(cat_iterator)
 177     if kind != expected_type:
 178         raise Exception('expected %r, saw %r' % (expected_type, kind))
 179     return b''.join(cat_iterator)
 180
 181 def get_commit_items(id, cp):
 182     return parse_commit(get_cat_data(cp.get(id), b'commit'))
 183
 184 def _local_git_date_str(epoch_sec):
 185     return b'%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
 186
 187
 188 def _git_date_str(epoch_sec, tz_offset_sec):
 189     offs =  tz_offset_sec // 60
 190     return b'%d %s%02d%02d' \
 191         % (epoch_sec,
 192            b'+' if offs >= 0 else b'-',
 193            abs(offs) // 60,
 194            abs(offs) % 60)
 195
 196
 197 def repo(sub = b'', repo_dir=None):
 198     """Get the path to the git repository or one of its subdirectories."""
 199     repo_dir = repo_dir or repodir
 200     if not repo_dir:
 201         raise GitError('You should call check_repo_or_die()')
 202
 203     # If there's a .git subdirectory, then the actual repo is in there.
 204     gd = os.path.join(repo_dir, b'.git')
 205     if os.path.exists(gd):
 206         repo_dir = gd
 207
 208     return os.path.join(repo_dir, sub)
 209
 210
 211 _shorten_hash_rx = \
 212     re.compile(br'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)')
 213
 214 def shorten_hash(s):
 215     return _shorten_hash_rx.sub(br'\1\2*\3', s)
 216
 217
 218 def repo_rel(path):
 219     full = os.path.abspath(path)
 220     fullrepo = os.path.abspath(repo(b''))
 221     if not fullrepo.endswith(b'/'):
 222         fullrepo += b'/'
 223     if full.startswith(fullrepo):
 224         path = full[len(fullrepo):]
 225     if path.startswith(b'index-cache/'):
 226         path = path[len(b'index-cache/'):]
 227     return shorten_hash(path)
 228
 229
 230 def auto_midx(objdir):
 231     args = [path.exe(), b'midx', b'--auto', b'--dir', objdir]
 232     try:
 233         rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
 234     except OSError as e:
 235         # make sure 'args' gets printed to help with debugging
 236         add_error('%r: exception: %s' % (args, e))
 237         raise
 238     if rv:
 239         add_error('%r: returned %d' % (args, rv))
 240
 241     args = [path.exe(), b'bloom', b'--dir', objdir]
 242     try:
 243         rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
 244     except OSError as e:
 245         # make sure 'args' gets printed to help with debugging
 246         add_error('%r: exception: %s' % (args, e))
 247         raise
 248     if rv:
 249         add_error('%r: returned %d' % (args, rv))
 250
 251
 252 def mangle_name(name, mode, gitmode):
 253     """Mangle a file name to present an abstract name for segmented files.
 254     Mangled file names will have the ".bup" extension added to them. If a
 255     file's name already ends with ".bup", a ".bupl" extension is added to
 256     disambiguate normal files from segmented ones.
 257     """
 258     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 259         assert(stat.S_ISDIR(gitmode))
 260         return name + b'.bup'
 261     elif name.endswith(b'.bup') or name[:-1].endswith(b'.bup'):
 262         return name + b'.bupl'
 263     else:
 264         return name
 265
 266
 267 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 268 def demangle_name(name, mode):
 269     """Remove name mangling from a file name, if necessary.
 270
 271     The return value is a tuple (demangled_filename,mode), where mode is one of
 272     the following:
 273
 274     * BUP_NORMAL  : files that should be read as-is from the repository
 275     * BUP_CHUNKED : files that were chunked and need to be reassembled
 276
 277     For more information on the name mangling algorithm, see mangle_name()
 278     """
 279     if name.endswith(b'.bupl'):
 280         return (name[:-5], BUP_NORMAL)
 281     elif name.endswith(b'.bup'):
 282         return (name[:-4], BUP_CHUNKED)
 283     elif name.endswith(b'.bupm'):
 284         return (name[:-5],
 285                 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
 286     return (name, BUP_NORMAL)
 287
 288
 289 def calc_hash(type, content):
 290     """Calculate some content's hash in the Git fashion."""
 291     header = b'%s %d\0' % (type, len(content))
 292     sum = Sha1(header)
 293     sum.update(content)
 294     return sum.digest()
 295
 296
 297 def shalist_item_sort_key(ent):
 298     (mode, name, id) = ent
 299     assert(mode+0 == mode)
 300     if stat.S_ISDIR(mode):
 301         return name + b'/'
 302     else:
 303         return name
 304
 305
 306 def tree_encode(shalist):
 307     """Generate a git tree object from (mode,name,hash) tuples."""
 308     shalist = sorted(shalist, key = shalist_item_sort_key)
 309     l = []
 310     for (mode,name,bin) in shalist:
 311         assert(mode)
 312         assert(mode+0 == mode)
 313         assert(name)
 314         assert(len(bin) == 20)
 315         s = b'%o %s\0%s' % (mode,name,bin)
 316         assert s[0] != b'0'  # 0-padded octal is not acceptable in a git tree
 317         l.append(s)
 318     return b''.join(l)
 319
 320
 321 def tree_decode(buf):
 322     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 323     ofs = 0
 324     while ofs < len(buf):
 325         z = buf.find(b'\0', ofs)
 326         assert(z > ofs)
 327         spl = buf[ofs:z].split(b' ', 1)
 328         assert(len(spl) == 2)
 329         mode,name = spl
 330         sha = buf[z+1:z+1+20]
 331         ofs = z+1+20
 332         yield (int(mode, 8), name, sha)
 333
 334
 335 def _encode_packobj(type, content, compression_level=1):
 336     if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
 337         raise ValueError('invalid compression level %s' % compression_level)
 338     szout = b''
 339     sz = len(content)
 340     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 341     sz >>= 4
 342     while 1:
 343         if sz: szbits |= 0x80
 344         szout += bytes_from_uint(szbits)
 345         if not sz:
 346             break
 347         szbits = sz & 0x7f
 348         sz >>= 7
 349     z = zlib.compressobj(compression_level)
 350     yield szout
 351     yield z.compress(content)
 352     yield z.flush()
 353
 354
 355 def _decode_packobj(buf):
 356     assert(buf)
 357     c = byte_int(buf[0])
 358     type = _typermap[(c & 0x70) >> 4]
 359     sz = c & 0x0f
 360     shift = 4
 361     i = 0
 362     while c & 0x80:
 363         i += 1
 364         c = byte_int(buf[i])
 365         sz |= (c & 0x7f) << shift
 366         shift += 7
 367         if not (c & 0x80):
 368             break
 369     return (type, zlib.decompress(buf[i+1:]))
 370
 371
 372 class PackIdx:
 373     def __init__(self):
 374         assert(0)
 375
 376     def find_offset(self, hash):
 377         """Get the offset of an object inside the index file."""
 378         idx = self._idx_from_hash(hash)
 379         if idx != None:
 380             return self._ofs_from_idx(idx)
 381         return None
 382
 383     def exists(self, hash, want_source=False):
 384         """Return nonempty if the object exists in this index."""
 385         if hash and (self._idx_from_hash(hash) != None):
 386             return want_source and os.path.basename(self.name) or True
 387         return None
 388
 389     def _idx_from_hash(self, hash):
 390         global _total_searches, _total_steps
 391         _total_searches += 1
 392         assert(len(hash) == 20)
 393         b1 = byte_int(hash[0])
 394         start = self.fanout[b1-1] # range -1..254
 395         end = self.fanout[b1] # range 0..255
 396         want = hash
 397         _total_steps += 1  # lookup table is a step
 398         while start < end:
 399             _total_steps += 1
 400             mid = start + (end - start) // 2
 401             v = self._idx_to_hash(mid)
 402             if v < want:
 403                 start = mid+1
 404             elif v > want:
 405                 end = mid
 406             else: # got it!
 407                 return mid
 408         return None
 409
 410
 411 class PackIdxV1(PackIdx):
 412     """Object representation of a Git pack index (version 1) file."""
 413     def __init__(self, filename, f):
 414         self.name = filename
 415         self.idxnames = [self.name]
 416         self.map = mmap_read(f)
 417         # Min size for 'L' is 4, which is sufficient for struct's '!I'
 418         self.fanout = array('L', struct.unpack('!256I', self.map))
 419         self.fanout.append(0)  # entry "-1"
 420         self.nsha = self.fanout[255]
 421         self.sha_ofs = 256 * 4
 422         # Avoid slicing shatable for individual hashes (very high overhead)
 423         self.shatable = buffer(self.map, self.sha_ofs, self.nsha * 24)
 424
 425     def __enter__(self):
 426         return self
 427
 428     def __exit__(self, type, value, traceback):
 429         with pending_raise(value, rethrow=False):
 430             self.close()
 431
 432     def __len__(self):
 433         return int(self.nsha)  # int() from long for python 2
 434
 435     def _ofs_from_idx(self, idx):
 436         if idx >= self.nsha or idx < 0:
 437             raise IndexError('invalid pack index index %d' % idx)
 438         ofs = self.sha_ofs + idx * 24
 439         return struct.unpack_from('!I', self.map, offset=ofs)[0]
 440
 441     def _idx_to_hash(self, idx):
 442         if idx >= self.nsha or idx < 0:
 443             raise IndexError('invalid pack index index %d' % idx)
 444         ofs = self.sha_ofs + idx * 24 + 4
 445         return self.map[ofs : ofs + 20]
 446
 447     def __iter__(self):
 448         start = self.sha_ofs + 4
 449         for ofs in range(start, start + 24 * self.nsha, 24):
 450             yield self.map[ofs : ofs + 20]
 451
 452     def close(self):
 453         if self.map is not None:
 454             self.shatable = None
 455             self.map.close()
 456             self.map = None
 457
 458
 459 class PackIdxV2(PackIdx):
 460     """Object representation of a Git pack index (version 2) file."""
 461     def __init__(self, filename, f):
 462         self.name = filename
 463         self.idxnames = [self.name]
 464         self.map = mmap_read(f)
 465         assert self.map[0:8] == b'\377tOc\0\0\0\2'
 466         # Min size for 'L' is 4, which is sufficient for struct's '!I'
 467         self.fanout = array('L', struct.unpack_from('!256I', self.map, offset=8))
 468         self.fanout.append(0)
 469         self.nsha = self.fanout[255]
 470         self.sha_ofs = 8 + 256*4
 471         self.ofstable_ofs = self.sha_ofs + self.nsha * 20 + self.nsha * 4
 472         self.ofs64table_ofs = self.ofstable_ofs + self.nsha * 4
 473         # Avoid slicing this for individual hashes (very high overhead)
 474         self.shatable = buffer(self.map, self.sha_ofs, self.nsha*20)
 475
 476     def __enter__(self):
 477         return self
 478
 479     def __exit__(self, type, value, traceback):
 480         with pending_raise(value, rethrow=False):
 481             self.close()
 482
 483     def __len__(self):
 484         return int(self.nsha)  # int() from long for python 2
 485
 486     def _ofs_from_idx(self, idx):
 487         if idx >= self.nsha or idx < 0:
 488             raise IndexError('invalid pack index index %d' % idx)
 489         ofs_ofs = self.ofstable_ofs + idx * 4
 490         ofs = struct.unpack_from('!I', self.map, offset=ofs_ofs)[0]
 491         if ofs & 0x80000000:
 492             idx64 = ofs & 0x7fffffff
 493             ofs64_ofs = self.ofs64table_ofs + idx64 * 8
 494             ofs = struct.unpack_from('!Q', self.map, offset=ofs64_ofs)[0]
 495         return ofs
 496
 497     def _idx_to_hash(self, idx):
 498         if idx >= self.nsha or idx < 0:
 499             raise IndexError('invalid pack index index %d' % idx)
 500         ofs = self.sha_ofs + idx * 20
 501         return self.map[ofs : ofs + 20]
 502
 503     def __iter__(self):
 504         start = self.sha_ofs
 505         for ofs in range(start, start + 20 * self.nsha, 20):
 506             yield self.map[ofs : ofs + 20]
 507
 508     def close(self):
 509         if self.map is not None:
 510             self.shatable = None
 511             self.map.close()
 512             self.map = None
 513
 514
 515 _mpi_count = 0
 516 class PackIdxList:
 517     def __init__(self, dir, ignore_midx=False):
 518         global _mpi_count
 519         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 520         _mpi_count += 1
 521         self.dir = dir
 522         self.also = set()
 523         self.packs = []
 524         self.do_bloom = False
 525         self.bloom = None
 526         self.ignore_midx = ignore_midx
 527         self.refresh()
 528
 529     def __del__(self):
 530         global _mpi_count
 531         _mpi_count -= 1
 532         assert(_mpi_count == 0)
 533
 534     def __iter__(self):
 535         return iter(idxmerge(self.packs))
 536
 537     def __len__(self):
 538         return sum(len(pack) for pack in self.packs)
 539
 540     def exists(self, hash, want_source=False):
 541         """Return nonempty if the object exists in the index files."""
 542         global _total_searches
 543         _total_searches += 1
 544         if hash in self.also:
 545             return True
 546         if self.do_bloom and self.bloom:
 547             if self.bloom.exists(hash):
 548                 self.do_bloom = False
 549             else:
 550                 _total_searches -= 1  # was counted by bloom
 551                 return None
 552         for i in range(len(self.packs)):
 553             p = self.packs[i]
 554             _total_searches -= 1  # will be incremented by sub-pack
 555             ix = p.exists(hash, want_source=want_source)
 556             if ix:
 557                 # reorder so most recently used packs are searched first
 558                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 559                 return ix
 560         self.do_bloom = True
 561         return None
 562
 563     def refresh(self, skip_midx = False):
 564         """Refresh the index list.
 565         This method verifies if .midx files were superseded (e.g. all of its
 566         contents are in another, bigger .midx file) and removes the superseded
 567         files.
 568
 569         If skip_midx is True, all work on .midx files will be skipped and .midx
 570         files will be removed from the list.
 571
 572         The instance variable 'ignore_midx' can force this function to
 573         always act as if skip_midx was True.
 574         """
 575         if self.bloom is not None:
 576             self.bloom.close()
 577         self.bloom = None # Always reopen the bloom as it may have been relaced
 578         self.do_bloom = False
 579         skip_midx = skip_midx or self.ignore_midx
 580         d = dict((p.name, p) for p in self.packs
 581                  if not skip_midx or not isinstance(p, midx.PackMidx))
 582         if os.path.exists(self.dir):
 583             if not skip_midx:
 584                 midxl = []
 585                 midxes = set(glob.glob(os.path.join(self.dir, b'*.midx')))
 586                 # remove any *.midx files from our list that no longer exist
 587                 for ix in list(d.values()):
 588                     if not isinstance(ix, midx.PackMidx):
 589                         continue
 590                     if ix.name in midxes:
 591                         continue
 592                     # remove the midx
 593                     del d[ix.name]
 594                     ix.close()
 595                     self.packs.remove(ix)
 596                 for ix in self.packs:
 597                     if isinstance(ix, midx.PackMidx):
 598                         for name in ix.idxnames:
 599                             d[os.path.join(self.dir, name)] = ix
 600                 for full in midxes:
 601                     if not d.get(full):
 602                         mx = midx.PackMidx(full)
 603                         (mxd, mxf) = os.path.split(mx.name)
 604                         broken = False
 605                         for n in mx.idxnames:
 606                             if not os.path.exists(os.path.join(mxd, n)):
 607                                 log(('warning: index %s missing\n'
 608                                      '  used by %s\n')
 609                                     % (path_msg(n), path_msg(mxf)))
 610                                 broken = True
 611                         if broken:
 612                             mx.close()
 613                             del mx
 614                             unlink(full)
 615                         else:
 616                             midxl.append(mx)
 617                 midxl.sort(key=lambda ix:
 618                            (-len(ix), -xstat.stat(ix.name).st_mtime))
 619                 for ix in midxl:
 620                     any_needed = False
 621                     for sub in ix.idxnames:
 622                         found = d.get(os.path.join(self.dir, sub))
 623                         if not found or isinstance(found, PackIdx):
 624                             # doesn't exist, or exists but not in a midx
 625                             any_needed = True
 626                             break
 627                     if any_needed:
 628                         d[ix.name] = ix
 629                         for name in ix.idxnames:
 630                             d[os.path.join(self.dir, name)] = ix
 631                     elif not ix.force_keep:
 632                         debug1('midx: removing redundant: %s\n'
 633                                % path_msg(os.path.basename(ix.name)))
 634                         ix.close()
 635                         unlink(ix.name)
 636             for full in glob.glob(os.path.join(self.dir, b'*.idx')):
 637                 if not d.get(full):
 638                     try:
 639                         ix = open_idx(full)
 640                     except GitError as e:
 641                         add_error(e)
 642                         continue
 643                     d[full] = ix
 644             bfull = os.path.join(self.dir, b'bup.bloom')
 645             if self.bloom is None and os.path.exists(bfull):
 646                 self.bloom = bloom.ShaBloom(bfull)
 647             self.packs = list(set(d.values()))
 648             self.packs.sort(reverse=True, key=lambda x: len(x))
 649             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 650                 self.do_bloom = True
 651             else:
 652                 self.bloom = None
 653         debug1('PackIdxList: using %d index%s.\n'
 654             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 655
 656     def add(self, hash):
 657         """Insert an additional object in the list."""
 658         self.also.add(hash)
 659
 660
 661 def open_idx(filename):
 662     if filename.endswith(b'.idx'):
 663         f = open(filename, 'rb')
 664         header = f.read(8)
 665         if header[0:4] == b'\377tOc':
 666             version = struct.unpack('!I', header[4:8])[0]
 667             if version == 2:
 668                 return PackIdxV2(filename, f)
 669             else:
 670                 raise GitError('%s: expected idx file version 2, got %d'
 671                                % (path_msg(filename), version))
 672         elif len(header) == 8 and header[0:4] < b'\377tOc':
 673             return PackIdxV1(filename, f)
 674         else:
 675             raise GitError('%s: unrecognized idx file header'
 676                            % path_msg(filename))
 677     elif filename.endswith(b'.midx'):
 678         return midx.PackMidx(filename)
 679     else:
 680         raise GitError('idx filenames must end with .idx or .midx')
 681
 682
 683 def idxmerge(idxlist, final_progress=True):
 684     """Generate a list of all the objects reachable in a PackIdxList."""
 685     def pfunc(count, total):
 686         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 687                   % (count*100.0/total, count, total))
 688     def pfinal(count, total):
 689         if final_progress:
 690             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 691                      % (100, total, total))
 692     return merge_iter(idxlist, 10024, pfunc, pfinal)
 693
 694
 695 def create_commit_blob(tree, parent,
 696                        author, adate_sec, adate_tz,
 697                        committer, cdate_sec, cdate_tz,
 698                        msg):
 699     if adate_tz is not None:
 700         adate_str = _git_date_str(adate_sec, adate_tz)
 701     else:
 702         adate_str = _local_git_date_str(adate_sec)
 703     if cdate_tz is not None:
 704         cdate_str = _git_date_str(cdate_sec, cdate_tz)
 705     else:
 706         cdate_str = _local_git_date_str(cdate_sec)
 707     l = []
 708     if tree: l.append(b'tree %s' % hexlify(tree))
 709     if parent: l.append(b'parent %s' % hexlify(parent))
 710     if author: l.append(b'author %s %s' % (author, adate_str))
 711     if committer: l.append(b'committer %s %s' % (committer, cdate_str))
 712     l.append(b'')
 713     l.append(msg)
 714     return b'\n'.join(l)
 715
 716
 717 def _make_objcache():
 718     return PackIdxList(repo(b'objects/pack'))
 719
 720 # bup-gc assumes that it can disable all PackWriter activities
 721 # (bloom/midx/cache) via the constructor and close() arguments.
 722
 723 class PackWriter:
 724     """Writes Git objects inside a pack file."""
 725     def __init__(self, objcache_maker=_make_objcache, compression_level=1,
 726                  run_midx=True, on_pack_finish=None,
 727                  max_pack_size=None, max_pack_objects=None, repo_dir=None):
 728         self.repo_dir = repo_dir or repo()
 729         self.file = None
 730         self.parentfd = None
 731         self.count = 0
 732         self.outbytes = 0
 733         self.filename = None
 734         self.idx = None
 735         self.objcache_maker = objcache_maker
 736         self.objcache = None
 737         self.compression_level = compression_level
 738         self.run_midx=run_midx
 739         self.on_pack_finish = on_pack_finish
 740         if not max_pack_size:
 741             max_pack_size = git_config_get(b'pack.packSizeLimit',
 742                                            repo_dir=self.repo_dir,
 743                                            opttype='int')
 744             if not max_pack_size:
 745                 # larger packs slow down pruning
 746                 max_pack_size = 1000 * 1000 * 1000
 747         self.max_pack_size = max_pack_size
 748         # cache memory usage is about 83 bytes per object
 749         self.max_pack_objects = max_pack_objects if max_pack_objects \
 750                                 else max(1, self.max_pack_size // 5000)
 751
 752     def __del__(self):
 753         self.close()
 754
 755     def __enter__(self):
 756         return self
 757
 758     def __exit__(self, type, value, traceback):
 759         with pending_raise(value, rethrow=False):
 760             self.close()
 761
 762     def _open(self):
 763         if not self.file:
 764             objdir = dir = os.path.join(self.repo_dir, b'objects')
 765             fd, name = tempfile.mkstemp(suffix=b'.pack', dir=objdir)
 766             try:
 767                 self.file = os.fdopen(fd, 'w+b')
 768             except:
 769                 os.close(fd)
 770                 raise
 771             try:
 772                 self.parentfd = os.open(objdir, os.O_RDONLY)
 773             except:
 774                 f = self.file
 775                 self.file = None
 776                 f.close()
 777                 raise
 778             assert name.endswith(b'.pack')
 779             self.filename = name[:-5]
 780             self.file.write(b'PACK\0\0\0\2\0\0\0\0')
 781             self.idx = PackIdxV2Writer()
 782
 783     def _raw_write(self, datalist, sha):
 784         self._open()
 785         f = self.file
 786         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 787         # the file never has a *partial* blob.  So let's make sure it's
 788         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 789         # to our hashsplit algorithm.)  f.write() does its own buffering,
 790         # but that's okay because we'll flush it in _end().
 791         oneblob = b''.join(datalist)
 792         try:
 793             f.write(oneblob)
 794         except IOError as e:
 795             reraise(GitError(e))
 796         nw = len(oneblob)
 797         crc = zlib.crc32(oneblob) & 0xffffffff
 798         self._update_idx(sha, crc, nw)
 799         self.outbytes += nw
 800         self.count += 1
 801         return nw, crc
 802
 803     def _update_idx(self, sha, crc, size):
 804         assert(sha)
 805         if self.idx:
 806             self.idx.add(sha, crc, self.file.tell() - size)
 807
 808     def _write(self, sha, type, content):
 809         if verbose:
 810             log('>')
 811         if not sha:
 812             sha = calc_hash(type, content)
 813         size, crc = self._raw_write(_encode_packobj(type, content,
 814                                                     self.compression_level),
 815                                     sha=sha)
 816         if self.outbytes >= self.max_pack_size \
 817            or self.count >= self.max_pack_objects:
 818             self.breakpoint()
 819         return sha
 820
 821     def breakpoint(self):
 822         """Clear byte and object counts and return the last processed id."""
 823         id = self._end(self.run_midx)
 824         self.outbytes = self.count = 0
 825         return id
 826
 827     def _require_objcache(self):
 828         if self.objcache is None and self.objcache_maker:
 829             self.objcache = self.objcache_maker()
 830         if self.objcache is None:
 831             raise GitError(
 832                     "PackWriter not opened or can't check exists w/o objcache")
 833
 834     def exists(self, id, want_source=False):
 835         """Return non-empty if an object is found in the object cache."""
 836         self._require_objcache()
 837         return self.objcache.exists(id, want_source=want_source)
 838
 839     def just_write(self, sha, type, content):
 840         """Write an object to the pack file without checking for duplication."""
 841         self._write(sha, type, content)
 842         # If nothing else, gc doesn't have/want an objcache
 843         if self.objcache is not None:
 844             self.objcache.add(sha)
 845
 846     def maybe_write(self, type, content):
 847         """Write an object to the pack file if not present and return its id."""
 848         sha = calc_hash(type, content)
 849         if not self.exists(sha):
 850             self._require_objcache()
 851             self.just_write(sha, type, content)
 852         return sha
 853
 854     def new_blob(self, blob):
 855         """Create a blob object in the pack with the supplied content."""
 856         return self.maybe_write(b'blob', blob)
 857
 858     def new_tree(self, shalist):
 859         """Create a tree object in the pack."""
 860         content = tree_encode(shalist)
 861         return self.maybe_write(b'tree', content)
 862
 863     def new_commit(self, tree, parent,
 864                    author, adate_sec, adate_tz,
 865                    committer, cdate_sec, cdate_tz,
 866                    msg):
 867         """Create a commit object in the pack.  The date_sec values must be
 868         epoch-seconds, and if a tz is None, the local timezone is assumed."""
 869         content = create_commit_blob(tree, parent,
 870                                      author, adate_sec, adate_tz,
 871                                      committer, cdate_sec, cdate_tz,
 872                                      msg)
 873         return self.maybe_write(b'commit', content)
 874
 875     def abort(self):
 876         """Remove the pack file from disk."""
 877         f = self.file
 878         if f:
 879             pfd = self.parentfd
 880             self.file = None
 881             self.parentfd = None
 882             self.idx = None
 883             try:
 884                 try:
 885                     os.unlink(self.filename + b'.pack')
 886                 finally:
 887                     f.close()
 888             finally:
 889                 if pfd is not None:
 890                     os.close(pfd)
 891
 892     def _end(self, run_midx=True):
 893         f = self.file
 894         if not f: return None
 895         self.file = None
 896         try:
 897             self.objcache = None
 898             idx = self.idx
 899             self.idx = None
 900
 901             # update object count
 902             f.seek(8)
 903             cp = struct.pack('!i', self.count)
 904             assert(len(cp) == 4)
 905             f.write(cp)
 906
 907             # calculate the pack sha1sum
 908             f.seek(0)
 909             sum = Sha1()
 910             for b in chunkyreader(f):
 911                 sum.update(b)
 912             packbin = sum.digest()
 913             f.write(packbin)
 914             fdatasync(f.fileno())
 915         finally:
 916             f.close()
 917
 918         idx.write(self.filename + b'.idx', packbin)
 919         nameprefix = os.path.join(self.repo_dir,
 920                                   b'objects/pack/pack-' +  hexlify(packbin))
 921         if os.path.exists(self.filename + b'.map'):
 922             os.unlink(self.filename + b'.map')
 923         os.rename(self.filename + b'.pack', nameprefix + b'.pack')
 924         os.rename(self.filename + b'.idx', nameprefix + b'.idx')
 925         try:
 926             os.fsync(self.parentfd)
 927         finally:
 928             os.close(self.parentfd)
 929
 930         if run_midx:
 931             auto_midx(os.path.join(self.repo_dir, b'objects/pack'))
 932
 933         if self.on_pack_finish:
 934             self.on_pack_finish(nameprefix)
 935
 936         return nameprefix
 937
 938     def close(self, run_midx=True):
 939         """Close the pack file and move it to its definitive path."""
 940         return self._end(run_midx=run_midx)
 941
 942
 943 class PackIdxV2Writer:
 944     def __init__(self):
 945         self.idx = list(list() for i in range(256))
 946         self.count = 0
 947
 948     def add(self, sha, crc, offs):
 949         assert(sha)
 950         self.count += 1
 951         self.idx[byte_int(sha[0])].append((sha, crc, offs))
 952
 953     def write(self, filename, packbin):
 954         ofs64_count = 0
 955         for section in self.idx:
 956             for entry in section:
 957                 if entry[2] >= 2**31:
 958                     ofs64_count += 1
 959
 960         # Length: header + fan-out + shas-and-crcs + overflow-offsets
 961         index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
 962         idx_map = None
 963         idx_f = open(filename, 'w+b')
 964         try:
 965             idx_f.truncate(index_len)
 966             fdatasync(idx_f.fileno())
 967             idx_map = mmap_readwrite(idx_f, close=False)
 968             try:
 969                 count = _helpers.write_idx(filename, idx_map, self.idx,
 970                                            self.count)
 971                 assert(count == self.count)
 972                 idx_map.flush()
 973             finally:
 974                 idx_map.close()
 975         finally:
 976             idx_f.close()
 977
 978         idx_f = open(filename, 'a+b')
 979         try:
 980             idx_f.write(packbin)
 981             idx_f.seek(0)
 982             idx_sum = Sha1()
 983             b = idx_f.read(8 + 4*256)
 984             idx_sum.update(b)
 985
 986             for b in chunkyreader(idx_f, 20 * self.count):
 987                 idx_sum.update(b)
 988
 989             for b in chunkyreader(idx_f):
 990                 idx_sum.update(b)
 991             idx_f.write(idx_sum.digest())
 992             fdatasync(idx_f.fileno())
 993         finally:
 994             idx_f.close()
 995
 996
 997 def list_refs(patterns=None, repo_dir=None,
 998               limit_to_heads=False, limit_to_tags=False):
 999     """Yield (refname, hash) tuples for all repository refs unless
1000     patterns are specified.  In that case, only include tuples for
1001     refs matching those patterns (cf. git-show-ref(1)).  The limits
1002     restrict the result items to refs/heads or refs/tags.  If both
1003     limits are specified, items from both sources will be included.
1004
1005     """
1006     argv = [b'git', b'show-ref']
1007     if limit_to_heads:
1008         argv.append(b'--heads')
1009     if limit_to_tags:
1010         argv.append(b'--tags')
1011     argv.append(b'--')
1012     if patterns:
1013         argv.extend(patterns)
1014     p = subprocess.Popen(argv, env=_gitenv(repo_dir), stdout=subprocess.PIPE,
1015                          close_fds=True)
1016     out = p.stdout.read().strip()
1017     rv = p.wait()  # not fatal
1018     if rv:
1019         assert(not out)
1020     if out:
1021         for d in out.split(b'\n'):
1022             sha, name = d.split(b' ', 1)
1023             yield name, unhexlify(sha)
1024
1025
1026 def read_ref(refname, repo_dir = None):
1027     """Get the commit id of the most recent commit made on a given ref."""
1028     refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
1029     l = tuple(islice(refs, 2))
1030     if l:
1031         assert(len(l) == 1)
1032         return l[0][1]
1033     else:
1034         return None
1035
1036
1037 def rev_list_invocation(ref_or_refs, format=None):
1038     if isinstance(ref_or_refs, bytes):
1039         refs = (ref_or_refs,)
1040     else:
1041         refs = ref_or_refs
1042     argv = [b'git', b'rev-list']
1043
1044     if format:
1045         argv.append(b'--pretty=format:' + format)
1046     for ref in refs:
1047         assert not ref.startswith(b'-')
1048         argv.append(ref)
1049     argv.append(b'--')
1050     return argv
1051
1052
1053 def rev_list(ref_or_refs, parse=None, format=None, repo_dir=None):
1054     """Yield information about commits as per "git rev-list".  If a format
1055     is not provided, yield one hex hash at a time.  If a format is
1056     provided, pass it to rev-list and call parse(git_stdout) for each
1057     commit with the stream positioned just after the rev-list "commit
1058     HASH" header line.  When a format is provided yield (oidx,
1059     parse(git_stdout)) for each commit.
1060
1061     """
1062     assert bool(parse) == bool(format)
1063     p = subprocess.Popen(rev_list_invocation(ref_or_refs,
1064                                              format=format),
1065                          env=_gitenv(repo_dir),
1066                          stdout = subprocess.PIPE,
1067                          close_fds=True)
1068     if not format:
1069         for line in p.stdout:
1070             yield line.strip()
1071     else:
1072         line = p.stdout.readline()
1073         while line:
1074             s = line.strip()
1075             if not s.startswith(b'commit '):
1076                 raise Exception('unexpected line ' + repr(s))
1077             s = s[7:]
1078             assert len(s) == 40
1079             yield s, parse(p.stdout)
1080             line = p.stdout.readline()
1081
1082     rv = p.wait()  # not fatal
1083     if rv:
1084         raise GitError('git rev-list returned error %d' % rv)
1085
1086
1087 def rev_parse(committish, repo_dir=None):
1088     """Resolve the full hash for 'committish', if it exists.
1089
1090     Should be roughly equivalent to 'git rev-parse'.
1091
1092     Returns the hex value of the hash if it is found, None if 'committish' does
1093     not correspond to anything.
1094     """
1095     head = read_ref(committish, repo_dir=repo_dir)
1096     if head:
1097         debug2("resolved from ref: commit = %s\n" % hexlify(head))
1098         return head
1099
1100     pL = PackIdxList(repo(b'objects/pack', repo_dir=repo_dir))
1101
1102     if len(committish) == 40:
1103         try:
1104             hash = unhexlify(committish)
1105         except TypeError:
1106             return None
1107
1108         if pL.exists(hash):
1109             return hash
1110
1111     return None
1112
1113
1114 def update_ref(refname, newval, oldval, repo_dir=None):
1115     """Update a repository reference."""
1116     if not oldval:
1117         oldval = b''
1118     assert refname.startswith(b'refs/heads/') \
1119         or refname.startswith(b'refs/tags/')
1120     p = subprocess.Popen([b'git', b'update-ref', refname,
1121                           hexlify(newval), hexlify(oldval)],
1122                          env=_gitenv(repo_dir),
1123                          close_fds=True)
1124     _git_wait(b'git update-ref', p)
1125
1126
1127 def delete_ref(refname, oldvalue=None):
1128     """Delete a repository reference (see git update-ref(1))."""
1129     assert refname.startswith(b'refs/')
1130     oldvalue = [] if not oldvalue else [oldvalue]
1131     p = subprocess.Popen([b'git', b'update-ref', b'-d', refname] + oldvalue,
1132                          env=_gitenv(),
1133                          close_fds=True)
1134     _git_wait('git update-ref', p)
1135
1136
1137 def guess_repo(path=None):
1138     """Set the path value in the global variable "repodir".
1139     This makes bup look for an existing bup repository, but not fail if a
1140     repository doesn't exist. Usually, if you are interacting with a bup
1141     repository, you would not be calling this function but using
1142     check_repo_or_die().
1143     """
1144     global repodir
1145     if path:
1146         repodir = path
1147     if not repodir:
1148         repodir = environ.get(b'BUP_DIR')
1149         if not repodir:
1150             repodir = os.path.expanduser(b'~/.bup')
1151
1152
1153 def init_repo(path=None):
1154     """Create the Git bare repository for bup in a given path."""
1155     guess_repo(path)
1156     d = repo()  # appends a / to the path
1157     parent = os.path.dirname(os.path.dirname(d))
1158     if parent and not os.path.exists(parent):
1159         raise GitError('parent directory "%s" does not exist\n'
1160                        % path_msg(parent))
1161     if os.path.exists(d) and not os.path.isdir(os.path.join(d, b'.')):
1162         raise GitError('"%s" exists but is not a directory\n' % path_msg(d))
1163     p = subprocess.Popen([b'git', b'--bare', b'init'], stdout=sys.stderr,
1164                          env=_gitenv(),
1165                          close_fds=True)
1166     _git_wait('git init', p)
1167     # Force the index version configuration in order to ensure bup works
1168     # regardless of the version of the installed Git binary.
1169     p = subprocess.Popen([b'git', b'config', b'pack.indexVersion', '2'],
1170                          stdout=sys.stderr, env=_gitenv(), close_fds=True)
1171     _git_wait('git config', p)
1172     # Enable the reflog
1173     p = subprocess.Popen([b'git', b'config', b'core.logAllRefUpdates', b'true'],
1174                          stdout=sys.stderr, env=_gitenv(), close_fds=True)
1175     _git_wait('git config', p)
1176
1177
1178 def check_repo_or_die(path=None):
1179     """Check to see if a bup repository probably exists, and abort if not."""
1180     guess_repo(path)
1181     top = repo()
1182     pst = stat_if_exists(top + b'/objects/pack')
1183     if pst and stat.S_ISDIR(pst.st_mode):
1184         return
1185     if not pst:
1186         top_st = stat_if_exists(top)
1187         if not top_st:
1188             log('error: repository %r does not exist (see "bup help init")\n'
1189                 % top)
1190             sys.exit(15)
1191     log('error: %s is not a repository\n' % path_msg(top))
1192     sys.exit(14)
1193
1194
1195 def is_suitable_git(ver_str):
1196     if not ver_str.startswith(b'git version '):
1197         return 'unrecognized'
1198     ver_str = ver_str[len(b'git version '):]
1199     if ver_str.startswith(b'0.'):
1200         return 'insufficient'
1201     if ver_str.startswith(b'1.'):
1202         if re.match(br'1\.[012345]rc', ver_str):
1203             return 'insufficient'
1204         if re.match(br'1\.[01234]\.', ver_str):
1205             return 'insufficient'
1206         if re.match(br'1\.5\.[012345]($|\.)', ver_str):
1207             return 'insufficient'
1208         if re.match(br'1\.5\.6-rc', ver_str):
1209             return 'insufficient'
1210         return 'suitable'
1211     if re.match(br'[0-9]+(\.|$)?', ver_str):
1212         return 'suitable'
1213     sys.exit(13)
1214
1215 _git_great = None
1216
1217 def require_suitable_git(ver_str=None):
1218     """Raise GitError if the version of git isn't suitable.
1219
1220     Rely on ver_str when provided, rather than invoking the git in the
1221     path.
1222
1223     """
1224     global _git_great
1225     if _git_great is not None:
1226         return
1227     if environ.get(b'BUP_GIT_VERSION_IS_FINE', b'').lower() \
1228        in (b'yes', b'true', b'1'):
1229         _git_great = True
1230         return
1231     if not ver_str:
1232         ver_str, _, _ = _git_exo([b'git', b'--version'])
1233     status = is_suitable_git(ver_str)
1234     if status == 'unrecognized':
1235         raise GitError('Unexpected git --version output: %r' % ver_str)
1236     if status == 'insufficient':
1237         log('error: git version must be at least 1.5.6\n')
1238         sys.exit(1)
1239     if status == 'suitable':
1240         _git_great = True
1241         return
1242     assert False
1243
1244
1245 class _AbortableIter:
1246     def __init__(self, it, onabort = None):
1247         self.it = it
1248         self.onabort = onabort
1249         self.done = None
1250
1251     def __iter__(self):
1252         return self
1253
1254     def __next__(self):
1255         try:
1256             return next(self.it)
1257         except StopIteration as e:
1258             self.done = True
1259             raise
1260         except:
1261             self.abort()
1262             raise
1263
1264     next = __next__
1265
1266     def abort(self):
1267         """Abort iteration and call the abortion callback, if needed."""
1268         if not self.done:
1269             self.done = True
1270             if self.onabort:
1271                 self.onabort()
1272
1273     def __del__(self):
1274         self.abort()
1275
1276
1277 class CatPipe:
1278     """Link to 'git cat-file' that is used to retrieve blob data."""
1279     def __init__(self, repo_dir = None):
1280         require_suitable_git()
1281         self.repo_dir = repo_dir
1282         self.p = self.inprogress = None
1283
1284     def close(self, wait=False):
1285         p = self.p
1286         if p:
1287             p.stdout.close()
1288             p.stdin.close()
1289         self.p = None
1290         self.inprogress = None
1291         if wait:
1292             p.wait()
1293             return p.returncode
1294         return None
1295
1296     def restart(self):
1297         self.close()
1298         self.p = subprocess.Popen([b'git', b'cat-file', b'--batch'],
1299                                   stdin=subprocess.PIPE,
1300                                   stdout=subprocess.PIPE,
1301                                   close_fds = True,
1302                                   bufsize = 4096,
1303                                   env=_gitenv(self.repo_dir))
1304
1305     def get(self, ref):
1306         """Yield (oidx, type, size), followed by the data referred to by ref.
1307         If ref does not exist, only yield (None, None, None).
1308
1309         """
1310         if not self.p or self.p.poll() != None:
1311             self.restart()
1312         assert(self.p)
1313         poll_result = self.p.poll()
1314         assert(poll_result == None)
1315         if self.inprogress:
1316             log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1317         assert(not self.inprogress)
1318         assert ref.find(b'\n') < 0
1319         assert ref.find(b'\r') < 0
1320         assert not ref.startswith(b'-')
1321         self.inprogress = ref
1322         self.p.stdin.write(ref + b'\n')
1323         self.p.stdin.flush()
1324         hdr = self.p.stdout.readline()
1325         if not hdr:
1326             raise GitError('unexpected cat-file EOF (last request: %r, exit: %s)'
1327                            % (ref, self.p.poll() or 'none'))
1328         if hdr.endswith(b' missing\n'):
1329             self.inprogress = None
1330             yield None, None, None
1331             return
1332         info = hdr.split(b' ')
1333         if len(info) != 3 or len(info[0]) != 40:
1334             raise GitError('expected object (id, type, size), got %r' % info)
1335         oidx, typ, size = info
1336         size = int(size)
1337         it = _AbortableIter(chunkyreader(self.p.stdout, size),
1338                             onabort=self.close)
1339         try:
1340             yield oidx, typ, size
1341             for blob in it:
1342                 yield blob
1343             readline_result = self.p.stdout.readline()
1344             assert readline_result == b'\n'
1345             self.inprogress = None
1346         except Exception as e:
1347             it.abort()
1348             raise
1349
1350     def _join(self, it):
1351         _, typ, _ = next(it)
1352         if typ == b'blob':
1353             for blob in it:
1354                 yield blob
1355         elif typ == b'tree':
1356             treefile = b''.join(it)
1357             for (mode, name, sha) in tree_decode(treefile):
1358                 for blob in self.join(hexlify(sha)):
1359                     yield blob
1360         elif typ == b'commit':
1361             treeline = b''.join(it).split(b'\n')[0]
1362             assert treeline.startswith(b'tree ')
1363             for blob in self.join(treeline[5:]):
1364                 yield blob
1365         else:
1366             raise GitError('invalid object type %r: expected blob/tree/commit'
1367                            % typ)
1368
1369     def join(self, id):
1370         """Generate a list of the content of all blobs that can be reached
1371         from an object.  The hash given in 'id' must point to a blob, a tree
1372         or a commit. The content of all blobs that can be seen from trees or
1373         commits will be added to the list.
1374         """
1375         for d in self._join(self.get(id)):
1376             yield d
1377
1378
1379 _cp = {}
1380
1381 def cp(repo_dir=None):
1382     """Create a CatPipe object or reuse the already existing one."""
1383     global _cp, repodir
1384     if not repo_dir:
1385         repo_dir = repodir or repo()
1386     repo_dir = os.path.abspath(repo_dir)
1387     cp = _cp.get(repo_dir)
1388     if not cp:
1389         cp = CatPipe(repo_dir)
1390         _cp[repo_dir] = cp
1391     return cp
1392
1393
1394 def close_catpipes():
1395     # FIXME: chain exceptions
1396     while _cp:
1397         _, cp = _cp.popitem()
1398         cp.close(wait=True)
1399
1400
1401 def tags(repo_dir = None):
1402     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1403     tags = {}
1404     for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1405         assert n.startswith(b'refs/tags/')
1406         name = n[10:]
1407         if not c in tags:
1408             tags[c] = []
1409         tags[c].append(name)  # more than one tag can point at 'c'
1410     return tags
1411
1412
1413 class MissingObject(KeyError):
1414     def __init__(self, oid):
1415         self.oid = oid
1416         KeyError.__init__(self, 'object %r is missing' % hexlify(oid))
1417
1418
1419 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1420                                    'path', 'chunk_path', 'data'])
1421 # The path is the mangled path, and if an item represents a fragment
1422 # of a chunked file, the chunk_path will be the chunked subtree path
1423 # for the chunk, i.e. ['', '2d3115e', ...].  The top-level path for a
1424 # chunked file will have a chunk_path of [''].  So some chunk subtree
1425 # of the file '/foo/bar/baz' might look like this:
1426 #
1427 #   item.path = ['foo', 'bar', 'baz.bup']
1428 #   item.chunk_path = ['', '2d3115e', '016b097']
1429 #   item.type = 'tree'
1430 #   ...
1431
1432
1433 def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1434     """Yield everything reachable from oidx via get_ref (which must behave
1435     like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1436     returns true.  Throw MissingObject if a hash encountered is
1437     missing from the repository, and don't read or return blob content
1438     in the data field unless include_data is set.
1439
1440     """
1441     # Maintain the pending stack on the heap to avoid stack overflow
1442     pending = [(oidx, [], [], None)]
1443     while len(pending):
1444         oidx, parent_path, chunk_path, mode = pending.pop()
1445         oid = unhexlify(oidx)
1446         if stop_at and stop_at(oidx):
1447             continue
1448
1449         if (not include_data) and mode and stat.S_ISREG(mode):
1450             # If the object is a "regular file", then it's a leaf in
1451             # the graph, so we can skip reading the data if the caller
1452             # hasn't requested it.
1453             yield WalkItem(oid=oid, type=b'blob',
1454                            chunk_path=chunk_path, path=parent_path,
1455                            mode=mode,
1456                            data=None)
1457             continue
1458
1459         item_it = get_ref(oidx)
1460         get_oidx, typ, _ = next(item_it)
1461         if not get_oidx:
1462             raise MissingObject(unhexlify(oidx))
1463         if typ not in (b'blob', b'commit', b'tree'):
1464             raise Exception('unexpected repository object type %r' % typ)
1465
1466         # FIXME: set the mode based on the type when the mode is None
1467         if typ == b'blob' and not include_data:
1468             # Dump data until we can ask cat_pipe not to fetch it
1469             for ignored in item_it:
1470                 pass
1471             data = None
1472         else:
1473             data = b''.join(item_it)
1474
1475         yield WalkItem(oid=oid, type=typ,
1476                        chunk_path=chunk_path, path=parent_path,
1477                        mode=mode,
1478                        data=(data if include_data else None))
1479
1480         if typ == b'commit':
1481             commit_items = parse_commit(data)
1482             for pid in commit_items.parents:
1483                 pending.append((pid, parent_path, chunk_path, mode))
1484             pending.append((commit_items.tree, parent_path, chunk_path,
1485                             hashsplit.GIT_MODE_TREE))
1486         elif typ == b'tree':
1487             for mode, name, ent_id in tree_decode(data):
1488                 demangled, bup_type = demangle_name(name, mode)
1489                 if chunk_path:
1490                     sub_path = parent_path
1491                     sub_chunk_path = chunk_path + [name]
1492                 else:
1493                     sub_path = parent_path + [name]
1494                     if bup_type == BUP_CHUNKED:
1495                         sub_chunk_path = [b'']
1496                     else:
1497                         sub_chunk_path = chunk_path
1498                 pending.append((hexlify(ent_id), sub_path, sub_chunk_path,
1499                                 mode))