lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5
   6 from __future__ import absolute_import, print_function
   7 import os, sys, zlib, subprocess, struct, stat, re, tempfile, glob
   8 from array import array
   9 from binascii import hexlify, unhexlify
  10 from collections import namedtuple
  11 from itertools import islice
  12
  13 from bup import _helpers, hashsplit, path, midx, bloom, xstat
  14 from bup.compat import (buffer,
  15                         byte_int, bytes_from_byte, bytes_from_uint,
  16                         environ,
  17                         items,
  18                         range,
  19                         reraise)
  20 from bup.io import path_msg
  21 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
  22                          exo,
  23                          fdatasync,
  24                          log,
  25                          merge_dict,
  26                          merge_iter,
  27                          mmap_read, mmap_readwrite,
  28                          progress, qprogress, stat_if_exists,
  29                          unlink,
  30                          utc_offset_str)
  31
  32
  33 verbose = 0
  34 repodir = None  # The default repository, once initialized
  35
  36 _typemap =  {b'blob': 3, b'tree': 2, b'commit': 1, b'tag': 4}
  37 _typermap = {v: k for k, v in items(_typemap)}
  38
  39
  40 _total_searches = 0
  41 _total_steps = 0
  42
  43
  44 class GitError(Exception):
  45     pass
  46
  47
  48 def _gitenv(repo_dir=None):
  49     if not repo_dir:
  50         repo_dir = repo()
  51     return merge_dict(environ, {b'GIT_DIR': os.path.abspath(repo_dir)})
  52
  53 def _git_wait(cmd, p):
  54     rv = p.wait()
  55     if rv != 0:
  56         raise GitError('%r returned %d' % (cmd, rv))
  57
  58 def _git_exo(cmd, **kwargs):
  59     kwargs['check'] = False
  60     result = exo(cmd, **kwargs)
  61     _, _, proc = result
  62     if proc.returncode != 0:
  63         raise GitError('%r returned %d' % (cmd, proc.returncode))
  64     return result
  65
  66 def git_config_get(option, repo_dir=None, opttype=None, cfg_file=None):
  67     assert not (repo_dir and cfg_file), "repo_dir and cfg_file cannot both be used"
  68     cmd = [b'git', b'config', b'--null']
  69     if cfg_file:
  70         cmd.extend([b'--file', cfg_file])
  71     if opttype == 'int':
  72         cmd.extend([b'--int'])
  73     elif opttype == 'bool':
  74         cmd.extend([b'--bool'])
  75     else:
  76         assert opttype is None
  77     cmd.extend([b'--get', option])
  78     env=None
  79     if repo_dir:
  80         env = _gitenv(repo_dir=repo_dir)
  81     p = subprocess.Popen(cmd, stdout=subprocess.PIPE, env=env,
  82                          close_fds=True)
  83     # with --null, git writes out a trailing \0 after the value
  84     r = p.stdout.read()[:-1]
  85     rc = p.wait()
  86     if rc == 0:
  87         if opttype == 'int':
  88             return int(r)
  89         elif opttype == 'bool':
  90             # git converts to 'true' or 'false'
  91             return r == b'true'
  92         return r
  93     if rc != 1:
  94         raise GitError('%r returned %d' % (cmd, rc))
  95     return None
  96
  97
  98 def parse_tz_offset(s):
  99     """UTC offset in seconds."""
 100     tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
 101     if bytes_from_byte(s[0]) == b'-':
 102         return - tz_off
 103     return tz_off
 104
 105
 106 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
 107 # Make sure that's authoritative.
 108 _start_end_char = br'[^ .,:;<>"\'\0\n]'
 109 _content_char = br'[^\0\n<>]'
 110 _safe_str_rx = br'(?:%s{1,2}|(?:%s%s*%s))' \
 111     % (_start_end_char,
 112        _start_end_char, _content_char, _start_end_char)
 113 _tz_rx = br'[-+]\d\d[0-5]\d'
 114 _parent_rx = br'(?:parent [abcdefABCDEF0123456789]{40}\n)'
 115 # Assumes every following line starting with a space is part of the
 116 # mergetag.  Is there a formal commit blob spec?
 117 _mergetag_rx = br'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)'
 118 _commit_rx = re.compile(br'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
 119 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
 120 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?)
 121
 122 (?P<message>(?:.|\n)*)''' % (_parent_rx,
 123                              _safe_str_rx, _safe_str_rx, _tz_rx,
 124                              _safe_str_rx, _safe_str_rx, _tz_rx,
 125                              _mergetag_rx))
 126 _parent_hash_rx = re.compile(br'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
 127
 128 # Note that the author_sec and committer_sec values are (UTC) epoch
 129 # seconds, and for now the mergetag is not included.
 130 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
 131                                        'author_name', 'author_mail',
 132                                        'author_sec', 'author_offset',
 133                                        'committer_name', 'committer_mail',
 134                                        'committer_sec', 'committer_offset',
 135                                        'message'])
 136
 137 def parse_commit(content):
 138     commit_match = re.match(_commit_rx, content)
 139     if not commit_match:
 140         raise Exception('cannot parse commit %r' % content)
 141     matches = commit_match.groupdict()
 142     return CommitInfo(tree=matches['tree'],
 143                       parents=re.findall(_parent_hash_rx, matches['parents']),
 144                       author_name=matches['author_name'],
 145                       author_mail=matches['author_mail'],
 146                       author_sec=int(matches['asec']),
 147                       author_offset=parse_tz_offset(matches['atz']),
 148                       committer_name=matches['committer_name'],
 149                       committer_mail=matches['committer_mail'],
 150                       committer_sec=int(matches['csec']),
 151                       committer_offset=parse_tz_offset(matches['ctz']),
 152                       message=matches['message'])
 153
 154
 155 def get_cat_data(cat_iterator, expected_type):
 156     _, kind, _ = next(cat_iterator)
 157     if kind != expected_type:
 158         raise Exception('expected %r, saw %r' % (expected_type, kind))
 159     return b''.join(cat_iterator)
 160
 161 def get_commit_items(id, cp):
 162     return parse_commit(get_cat_data(cp.get(id), b'commit'))
 163
 164 def _local_git_date_str(epoch_sec):
 165     return b'%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
 166
 167
 168 def _git_date_str(epoch_sec, tz_offset_sec):
 169     offs =  tz_offset_sec // 60
 170     return b'%d %s%02d%02d' \
 171         % (epoch_sec,
 172            b'+' if offs >= 0 else b'-',
 173            abs(offs) // 60,
 174            abs(offs) % 60)
 175
 176
 177 def repo(sub = b'', repo_dir=None):
 178     """Get the path to the git repository or one of its subdirectories."""
 179     repo_dir = repo_dir or repodir
 180     if not repo_dir:
 181         raise GitError('You should call check_repo_or_die()')
 182
 183     # If there's a .git subdirectory, then the actual repo is in there.
 184     gd = os.path.join(repo_dir, b'.git')
 185     if os.path.exists(gd):
 186         repo_dir = gd
 187
 188     return os.path.join(repo_dir, sub)
 189
 190
 191 _shorten_hash_rx = \
 192     re.compile(br'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)')
 193
 194 def shorten_hash(s):
 195     return _shorten_hash_rx.sub(br'\1\2*\3', s)
 196
 197
 198 def repo_rel(path):
 199     full = os.path.abspath(path)
 200     fullrepo = os.path.abspath(repo(b''))
 201     if not fullrepo.endswith(b'/'):
 202         fullrepo += b'/'
 203     if full.startswith(fullrepo):
 204         path = full[len(fullrepo):]
 205     if path.startswith(b'index-cache/'):
 206         path = path[len(b'index-cache/'):]
 207     return shorten_hash(path)
 208
 209
 210 def auto_midx(objdir):
 211     args = [path.exe(), b'midx', b'--auto', b'--dir', objdir]
 212     try:
 213         rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
 214     except OSError as e:
 215         # make sure 'args' gets printed to help with debugging
 216         add_error('%r: exception: %s' % (args, e))
 217         raise
 218     if rv:
 219         add_error('%r: returned %d' % (args, rv))
 220
 221     args = [path.exe(), b'bloom', b'--dir', objdir]
 222     try:
 223         rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
 224     except OSError as e:
 225         # make sure 'args' gets printed to help with debugging
 226         add_error('%r: exception: %s' % (args, e))
 227         raise
 228     if rv:
 229         add_error('%r: returned %d' % (args, rv))
 230
 231
 232 def mangle_name(name, mode, gitmode):
 233     """Mangle a file name to present an abstract name for segmented files.
 234     Mangled file names will have the ".bup" extension added to them. If a
 235     file's name already ends with ".bup", a ".bupl" extension is added to
 236     disambiguate normal files from segmented ones.
 237     """
 238     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 239         assert(stat.S_ISDIR(gitmode))
 240         return name + b'.bup'
 241     elif name.endswith(b'.bup') or name[:-1].endswith(b'.bup'):
 242         return name + b'.bupl'
 243     else:
 244         return name
 245
 246
 247 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 248 def demangle_name(name, mode):
 249     """Remove name mangling from a file name, if necessary.
 250
 251     The return value is a tuple (demangled_filename,mode), where mode is one of
 252     the following:
 253
 254     * BUP_NORMAL  : files that should be read as-is from the repository
 255     * BUP_CHUNKED : files that were chunked and need to be reassembled
 256
 257     For more information on the name mangling algorithm, see mangle_name()
 258     """
 259     if name.endswith(b'.bupl'):
 260         return (name[:-5], BUP_NORMAL)
 261     elif name.endswith(b'.bup'):
 262         return (name[:-4], BUP_CHUNKED)
 263     elif name.endswith(b'.bupm'):
 264         return (name[:-5],
 265                 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
 266     else:
 267         return (name, BUP_NORMAL)
 268
 269
 270 def calc_hash(type, content):
 271     """Calculate some content's hash in the Git fashion."""
 272     header = b'%s %d\0' % (type, len(content))
 273     sum = Sha1(header)
 274     sum.update(content)
 275     return sum.digest()
 276
 277
 278 def shalist_item_sort_key(ent):
 279     (mode, name, id) = ent
 280     assert(mode+0 == mode)
 281     if stat.S_ISDIR(mode):
 282         return name + b'/'
 283     else:
 284         return name
 285
 286
 287 def tree_encode(shalist):
 288     """Generate a git tree object from (mode,name,hash) tuples."""
 289     shalist = sorted(shalist, key = shalist_item_sort_key)
 290     l = []
 291     for (mode,name,bin) in shalist:
 292         assert(mode)
 293         assert(mode+0 == mode)
 294         assert(name)
 295         assert(len(bin) == 20)
 296         s = b'%o %s\0%s' % (mode,name,bin)
 297         assert s[0] != b'0'  # 0-padded octal is not acceptable in a git tree
 298         l.append(s)
 299     return b''.join(l)
 300
 301
 302 def tree_decode(buf):
 303     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 304     ofs = 0
 305     while ofs < len(buf):
 306         z = buf.find(b'\0', ofs)
 307         assert(z > ofs)
 308         spl = buf[ofs:z].split(b' ', 1)
 309         assert(len(spl) == 2)
 310         mode,name = spl
 311         sha = buf[z+1:z+1+20]
 312         ofs = z+1+20
 313         yield (int(mode, 8), name, sha)
 314
 315
 316 def _encode_packobj(type, content, compression_level=1):
 317     if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
 318         raise ValueError('invalid compression level %s' % compression_level)
 319     szout = b''
 320     sz = len(content)
 321     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 322     sz >>= 4
 323     while 1:
 324         if sz: szbits |= 0x80
 325         szout += bytes_from_uint(szbits)
 326         if not sz:
 327             break
 328         szbits = sz & 0x7f
 329         sz >>= 7
 330     z = zlib.compressobj(compression_level)
 331     yield szout
 332     yield z.compress(content)
 333     yield z.flush()
 334
 335
 336 def _decode_packobj(buf):
 337     assert(buf)
 338     c = byte_int(buf[0])
 339     type = _typermap[(c & 0x70) >> 4]
 340     sz = c & 0x0f
 341     shift = 4
 342     i = 0
 343     while c & 0x80:
 344         i += 1
 345         c = byte_int(buf[i])
 346         sz |= (c & 0x7f) << shift
 347         shift += 7
 348         if not (c & 0x80):
 349             break
 350     return (type, zlib.decompress(buf[i+1:]))
 351
 352
 353 class PackIdx:
 354     def __init__(self):
 355         assert(0)
 356
 357     def find_offset(self, hash):
 358         """Get the offset of an object inside the index file."""
 359         idx = self._idx_from_hash(hash)
 360         if idx != None:
 361             return self._ofs_from_idx(idx)
 362         return None
 363
 364     def exists(self, hash, want_source=False):
 365         """Return nonempty if the object exists in this index."""
 366         if hash and (self._idx_from_hash(hash) != None):
 367             return want_source and os.path.basename(self.name) or True
 368         return None
 369
 370     def _idx_from_hash(self, hash):
 371         global _total_searches, _total_steps
 372         _total_searches += 1
 373         assert(len(hash) == 20)
 374         b1 = byte_int(hash[0])
 375         start = self.fanout[b1-1] # range -1..254
 376         end = self.fanout[b1] # range 0..255
 377         want = hash
 378         _total_steps += 1  # lookup table is a step
 379         while start < end:
 380             _total_steps += 1
 381             mid = start + (end - start) // 2
 382             v = self._idx_to_hash(mid)
 383             if v < want:
 384                 start = mid+1
 385             elif v > want:
 386                 end = mid
 387             else: # got it!
 388                 return mid
 389         return None
 390
 391
 392 class PackIdxV1(PackIdx):
 393     """Object representation of a Git pack index (version 1) file."""
 394     def __init__(self, filename, f):
 395         self.name = filename
 396         self.idxnames = [self.name]
 397         self.map = mmap_read(f)
 398         # Min size for 'L' is 4, which is sufficient for struct's '!I'
 399         self.fanout = array('L', struct.unpack('!256I', self.map))
 400         self.fanout.append(0)  # entry "-1"
 401         self.nsha = self.fanout[255]
 402         self.sha_ofs = 256 * 4
 403         # Avoid slicing shatable for individual hashes (very high overhead)
 404         self.shatable = buffer(self.map, self.sha_ofs, self.nsha * 24)
 405
 406     def __enter__(self):
 407         return self
 408
 409     def __exit__(self, type, value, traceback):
 410         self.close()
 411
 412     def __len__(self):
 413         return int(self.nsha)  # int() from long for python 2
 414
 415     def _ofs_from_idx(self, idx):
 416         if idx >= self.nsha or idx < 0:
 417             raise IndexError('invalid pack index index %d' % idx)
 418         ofs = self.sha_ofs + idx * 24
 419         return struct.unpack_from('!I', self.map, offset=ofs)[0]
 420
 421     def _idx_to_hash(self, idx):
 422         if idx >= self.nsha or idx < 0:
 423             raise IndexError('invalid pack index index %d' % idx)
 424         ofs = self.sha_ofs + idx * 24 + 4
 425         return self.map[ofs : ofs + 20]
 426
 427     def __iter__(self):
 428         start = self.sha_ofs + 4
 429         for ofs in range(start, start + 24 * self.nsha, 24):
 430             yield self.map[ofs : ofs + 20]
 431
 432     def close(self):
 433         if self.map is not None:
 434             self.shatable = None
 435             self.map.close()
 436             self.map = None
 437
 438
 439 class PackIdxV2(PackIdx):
 440     """Object representation of a Git pack index (version 2) file."""
 441     def __init__(self, filename, f):
 442         self.name = filename
 443         self.idxnames = [self.name]
 444         self.map = mmap_read(f)
 445         assert self.map[0:8] == b'\377tOc\0\0\0\2'
 446         # Min size for 'L' is 4, which is sufficient for struct's '!I'
 447         self.fanout = array('L', struct.unpack_from('!256I', self.map, offset=8))
 448         self.fanout.append(0)
 449         self.nsha = self.fanout[255]
 450         self.sha_ofs = 8 + 256*4
 451         self.ofstable_ofs = self.sha_ofs + self.nsha * 20 + self.nsha * 4
 452         self.ofs64table_ofs = self.ofstable_ofs + self.nsha * 4
 453         # Avoid slicing this for individual hashes (very high overhead)
 454         self.shatable = buffer(self.map, self.sha_ofs, self.nsha*20)
 455
 456     def __enter__(self):
 457         return self
 458
 459     def __exit__(self, type, value, traceback):
 460         self.close()
 461
 462     def __len__(self):
 463         return int(self.nsha)  # int() from long for python 2
 464
 465     def _ofs_from_idx(self, idx):
 466         if idx >= self.nsha or idx < 0:
 467             raise IndexError('invalid pack index index %d' % idx)
 468         ofs_ofs = self.ofstable_ofs + idx * 4
 469         ofs = struct.unpack_from('!I', self.map, offset=ofs_ofs)[0]
 470         if ofs & 0x80000000:
 471             idx64 = ofs & 0x7fffffff
 472             ofs64_ofs = self.ofs64table_ofs + idx64 * 8
 473             ofs = struct.unpack_from('!Q', self.map, offset=ofs64_ofs)[0]
 474         return ofs
 475
 476     def _idx_to_hash(self, idx):
 477         if idx >= self.nsha or idx < 0:
 478             raise IndexError('invalid pack index index %d' % idx)
 479         ofs = self.sha_ofs + idx * 20
 480         return self.map[ofs : ofs + 20]
 481
 482     def __iter__(self):
 483         start = self.sha_ofs
 484         for ofs in range(start, start + 20 * self.nsha, 20):
 485             yield self.map[ofs : ofs + 20]
 486
 487     def close(self):
 488         if self.map is not None:
 489             self.shatable = None
 490             self.map.close()
 491             self.map = None
 492
 493
 494 _mpi_count = 0
 495 class PackIdxList:
 496     def __init__(self, dir, ignore_midx=False):
 497         global _mpi_count
 498         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 499         _mpi_count += 1
 500         self.dir = dir
 501         self.also = set()
 502         self.packs = []
 503         self.do_bloom = False
 504         self.bloom = None
 505         self.ignore_midx = ignore_midx
 506         self.refresh()
 507
 508     def __del__(self):
 509         global _mpi_count
 510         _mpi_count -= 1
 511         assert(_mpi_count == 0)
 512
 513     def __iter__(self):
 514         return iter(idxmerge(self.packs))
 515
 516     def __len__(self):
 517         return sum(len(pack) for pack in self.packs)
 518
 519     def exists(self, hash, want_source=False):
 520         """Return nonempty if the object exists in the index files."""
 521         global _total_searches
 522         _total_searches += 1
 523         if hash in self.also:
 524             return True
 525         if self.do_bloom and self.bloom:
 526             if self.bloom.exists(hash):
 527                 self.do_bloom = False
 528             else:
 529                 _total_searches -= 1  # was counted by bloom
 530                 return None
 531         for i in range(len(self.packs)):
 532             p = self.packs[i]
 533             _total_searches -= 1  # will be incremented by sub-pack
 534             ix = p.exists(hash, want_source=want_source)
 535             if ix:
 536                 # reorder so most recently used packs are searched first
 537                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 538                 return ix
 539         self.do_bloom = True
 540         return None
 541
 542     def refresh(self, skip_midx = False):
 543         """Refresh the index list.
 544         This method verifies if .midx files were superseded (e.g. all of its
 545         contents are in another, bigger .midx file) and removes the superseded
 546         files.
 547
 548         If skip_midx is True, all work on .midx files will be skipped and .midx
 549         files will be removed from the list.
 550
 551         The instance variable 'ignore_midx' can force this function to
 552         always act as if skip_midx was True.
 553         """
 554         if self.bloom is not None:
 555             self.bloom.close()
 556         self.bloom = None # Always reopen the bloom as it may have been relaced
 557         self.do_bloom = False
 558         skip_midx = skip_midx or self.ignore_midx
 559         d = dict((p.name, p) for p in self.packs
 560                  if not skip_midx or not isinstance(p, midx.PackMidx))
 561         if os.path.exists(self.dir):
 562             if not skip_midx:
 563                 midxl = []
 564                 midxes = set(glob.glob(os.path.join(self.dir, b'*.midx')))
 565                 # remove any *.midx files from our list that no longer exist
 566                 for ix in list(d.values()):
 567                     if not isinstance(ix, midx.PackMidx):
 568                         continue
 569                     if ix.name in midxes:
 570                         continue
 571                     # remove the midx
 572                     del d[ix.name]
 573                     ix.close()
 574                     self.packs.remove(ix)
 575                 for ix in self.packs:
 576                     if isinstance(ix, midx.PackMidx):
 577                         for name in ix.idxnames:
 578                             d[os.path.join(self.dir, name)] = ix
 579                 for full in midxes:
 580                     if not d.get(full):
 581                         mx = midx.PackMidx(full)
 582                         (mxd, mxf) = os.path.split(mx.name)
 583                         broken = False
 584                         for n in mx.idxnames:
 585                             if not os.path.exists(os.path.join(mxd, n)):
 586                                 log(('warning: index %s missing\n'
 587                                      '  used by %s\n')
 588                                     % (path_msg(n), path_msg(mxf)))
 589                                 broken = True
 590                         if broken:
 591                             mx.close()
 592                             del mx
 593                             unlink(full)
 594                         else:
 595                             midxl.append(mx)
 596                 midxl.sort(key=lambda ix:
 597                            (-len(ix), -xstat.stat(ix.name).st_mtime))
 598                 for ix in midxl:
 599                     any_needed = False
 600                     for sub in ix.idxnames:
 601                         found = d.get(os.path.join(self.dir, sub))
 602                         if not found or isinstance(found, PackIdx):
 603                             # doesn't exist, or exists but not in a midx
 604                             any_needed = True
 605                             break
 606                     if any_needed:
 607                         d[ix.name] = ix
 608                         for name in ix.idxnames:
 609                             d[os.path.join(self.dir, name)] = ix
 610                     elif not ix.force_keep:
 611                         debug1('midx: removing redundant: %s\n'
 612                                % path_msg(os.path.basename(ix.name)))
 613                         ix.close()
 614                         unlink(ix.name)
 615             for full in glob.glob(os.path.join(self.dir, b'*.idx')):
 616                 if not d.get(full):
 617                     try:
 618                         ix = open_idx(full)
 619                     except GitError as e:
 620                         add_error(e)
 621                         continue
 622                     d[full] = ix
 623             bfull = os.path.join(self.dir, b'bup.bloom')
 624             if self.bloom is None and os.path.exists(bfull):
 625                 self.bloom = bloom.ShaBloom(bfull)
 626             self.packs = list(set(d.values()))
 627             self.packs.sort(reverse=True, key=lambda x: len(x))
 628             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 629                 self.do_bloom = True
 630             else:
 631                 self.bloom = None
 632         debug1('PackIdxList: using %d index%s.\n'
 633             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 634
 635     def add(self, hash):
 636         """Insert an additional object in the list."""
 637         self.also.add(hash)
 638
 639
 640 def open_idx(filename):
 641     if filename.endswith(b'.idx'):
 642         f = open(filename, 'rb')
 643         header = f.read(8)
 644         if header[0:4] == b'\377tOc':
 645             version = struct.unpack('!I', header[4:8])[0]
 646             if version == 2:
 647                 return PackIdxV2(filename, f)
 648             else:
 649                 raise GitError('%s: expected idx file version 2, got %d'
 650                                % (path_msg(filename), version))
 651         elif len(header) == 8 and header[0:4] < b'\377tOc':
 652             return PackIdxV1(filename, f)
 653         else:
 654             raise GitError('%s: unrecognized idx file header'
 655                            % path_msg(filename))
 656     elif filename.endswith(b'.midx'):
 657         return midx.PackMidx(filename)
 658     else:
 659         raise GitError('idx filenames must end with .idx or .midx')
 660
 661
 662 def idxmerge(idxlist, final_progress=True):
 663     """Generate a list of all the objects reachable in a PackIdxList."""
 664     def pfunc(count, total):
 665         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 666                   % (count*100.0/total, count, total))
 667     def pfinal(count, total):
 668         if final_progress:
 669             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 670                      % (100, total, total))
 671     return merge_iter(idxlist, 10024, pfunc, pfinal)
 672
 673
 674 def create_commit_blob(tree, parent,
 675                        author, adate_sec, adate_tz,
 676                        committer, cdate_sec, cdate_tz,
 677                        msg):
 678     if adate_tz is not None:
 679         adate_str = _git_date_str(adate_sec, adate_tz)
 680     else:
 681         adate_str = _local_git_date_str(adate_sec)
 682     if cdate_tz is not None:
 683         cdate_str = _git_date_str(cdate_sec, cdate_tz)
 684     else:
 685         cdate_str = _local_git_date_str(cdate_sec)
 686     l = []
 687     if tree: l.append(b'tree %s' % hexlify(tree))
 688     if parent: l.append(b'parent %s' % hexlify(parent))
 689     if author: l.append(b'author %s %s' % (author, adate_str))
 690     if committer: l.append(b'committer %s %s' % (committer, cdate_str))
 691     l.append(b'')
 692     l.append(msg)
 693     return b'\n'.join(l)
 694
 695
 696 def _make_objcache():
 697     return PackIdxList(repo(b'objects/pack'))
 698
 699 # bup-gc assumes that it can disable all PackWriter activities
 700 # (bloom/midx/cache) via the constructor and close() arguments.
 701
 702 class PackWriter:
 703     """Writes Git objects inside a pack file."""
 704     def __init__(self, objcache_maker=_make_objcache, compression_level=1,
 705                  run_midx=True, on_pack_finish=None,
 706                  max_pack_size=None, max_pack_objects=None, repo_dir=None):
 707         self.repo_dir = repo_dir or repo()
 708         self.file = None
 709         self.parentfd = None
 710         self.count = 0
 711         self.outbytes = 0
 712         self.filename = None
 713         self.idx = None
 714         self.objcache_maker = objcache_maker
 715         self.objcache = None
 716         self.compression_level = compression_level
 717         self.run_midx=run_midx
 718         self.on_pack_finish = on_pack_finish
 719         if not max_pack_size:
 720             max_pack_size = git_config_get(b'pack.packSizeLimit',
 721                                            repo_dir=self.repo_dir,
 722                                            opttype='int')
 723             if not max_pack_size:
 724                 # larger packs slow down pruning
 725                 max_pack_size = 1000 * 1000 * 1000
 726         self.max_pack_size = max_pack_size
 727         # cache memory usage is about 83 bytes per object
 728         self.max_pack_objects = max_pack_objects if max_pack_objects \
 729                                 else max(1, self.max_pack_size // 5000)
 730
 731     def __del__(self):
 732         self.close()
 733
 734     def __enter__(self):
 735         return self
 736
 737     def __exit__(self, type, value, traceback):
 738         self.close()
 739
 740     def _open(self):
 741         if not self.file:
 742             objdir = dir = os.path.join(self.repo_dir, b'objects')
 743             fd, name = tempfile.mkstemp(suffix=b'.pack', dir=objdir)
 744             try:
 745                 self.file = os.fdopen(fd, 'w+b')
 746             except:
 747                 os.close(fd)
 748                 raise
 749             try:
 750                 self.parentfd = os.open(objdir, os.O_RDONLY)
 751             except:
 752                 f = self.file
 753                 self.file = None
 754                 f.close()
 755                 raise
 756             assert name.endswith(b'.pack')
 757             self.filename = name[:-5]
 758             self.file.write(b'PACK\0\0\0\2\0\0\0\0')
 759             self.idx = PackIdxV2Writer()
 760
 761     def _raw_write(self, datalist, sha):
 762         self._open()
 763         f = self.file
 764         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 765         # the file never has a *partial* blob.  So let's make sure it's
 766         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 767         # to our hashsplit algorithm.)  f.write() does its own buffering,
 768         # but that's okay because we'll flush it in _end().
 769         oneblob = b''.join(datalist)
 770         try:
 771             f.write(oneblob)
 772         except IOError as e:
 773             reraise(GitError(e))
 774         nw = len(oneblob)
 775         crc = zlib.crc32(oneblob) & 0xffffffff
 776         self._update_idx(sha, crc, nw)
 777         self.outbytes += nw
 778         self.count += 1
 779         return nw, crc
 780
 781     def _update_idx(self, sha, crc, size):
 782         assert(sha)
 783         if self.idx:
 784             self.idx.add(sha, crc, self.file.tell() - size)
 785
 786     def _write(self, sha, type, content):
 787         if verbose:
 788             log('>')
 789         if not sha:
 790             sha = calc_hash(type, content)
 791         size, crc = self._raw_write(_encode_packobj(type, content,
 792                                                     self.compression_level),
 793                                     sha=sha)
 794         if self.outbytes >= self.max_pack_size \
 795            or self.count >= self.max_pack_objects:
 796             self.breakpoint()
 797         return sha
 798
 799     def breakpoint(self):
 800         """Clear byte and object counts and return the last processed id."""
 801         id = self._end(self.run_midx)
 802         self.outbytes = self.count = 0
 803         return id
 804
 805     def _require_objcache(self):
 806         if self.objcache is None and self.objcache_maker:
 807             self.objcache = self.objcache_maker()
 808         if self.objcache is None:
 809             raise GitError(
 810                     "PackWriter not opened or can't check exists w/o objcache")
 811
 812     def exists(self, id, want_source=False):
 813         """Return non-empty if an object is found in the object cache."""
 814         self._require_objcache()
 815         return self.objcache.exists(id, want_source=want_source)
 816
 817     def just_write(self, sha, type, content):
 818         """Write an object to the pack file without checking for duplication."""
 819         self._write(sha, type, content)
 820         # If nothing else, gc doesn't have/want an objcache
 821         if self.objcache is not None:
 822             self.objcache.add(sha)
 823
 824     def maybe_write(self, type, content):
 825         """Write an object to the pack file if not present and return its id."""
 826         sha = calc_hash(type, content)
 827         if not self.exists(sha):
 828             self._require_objcache()
 829             self.just_write(sha, type, content)
 830         return sha
 831
 832     def new_blob(self, blob):
 833         """Create a blob object in the pack with the supplied content."""
 834         return self.maybe_write(b'blob', blob)
 835
 836     def new_tree(self, shalist):
 837         """Create a tree object in the pack."""
 838         content = tree_encode(shalist)
 839         return self.maybe_write(b'tree', content)
 840
 841     def new_commit(self, tree, parent,
 842                    author, adate_sec, adate_tz,
 843                    committer, cdate_sec, cdate_tz,
 844                    msg):
 845         """Create a commit object in the pack.  The date_sec values must be
 846         epoch-seconds, and if a tz is None, the local timezone is assumed."""
 847         content = create_commit_blob(tree, parent,
 848                                      author, adate_sec, adate_tz,
 849                                      committer, cdate_sec, cdate_tz,
 850                                      msg)
 851         return self.maybe_write(b'commit', content)
 852
 853     def abort(self):
 854         """Remove the pack file from disk."""
 855         f = self.file
 856         if f:
 857             pfd = self.parentfd
 858             self.file = None
 859             self.parentfd = None
 860             self.idx = None
 861             try:
 862                 try:
 863                     os.unlink(self.filename + b'.pack')
 864                 finally:
 865                     f.close()
 866             finally:
 867                 if pfd is not None:
 868                     os.close(pfd)
 869
 870     def _end(self, run_midx=True):
 871         f = self.file
 872         if not f: return None
 873         self.file = None
 874         try:
 875             self.objcache = None
 876             idx = self.idx
 877             self.idx = None
 878
 879             # update object count
 880             f.seek(8)
 881             cp = struct.pack('!i', self.count)
 882             assert(len(cp) == 4)
 883             f.write(cp)
 884
 885             # calculate the pack sha1sum
 886             f.seek(0)
 887             sum = Sha1()
 888             for b in chunkyreader(f):
 889                 sum.update(b)
 890             packbin = sum.digest()
 891             f.write(packbin)
 892             fdatasync(f.fileno())
 893         finally:
 894             f.close()
 895
 896         idx.write(self.filename + b'.idx', packbin)
 897         nameprefix = os.path.join(self.repo_dir,
 898                                   b'objects/pack/pack-' +  hexlify(packbin))
 899         if os.path.exists(self.filename + b'.map'):
 900             os.unlink(self.filename + b'.map')
 901         os.rename(self.filename + b'.pack', nameprefix + b'.pack')
 902         os.rename(self.filename + b'.idx', nameprefix + b'.idx')
 903         try:
 904             os.fsync(self.parentfd)
 905         finally:
 906             os.close(self.parentfd)
 907
 908         if run_midx:
 909             auto_midx(os.path.join(self.repo_dir, b'objects/pack'))
 910
 911         if self.on_pack_finish:
 912             self.on_pack_finish(nameprefix)
 913
 914         return nameprefix
 915
 916     def close(self, run_midx=True):
 917         """Close the pack file and move it to its definitive path."""
 918         return self._end(run_midx=run_midx)
 919
 920
 921 class PackIdxV2Writer:
 922     def __init__(self):
 923         self.idx = list(list() for i in range(256))
 924         self.count = 0
 925
 926     def add(self, sha, crc, offs):
 927         assert(sha)
 928         self.count += 1
 929         self.idx[byte_int(sha[0])].append((sha, crc, offs))
 930
 931     def write(self, filename, packbin):
 932         ofs64_count = 0
 933         for section in self.idx:
 934             for entry in section:
 935                 if entry[2] >= 2**31:
 936                     ofs64_count += 1
 937
 938         # Length: header + fan-out + shas-and-crcs + overflow-offsets
 939         index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
 940         idx_map = None
 941         idx_f = open(filename, 'w+b')
 942         try:
 943             idx_f.truncate(index_len)
 944             fdatasync(idx_f.fileno())
 945             idx_map = mmap_readwrite(idx_f, close=False)
 946             try:
 947                 count = _helpers.write_idx(filename, idx_map, self.idx,
 948                                            self.count)
 949                 assert(count == self.count)
 950                 idx_map.flush()
 951             finally:
 952                 idx_map.close()
 953         finally:
 954             idx_f.close()
 955
 956         idx_f = open(filename, 'a+b')
 957         try:
 958             idx_f.write(packbin)
 959             idx_f.seek(0)
 960             idx_sum = Sha1()
 961             b = idx_f.read(8 + 4*256)
 962             idx_sum.update(b)
 963
 964             for b in chunkyreader(idx_f, 20 * self.count):
 965                 idx_sum.update(b)
 966
 967             for b in chunkyreader(idx_f):
 968                 idx_sum.update(b)
 969             idx_f.write(idx_sum.digest())
 970             fdatasync(idx_f.fileno())
 971         finally:
 972             idx_f.close()
 973
 974
 975 def list_refs(patterns=None, repo_dir=None,
 976               limit_to_heads=False, limit_to_tags=False):
 977     """Yield (refname, hash) tuples for all repository refs unless
 978     patterns are specified.  In that case, only include tuples for
 979     refs matching those patterns (cf. git-show-ref(1)).  The limits
 980     restrict the result items to refs/heads or refs/tags.  If both
 981     limits are specified, items from both sources will be included.
 982
 983     """
 984     argv = [b'git', b'show-ref']
 985     if limit_to_heads:
 986         argv.append(b'--heads')
 987     if limit_to_tags:
 988         argv.append(b'--tags')
 989     argv.append(b'--')
 990     if patterns:
 991         argv.extend(patterns)
 992     p = subprocess.Popen(argv, env=_gitenv(repo_dir), stdout=subprocess.PIPE,
 993                          close_fds=True)
 994     out = p.stdout.read().strip()
 995     rv = p.wait()  # not fatal
 996     if rv:
 997         assert(not out)
 998     if out:
 999         for d in out.split(b'\n'):
1000             sha, name = d.split(b' ', 1)
1001             yield name, unhexlify(sha)
1002
1003
1004 def read_ref(refname, repo_dir = None):
1005     """Get the commit id of the most recent commit made on a given ref."""
1006     refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
1007     l = tuple(islice(refs, 2))
1008     if l:
1009         assert(len(l) == 1)
1010         return l[0][1]
1011     else:
1012         return None
1013
1014
1015 def rev_list_invocation(ref_or_refs, format=None):
1016     if isinstance(ref_or_refs, bytes):
1017         refs = (ref_or_refs,)
1018     else:
1019         refs = ref_or_refs
1020     argv = [b'git', b'rev-list']
1021
1022     if format:
1023         argv.append(b'--pretty=format:' + format)
1024     for ref in refs:
1025         assert not ref.startswith(b'-')
1026         argv.append(ref)
1027     argv.append(b'--')
1028     return argv
1029
1030
1031 def rev_list(ref_or_refs, parse=None, format=None, repo_dir=None):
1032     """Yield information about commits as per "git rev-list".  If a format
1033     is not provided, yield one hex hash at a time.  If a format is
1034     provided, pass it to rev-list and call parse(git_stdout) for each
1035     commit with the stream positioned just after the rev-list "commit
1036     HASH" header line.  When a format is provided yield (oidx,
1037     parse(git_stdout)) for each commit.
1038
1039     """
1040     assert bool(parse) == bool(format)
1041     p = subprocess.Popen(rev_list_invocation(ref_or_refs,
1042                                              format=format),
1043                          env=_gitenv(repo_dir),
1044                          stdout = subprocess.PIPE,
1045                          close_fds=True)
1046     if not format:
1047         for line in p.stdout:
1048             yield line.strip()
1049     else:
1050         line = p.stdout.readline()
1051         while line:
1052             s = line.strip()
1053             if not s.startswith(b'commit '):
1054                 raise Exception('unexpected line ' + repr(s))
1055             s = s[7:]
1056             assert len(s) == 40
1057             yield s, parse(p.stdout)
1058             line = p.stdout.readline()
1059
1060     rv = p.wait()  # not fatal
1061     if rv:
1062         raise GitError('git rev-list returned error %d' % rv)
1063
1064
1065 def rev_parse(committish, repo_dir=None):
1066     """Resolve the full hash for 'committish', if it exists.
1067
1068     Should be roughly equivalent to 'git rev-parse'.
1069
1070     Returns the hex value of the hash if it is found, None if 'committish' does
1071     not correspond to anything.
1072     """
1073     head = read_ref(committish, repo_dir=repo_dir)
1074     if head:
1075         debug2("resolved from ref: commit = %s\n" % hexlify(head))
1076         return head
1077
1078     pL = PackIdxList(repo(b'objects/pack', repo_dir=repo_dir))
1079
1080     if len(committish) == 40:
1081         try:
1082             hash = unhexlify(committish)
1083         except TypeError:
1084             return None
1085
1086         if pL.exists(hash):
1087             return hash
1088
1089     return None
1090
1091
1092 def update_ref(refname, newval, oldval, repo_dir=None):
1093     """Update a repository reference."""
1094     if not oldval:
1095         oldval = b''
1096     assert refname.startswith(b'refs/heads/') \
1097         or refname.startswith(b'refs/tags/')
1098     p = subprocess.Popen([b'git', b'update-ref', refname,
1099                           hexlify(newval), hexlify(oldval)],
1100                          env=_gitenv(repo_dir),
1101                          close_fds=True)
1102     _git_wait(b'git update-ref', p)
1103
1104
1105 def delete_ref(refname, oldvalue=None):
1106     """Delete a repository reference (see git update-ref(1))."""
1107     assert refname.startswith(b'refs/')
1108     oldvalue = [] if not oldvalue else [oldvalue]
1109     p = subprocess.Popen([b'git', b'update-ref', b'-d', refname] + oldvalue,
1110                          env=_gitenv(),
1111                          close_fds=True)
1112     _git_wait('git update-ref', p)
1113
1114
1115 def guess_repo(path=None):
1116     """Set the path value in the global variable "repodir".
1117     This makes bup look for an existing bup repository, but not fail if a
1118     repository doesn't exist. Usually, if you are interacting with a bup
1119     repository, you would not be calling this function but using
1120     check_repo_or_die().
1121     """
1122     global repodir
1123     if path:
1124         repodir = path
1125     if not repodir:
1126         repodir = environ.get(b'BUP_DIR')
1127         if not repodir:
1128             repodir = os.path.expanduser(b'~/.bup')
1129
1130
1131 def init_repo(path=None):
1132     """Create the Git bare repository for bup in a given path."""
1133     guess_repo(path)
1134     d = repo()  # appends a / to the path
1135     parent = os.path.dirname(os.path.dirname(d))
1136     if parent and not os.path.exists(parent):
1137         raise GitError('parent directory "%s" does not exist\n'
1138                        % path_msg(parent))
1139     if os.path.exists(d) and not os.path.isdir(os.path.join(d, b'.')):
1140         raise GitError('"%s" exists but is not a directory\n' % path_msg(d))
1141     p = subprocess.Popen([b'git', b'--bare', b'init'], stdout=sys.stderr,
1142                          env=_gitenv(),
1143                          close_fds=True)
1144     _git_wait('git init', p)
1145     # Force the index version configuration in order to ensure bup works
1146     # regardless of the version of the installed Git binary.
1147     p = subprocess.Popen([b'git', b'config', b'pack.indexVersion', '2'],
1148                          stdout=sys.stderr, env=_gitenv(), close_fds=True)
1149     _git_wait('git config', p)
1150     # Enable the reflog
1151     p = subprocess.Popen([b'git', b'config', b'core.logAllRefUpdates', b'true'],
1152                          stdout=sys.stderr, env=_gitenv(), close_fds=True)
1153     _git_wait('git config', p)
1154
1155
1156 def check_repo_or_die(path=None):
1157     """Check to see if a bup repository probably exists, and abort if not."""
1158     guess_repo(path)
1159     top = repo()
1160     pst = stat_if_exists(top + b'/objects/pack')
1161     if pst and stat.S_ISDIR(pst.st_mode):
1162         return
1163     if not pst:
1164         top_st = stat_if_exists(top)
1165         if not top_st:
1166             log('error: repository %r does not exist (see "bup help init")\n'
1167                 % top)
1168             sys.exit(15)
1169     log('error: %s is not a repository\n' % path_msg(top))
1170     sys.exit(14)
1171
1172
1173 def is_suitable_git(ver_str):
1174     if not ver_str.startswith(b'git version '):
1175         return 'unrecognized'
1176     ver_str = ver_str[len(b'git version '):]
1177     if ver_str.startswith(b'0.'):
1178         return 'insufficient'
1179     if ver_str.startswith(b'1.'):
1180         if re.match(br'1\.[012345]rc', ver_str):
1181             return 'insufficient'
1182         if re.match(br'1\.[01234]\.', ver_str):
1183             return 'insufficient'
1184         if re.match(br'1\.5\.[012345]($|\.)', ver_str):
1185             return 'insufficient'
1186         if re.match(br'1\.5\.6-rc', ver_str):
1187             return 'insufficient'
1188         return 'suitable'
1189     if re.match(br'[0-9]+(\.|$)?', ver_str):
1190         return 'suitable'
1191     sys.exit(13)
1192
1193 _git_great = None
1194
1195 def require_suitable_git(ver_str=None):
1196     """Raise GitError if the version of git isn't suitable.
1197
1198     Rely on ver_str when provided, rather than invoking the git in the
1199     path.
1200
1201     """
1202     global _git_great
1203     if _git_great is not None:
1204         return
1205     if environ.get(b'BUP_GIT_VERSION_IS_FINE', b'').lower() \
1206        in (b'yes', b'true', b'1'):
1207         _git_great = True
1208         return
1209     if not ver_str:
1210         ver_str, _, _ = _git_exo([b'git', b'--version'])
1211     status = is_suitable_git(ver_str)
1212     if status == 'unrecognized':
1213         raise GitError('Unexpected git --version output: %r' % ver_str)
1214     if status == 'insufficient':
1215         log('error: git version must be at least 1.5.6\n')
1216         sys.exit(1)
1217     if status == 'suitable':
1218         _git_great = True
1219         return
1220     assert False
1221
1222
1223 class _AbortableIter:
1224     def __init__(self, it, onabort = None):
1225         self.it = it
1226         self.onabort = onabort
1227         self.done = None
1228
1229     def __iter__(self):
1230         return self
1231
1232     def __next__(self):
1233         try:
1234             return next(self.it)
1235         except StopIteration as e:
1236             self.done = True
1237             raise
1238         except:
1239             self.abort()
1240             raise
1241
1242     next = __next__
1243
1244     def abort(self):
1245         """Abort iteration and call the abortion callback, if needed."""
1246         if not self.done:
1247             self.done = True
1248             if self.onabort:
1249                 self.onabort()
1250
1251     def __del__(self):
1252         self.abort()
1253
1254
1255 class CatPipe:
1256     """Link to 'git cat-file' that is used to retrieve blob data."""
1257     def __init__(self, repo_dir = None):
1258         require_suitable_git()
1259         self.repo_dir = repo_dir
1260         self.p = self.inprogress = None
1261
1262     def close(self, wait=False):
1263         p = self.p
1264         if p:
1265             p.stdout.close()
1266             p.stdin.close()
1267         self.p = None
1268         self.inprogress = None
1269         if wait:
1270             p.wait()
1271             return p.returncode
1272
1273     def restart(self):
1274         self.close()
1275         self.p = subprocess.Popen([b'git', b'cat-file', b'--batch'],
1276                                   stdin=subprocess.PIPE,
1277                                   stdout=subprocess.PIPE,
1278                                   close_fds = True,
1279                                   bufsize = 4096,
1280                                   env=_gitenv(self.repo_dir))
1281
1282     def get(self, ref):
1283         """Yield (oidx, type, size), followed by the data referred to by ref.
1284         If ref does not exist, only yield (None, None, None).
1285
1286         """
1287         if not self.p or self.p.poll() != None:
1288             self.restart()
1289         assert(self.p)
1290         poll_result = self.p.poll()
1291         assert(poll_result == None)
1292         if self.inprogress:
1293             log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1294         assert(not self.inprogress)
1295         assert ref.find(b'\n') < 0
1296         assert ref.find(b'\r') < 0
1297         assert not ref.startswith(b'-')
1298         self.inprogress = ref
1299         self.p.stdin.write(ref + b'\n')
1300         self.p.stdin.flush()
1301         hdr = self.p.stdout.readline()
1302         if hdr.endswith(b' missing\n'):
1303             self.inprogress = None
1304             yield None, None, None
1305             return
1306         info = hdr.split(b' ')
1307         if len(info) != 3 or len(info[0]) != 40:
1308             raise GitError('expected object (id, type, size), got %r' % info)
1309         oidx, typ, size = info
1310         size = int(size)
1311         it = _AbortableIter(chunkyreader(self.p.stdout, size),
1312                             onabort=self.close)
1313         try:
1314             yield oidx, typ, size
1315             for blob in it:
1316                 yield blob
1317             readline_result = self.p.stdout.readline()
1318             assert readline_result == b'\n'
1319             self.inprogress = None
1320         except Exception as e:
1321             it.abort()
1322             raise
1323
1324     def _join(self, it):
1325         _, typ, _ = next(it)
1326         if typ == b'blob':
1327             for blob in it:
1328                 yield blob
1329         elif typ == b'tree':
1330             treefile = b''.join(it)
1331             for (mode, name, sha) in tree_decode(treefile):
1332                 for blob in self.join(hexlify(sha)):
1333                     yield blob
1334         elif typ == b'commit':
1335             treeline = b''.join(it).split(b'\n')[0]
1336             assert treeline.startswith(b'tree ')
1337             for blob in self.join(treeline[5:]):
1338                 yield blob
1339         else:
1340             raise GitError('invalid object type %r: expected blob/tree/commit'
1341                            % typ)
1342
1343     def join(self, id):
1344         """Generate a list of the content of all blobs that can be reached
1345         from an object.  The hash given in 'id' must point to a blob, a tree
1346         or a commit. The content of all blobs that can be seen from trees or
1347         commits will be added to the list.
1348         """
1349         for d in self._join(self.get(id)):
1350             yield d
1351
1352
1353 _cp = {}
1354
1355 def cp(repo_dir=None):
1356     """Create a CatPipe object or reuse the already existing one."""
1357     global _cp, repodir
1358     if not repo_dir:
1359         repo_dir = repodir or repo()
1360     repo_dir = os.path.abspath(repo_dir)
1361     cp = _cp.get(repo_dir)
1362     if not cp:
1363         cp = CatPipe(repo_dir)
1364         _cp[repo_dir] = cp
1365     return cp
1366
1367
1368 def close_catpipes():
1369     # FIXME: chain exceptions
1370     while _cp:
1371         _, cp = _cp.popitem()
1372         cp.close(wait=True)
1373
1374
1375 def tags(repo_dir = None):
1376     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1377     tags = {}
1378     for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1379         assert n.startswith(b'refs/tags/')
1380         name = n[10:]
1381         if not c in tags:
1382             tags[c] = []
1383         tags[c].append(name)  # more than one tag can point at 'c'
1384     return tags
1385
1386
1387 class MissingObject(KeyError):
1388     def __init__(self, oid):
1389         self.oid = oid
1390         KeyError.__init__(self, 'object %r is missing' % hexlify(oid))
1391
1392
1393 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1394                                    'path', 'chunk_path', 'data'])
1395 # The path is the mangled path, and if an item represents a fragment
1396 # of a chunked file, the chunk_path will be the chunked subtree path
1397 # for the chunk, i.e. ['', '2d3115e', ...].  The top-level path for a
1398 # chunked file will have a chunk_path of [''].  So some chunk subtree
1399 # of the file '/foo/bar/baz' might look like this:
1400 #
1401 #   item.path = ['foo', 'bar', 'baz.bup']
1402 #   item.chunk_path = ['', '2d3115e', '016b097']
1403 #   item.type = 'tree'
1404 #   ...
1405
1406
1407 def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1408     """Yield everything reachable from oidx via get_ref (which must behave
1409     like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1410     returns true.  Throw MissingObject if a hash encountered is
1411     missing from the repository, and don't read or return blob content
1412     in the data field unless include_data is set.
1413
1414     """
1415     # Maintain the pending stack on the heap to avoid stack overflow
1416     pending = [(oidx, [], [], None)]
1417     while len(pending):
1418         oidx, parent_path, chunk_path, mode = pending.pop()
1419         oid = unhexlify(oidx)
1420         if stop_at and stop_at(oidx):
1421             continue
1422
1423         if (not include_data) and mode and stat.S_ISREG(mode):
1424             # If the object is a "regular file", then it's a leaf in
1425             # the graph, so we can skip reading the data if the caller
1426             # hasn't requested it.
1427             yield WalkItem(oid=oid, type=b'blob',
1428                            chunk_path=chunk_path, path=parent_path,
1429                            mode=mode,
1430                            data=None)
1431             continue
1432
1433         item_it = get_ref(oidx)
1434         get_oidx, typ, _ = next(item_it)
1435         if not get_oidx:
1436             raise MissingObject(unhexlify(oidx))
1437         if typ not in (b'blob', b'commit', b'tree'):
1438             raise Exception('unexpected repository object type %r' % typ)
1439
1440         # FIXME: set the mode based on the type when the mode is None
1441         if typ == b'blob' and not include_data:
1442             # Dump data until we can ask cat_pipe not to fetch it
1443             for ignored in item_it:
1444                 pass
1445             data = None
1446         else:
1447             data = b''.join(item_it)
1448
1449         yield WalkItem(oid=oid, type=typ,
1450                        chunk_path=chunk_path, path=parent_path,
1451                        mode=mode,
1452                        data=(data if include_data else None))
1453
1454         if typ == b'commit':
1455             commit_items = parse_commit(data)
1456             for pid in commit_items.parents:
1457                 pending.append((pid, parent_path, chunk_path, mode))
1458             pending.append((commit_items.tree, parent_path, chunk_path,
1459                             hashsplit.GIT_MODE_TREE))
1460         elif typ == b'tree':
1461             for mode, name, ent_id in tree_decode(data):
1462                 demangled, bup_type = demangle_name(name, mode)
1463                 if chunk_path:
1464                     sub_path = parent_path
1465                     sub_chunk_path = chunk_path + [name]
1466                 else:
1467                     sub_path = parent_path + [name]
1468                     if bup_type == BUP_CHUNKED:
1469                         sub_chunk_path = [b'']
1470                     else:
1471                         sub_chunk_path = chunk_path
1472                 pending.append((hexlify(ent_id), sub_path, sub_chunk_path,
1473                                 mode))