lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5
   6 from __future__ import absolute_import, print_function
   7 import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
   8 from array import array
   9 from binascii import hexlify, unhexlify
  10 from collections import namedtuple
  11 from itertools import islice
  12 from numbers import Integral
  13
  14 from bup import _helpers, compat, hashsplit, path, midx, bloom, xstat
  15 from bup.compat import (buffer,
  16                         byte_int, bytes_from_byte, bytes_from_uint,
  17                         environ,
  18                         items,
  19                         range,
  20                         reraise)
  21 from bup.io import path_msg
  22 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
  23                          exo,
  24                          fdatasync,
  25                          hostname, localtime, log,
  26                          merge_dict,
  27                          merge_iter,
  28                          mmap_read, mmap_readwrite,
  29                          parse_num,
  30                          progress, qprogress, stat_if_exists,
  31                          unlink,
  32                          utc_offset_str)
  33 from bup.pwdgrp import username, userfullname
  34
  35
  36 verbose = 0
  37 repodir = None  # The default repository, once initialized
  38
  39 _typemap =  {b'blob': 3, b'tree': 2, b'commit': 1, b'tag': 4}
  40 _typermap = {v: k for k, v in items(_typemap)}
  41
  42
  43 _total_searches = 0
  44 _total_steps = 0
  45
  46
  47 class GitError(Exception):
  48     pass
  49
  50
  51 def _gitenv(repo_dir=None):
  52     if not repo_dir:
  53         repo_dir = repo()
  54     return merge_dict(environ, {b'GIT_DIR': os.path.abspath(repo_dir)})
  55
  56 def _git_wait(cmd, p):
  57     rv = p.wait()
  58     if rv != 0:
  59         raise GitError('%r returned %d' % (cmd, rv))
  60
  61 def _git_exo(cmd, **kwargs):
  62     kwargs['check'] = False
  63     result = exo(cmd, **kwargs)
  64     _, _, proc = result
  65     if proc.returncode != 0:
  66         raise GitError('%r returned %d' % (cmd, proc.returncode))
  67     return result
  68
  69 def git_config_get(option, repo_dir=None, opttype=None):
  70     cmd = [b'git', b'config', b'--null']
  71     if opttype == 'int':
  72         cmd.extend([b'--int'])
  73     elif opttype == 'bool':
  74         cmd.extend([b'--bool'])
  75     else:
  76         assert opttype is None
  77     cmd.extend([b'--get', option])
  78     env=None
  79     if repo_dir:
  80         env = _gitenv(repo_dir=repo_dir)
  81     p = subprocess.Popen(cmd, stdout=subprocess.PIPE, env=env,
  82                          close_fds=True)
  83     # with --null, git writes out a trailing \0 after the value
  84     r = p.stdout.read()[:-1]
  85     rc = p.wait()
  86     if rc == 0:
  87         if opttype == 'int':
  88             return int(r)
  89         elif opttype == 'bool':
  90             # git converts to 'true' or 'false'
  91             return r == b'true'
  92         return r
  93     if rc != 1:
  94         raise GitError('%r returned %d' % (cmd, rc))
  95     return None
  96
  97
  98 def parse_tz_offset(s):
  99     """UTC offset in seconds."""
 100     tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
 101     if bytes_from_byte(s[0]) == b'-':
 102         return - tz_off
 103     return tz_off
 104
 105
 106 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
 107 # Make sure that's authoritative.
 108 _start_end_char = br'[^ .,:;<>"\'\0\n]'
 109 _content_char = br'[^\0\n<>]'
 110 _safe_str_rx = br'(?:%s{1,2}|(?:%s%s*%s))' \
 111     % (_start_end_char,
 112        _start_end_char, _content_char, _start_end_char)
 113 _tz_rx = br'[-+]\d\d[0-5]\d'
 114 _parent_rx = br'(?:parent [abcdefABCDEF0123456789]{40}\n)'
 115 # Assumes every following line starting with a space is part of the
 116 # mergetag.  Is there a formal commit blob spec?
 117 _mergetag_rx = br'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)'
 118 _commit_rx = re.compile(br'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
 119 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
 120 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?)
 121
 122 (?P<message>(?:.|\n)*)''' % (_parent_rx,
 123                              _safe_str_rx, _safe_str_rx, _tz_rx,
 124                              _safe_str_rx, _safe_str_rx, _tz_rx,
 125                              _mergetag_rx))
 126 _parent_hash_rx = re.compile(br'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
 127
 128 # Note that the author_sec and committer_sec values are (UTC) epoch
 129 # seconds, and for now the mergetag is not included.
 130 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
 131                                        'author_name', 'author_mail',
 132                                        'author_sec', 'author_offset',
 133                                        'committer_name', 'committer_mail',
 134                                        'committer_sec', 'committer_offset',
 135                                        'message'])
 136
 137 def parse_commit(content):
 138     commit_match = re.match(_commit_rx, content)
 139     if not commit_match:
 140         raise Exception('cannot parse commit %r' % content)
 141     matches = commit_match.groupdict()
 142     return CommitInfo(tree=matches['tree'],
 143                       parents=re.findall(_parent_hash_rx, matches['parents']),
 144                       author_name=matches['author_name'],
 145                       author_mail=matches['author_mail'],
 146                       author_sec=int(matches['asec']),
 147                       author_offset=parse_tz_offset(matches['atz']),
 148                       committer_name=matches['committer_name'],
 149                       committer_mail=matches['committer_mail'],
 150                       committer_sec=int(matches['csec']),
 151                       committer_offset=parse_tz_offset(matches['ctz']),
 152                       message=matches['message'])
 153
 154
 155 def get_cat_data(cat_iterator, expected_type):
 156     _, kind, _ = next(cat_iterator)
 157     if kind != expected_type:
 158         raise Exception('expected %r, saw %r' % (expected_type, kind))
 159     return b''.join(cat_iterator)
 160
 161 def get_commit_items(id, cp):
 162     return parse_commit(get_cat_data(cp.get(id), b'commit'))
 163
 164 def _local_git_date_str(epoch_sec):
 165     return b'%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
 166
 167
 168 def _git_date_str(epoch_sec, tz_offset_sec):
 169     offs =  tz_offset_sec // 60
 170     return b'%d %s%02d%02d' \
 171         % (epoch_sec,
 172            b'+' if offs >= 0 else b'-',
 173            abs(offs) // 60,
 174            abs(offs) % 60)
 175
 176
 177 def repo(sub = b'', repo_dir=None):
 178     """Get the path to the git repository or one of its subdirectories."""
 179     repo_dir = repo_dir or repodir
 180     if not repo_dir:
 181         raise GitError('You should call check_repo_or_die()')
 182
 183     # If there's a .git subdirectory, then the actual repo is in there.
 184     gd = os.path.join(repo_dir, b'.git')
 185     if os.path.exists(gd):
 186         repo_dir = gd
 187
 188     return os.path.join(repo_dir, sub)
 189
 190
 191 _shorten_hash_rx = \
 192     re.compile(br'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)')
 193
 194 def shorten_hash(s):
 195     return _shorten_hash_rx.sub(br'\1\2*\3', s)
 196
 197
 198 def repo_rel(path):
 199     full = os.path.abspath(path)
 200     fullrepo = os.path.abspath(repo(b''))
 201     if not fullrepo.endswith(b'/'):
 202         fullrepo += b'/'
 203     if full.startswith(fullrepo):
 204         path = full[len(fullrepo):]
 205     if path.startswith(b'index-cache/'):
 206         path = path[len(b'index-cache/'):]
 207     return shorten_hash(path)
 208
 209
 210 def auto_midx(objdir):
 211     args = [path.exe(), b'midx', b'--auto', b'--dir', objdir]
 212     try:
 213         rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
 214     except OSError as e:
 215         # make sure 'args' gets printed to help with debugging
 216         add_error('%r: exception: %s' % (args, e))
 217         raise
 218     if rv:
 219         add_error('%r: returned %d' % (args, rv))
 220
 221     args = [path.exe(), b'bloom', b'--dir', objdir]
 222     try:
 223         rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
 224     except OSError as e:
 225         # make sure 'args' gets printed to help with debugging
 226         add_error('%r: exception: %s' % (args, e))
 227         raise
 228     if rv:
 229         add_error('%r: returned %d' % (args, rv))
 230
 231
 232 def mangle_name(name, mode, gitmode):
 233     """Mangle a file name to present an abstract name for segmented files.
 234     Mangled file names will have the ".bup" extension added to them. If a
 235     file's name already ends with ".bup", a ".bupl" extension is added to
 236     disambiguate normal files from segmented ones.
 237     """
 238     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 239         assert(stat.S_ISDIR(gitmode))
 240         return name + b'.bup'
 241     elif name.endswith(b'.bup') or name[:-1].endswith(b'.bup'):
 242         return name + b'.bupl'
 243     else:
 244         return name
 245
 246
 247 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 248 def demangle_name(name, mode):
 249     """Remove name mangling from a file name, if necessary.
 250
 251     The return value is a tuple (demangled_filename,mode), where mode is one of
 252     the following:
 253
 254     * BUP_NORMAL  : files that should be read as-is from the repository
 255     * BUP_CHUNKED : files that were chunked and need to be reassembled
 256
 257     For more information on the name mangling algorithm, see mangle_name()
 258     """
 259     if name.endswith(b'.bupl'):
 260         return (name[:-5], BUP_NORMAL)
 261     elif name.endswith(b'.bup'):
 262         return (name[:-4], BUP_CHUNKED)
 263     elif name.endswith(b'.bupm'):
 264         return (name[:-5],
 265                 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
 266     else:
 267         return (name, BUP_NORMAL)
 268
 269
 270 def calc_hash(type, content):
 271     """Calculate some content's hash in the Git fashion."""
 272     header = b'%s %d\0' % (type, len(content))
 273     sum = Sha1(header)
 274     sum.update(content)
 275     return sum.digest()
 276
 277
 278 def shalist_item_sort_key(ent):
 279     (mode, name, id) = ent
 280     assert(mode+0 == mode)
 281     if stat.S_ISDIR(mode):
 282         return name + b'/'
 283     else:
 284         return name
 285
 286
 287 def tree_encode(shalist):
 288     """Generate a git tree object from (mode,name,hash) tuples."""
 289     shalist = sorted(shalist, key = shalist_item_sort_key)
 290     l = []
 291     for (mode,name,bin) in shalist:
 292         assert(mode)
 293         assert(mode+0 == mode)
 294         assert(name)
 295         assert(len(bin) == 20)
 296         s = b'%o %s\0%s' % (mode,name,bin)
 297         assert s[0] != b'0'  # 0-padded octal is not acceptable in a git tree
 298         l.append(s)
 299     return b''.join(l)
 300
 301
 302 def tree_decode(buf):
 303     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 304     ofs = 0
 305     while ofs < len(buf):
 306         z = buf.find(b'\0', ofs)
 307         assert(z > ofs)
 308         spl = buf[ofs:z].split(b' ', 1)
 309         assert(len(spl) == 2)
 310         mode,name = spl
 311         sha = buf[z+1:z+1+20]
 312         ofs = z+1+20
 313         yield (int(mode, 8), name, sha)
 314
 315
 316 def _encode_packobj(type, content, compression_level=1):
 317     if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
 318         raise ValueError('invalid compression level %s' % compression_level)
 319     szout = b''
 320     sz = len(content)
 321     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 322     sz >>= 4
 323     while 1:
 324         if sz: szbits |= 0x80
 325         szout += bytes_from_uint(szbits)
 326         if not sz:
 327             break
 328         szbits = sz & 0x7f
 329         sz >>= 7
 330     z = zlib.compressobj(compression_level)
 331     yield szout
 332     yield z.compress(content)
 333     yield z.flush()
 334
 335
 336 def _decode_packobj(buf):
 337     assert(buf)
 338     c = byte_int(buf[0])
 339     type = _typermap[(c & 0x70) >> 4]
 340     sz = c & 0x0f
 341     shift = 4
 342     i = 0
 343     while c & 0x80:
 344         i += 1
 345         c = byte_int(buf[i])
 346         sz |= (c & 0x7f) << shift
 347         shift += 7
 348         if not (c & 0x80):
 349             break
 350     return (type, zlib.decompress(buf[i+1:]))
 351
 352
 353 class PackIdx:
 354     def __init__(self):
 355         assert(0)
 356
 357     def find_offset(self, hash):
 358         """Get the offset of an object inside the index file."""
 359         idx = self._idx_from_hash(hash)
 360         if idx != None:
 361             return self._ofs_from_idx(idx)
 362         return None
 363
 364     def exists(self, hash, want_source=False):
 365         """Return nonempty if the object exists in this index."""
 366         if hash and (self._idx_from_hash(hash) != None):
 367             return want_source and os.path.basename(self.name) or True
 368         return None
 369
 370     def _idx_from_hash(self, hash):
 371         global _total_searches, _total_steps
 372         _total_searches += 1
 373         assert(len(hash) == 20)
 374         b1 = byte_int(hash[0])
 375         start = self.fanout[b1-1] # range -1..254
 376         end = self.fanout[b1] # range 0..255
 377         want = hash
 378         _total_steps += 1  # lookup table is a step
 379         while start < end:
 380             _total_steps += 1
 381             mid = start + (end - start) // 2
 382             v = self._idx_to_hash(mid)
 383             if v < want:
 384                 start = mid+1
 385             elif v > want:
 386                 end = mid
 387             else: # got it!
 388                 return mid
 389         return None
 390
 391
 392 class PackIdxV1(PackIdx):
 393     """Object representation of a Git pack index (version 1) file."""
 394     def __init__(self, filename, f):
 395         self.name = filename
 396         self.idxnames = [self.name]
 397         self.map = mmap_read(f)
 398         # Min size for 'L' is 4, which is sufficient for struct's '!I'
 399         self.fanout = array('L', struct.unpack('!256I', self.map))
 400         self.fanout.append(0)  # entry "-1"
 401         self.nsha = self.fanout[255]
 402         self.sha_ofs = 256 * 4
 403         # Avoid slicing shatable for individual hashes (very high overhead)
 404         self.shatable = buffer(self.map, self.sha_ofs, self.nsha * 24)
 405
 406     def __enter__(self):
 407         return self
 408
 409     def __exit__(self, type, value, traceback):
 410         self.close()
 411
 412     def __len__(self):
 413         return int(self.nsha)  # int() from long for python 2
 414
 415     def _ofs_from_idx(self, idx):
 416         if idx >= self.nsha or idx < 0:
 417             raise IndexError('invalid pack index index %d' % idx)
 418         ofs = self.sha_ofs + idx * 24
 419         return struct.unpack_from('!I', self.map, offset=ofs)[0]
 420
 421     def _idx_to_hash(self, idx):
 422         if idx >= self.nsha or idx < 0:
 423             raise IndexError('invalid pack index index %d' % idx)
 424         ofs = self.sha_ofs + idx * 24 + 4
 425         return self.map[ofs : ofs + 20]
 426
 427     def __iter__(self):
 428         start = self.sha_ofs + 4
 429         for ofs in range(start, start + 24 * self.nsha, 24):
 430             yield self.map[ofs : ofs + 20]
 431
 432     def close(self):
 433         if self.map is not None:
 434             self.shatable = None
 435             self.map.close()
 436             self.map = None
 437
 438
 439 class PackIdxV2(PackIdx):
 440     """Object representation of a Git pack index (version 2) file."""
 441     def __init__(self, filename, f):
 442         self.name = filename
 443         self.idxnames = [self.name]
 444         self.map = mmap_read(f)
 445         assert self.map[0:8] == b'\377tOc\0\0\0\2'
 446         # Min size for 'L' is 4, which is sufficient for struct's '!I'
 447         self.fanout = array('L', struct.unpack_from('!256I', self.map, offset=8))
 448         self.fanout.append(0)
 449         self.nsha = self.fanout[255]
 450         self.sha_ofs = 8 + 256*4
 451         self.ofstable_ofs = self.sha_ofs + self.nsha * 20 + self.nsha * 4
 452         self.ofs64table_ofs = self.ofstable_ofs + self.nsha * 4
 453         # Avoid slicing this for individual hashes (very high overhead)
 454         self.shatable = buffer(self.map, self.sha_ofs, self.nsha*20)
 455
 456     def __enter__(self):
 457         return self
 458
 459     def __exit__(self, type, value, traceback):
 460         self.close()
 461
 462     def __len__(self):
 463         return int(self.nsha)  # int() from long for python 2
 464
 465     def _ofs_from_idx(self, idx):
 466         if idx >= self.nsha or idx < 0:
 467             raise IndexError('invalid pack index index %d' % idx)
 468         ofs_ofs = self.ofstable_ofs + idx * 4
 469         ofs = struct.unpack_from('!I', self.map, offset=ofs_ofs)[0]
 470         if ofs & 0x80000000:
 471             idx64 = ofs & 0x7fffffff
 472             ofs64_ofs = self.ofs64table_ofs + idx64 * 8
 473             ofs = struct.unpack_from('!Q', self.map, offset=ofs64_ofs)[0]
 474         return ofs
 475
 476     def _idx_to_hash(self, idx):
 477         if idx >= self.nsha or idx < 0:
 478             raise IndexError('invalid pack index index %d' % idx)
 479         ofs = self.sha_ofs + idx * 20
 480         return self.map[ofs : ofs + 20]
 481
 482     def __iter__(self):
 483         start = self.sha_ofs
 484         for ofs in range(start, start + 20 * self.nsha, 20):
 485             yield self.map[ofs : ofs + 20]
 486
 487     def close(self):
 488         if self.map is not None:
 489             self.shatable = None
 490             self.map.close()
 491             self.map = None
 492
 493
 494 _mpi_count = 0
 495 class PackIdxList:
 496     def __init__(self, dir, ignore_midx=False):
 497         global _mpi_count
 498         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 499         _mpi_count += 1
 500         self.dir = dir
 501         self.also = set()
 502         self.packs = []
 503         self.do_bloom = False
 504         self.bloom = None
 505         self.ignore_midx = ignore_midx
 506         self.refresh()
 507
 508     def __del__(self):
 509         global _mpi_count
 510         _mpi_count -= 1
 511         assert(_mpi_count == 0)
 512
 513     def __iter__(self):
 514         return iter(idxmerge(self.packs))
 515
 516     def __len__(self):
 517         return sum(len(pack) for pack in self.packs)
 518
 519     def exists(self, hash, want_source=False):
 520         """Return nonempty if the object exists in the index files."""
 521         global _total_searches
 522         _total_searches += 1
 523         if hash in self.also:
 524             return True
 525         if self.do_bloom and self.bloom:
 526             if self.bloom.exists(hash):
 527                 self.do_bloom = False
 528             else:
 529                 _total_searches -= 1  # was counted by bloom
 530                 return None
 531         for i in range(len(self.packs)):
 532             p = self.packs[i]
 533             _total_searches -= 1  # will be incremented by sub-pack
 534             ix = p.exists(hash, want_source=want_source)
 535             if ix:
 536                 # reorder so most recently used packs are searched first
 537                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 538                 return ix
 539         self.do_bloom = True
 540         return None
 541
 542     def refresh(self, skip_midx = False):
 543         """Refresh the index list.
 544         This method verifies if .midx files were superseded (e.g. all of its
 545         contents are in another, bigger .midx file) and removes the superseded
 546         files.
 547
 548         If skip_midx is True, all work on .midx files will be skipped and .midx
 549         files will be removed from the list.
 550
 551         The instance variable 'ignore_midx' can force this function to
 552         always act as if skip_midx was True.
 553         """
 554         if self.bloom is not None:
 555             self.bloom.close()
 556         self.bloom = None # Always reopen the bloom as it may have been relaced
 557         self.do_bloom = False
 558         skip_midx = skip_midx or self.ignore_midx
 559         d = dict((p.name, p) for p in self.packs
 560                  if not skip_midx or not isinstance(p, midx.PackMidx))
 561         if os.path.exists(self.dir):
 562             if not skip_midx:
 563                 midxl = []
 564                 midxes = set(glob.glob(os.path.join(self.dir, b'*.midx')))
 565                 # remove any *.midx files from our list that no longer exist
 566                 for ix in list(d.values()):
 567                     if not isinstance(ix, midx.PackMidx):
 568                         continue
 569                     if ix.name in midxes:
 570                         continue
 571                     # remove the midx
 572                     del d[ix.name]
 573                     ix.close()
 574                     self.packs.remove(ix)
 575                 for ix in self.packs:
 576                     if isinstance(ix, midx.PackMidx):
 577                         for name in ix.idxnames:
 578                             d[os.path.join(self.dir, name)] = ix
 579                 for full in midxes:
 580                     if not d.get(full):
 581                         mx = midx.PackMidx(full)
 582                         (mxd, mxf) = os.path.split(mx.name)
 583                         broken = False
 584                         for n in mx.idxnames:
 585                             if not os.path.exists(os.path.join(mxd, n)):
 586                                 log(('warning: index %s missing\n'
 587                                      '  used by %s\n')
 588                                     % (path_msg(n), path_msg(mxf)))
 589                                 broken = True
 590                         if broken:
 591                             mx.close()
 592                             del mx
 593                             unlink(full)
 594                         else:
 595                             midxl.append(mx)
 596                 midxl.sort(key=lambda ix:
 597                            (-len(ix), -xstat.stat(ix.name).st_mtime))
 598                 for ix in midxl:
 599                     any_needed = False
 600                     for sub in ix.idxnames:
 601                         found = d.get(os.path.join(self.dir, sub))
 602                         if not found or isinstance(found, PackIdx):
 603                             # doesn't exist, or exists but not in a midx
 604                             any_needed = True
 605                             break
 606                     if any_needed:
 607                         d[ix.name] = ix
 608                         for name in ix.idxnames:
 609                             d[os.path.join(self.dir, name)] = ix
 610                     elif not ix.force_keep:
 611                         debug1('midx: removing redundant: %s\n'
 612                                % path_msg(os.path.basename(ix.name)))
 613                         ix.close()
 614                         unlink(ix.name)
 615             for full in glob.glob(os.path.join(self.dir, b'*.idx')):
 616                 if not d.get(full):
 617                     try:
 618                         ix = open_idx(full)
 619                     except GitError as e:
 620                         add_error(e)
 621                         continue
 622                     d[full] = ix
 623             bfull = os.path.join(self.dir, b'bup.bloom')
 624             if self.bloom is None and os.path.exists(bfull):
 625                 self.bloom = bloom.ShaBloom(bfull)
 626             self.packs = list(set(d.values()))
 627             self.packs.sort(reverse=True, key=lambda x: len(x))
 628             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 629                 self.do_bloom = True
 630             else:
 631                 self.bloom = None
 632         debug1('PackIdxList: using %d index%s.\n'
 633             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 634
 635     def add(self, hash):
 636         """Insert an additional object in the list."""
 637         self.also.add(hash)
 638
 639
 640 def open_idx(filename):
 641     if filename.endswith(b'.idx'):
 642         f = open(filename, 'rb')
 643         header = f.read(8)
 644         if header[0:4] == b'\377tOc':
 645             version = struct.unpack('!I', header[4:8])[0]
 646             if version == 2:
 647                 return PackIdxV2(filename, f)
 648             else:
 649                 raise GitError('%s: expected idx file version 2, got %d'
 650                                % (path_msg(filename), version))
 651         elif len(header) == 8 and header[0:4] < b'\377tOc':
 652             return PackIdxV1(filename, f)
 653         else:
 654             raise GitError('%s: unrecognized idx file header'
 655                            % path_msg(filename))
 656     elif filename.endswith(b'.midx'):
 657         return midx.PackMidx(filename)
 658     else:
 659         raise GitError('idx filenames must end with .idx or .midx')
 660
 661
 662 def idxmerge(idxlist, final_progress=True):
 663     """Generate a list of all the objects reachable in a PackIdxList."""
 664     def pfunc(count, total):
 665         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 666                   % (count*100.0/total, count, total))
 667     def pfinal(count, total):
 668         if final_progress:
 669             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 670                      % (100, total, total))
 671     return merge_iter(idxlist, 10024, pfunc, pfinal)
 672
 673
 674 def create_commit_blob(tree, parent,
 675                        author, adate_sec, adate_tz,
 676                        committer, cdate_sec, cdate_tz,
 677                        msg):
 678     if adate_tz is not None:
 679         adate_str = _git_date_str(adate_sec, adate_tz)
 680     else:
 681         adate_str = _local_git_date_str(adate_sec)
 682     if cdate_tz is not None:
 683         cdate_str = _git_date_str(cdate_sec, cdate_tz)
 684     else:
 685         cdate_str = _local_git_date_str(cdate_sec)
 686     l = []
 687     if tree: l.append(b'tree %s' % hexlify(tree))
 688     if parent: l.append(b'parent %s' % hexlify(parent))
 689     if author: l.append(b'author %s %s' % (author, adate_str))
 690     if committer: l.append(b'committer %s %s' % (committer, cdate_str))
 691     l.append(b'')
 692     l.append(msg)
 693     return b'\n'.join(l)
 694
 695
 696 def _make_objcache():
 697     return PackIdxList(repo(b'objects/pack'))
 698
 699 # bup-gc assumes that it can disable all PackWriter activities
 700 # (bloom/midx/cache) via the constructor and close() arguments.
 701
 702 class PackWriter:
 703     """Writes Git objects inside a pack file."""
 704     def __init__(self, objcache_maker=_make_objcache, compression_level=1,
 705                  run_midx=True, on_pack_finish=None,
 706                  max_pack_size=None, max_pack_objects=None, repo_dir=None):
 707         self.repo_dir = repo_dir or repo()
 708         self.file = None
 709         self.parentfd = None
 710         self.count = 0
 711         self.outbytes = 0
 712         self.filename = None
 713         self.idx = None
 714         self.objcache_maker = objcache_maker
 715         self.objcache = None
 716         self.compression_level = compression_level
 717         self.run_midx=run_midx
 718         self.on_pack_finish = on_pack_finish
 719         if not max_pack_size:
 720             max_pack_size = git_config_get(b'pack.packSizeLimit',
 721                                            repo_dir=self.repo_dir)
 722             if max_pack_size is not None:
 723                 max_pack_size = parse_num(max_pack_size)
 724             if not max_pack_size:
 725                 # larger packs slow down pruning
 726                 max_pack_size = 1000 * 1000 * 1000
 727         self.max_pack_size = max_pack_size
 728         # cache memory usage is about 83 bytes per object
 729         self.max_pack_objects = max_pack_objects if max_pack_objects \
 730                                 else max(1, self.max_pack_size // 5000)
 731
 732     def __del__(self):
 733         self.close()
 734
 735     def __enter__(self):
 736         return self
 737
 738     def __exit__(self, type, value, traceback):
 739         self.close()
 740
 741     def _open(self):
 742         if not self.file:
 743             objdir = dir = os.path.join(self.repo_dir, b'objects')
 744             fd, name = tempfile.mkstemp(suffix=b'.pack', dir=objdir)
 745             try:
 746                 self.file = os.fdopen(fd, 'w+b')
 747             except:
 748                 os.close(fd)
 749                 raise
 750             try:
 751                 self.parentfd = os.open(objdir, os.O_RDONLY)
 752             except:
 753                 f = self.file
 754                 self.file = None
 755                 f.close()
 756                 raise
 757             assert name.endswith(b'.pack')
 758             self.filename = name[:-5]
 759             self.file.write(b'PACK\0\0\0\2\0\0\0\0')
 760             self.idx = PackIdxV2Writer()
 761
 762     def _raw_write(self, datalist, sha):
 763         self._open()
 764         f = self.file
 765         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 766         # the file never has a *partial* blob.  So let's make sure it's
 767         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 768         # to our hashsplit algorithm.)  f.write() does its own buffering,
 769         # but that's okay because we'll flush it in _end().
 770         oneblob = b''.join(datalist)
 771         try:
 772             f.write(oneblob)
 773         except IOError as e:
 774             reraise(GitError(e))
 775         nw = len(oneblob)
 776         crc = zlib.crc32(oneblob) & 0xffffffff
 777         self._update_idx(sha, crc, nw)
 778         self.outbytes += nw
 779         self.count += 1
 780         return nw, crc
 781
 782     def _update_idx(self, sha, crc, size):
 783         assert(sha)
 784         if self.idx:
 785             self.idx.add(sha, crc, self.file.tell() - size)
 786
 787     def _write(self, sha, type, content):
 788         if verbose:
 789             log('>')
 790         if not sha:
 791             sha = calc_hash(type, content)
 792         size, crc = self._raw_write(_encode_packobj(type, content,
 793                                                     self.compression_level),
 794                                     sha=sha)
 795         if self.outbytes >= self.max_pack_size \
 796            or self.count >= self.max_pack_objects:
 797             self.breakpoint()
 798         return sha
 799
 800     def breakpoint(self):
 801         """Clear byte and object counts and return the last processed id."""
 802         id = self._end(self.run_midx)
 803         self.outbytes = self.count = 0
 804         return id
 805
 806     def _require_objcache(self):
 807         if self.objcache is None and self.objcache_maker:
 808             self.objcache = self.objcache_maker()
 809         if self.objcache is None:
 810             raise GitError(
 811                     "PackWriter not opened or can't check exists w/o objcache")
 812
 813     def exists(self, id, want_source=False):
 814         """Return non-empty if an object is found in the object cache."""
 815         self._require_objcache()
 816         return self.objcache.exists(id, want_source=want_source)
 817
 818     def just_write(self, sha, type, content):
 819         """Write an object to the pack file without checking for duplication."""
 820         self._write(sha, type, content)
 821         # If nothing else, gc doesn't have/want an objcache
 822         if self.objcache is not None:
 823             self.objcache.add(sha)
 824
 825     def maybe_write(self, type, content):
 826         """Write an object to the pack file if not present and return its id."""
 827         sha = calc_hash(type, content)
 828         if not self.exists(sha):
 829             self._require_objcache()
 830             self.just_write(sha, type, content)
 831         return sha
 832
 833     def new_blob(self, blob):
 834         """Create a blob object in the pack with the supplied content."""
 835         return self.maybe_write(b'blob', blob)
 836
 837     def new_tree(self, shalist):
 838         """Create a tree object in the pack."""
 839         content = tree_encode(shalist)
 840         return self.maybe_write(b'tree', content)
 841
 842     def new_commit(self, tree, parent,
 843                    author, adate_sec, adate_tz,
 844                    committer, cdate_sec, cdate_tz,
 845                    msg):
 846         """Create a commit object in the pack.  The date_sec values must be
 847         epoch-seconds, and if a tz is None, the local timezone is assumed."""
 848         content = create_commit_blob(tree, parent,
 849                                      author, adate_sec, adate_tz,
 850                                      committer, cdate_sec, cdate_tz,
 851                                      msg)
 852         return self.maybe_write(b'commit', content)
 853
 854     def abort(self):
 855         """Remove the pack file from disk."""
 856         f = self.file
 857         if f:
 858             pfd = self.parentfd
 859             self.file = None
 860             self.parentfd = None
 861             self.idx = None
 862             try:
 863                 try:
 864                     os.unlink(self.filename + b'.pack')
 865                 finally:
 866                     f.close()
 867             finally:
 868                 if pfd is not None:
 869                     os.close(pfd)
 870
 871     def _end(self, run_midx=True):
 872         f = self.file
 873         if not f: return None
 874         self.file = None
 875         try:
 876             self.objcache = None
 877             idx = self.idx
 878             self.idx = None
 879
 880             # update object count
 881             f.seek(8)
 882             cp = struct.pack('!i', self.count)
 883             assert(len(cp) == 4)
 884             f.write(cp)
 885
 886             # calculate the pack sha1sum
 887             f.seek(0)
 888             sum = Sha1()
 889             for b in chunkyreader(f):
 890                 sum.update(b)
 891             packbin = sum.digest()
 892             f.write(packbin)
 893             fdatasync(f.fileno())
 894         finally:
 895             f.close()
 896
 897         idx.write(self.filename + b'.idx', packbin)
 898         nameprefix = os.path.join(self.repo_dir,
 899                                   b'objects/pack/pack-' +  hexlify(packbin))
 900         if os.path.exists(self.filename + b'.map'):
 901             os.unlink(self.filename + b'.map')
 902         os.rename(self.filename + b'.pack', nameprefix + b'.pack')
 903         os.rename(self.filename + b'.idx', nameprefix + b'.idx')
 904         try:
 905             os.fsync(self.parentfd)
 906         finally:
 907             os.close(self.parentfd)
 908
 909         if run_midx:
 910             auto_midx(os.path.join(self.repo_dir, b'objects/pack'))
 911
 912         if self.on_pack_finish:
 913             self.on_pack_finish(nameprefix)
 914
 915         return nameprefix
 916
 917     def close(self, run_midx=True):
 918         """Close the pack file and move it to its definitive path."""
 919         return self._end(run_midx=run_midx)
 920
 921
 922 class PackIdxV2Writer:
 923     def __init__(self):
 924         self.idx = list(list() for i in range(256))
 925         self.count = 0
 926
 927     def add(self, sha, crc, offs):
 928         assert(sha)
 929         self.count += 1
 930         self.idx[byte_int(sha[0])].append((sha, crc, offs))
 931
 932     def write(self, filename, packbin):
 933         ofs64_count = 0
 934         for section in self.idx:
 935             for entry in section:
 936                 if entry[2] >= 2**31:
 937                     ofs64_count += 1
 938
 939         # Length: header + fan-out + shas-and-crcs + overflow-offsets
 940         index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
 941         idx_map = None
 942         idx_f = open(filename, 'w+b')
 943         try:
 944             idx_f.truncate(index_len)
 945             fdatasync(idx_f.fileno())
 946             idx_map = mmap_readwrite(idx_f, close=False)
 947             try:
 948                 count = _helpers.write_idx(filename, idx_map, self.idx,
 949                                            self.count)
 950                 assert(count == self.count)
 951                 idx_map.flush()
 952             finally:
 953                 idx_map.close()
 954         finally:
 955             idx_f.close()
 956
 957         idx_f = open(filename, 'a+b')
 958         try:
 959             idx_f.write(packbin)
 960             idx_f.seek(0)
 961             idx_sum = Sha1()
 962             b = idx_f.read(8 + 4*256)
 963             idx_sum.update(b)
 964
 965             for b in chunkyreader(idx_f, 20 * self.count):
 966                 idx_sum.update(b)
 967
 968             for b in chunkyreader(idx_f):
 969                 idx_sum.update(b)
 970             idx_f.write(idx_sum.digest())
 971             fdatasync(idx_f.fileno())
 972         finally:
 973             idx_f.close()
 974
 975
 976 def list_refs(patterns=None, repo_dir=None,
 977               limit_to_heads=False, limit_to_tags=False):
 978     """Yield (refname, hash) tuples for all repository refs unless
 979     patterns are specified.  In that case, only include tuples for
 980     refs matching those patterns (cf. git-show-ref(1)).  The limits
 981     restrict the result items to refs/heads or refs/tags.  If both
 982     limits are specified, items from both sources will be included.
 983
 984     """
 985     argv = [b'git', b'show-ref']
 986     if limit_to_heads:
 987         argv.append(b'--heads')
 988     if limit_to_tags:
 989         argv.append(b'--tags')
 990     argv.append(b'--')
 991     if patterns:
 992         argv.extend(patterns)
 993     p = subprocess.Popen(argv, env=_gitenv(repo_dir), stdout=subprocess.PIPE,
 994                          close_fds=True)
 995     out = p.stdout.read().strip()
 996     rv = p.wait()  # not fatal
 997     if rv:
 998         assert(not out)
 999     if out:
1000         for d in out.split(b'\n'):
1001             sha, name = d.split(b' ', 1)
1002             yield name, unhexlify(sha)
1003
1004
1005 def read_ref(refname, repo_dir = None):
1006     """Get the commit id of the most recent commit made on a given ref."""
1007     refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
1008     l = tuple(islice(refs, 2))
1009     if l:
1010         assert(len(l) == 1)
1011         return l[0][1]
1012     else:
1013         return None
1014
1015
1016 def rev_list_invocation(ref_or_refs, format=None):
1017     if isinstance(ref_or_refs, bytes):
1018         refs = (ref_or_refs,)
1019     else:
1020         refs = ref_or_refs
1021     argv = [b'git', b'rev-list']
1022
1023     if format:
1024         argv.append(b'--pretty=format:' + format)
1025     for ref in refs:
1026         assert not ref.startswith(b'-')
1027         argv.append(ref)
1028     argv.append(b'--')
1029     return argv
1030
1031
1032 def rev_list(ref_or_refs, parse=None, format=None, repo_dir=None):
1033     """Yield information about commits as per "git rev-list".  If a format
1034     is not provided, yield one hex hash at a time.  If a format is
1035     provided, pass it to rev-list and call parse(git_stdout) for each
1036     commit with the stream positioned just after the rev-list "commit
1037     HASH" header line.  When a format is provided yield (oidx,
1038     parse(git_stdout)) for each commit.
1039
1040     """
1041     assert bool(parse) == bool(format)
1042     p = subprocess.Popen(rev_list_invocation(ref_or_refs,
1043                                              format=format),
1044                          env=_gitenv(repo_dir),
1045                          stdout = subprocess.PIPE,
1046                          close_fds=True)
1047     if not format:
1048         for line in p.stdout:
1049             yield line.strip()
1050     else:
1051         line = p.stdout.readline()
1052         while line:
1053             s = line.strip()
1054             if not s.startswith(b'commit '):
1055                 raise Exception('unexpected line ' + repr(s))
1056             s = s[7:]
1057             assert len(s) == 40
1058             yield s, parse(p.stdout)
1059             line = p.stdout.readline()
1060
1061     rv = p.wait()  # not fatal
1062     if rv:
1063         raise GitError('git rev-list returned error %d' % rv)
1064
1065
1066 def rev_parse(committish, repo_dir=None):
1067     """Resolve the full hash for 'committish', if it exists.
1068
1069     Should be roughly equivalent to 'git rev-parse'.
1070
1071     Returns the hex value of the hash if it is found, None if 'committish' does
1072     not correspond to anything.
1073     """
1074     head = read_ref(committish, repo_dir=repo_dir)
1075     if head:
1076         debug2("resolved from ref: commit = %s\n" % hexlify(head))
1077         return head
1078
1079     pL = PackIdxList(repo(b'objects/pack', repo_dir=repo_dir))
1080
1081     if len(committish) == 40:
1082         try:
1083             hash = unhexlify(committish)
1084         except TypeError:
1085             return None
1086
1087         if pL.exists(hash):
1088             return hash
1089
1090     return None
1091
1092
1093 def update_ref(refname, newval, oldval, repo_dir=None):
1094     """Update a repository reference."""
1095     if not oldval:
1096         oldval = b''
1097     assert refname.startswith(b'refs/heads/') \
1098         or refname.startswith(b'refs/tags/')
1099     p = subprocess.Popen([b'git', b'update-ref', refname,
1100                           hexlify(newval), hexlify(oldval)],
1101                          env=_gitenv(repo_dir),
1102                          close_fds=True)
1103     _git_wait(b'git update-ref', p)
1104
1105
1106 def delete_ref(refname, oldvalue=None):
1107     """Delete a repository reference (see git update-ref(1))."""
1108     assert refname.startswith(b'refs/')
1109     oldvalue = [] if not oldvalue else [oldvalue]
1110     p = subprocess.Popen([b'git', b'update-ref', b'-d', refname] + oldvalue,
1111                          env=_gitenv(),
1112                          close_fds=True)
1113     _git_wait('git update-ref', p)
1114
1115
1116 def guess_repo(path=None):
1117     """Set the path value in the global variable "repodir".
1118     This makes bup look for an existing bup repository, but not fail if a
1119     repository doesn't exist. Usually, if you are interacting with a bup
1120     repository, you would not be calling this function but using
1121     check_repo_or_die().
1122     """
1123     global repodir
1124     if path:
1125         repodir = path
1126     if not repodir:
1127         repodir = environ.get(b'BUP_DIR')
1128         if not repodir:
1129             repodir = os.path.expanduser(b'~/.bup')
1130
1131
1132 def init_repo(path=None):
1133     """Create the Git bare repository for bup in a given path."""
1134     guess_repo(path)
1135     d = repo()  # appends a / to the path
1136     parent = os.path.dirname(os.path.dirname(d))
1137     if parent and not os.path.exists(parent):
1138         raise GitError('parent directory "%s" does not exist\n'
1139                        % path_msg(parent))
1140     if os.path.exists(d) and not os.path.isdir(os.path.join(d, b'.')):
1141         raise GitError('"%s" exists but is not a directory\n' % path_msg(d))
1142     p = subprocess.Popen([b'git', b'--bare', b'init'], stdout=sys.stderr,
1143                          env=_gitenv(),
1144                          close_fds=True)
1145     _git_wait('git init', p)
1146     # Force the index version configuration in order to ensure bup works
1147     # regardless of the version of the installed Git binary.
1148     p = subprocess.Popen([b'git', b'config', b'pack.indexVersion', '2'],
1149                          stdout=sys.stderr, env=_gitenv(), close_fds=True)
1150     _git_wait('git config', p)
1151     # Enable the reflog
1152     p = subprocess.Popen([b'git', b'config', b'core.logAllRefUpdates', b'true'],
1153                          stdout=sys.stderr, env=_gitenv(), close_fds=True)
1154     _git_wait('git config', p)
1155
1156
1157 def check_repo_or_die(path=None):
1158     """Check to see if a bup repository probably exists, and abort if not."""
1159     guess_repo(path)
1160     top = repo()
1161     pst = stat_if_exists(top + b'/objects/pack')
1162     if pst and stat.S_ISDIR(pst.st_mode):
1163         return
1164     if not pst:
1165         top_st = stat_if_exists(top)
1166         if not top_st:
1167             log('error: repository %r does not exist (see "bup help init")\n'
1168                 % top)
1169             sys.exit(15)
1170     log('error: %s is not a repository\n' % path_msg(top))
1171     sys.exit(14)
1172
1173
1174 def is_suitable_git(ver_str):
1175     if not ver_str.startswith(b'git version '):
1176         return 'unrecognized'
1177     ver_str = ver_str[len(b'git version '):]
1178     if ver_str.startswith(b'0.'):
1179         return 'insufficient'
1180     if ver_str.startswith(b'1.'):
1181         if re.match(br'1\.[012345]rc', ver_str):
1182             return 'insufficient'
1183         if re.match(br'1\.[01234]\.', ver_str):
1184             return 'insufficient'
1185         if re.match(br'1\.5\.[012345]($|\.)', ver_str):
1186             return 'insufficient'
1187         if re.match(br'1\.5\.6-rc', ver_str):
1188             return 'insufficient'
1189         return 'suitable'
1190     if re.match(br'[0-9]+(\.|$)?', ver_str):
1191         return 'suitable'
1192     sys.exit(13)
1193
1194 _git_great = None
1195
1196 def require_suitable_git(ver_str=None):
1197     """Raise GitError if the version of git isn't suitable.
1198
1199     Rely on ver_str when provided, rather than invoking the git in the
1200     path.
1201
1202     """
1203     global _git_great
1204     if _git_great is not None:
1205         return
1206     if environ.get(b'BUP_GIT_VERSION_IS_FINE', b'').lower() \
1207        in (b'yes', b'true', b'1'):
1208         _git_great = True
1209         return
1210     if not ver_str:
1211         ver_str, _, _ = _git_exo([b'git', b'--version'])
1212     status = is_suitable_git(ver_str)
1213     if status == 'unrecognized':
1214         raise GitError('Unexpected git --version output: %r' % ver_str)
1215     if status == 'insufficient':
1216         log('error: git version must be at least 1.5.6\n')
1217         sys.exit(1)
1218     if status == 'suitable':
1219         _git_great = True
1220         return
1221     assert False
1222
1223
1224 class _AbortableIter:
1225     def __init__(self, it, onabort = None):
1226         self.it = it
1227         self.onabort = onabort
1228         self.done = None
1229
1230     def __iter__(self):
1231         return self
1232
1233     def __next__(self):
1234         try:
1235             return next(self.it)
1236         except StopIteration as e:
1237             self.done = True
1238             raise
1239         except:
1240             self.abort()
1241             raise
1242
1243     next = __next__
1244
1245     def abort(self):
1246         """Abort iteration and call the abortion callback, if needed."""
1247         if not self.done:
1248             self.done = True
1249             if self.onabort:
1250                 self.onabort()
1251
1252     def __del__(self):
1253         self.abort()
1254
1255
1256 class CatPipe:
1257     """Link to 'git cat-file' that is used to retrieve blob data."""
1258     def __init__(self, repo_dir = None):
1259         require_suitable_git()
1260         self.repo_dir = repo_dir
1261         self.p = self.inprogress = None
1262
1263     def close(self, wait=False):
1264         p = self.p
1265         if p:
1266             p.stdout.close()
1267             p.stdin.close()
1268         self.p = None
1269         self.inprogress = None
1270         if wait:
1271             p.wait()
1272             return p.returncode
1273
1274     def restart(self):
1275         self.close()
1276         self.p = subprocess.Popen([b'git', b'cat-file', b'--batch'],
1277                                   stdin=subprocess.PIPE,
1278                                   stdout=subprocess.PIPE,
1279                                   close_fds = True,
1280                                   bufsize = 4096,
1281                                   env=_gitenv(self.repo_dir))
1282
1283     def get(self, ref):
1284         """Yield (oidx, type, size), followed by the data referred to by ref.
1285         If ref does not exist, only yield (None, None, None).
1286
1287         """
1288         if not self.p or self.p.poll() != None:
1289             self.restart()
1290         assert(self.p)
1291         poll_result = self.p.poll()
1292         assert(poll_result == None)
1293         if self.inprogress:
1294             log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1295         assert(not self.inprogress)
1296         assert ref.find(b'\n') < 0
1297         assert ref.find(b'\r') < 0
1298         assert not ref.startswith(b'-')
1299         self.inprogress = ref
1300         self.p.stdin.write(ref + b'\n')
1301         self.p.stdin.flush()
1302         hdr = self.p.stdout.readline()
1303         if hdr.endswith(b' missing\n'):
1304             self.inprogress = None
1305             yield None, None, None
1306             return
1307         info = hdr.split(b' ')
1308         if len(info) != 3 or len(info[0]) != 40:
1309             raise GitError('expected object (id, type, size), got %r' % info)
1310         oidx, typ, size = info
1311         size = int(size)
1312         it = _AbortableIter(chunkyreader(self.p.stdout, size),
1313                             onabort=self.close)
1314         try:
1315             yield oidx, typ, size
1316             for blob in it:
1317                 yield blob
1318             readline_result = self.p.stdout.readline()
1319             assert readline_result == b'\n'
1320             self.inprogress = None
1321         except Exception as e:
1322             it.abort()
1323             raise
1324
1325     def _join(self, it):
1326         _, typ, _ = next(it)
1327         if typ == b'blob':
1328             for blob in it:
1329                 yield blob
1330         elif typ == b'tree':
1331             treefile = b''.join(it)
1332             for (mode, name, sha) in tree_decode(treefile):
1333                 for blob in self.join(hexlify(sha)):
1334                     yield blob
1335         elif typ == b'commit':
1336             treeline = b''.join(it).split(b'\n')[0]
1337             assert treeline.startswith(b'tree ')
1338             for blob in self.join(treeline[5:]):
1339                 yield blob
1340         else:
1341             raise GitError('invalid object type %r: expected blob/tree/commit'
1342                            % typ)
1343
1344     def join(self, id):
1345         """Generate a list of the content of all blobs that can be reached
1346         from an object.  The hash given in 'id' must point to a blob, a tree
1347         or a commit. The content of all blobs that can be seen from trees or
1348         commits will be added to the list.
1349         """
1350         for d in self._join(self.get(id)):
1351             yield d
1352
1353
1354 _cp = {}
1355
1356 def cp(repo_dir=None):
1357     """Create a CatPipe object or reuse the already existing one."""
1358     global _cp, repodir
1359     if not repo_dir:
1360         repo_dir = repodir or repo()
1361     repo_dir = os.path.abspath(repo_dir)
1362     cp = _cp.get(repo_dir)
1363     if not cp:
1364         cp = CatPipe(repo_dir)
1365         _cp[repo_dir] = cp
1366     return cp
1367
1368
1369 def close_catpipes():
1370     # FIXME: chain exceptions
1371     while _cp:
1372         _, cp = _cp.popitem()
1373         cp.close(wait=True)
1374
1375
1376 def tags(repo_dir = None):
1377     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1378     tags = {}
1379     for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1380         assert n.startswith(b'refs/tags/')
1381         name = n[10:]
1382         if not c in tags:
1383             tags[c] = []
1384         tags[c].append(name)  # more than one tag can point at 'c'
1385     return tags
1386
1387
1388 class MissingObject(KeyError):
1389     def __init__(self, oid):
1390         self.oid = oid
1391         KeyError.__init__(self, 'object %r is missing' % hexlify(oid))
1392
1393
1394 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1395                                    'path', 'chunk_path', 'data'])
1396 # The path is the mangled path, and if an item represents a fragment
1397 # of a chunked file, the chunk_path will be the chunked subtree path
1398 # for the chunk, i.e. ['', '2d3115e', ...].  The top-level path for a
1399 # chunked file will have a chunk_path of [''].  So some chunk subtree
1400 # of the file '/foo/bar/baz' might look like this:
1401 #
1402 #   item.path = ['foo', 'bar', 'baz.bup']
1403 #   item.chunk_path = ['', '2d3115e', '016b097']
1404 #   item.type = 'tree'
1405 #   ...
1406
1407
1408 def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1409     """Yield everything reachable from oidx via get_ref (which must behave
1410     like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1411     returns true.  Throw MissingObject if a hash encountered is
1412     missing from the repository, and don't read or return blob content
1413     in the data field unless include_data is set.
1414
1415     """
1416     # Maintain the pending stack on the heap to avoid stack overflow
1417     pending = [(oidx, [], [], None)]
1418     while len(pending):
1419         oidx, parent_path, chunk_path, mode = pending.pop()
1420         oid = unhexlify(oidx)
1421         if stop_at and stop_at(oidx):
1422             continue
1423
1424         if (not include_data) and mode and stat.S_ISREG(mode):
1425             # If the object is a "regular file", then it's a leaf in
1426             # the graph, so we can skip reading the data if the caller
1427             # hasn't requested it.
1428             yield WalkItem(oid=oid, type=b'blob',
1429                            chunk_path=chunk_path, path=parent_path,
1430                            mode=mode,
1431                            data=None)
1432             continue
1433
1434         item_it = get_ref(oidx)
1435         get_oidx, typ, _ = next(item_it)
1436         if not get_oidx:
1437             raise MissingObject(unhexlify(oidx))
1438         if typ not in (b'blob', b'commit', b'tree'):
1439             raise Exception('unexpected repository object type %r' % typ)
1440
1441         # FIXME: set the mode based on the type when the mode is None
1442         if typ == b'blob' and not include_data:
1443             # Dump data until we can ask cat_pipe not to fetch it
1444             for ignored in item_it:
1445                 pass
1446             data = None
1447         else:
1448             data = b''.join(item_it)
1449
1450         yield WalkItem(oid=oid, type=typ,
1451                        chunk_path=chunk_path, path=parent_path,
1452                        mode=mode,
1453                        data=(data if include_data else None))
1454
1455         if typ == b'commit':
1456             commit_items = parse_commit(data)
1457             for pid in commit_items.parents:
1458                 pending.append((pid, parent_path, chunk_path, mode))
1459             pending.append((commit_items.tree, parent_path, chunk_path,
1460                             hashsplit.GIT_MODE_TREE))
1461         elif typ == b'tree':
1462             for mode, name, ent_id in tree_decode(data):
1463                 demangled, bup_type = demangle_name(name, mode)
1464                 if chunk_path:
1465                     sub_path = parent_path
1466                     sub_chunk_path = chunk_path + [name]
1467                 else:
1468                     sub_path = parent_path + [name]
1469                     if bup_type == BUP_CHUNKED:
1470                         sub_chunk_path = [b'']
1471                     else:
1472                         sub_chunk_path = chunk_path
1473                 pending.append((hexlify(ent_id), sub_path, sub_chunk_path,
1474                                 mode))