lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5
   6 from __future__ import absolute_import, print_function
   7 import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
   8 from array import array
   9 from binascii import hexlify, unhexlify
  10 from collections import namedtuple
  11 from itertools import islice
  12 from numbers import Integral
  13
  14 from bup import _helpers, compat, hashsplit, path, midx, bloom, xstat
  15 from bup.compat import (buffer,
  16                         byte_int, bytes_from_byte, bytes_from_uint,
  17                         environ,
  18                         items,
  19                         range,
  20                         reraise)
  21 from bup.io import path_msg
  22 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
  23                          exo,
  24                          fdatasync,
  25                          hostname, localtime, log,
  26                          merge_dict,
  27                          merge_iter,
  28                          mmap_read, mmap_readwrite,
  29                          parse_num,
  30                          progress, qprogress, stat_if_exists,
  31                          unlink,
  32                          utc_offset_str)
  33 from bup.pwdgrp import username, userfullname
  34
  35
  36 verbose = 0
  37 repodir = None  # The default repository, once initialized
  38
  39 _typemap =  {b'blob': 3, b'tree': 2, b'commit': 1, b'tag': 4}
  40 _typermap = {v: k for k, v in items(_typemap)}
  41
  42
  43 _total_searches = 0
  44 _total_steps = 0
  45
  46
  47 class GitError(Exception):
  48     pass
  49
  50
  51 def _gitenv(repo_dir=None):
  52     if not repo_dir:
  53         repo_dir = repo()
  54     return merge_dict(environ, {b'GIT_DIR': os.path.abspath(repo_dir)})
  55
  56 def _git_wait(cmd, p):
  57     rv = p.wait()
  58     if rv != 0:
  59         raise GitError('%r returned %d' % (cmd, rv))
  60
  61 def _git_exo(cmd, **kwargs):
  62     kwargs['check'] = False
  63     result = exo(cmd, **kwargs)
  64     _, _, proc = result
  65     if proc.returncode != 0:
  66         raise GitError('%r returned %d' % (cmd, proc.returncode))
  67     return result
  68
  69 def git_config_get(option, repo_dir=None, opttype=None, cfg_file=None):
  70     assert not (repo_dir and cfg_file), "repo_dir and cfg_file cannot both be used"
  71     cmd = [b'git', b'config', b'--null']
  72     if cfg_file:
  73         cmd.extend([b'--file', cfg_file])
  74     if opttype == 'int':
  75         cmd.extend([b'--int'])
  76     elif opttype == 'bool':
  77         cmd.extend([b'--bool'])
  78     else:
  79         assert opttype is None
  80     cmd.extend([b'--get', option])
  81     env=None
  82     if repo_dir:
  83         env = _gitenv(repo_dir=repo_dir)
  84     p = subprocess.Popen(cmd, stdout=subprocess.PIPE, env=env,
  85                          close_fds=True)
  86     # with --null, git writes out a trailing \0 after the value
  87     r = p.stdout.read()[:-1]
  88     rc = p.wait()
  89     if rc == 0:
  90         if opttype == 'int':
  91             return int(r)
  92         elif opttype == 'bool':
  93             # git converts to 'true' or 'false'
  94             return r == b'true'
  95         return r
  96     if rc != 1:
  97         raise GitError('%r returned %d' % (cmd, rc))
  98     return None
  99
 100
 101 def parse_tz_offset(s):
 102     """UTC offset in seconds."""
 103     tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
 104     if bytes_from_byte(s[0]) == b'-':
 105         return - tz_off
 106     return tz_off
 107
 108
 109 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
 110 # Make sure that's authoritative.
 111 _start_end_char = br'[^ .,:;<>"\'\0\n]'
 112 _content_char = br'[^\0\n<>]'
 113 _safe_str_rx = br'(?:%s{1,2}|(?:%s%s*%s))' \
 114     % (_start_end_char,
 115        _start_end_char, _content_char, _start_end_char)
 116 _tz_rx = br'[-+]\d\d[0-5]\d'
 117 _parent_rx = br'(?:parent [abcdefABCDEF0123456789]{40}\n)'
 118 # Assumes every following line starting with a space is part of the
 119 # mergetag.  Is there a formal commit blob spec?
 120 _mergetag_rx = br'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)'
 121 _commit_rx = re.compile(br'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
 122 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
 123 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?)
 124
 125 (?P<message>(?:.|\n)*)''' % (_parent_rx,
 126                              _safe_str_rx, _safe_str_rx, _tz_rx,
 127                              _safe_str_rx, _safe_str_rx, _tz_rx,
 128                              _mergetag_rx))
 129 _parent_hash_rx = re.compile(br'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
 130
 131 # Note that the author_sec and committer_sec values are (UTC) epoch
 132 # seconds, and for now the mergetag is not included.
 133 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
 134                                        'author_name', 'author_mail',
 135                                        'author_sec', 'author_offset',
 136                                        'committer_name', 'committer_mail',
 137                                        'committer_sec', 'committer_offset',
 138                                        'message'])
 139
 140 def parse_commit(content):
 141     commit_match = re.match(_commit_rx, content)
 142     if not commit_match:
 143         raise Exception('cannot parse commit %r' % content)
 144     matches = commit_match.groupdict()
 145     return CommitInfo(tree=matches['tree'],
 146                       parents=re.findall(_parent_hash_rx, matches['parents']),
 147                       author_name=matches['author_name'],
 148                       author_mail=matches['author_mail'],
 149                       author_sec=int(matches['asec']),
 150                       author_offset=parse_tz_offset(matches['atz']),
 151                       committer_name=matches['committer_name'],
 152                       committer_mail=matches['committer_mail'],
 153                       committer_sec=int(matches['csec']),
 154                       committer_offset=parse_tz_offset(matches['ctz']),
 155                       message=matches['message'])
 156
 157
 158 def get_cat_data(cat_iterator, expected_type):
 159     _, kind, _ = next(cat_iterator)
 160     if kind != expected_type:
 161         raise Exception('expected %r, saw %r' % (expected_type, kind))
 162     return b''.join(cat_iterator)
 163
 164 def get_commit_items(id, cp):
 165     return parse_commit(get_cat_data(cp.get(id), b'commit'))
 166
 167 def _local_git_date_str(epoch_sec):
 168     return b'%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
 169
 170
 171 def _git_date_str(epoch_sec, tz_offset_sec):
 172     offs =  tz_offset_sec // 60
 173     return b'%d %s%02d%02d' \
 174         % (epoch_sec,
 175            b'+' if offs >= 0 else b'-',
 176            abs(offs) // 60,
 177            abs(offs) % 60)
 178
 179
 180 def repo(sub = b'', repo_dir=None):
 181     """Get the path to the git repository or one of its subdirectories."""
 182     repo_dir = repo_dir or repodir
 183     if not repo_dir:
 184         raise GitError('You should call check_repo_or_die()')
 185
 186     # If there's a .git subdirectory, then the actual repo is in there.
 187     gd = os.path.join(repo_dir, b'.git')
 188     if os.path.exists(gd):
 189         repo_dir = gd
 190
 191     return os.path.join(repo_dir, sub)
 192
 193
 194 _shorten_hash_rx = \
 195     re.compile(br'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)')
 196
 197 def shorten_hash(s):
 198     return _shorten_hash_rx.sub(br'\1\2*\3', s)
 199
 200
 201 def repo_rel(path):
 202     full = os.path.abspath(path)
 203     fullrepo = os.path.abspath(repo(b''))
 204     if not fullrepo.endswith(b'/'):
 205         fullrepo += b'/'
 206     if full.startswith(fullrepo):
 207         path = full[len(fullrepo):]
 208     if path.startswith(b'index-cache/'):
 209         path = path[len(b'index-cache/'):]
 210     return shorten_hash(path)
 211
 212
 213 def auto_midx(objdir):
 214     args = [path.exe(), b'midx', b'--auto', b'--dir', objdir]
 215     try:
 216         rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
 217     except OSError as e:
 218         # make sure 'args' gets printed to help with debugging
 219         add_error('%r: exception: %s' % (args, e))
 220         raise
 221     if rv:
 222         add_error('%r: returned %d' % (args, rv))
 223
 224     args = [path.exe(), b'bloom', b'--dir', objdir]
 225     try:
 226         rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
 227     except OSError as e:
 228         # make sure 'args' gets printed to help with debugging
 229         add_error('%r: exception: %s' % (args, e))
 230         raise
 231     if rv:
 232         add_error('%r: returned %d' % (args, rv))
 233
 234
 235 def mangle_name(name, mode, gitmode):
 236     """Mangle a file name to present an abstract name for segmented files.
 237     Mangled file names will have the ".bup" extension added to them. If a
 238     file's name already ends with ".bup", a ".bupl" extension is added to
 239     disambiguate normal files from segmented ones.
 240     """
 241     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 242         assert(stat.S_ISDIR(gitmode))
 243         return name + b'.bup'
 244     elif name.endswith(b'.bup') or name[:-1].endswith(b'.bup'):
 245         return name + b'.bupl'
 246     else:
 247         return name
 248
 249
 250 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 251 def demangle_name(name, mode):
 252     """Remove name mangling from a file name, if necessary.
 253
 254     The return value is a tuple (demangled_filename,mode), where mode is one of
 255     the following:
 256
 257     * BUP_NORMAL  : files that should be read as-is from the repository
 258     * BUP_CHUNKED : files that were chunked and need to be reassembled
 259
 260     For more information on the name mangling algorithm, see mangle_name()
 261     """
 262     if name.endswith(b'.bupl'):
 263         return (name[:-5], BUP_NORMAL)
 264     elif name.endswith(b'.bup'):
 265         return (name[:-4], BUP_CHUNKED)
 266     elif name.endswith(b'.bupm'):
 267         return (name[:-5],
 268                 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
 269     else:
 270         return (name, BUP_NORMAL)
 271
 272
 273 def calc_hash(type, content):
 274     """Calculate some content's hash in the Git fashion."""
 275     header = b'%s %d\0' % (type, len(content))
 276     sum = Sha1(header)
 277     sum.update(content)
 278     return sum.digest()
 279
 280
 281 def shalist_item_sort_key(ent):
 282     (mode, name, id) = ent
 283     assert(mode+0 == mode)
 284     if stat.S_ISDIR(mode):
 285         return name + b'/'
 286     else:
 287         return name
 288
 289
 290 def tree_encode(shalist):
 291     """Generate a git tree object from (mode,name,hash) tuples."""
 292     shalist = sorted(shalist, key = shalist_item_sort_key)
 293     l = []
 294     for (mode,name,bin) in shalist:
 295         assert(mode)
 296         assert(mode+0 == mode)
 297         assert(name)
 298         assert(len(bin) == 20)
 299         s = b'%o %s\0%s' % (mode,name,bin)
 300         assert s[0] != b'0'  # 0-padded octal is not acceptable in a git tree
 301         l.append(s)
 302     return b''.join(l)
 303
 304
 305 def tree_decode(buf):
 306     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 307     ofs = 0
 308     while ofs < len(buf):
 309         z = buf.find(b'\0', ofs)
 310         assert(z > ofs)
 311         spl = buf[ofs:z].split(b' ', 1)
 312         assert(len(spl) == 2)
 313         mode,name = spl
 314         sha = buf[z+1:z+1+20]
 315         ofs = z+1+20
 316         yield (int(mode, 8), name, sha)
 317
 318
 319 def _encode_packobj(type, content, compression_level=1):
 320     if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
 321         raise ValueError('invalid compression level %s' % compression_level)
 322     szout = b''
 323     sz = len(content)
 324     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 325     sz >>= 4
 326     while 1:
 327         if sz: szbits |= 0x80
 328         szout += bytes_from_uint(szbits)
 329         if not sz:
 330             break
 331         szbits = sz & 0x7f
 332         sz >>= 7
 333     z = zlib.compressobj(compression_level)
 334     yield szout
 335     yield z.compress(content)
 336     yield z.flush()
 337
 338
 339 def _decode_packobj(buf):
 340     assert(buf)
 341     c = byte_int(buf[0])
 342     type = _typermap[(c & 0x70) >> 4]
 343     sz = c & 0x0f
 344     shift = 4
 345     i = 0
 346     while c & 0x80:
 347         i += 1
 348         c = byte_int(buf[i])
 349         sz |= (c & 0x7f) << shift
 350         shift += 7
 351         if not (c & 0x80):
 352             break
 353     return (type, zlib.decompress(buf[i+1:]))
 354
 355
 356 class PackIdx:
 357     def __init__(self):
 358         assert(0)
 359
 360     def find_offset(self, hash):
 361         """Get the offset of an object inside the index file."""
 362         idx = self._idx_from_hash(hash)
 363         if idx != None:
 364             return self._ofs_from_idx(idx)
 365         return None
 366
 367     def exists(self, hash, want_source=False):
 368         """Return nonempty if the object exists in this index."""
 369         if hash and (self._idx_from_hash(hash) != None):
 370             return want_source and os.path.basename(self.name) or True
 371         return None
 372
 373     def _idx_from_hash(self, hash):
 374         global _total_searches, _total_steps
 375         _total_searches += 1
 376         assert(len(hash) == 20)
 377         b1 = byte_int(hash[0])
 378         start = self.fanout[b1-1] # range -1..254
 379         end = self.fanout[b1] # range 0..255
 380         want = hash
 381         _total_steps += 1  # lookup table is a step
 382         while start < end:
 383             _total_steps += 1
 384             mid = start + (end - start) // 2
 385             v = self._idx_to_hash(mid)
 386             if v < want:
 387                 start = mid+1
 388             elif v > want:
 389                 end = mid
 390             else: # got it!
 391                 return mid
 392         return None
 393
 394
 395 class PackIdxV1(PackIdx):
 396     """Object representation of a Git pack index (version 1) file."""
 397     def __init__(self, filename, f):
 398         self.name = filename
 399         self.idxnames = [self.name]
 400         self.map = mmap_read(f)
 401         # Min size for 'L' is 4, which is sufficient for struct's '!I'
 402         self.fanout = array('L', struct.unpack('!256I', self.map))
 403         self.fanout.append(0)  # entry "-1"
 404         self.nsha = self.fanout[255]
 405         self.sha_ofs = 256 * 4
 406         # Avoid slicing shatable for individual hashes (very high overhead)
 407         self.shatable = buffer(self.map, self.sha_ofs, self.nsha * 24)
 408
 409     def __enter__(self):
 410         return self
 411
 412     def __exit__(self, type, value, traceback):
 413         self.close()
 414
 415     def __len__(self):
 416         return int(self.nsha)  # int() from long for python 2
 417
 418     def _ofs_from_idx(self, idx):
 419         if idx >= self.nsha or idx < 0:
 420             raise IndexError('invalid pack index index %d' % idx)
 421         ofs = self.sha_ofs + idx * 24
 422         return struct.unpack_from('!I', self.map, offset=ofs)[0]
 423
 424     def _idx_to_hash(self, idx):
 425         if idx >= self.nsha or idx < 0:
 426             raise IndexError('invalid pack index index %d' % idx)
 427         ofs = self.sha_ofs + idx * 24 + 4
 428         return self.map[ofs : ofs + 20]
 429
 430     def __iter__(self):
 431         start = self.sha_ofs + 4
 432         for ofs in range(start, start + 24 * self.nsha, 24):
 433             yield self.map[ofs : ofs + 20]
 434
 435     def close(self):
 436         if self.map is not None:
 437             self.shatable = None
 438             self.map.close()
 439             self.map = None
 440
 441
 442 class PackIdxV2(PackIdx):
 443     """Object representation of a Git pack index (version 2) file."""
 444     def __init__(self, filename, f):
 445         self.name = filename
 446         self.idxnames = [self.name]
 447         self.map = mmap_read(f)
 448         assert self.map[0:8] == b'\377tOc\0\0\0\2'
 449         # Min size for 'L' is 4, which is sufficient for struct's '!I'
 450         self.fanout = array('L', struct.unpack_from('!256I', self.map, offset=8))
 451         self.fanout.append(0)
 452         self.nsha = self.fanout[255]
 453         self.sha_ofs = 8 + 256*4
 454         self.ofstable_ofs = self.sha_ofs + self.nsha * 20 + self.nsha * 4
 455         self.ofs64table_ofs = self.ofstable_ofs + self.nsha * 4
 456         # Avoid slicing this for individual hashes (very high overhead)
 457         self.shatable = buffer(self.map, self.sha_ofs, self.nsha*20)
 458
 459     def __enter__(self):
 460         return self
 461
 462     def __exit__(self, type, value, traceback):
 463         self.close()
 464
 465     def __len__(self):
 466         return int(self.nsha)  # int() from long for python 2
 467
 468     def _ofs_from_idx(self, idx):
 469         if idx >= self.nsha or idx < 0:
 470             raise IndexError('invalid pack index index %d' % idx)
 471         ofs_ofs = self.ofstable_ofs + idx * 4
 472         ofs = struct.unpack_from('!I', self.map, offset=ofs_ofs)[0]
 473         if ofs & 0x80000000:
 474             idx64 = ofs & 0x7fffffff
 475             ofs64_ofs = self.ofs64table_ofs + idx64 * 8
 476             ofs = struct.unpack_from('!Q', self.map, offset=ofs64_ofs)[0]
 477         return ofs
 478
 479     def _idx_to_hash(self, idx):
 480         if idx >= self.nsha or idx < 0:
 481             raise IndexError('invalid pack index index %d' % idx)
 482         ofs = self.sha_ofs + idx * 20
 483         return self.map[ofs : ofs + 20]
 484
 485     def __iter__(self):
 486         start = self.sha_ofs
 487         for ofs in range(start, start + 20 * self.nsha, 20):
 488             yield self.map[ofs : ofs + 20]
 489
 490     def close(self):
 491         if self.map is not None:
 492             self.shatable = None
 493             self.map.close()
 494             self.map = None
 495
 496
 497 _mpi_count = 0
 498 class PackIdxList:
 499     def __init__(self, dir, ignore_midx=False):
 500         global _mpi_count
 501         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 502         _mpi_count += 1
 503         self.dir = dir
 504         self.also = set()
 505         self.packs = []
 506         self.do_bloom = False
 507         self.bloom = None
 508         self.ignore_midx = ignore_midx
 509         self.refresh()
 510
 511     def __del__(self):
 512         global _mpi_count
 513         _mpi_count -= 1
 514         assert(_mpi_count == 0)
 515
 516     def __iter__(self):
 517         return iter(idxmerge(self.packs))
 518
 519     def __len__(self):
 520         return sum(len(pack) for pack in self.packs)
 521
 522     def exists(self, hash, want_source=False):
 523         """Return nonempty if the object exists in the index files."""
 524         global _total_searches
 525         _total_searches += 1
 526         if hash in self.also:
 527             return True
 528         if self.do_bloom and self.bloom:
 529             if self.bloom.exists(hash):
 530                 self.do_bloom = False
 531             else:
 532                 _total_searches -= 1  # was counted by bloom
 533                 return None
 534         for i in range(len(self.packs)):
 535             p = self.packs[i]
 536             _total_searches -= 1  # will be incremented by sub-pack
 537             ix = p.exists(hash, want_source=want_source)
 538             if ix:
 539                 # reorder so most recently used packs are searched first
 540                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 541                 return ix
 542         self.do_bloom = True
 543         return None
 544
 545     def refresh(self, skip_midx = False):
 546         """Refresh the index list.
 547         This method verifies if .midx files were superseded (e.g. all of its
 548         contents are in another, bigger .midx file) and removes the superseded
 549         files.
 550
 551         If skip_midx is True, all work on .midx files will be skipped and .midx
 552         files will be removed from the list.
 553
 554         The instance variable 'ignore_midx' can force this function to
 555         always act as if skip_midx was True.
 556         """
 557         if self.bloom is not None:
 558             self.bloom.close()
 559         self.bloom = None # Always reopen the bloom as it may have been relaced
 560         self.do_bloom = False
 561         skip_midx = skip_midx or self.ignore_midx
 562         d = dict((p.name, p) for p in self.packs
 563                  if not skip_midx or not isinstance(p, midx.PackMidx))
 564         if os.path.exists(self.dir):
 565             if not skip_midx:
 566                 midxl = []
 567                 midxes = set(glob.glob(os.path.join(self.dir, b'*.midx')))
 568                 # remove any *.midx files from our list that no longer exist
 569                 for ix in list(d.values()):
 570                     if not isinstance(ix, midx.PackMidx):
 571                         continue
 572                     if ix.name in midxes:
 573                         continue
 574                     # remove the midx
 575                     del d[ix.name]
 576                     ix.close()
 577                     self.packs.remove(ix)
 578                 for ix in self.packs:
 579                     if isinstance(ix, midx.PackMidx):
 580                         for name in ix.idxnames:
 581                             d[os.path.join(self.dir, name)] = ix
 582                 for full in midxes:
 583                     if not d.get(full):
 584                         mx = midx.PackMidx(full)
 585                         (mxd, mxf) = os.path.split(mx.name)
 586                         broken = False
 587                         for n in mx.idxnames:
 588                             if not os.path.exists(os.path.join(mxd, n)):
 589                                 log(('warning: index %s missing\n'
 590                                      '  used by %s\n')
 591                                     % (path_msg(n), path_msg(mxf)))
 592                                 broken = True
 593                         if broken:
 594                             mx.close()
 595                             del mx
 596                             unlink(full)
 597                         else:
 598                             midxl.append(mx)
 599                 midxl.sort(key=lambda ix:
 600                            (-len(ix), -xstat.stat(ix.name).st_mtime))
 601                 for ix in midxl:
 602                     any_needed = False
 603                     for sub in ix.idxnames:
 604                         found = d.get(os.path.join(self.dir, sub))
 605                         if not found or isinstance(found, PackIdx):
 606                             # doesn't exist, or exists but not in a midx
 607                             any_needed = True
 608                             break
 609                     if any_needed:
 610                         d[ix.name] = ix
 611                         for name in ix.idxnames:
 612                             d[os.path.join(self.dir, name)] = ix
 613                     elif not ix.force_keep:
 614                         debug1('midx: removing redundant: %s\n'
 615                                % path_msg(os.path.basename(ix.name)))
 616                         ix.close()
 617                         unlink(ix.name)
 618             for full in glob.glob(os.path.join(self.dir, b'*.idx')):
 619                 if not d.get(full):
 620                     try:
 621                         ix = open_idx(full)
 622                     except GitError as e:
 623                         add_error(e)
 624                         continue
 625                     d[full] = ix
 626             bfull = os.path.join(self.dir, b'bup.bloom')
 627             if self.bloom is None and os.path.exists(bfull):
 628                 self.bloom = bloom.ShaBloom(bfull)
 629             self.packs = list(set(d.values()))
 630             self.packs.sort(reverse=True, key=lambda x: len(x))
 631             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 632                 self.do_bloom = True
 633             else:
 634                 self.bloom = None
 635         debug1('PackIdxList: using %d index%s.\n'
 636             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 637
 638     def add(self, hash):
 639         """Insert an additional object in the list."""
 640         self.also.add(hash)
 641
 642
 643 def open_idx(filename):
 644     if filename.endswith(b'.idx'):
 645         f = open(filename, 'rb')
 646         header = f.read(8)
 647         if header[0:4] == b'\377tOc':
 648             version = struct.unpack('!I', header[4:8])[0]
 649             if version == 2:
 650                 return PackIdxV2(filename, f)
 651             else:
 652                 raise GitError('%s: expected idx file version 2, got %d'
 653                                % (path_msg(filename), version))
 654         elif len(header) == 8 and header[0:4] < b'\377tOc':
 655             return PackIdxV1(filename, f)
 656         else:
 657             raise GitError('%s: unrecognized idx file header'
 658                            % path_msg(filename))
 659     elif filename.endswith(b'.midx'):
 660         return midx.PackMidx(filename)
 661     else:
 662         raise GitError('idx filenames must end with .idx or .midx')
 663
 664
 665 def idxmerge(idxlist, final_progress=True):
 666     """Generate a list of all the objects reachable in a PackIdxList."""
 667     def pfunc(count, total):
 668         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 669                   % (count*100.0/total, count, total))
 670     def pfinal(count, total):
 671         if final_progress:
 672             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 673                      % (100, total, total))
 674     return merge_iter(idxlist, 10024, pfunc, pfinal)
 675
 676
 677 def create_commit_blob(tree, parent,
 678                        author, adate_sec, adate_tz,
 679                        committer, cdate_sec, cdate_tz,
 680                        msg):
 681     if adate_tz is not None:
 682         adate_str = _git_date_str(adate_sec, adate_tz)
 683     else:
 684         adate_str = _local_git_date_str(adate_sec)
 685     if cdate_tz is not None:
 686         cdate_str = _git_date_str(cdate_sec, cdate_tz)
 687     else:
 688         cdate_str = _local_git_date_str(cdate_sec)
 689     l = []
 690     if tree: l.append(b'tree %s' % hexlify(tree))
 691     if parent: l.append(b'parent %s' % hexlify(parent))
 692     if author: l.append(b'author %s %s' % (author, adate_str))
 693     if committer: l.append(b'committer %s %s' % (committer, cdate_str))
 694     l.append(b'')
 695     l.append(msg)
 696     return b'\n'.join(l)
 697
 698
 699 def _make_objcache():
 700     return PackIdxList(repo(b'objects/pack'))
 701
 702 # bup-gc assumes that it can disable all PackWriter activities
 703 # (bloom/midx/cache) via the constructor and close() arguments.
 704
 705 class PackWriter:
 706     """Writes Git objects inside a pack file."""
 707     def __init__(self, objcache_maker=_make_objcache, compression_level=1,
 708                  run_midx=True, on_pack_finish=None,
 709                  max_pack_size=None, max_pack_objects=None, repo_dir=None):
 710         self.repo_dir = repo_dir or repo()
 711         self.file = None
 712         self.parentfd = None
 713         self.count = 0
 714         self.outbytes = 0
 715         self.filename = None
 716         self.idx = None
 717         self.objcache_maker = objcache_maker
 718         self.objcache = None
 719         self.compression_level = compression_level
 720         self.run_midx=run_midx
 721         self.on_pack_finish = on_pack_finish
 722         if not max_pack_size:
 723             max_pack_size = git_config_get(b'pack.packSizeLimit',
 724                                            repo_dir=self.repo_dir,
 725                                            opttype='int')
 726             if not max_pack_size:
 727                 # larger packs slow down pruning
 728                 max_pack_size = 1000 * 1000 * 1000
 729         self.max_pack_size = max_pack_size
 730         # cache memory usage is about 83 bytes per object
 731         self.max_pack_objects = max_pack_objects if max_pack_objects \
 732                                 else max(1, self.max_pack_size // 5000)
 733
 734     def __del__(self):
 735         self.close()
 736
 737     def __enter__(self):
 738         return self
 739
 740     def __exit__(self, type, value, traceback):
 741         self.close()
 742
 743     def _open(self):
 744         if not self.file:
 745             objdir = dir = os.path.join(self.repo_dir, b'objects')
 746             fd, name = tempfile.mkstemp(suffix=b'.pack', dir=objdir)
 747             try:
 748                 self.file = os.fdopen(fd, 'w+b')
 749             except:
 750                 os.close(fd)
 751                 raise
 752             try:
 753                 self.parentfd = os.open(objdir, os.O_RDONLY)
 754             except:
 755                 f = self.file
 756                 self.file = None
 757                 f.close()
 758                 raise
 759             assert name.endswith(b'.pack')
 760             self.filename = name[:-5]
 761             self.file.write(b'PACK\0\0\0\2\0\0\0\0')
 762             self.idx = PackIdxV2Writer()
 763
 764     def _raw_write(self, datalist, sha):
 765         self._open()
 766         f = self.file
 767         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 768         # the file never has a *partial* blob.  So let's make sure it's
 769         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 770         # to our hashsplit algorithm.)  f.write() does its own buffering,
 771         # but that's okay because we'll flush it in _end().
 772         oneblob = b''.join(datalist)
 773         try:
 774             f.write(oneblob)
 775         except IOError as e:
 776             reraise(GitError(e))
 777         nw = len(oneblob)
 778         crc = zlib.crc32(oneblob) & 0xffffffff
 779         self._update_idx(sha, crc, nw)
 780         self.outbytes += nw
 781         self.count += 1
 782         return nw, crc
 783
 784     def _update_idx(self, sha, crc, size):
 785         assert(sha)
 786         if self.idx:
 787             self.idx.add(sha, crc, self.file.tell() - size)
 788
 789     def _write(self, sha, type, content):
 790         if verbose:
 791             log('>')
 792         if not sha:
 793             sha = calc_hash(type, content)
 794         size, crc = self._raw_write(_encode_packobj(type, content,
 795                                                     self.compression_level),
 796                                     sha=sha)
 797         if self.outbytes >= self.max_pack_size \
 798            or self.count >= self.max_pack_objects:
 799             self.breakpoint()
 800         return sha
 801
 802     def breakpoint(self):
 803         """Clear byte and object counts and return the last processed id."""
 804         id = self._end(self.run_midx)
 805         self.outbytes = self.count = 0
 806         return id
 807
 808     def _require_objcache(self):
 809         if self.objcache is None and self.objcache_maker:
 810             self.objcache = self.objcache_maker()
 811         if self.objcache is None:
 812             raise GitError(
 813                     "PackWriter not opened or can't check exists w/o objcache")
 814
 815     def exists(self, id, want_source=False):
 816         """Return non-empty if an object is found in the object cache."""
 817         self._require_objcache()
 818         return self.objcache.exists(id, want_source=want_source)
 819
 820     def just_write(self, sha, type, content):
 821         """Write an object to the pack file without checking for duplication."""
 822         self._write(sha, type, content)
 823         # If nothing else, gc doesn't have/want an objcache
 824         if self.objcache is not None:
 825             self.objcache.add(sha)
 826
 827     def maybe_write(self, type, content):
 828         """Write an object to the pack file if not present and return its id."""
 829         sha = calc_hash(type, content)
 830         if not self.exists(sha):
 831             self._require_objcache()
 832             self.just_write(sha, type, content)
 833         return sha
 834
 835     def new_blob(self, blob):
 836         """Create a blob object in the pack with the supplied content."""
 837         return self.maybe_write(b'blob', blob)
 838
 839     def new_tree(self, shalist):
 840         """Create a tree object in the pack."""
 841         content = tree_encode(shalist)
 842         return self.maybe_write(b'tree', content)
 843
 844     def new_commit(self, tree, parent,
 845                    author, adate_sec, adate_tz,
 846                    committer, cdate_sec, cdate_tz,
 847                    msg):
 848         """Create a commit object in the pack.  The date_sec values must be
 849         epoch-seconds, and if a tz is None, the local timezone is assumed."""
 850         content = create_commit_blob(tree, parent,
 851                                      author, adate_sec, adate_tz,
 852                                      committer, cdate_sec, cdate_tz,
 853                                      msg)
 854         return self.maybe_write(b'commit', content)
 855
 856     def abort(self):
 857         """Remove the pack file from disk."""
 858         f = self.file
 859         if f:
 860             pfd = self.parentfd
 861             self.file = None
 862             self.parentfd = None
 863             self.idx = None
 864             try:
 865                 try:
 866                     os.unlink(self.filename + b'.pack')
 867                 finally:
 868                     f.close()
 869             finally:
 870                 if pfd is not None:
 871                     os.close(pfd)
 872
 873     def _end(self, run_midx=True):
 874         f = self.file
 875         if not f: return None
 876         self.file = None
 877         try:
 878             self.objcache = None
 879             idx = self.idx
 880             self.idx = None
 881
 882             # update object count
 883             f.seek(8)
 884             cp = struct.pack('!i', self.count)
 885             assert(len(cp) == 4)
 886             f.write(cp)
 887
 888             # calculate the pack sha1sum
 889             f.seek(0)
 890             sum = Sha1()
 891             for b in chunkyreader(f):
 892                 sum.update(b)
 893             packbin = sum.digest()
 894             f.write(packbin)
 895             fdatasync(f.fileno())
 896         finally:
 897             f.close()
 898
 899         idx.write(self.filename + b'.idx', packbin)
 900         nameprefix = os.path.join(self.repo_dir,
 901                                   b'objects/pack/pack-' +  hexlify(packbin))
 902         if os.path.exists(self.filename + b'.map'):
 903             os.unlink(self.filename + b'.map')
 904         os.rename(self.filename + b'.pack', nameprefix + b'.pack')
 905         os.rename(self.filename + b'.idx', nameprefix + b'.idx')
 906         try:
 907             os.fsync(self.parentfd)
 908         finally:
 909             os.close(self.parentfd)
 910
 911         if run_midx:
 912             auto_midx(os.path.join(self.repo_dir, b'objects/pack'))
 913
 914         if self.on_pack_finish:
 915             self.on_pack_finish(nameprefix)
 916
 917         return nameprefix
 918
 919     def close(self, run_midx=True):
 920         """Close the pack file and move it to its definitive path."""
 921         return self._end(run_midx=run_midx)
 922
 923
 924 class PackIdxV2Writer:
 925     def __init__(self):
 926         self.idx = list(list() for i in range(256))
 927         self.count = 0
 928
 929     def add(self, sha, crc, offs):
 930         assert(sha)
 931         self.count += 1
 932         self.idx[byte_int(sha[0])].append((sha, crc, offs))
 933
 934     def write(self, filename, packbin):
 935         ofs64_count = 0
 936         for section in self.idx:
 937             for entry in section:
 938                 if entry[2] >= 2**31:
 939                     ofs64_count += 1
 940
 941         # Length: header + fan-out + shas-and-crcs + overflow-offsets
 942         index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
 943         idx_map = None
 944         idx_f = open(filename, 'w+b')
 945         try:
 946             idx_f.truncate(index_len)
 947             fdatasync(idx_f.fileno())
 948             idx_map = mmap_readwrite(idx_f, close=False)
 949             try:
 950                 count = _helpers.write_idx(filename, idx_map, self.idx,
 951                                            self.count)
 952                 assert(count == self.count)
 953                 idx_map.flush()
 954             finally:
 955                 idx_map.close()
 956         finally:
 957             idx_f.close()
 958
 959         idx_f = open(filename, 'a+b')
 960         try:
 961             idx_f.write(packbin)
 962             idx_f.seek(0)
 963             idx_sum = Sha1()
 964             b = idx_f.read(8 + 4*256)
 965             idx_sum.update(b)
 966
 967             for b in chunkyreader(idx_f, 20 * self.count):
 968                 idx_sum.update(b)
 969
 970             for b in chunkyreader(idx_f):
 971                 idx_sum.update(b)
 972             idx_f.write(idx_sum.digest())
 973             fdatasync(idx_f.fileno())
 974         finally:
 975             idx_f.close()
 976
 977
 978 def list_refs(patterns=None, repo_dir=None,
 979               limit_to_heads=False, limit_to_tags=False):
 980     """Yield (refname, hash) tuples for all repository refs unless
 981     patterns are specified.  In that case, only include tuples for
 982     refs matching those patterns (cf. git-show-ref(1)).  The limits
 983     restrict the result items to refs/heads or refs/tags.  If both
 984     limits are specified, items from both sources will be included.
 985
 986     """
 987     argv = [b'git', b'show-ref']
 988     if limit_to_heads:
 989         argv.append(b'--heads')
 990     if limit_to_tags:
 991         argv.append(b'--tags')
 992     argv.append(b'--')
 993     if patterns:
 994         argv.extend(patterns)
 995     p = subprocess.Popen(argv, env=_gitenv(repo_dir), stdout=subprocess.PIPE,
 996                          close_fds=True)
 997     out = p.stdout.read().strip()
 998     rv = p.wait()  # not fatal
 999     if rv:
1000         assert(not out)
1001     if out:
1002         for d in out.split(b'\n'):
1003             sha, name = d.split(b' ', 1)
1004             yield name, unhexlify(sha)
1005
1006
1007 def read_ref(refname, repo_dir = None):
1008     """Get the commit id of the most recent commit made on a given ref."""
1009     refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
1010     l = tuple(islice(refs, 2))
1011     if l:
1012         assert(len(l) == 1)
1013         return l[0][1]
1014     else:
1015         return None
1016
1017
1018 def rev_list_invocation(ref_or_refs, format=None):
1019     if isinstance(ref_or_refs, bytes):
1020         refs = (ref_or_refs,)
1021     else:
1022         refs = ref_or_refs
1023     argv = [b'git', b'rev-list']
1024
1025     if format:
1026         argv.append(b'--pretty=format:' + format)
1027     for ref in refs:
1028         assert not ref.startswith(b'-')
1029         argv.append(ref)
1030     argv.append(b'--')
1031     return argv
1032
1033
1034 def rev_list(ref_or_refs, parse=None, format=None, repo_dir=None):
1035     """Yield information about commits as per "git rev-list".  If a format
1036     is not provided, yield one hex hash at a time.  If a format is
1037     provided, pass it to rev-list and call parse(git_stdout) for each
1038     commit with the stream positioned just after the rev-list "commit
1039     HASH" header line.  When a format is provided yield (oidx,
1040     parse(git_stdout)) for each commit.
1041
1042     """
1043     assert bool(parse) == bool(format)
1044     p = subprocess.Popen(rev_list_invocation(ref_or_refs,
1045                                              format=format),
1046                          env=_gitenv(repo_dir),
1047                          stdout = subprocess.PIPE,
1048                          close_fds=True)
1049     if not format:
1050         for line in p.stdout:
1051             yield line.strip()
1052     else:
1053         line = p.stdout.readline()
1054         while line:
1055             s = line.strip()
1056             if not s.startswith(b'commit '):
1057                 raise Exception('unexpected line ' + repr(s))
1058             s = s[7:]
1059             assert len(s) == 40
1060             yield s, parse(p.stdout)
1061             line = p.stdout.readline()
1062
1063     rv = p.wait()  # not fatal
1064     if rv:
1065         raise GitError('git rev-list returned error %d' % rv)
1066
1067
1068 def rev_parse(committish, repo_dir=None):
1069     """Resolve the full hash for 'committish', if it exists.
1070
1071     Should be roughly equivalent to 'git rev-parse'.
1072
1073     Returns the hex value of the hash if it is found, None if 'committish' does
1074     not correspond to anything.
1075     """
1076     head = read_ref(committish, repo_dir=repo_dir)
1077     if head:
1078         debug2("resolved from ref: commit = %s\n" % hexlify(head))
1079         return head
1080
1081     pL = PackIdxList(repo(b'objects/pack', repo_dir=repo_dir))
1082
1083     if len(committish) == 40:
1084         try:
1085             hash = unhexlify(committish)
1086         except TypeError:
1087             return None
1088
1089         if pL.exists(hash):
1090             return hash
1091
1092     return None
1093
1094
1095 def update_ref(refname, newval, oldval, repo_dir=None):
1096     """Update a repository reference."""
1097     if not oldval:
1098         oldval = b''
1099     assert refname.startswith(b'refs/heads/') \
1100         or refname.startswith(b'refs/tags/')
1101     p = subprocess.Popen([b'git', b'update-ref', refname,
1102                           hexlify(newval), hexlify(oldval)],
1103                          env=_gitenv(repo_dir),
1104                          close_fds=True)
1105     _git_wait(b'git update-ref', p)
1106
1107
1108 def delete_ref(refname, oldvalue=None):
1109     """Delete a repository reference (see git update-ref(1))."""
1110     assert refname.startswith(b'refs/')
1111     oldvalue = [] if not oldvalue else [oldvalue]
1112     p = subprocess.Popen([b'git', b'update-ref', b'-d', refname] + oldvalue,
1113                          env=_gitenv(),
1114                          close_fds=True)
1115     _git_wait('git update-ref', p)
1116
1117
1118 def guess_repo(path=None):
1119     """Set the path value in the global variable "repodir".
1120     This makes bup look for an existing bup repository, but not fail if a
1121     repository doesn't exist. Usually, if you are interacting with a bup
1122     repository, you would not be calling this function but using
1123     check_repo_or_die().
1124     """
1125     global repodir
1126     if path:
1127         repodir = path
1128     if not repodir:
1129         repodir = environ.get(b'BUP_DIR')
1130         if not repodir:
1131             repodir = os.path.expanduser(b'~/.bup')
1132
1133
1134 def init_repo(path=None):
1135     """Create the Git bare repository for bup in a given path."""
1136     guess_repo(path)
1137     d = repo()  # appends a / to the path
1138     parent = os.path.dirname(os.path.dirname(d))
1139     if parent and not os.path.exists(parent):
1140         raise GitError('parent directory "%s" does not exist\n'
1141                        % path_msg(parent))
1142     if os.path.exists(d) and not os.path.isdir(os.path.join(d, b'.')):
1143         raise GitError('"%s" exists but is not a directory\n' % path_msg(d))
1144     p = subprocess.Popen([b'git', b'--bare', b'init'], stdout=sys.stderr,
1145                          env=_gitenv(),
1146                          close_fds=True)
1147     _git_wait('git init', p)
1148     # Force the index version configuration in order to ensure bup works
1149     # regardless of the version of the installed Git binary.
1150     p = subprocess.Popen([b'git', b'config', b'pack.indexVersion', '2'],
1151                          stdout=sys.stderr, env=_gitenv(), close_fds=True)
1152     _git_wait('git config', p)
1153     # Enable the reflog
1154     p = subprocess.Popen([b'git', b'config', b'core.logAllRefUpdates', b'true'],
1155                          stdout=sys.stderr, env=_gitenv(), close_fds=True)
1156     _git_wait('git config', p)
1157
1158
1159 def check_repo_or_die(path=None):
1160     """Check to see if a bup repository probably exists, and abort if not."""
1161     guess_repo(path)
1162     top = repo()
1163     pst = stat_if_exists(top + b'/objects/pack')
1164     if pst and stat.S_ISDIR(pst.st_mode):
1165         return
1166     if not pst:
1167         top_st = stat_if_exists(top)
1168         if not top_st:
1169             log('error: repository %r does not exist (see "bup help init")\n'
1170                 % top)
1171             sys.exit(15)
1172     log('error: %s is not a repository\n' % path_msg(top))
1173     sys.exit(14)
1174
1175
1176 def is_suitable_git(ver_str):
1177     if not ver_str.startswith(b'git version '):
1178         return 'unrecognized'
1179     ver_str = ver_str[len(b'git version '):]
1180     if ver_str.startswith(b'0.'):
1181         return 'insufficient'
1182     if ver_str.startswith(b'1.'):
1183         if re.match(br'1\.[012345]rc', ver_str):
1184             return 'insufficient'
1185         if re.match(br'1\.[01234]\.', ver_str):
1186             return 'insufficient'
1187         if re.match(br'1\.5\.[012345]($|\.)', ver_str):
1188             return 'insufficient'
1189         if re.match(br'1\.5\.6-rc', ver_str):
1190             return 'insufficient'
1191         return 'suitable'
1192     if re.match(br'[0-9]+(\.|$)?', ver_str):
1193         return 'suitable'
1194     sys.exit(13)
1195
1196 _git_great = None
1197
1198 def require_suitable_git(ver_str=None):
1199     """Raise GitError if the version of git isn't suitable.
1200
1201     Rely on ver_str when provided, rather than invoking the git in the
1202     path.
1203
1204     """
1205     global _git_great
1206     if _git_great is not None:
1207         return
1208     if environ.get(b'BUP_GIT_VERSION_IS_FINE', b'').lower() \
1209        in (b'yes', b'true', b'1'):
1210         _git_great = True
1211         return
1212     if not ver_str:
1213         ver_str, _, _ = _git_exo([b'git', b'--version'])
1214     status = is_suitable_git(ver_str)
1215     if status == 'unrecognized':
1216         raise GitError('Unexpected git --version output: %r' % ver_str)
1217     if status == 'insufficient':
1218         log('error: git version must be at least 1.5.6\n')
1219         sys.exit(1)
1220     if status == 'suitable':
1221         _git_great = True
1222         return
1223     assert False
1224
1225
1226 class _AbortableIter:
1227     def __init__(self, it, onabort = None):
1228         self.it = it
1229         self.onabort = onabort
1230         self.done = None
1231
1232     def __iter__(self):
1233         return self
1234
1235     def __next__(self):
1236         try:
1237             return next(self.it)
1238         except StopIteration as e:
1239             self.done = True
1240             raise
1241         except:
1242             self.abort()
1243             raise
1244
1245     next = __next__
1246
1247     def abort(self):
1248         """Abort iteration and call the abortion callback, if needed."""
1249         if not self.done:
1250             self.done = True
1251             if self.onabort:
1252                 self.onabort()
1253
1254     def __del__(self):
1255         self.abort()
1256
1257
1258 class CatPipe:
1259     """Link to 'git cat-file' that is used to retrieve blob data."""
1260     def __init__(self, repo_dir = None):
1261         require_suitable_git()
1262         self.repo_dir = repo_dir
1263         self.p = self.inprogress = None
1264
1265     def close(self, wait=False):
1266         p = self.p
1267         if p:
1268             p.stdout.close()
1269             p.stdin.close()
1270         self.p = None
1271         self.inprogress = None
1272         if wait:
1273             p.wait()
1274             return p.returncode
1275
1276     def restart(self):
1277         self.close()
1278         self.p = subprocess.Popen([b'git', b'cat-file', b'--batch'],
1279                                   stdin=subprocess.PIPE,
1280                                   stdout=subprocess.PIPE,
1281                                   close_fds = True,
1282                                   bufsize = 4096,
1283                                   env=_gitenv(self.repo_dir))
1284
1285     def get(self, ref):
1286         """Yield (oidx, type, size), followed by the data referred to by ref.
1287         If ref does not exist, only yield (None, None, None).
1288
1289         """
1290         if not self.p or self.p.poll() != None:
1291             self.restart()
1292         assert(self.p)
1293         poll_result = self.p.poll()
1294         assert(poll_result == None)
1295         if self.inprogress:
1296             log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1297         assert(not self.inprogress)
1298         assert ref.find(b'\n') < 0
1299         assert ref.find(b'\r') < 0
1300         assert not ref.startswith(b'-')
1301         self.inprogress = ref
1302         self.p.stdin.write(ref + b'\n')
1303         self.p.stdin.flush()
1304         hdr = self.p.stdout.readline()
1305         if hdr.endswith(b' missing\n'):
1306             self.inprogress = None
1307             yield None, None, None
1308             return
1309         info = hdr.split(b' ')
1310         if len(info) != 3 or len(info[0]) != 40:
1311             raise GitError('expected object (id, type, size), got %r' % info)
1312         oidx, typ, size = info
1313         size = int(size)
1314         it = _AbortableIter(chunkyreader(self.p.stdout, size),
1315                             onabort=self.close)
1316         try:
1317             yield oidx, typ, size
1318             for blob in it:
1319                 yield blob
1320             readline_result = self.p.stdout.readline()
1321             assert readline_result == b'\n'
1322             self.inprogress = None
1323         except Exception as e:
1324             it.abort()
1325             raise
1326
1327     def _join(self, it):
1328         _, typ, _ = next(it)
1329         if typ == b'blob':
1330             for blob in it:
1331                 yield blob
1332         elif typ == b'tree':
1333             treefile = b''.join(it)
1334             for (mode, name, sha) in tree_decode(treefile):
1335                 for blob in self.join(hexlify(sha)):
1336                     yield blob
1337         elif typ == b'commit':
1338             treeline = b''.join(it).split(b'\n')[0]
1339             assert treeline.startswith(b'tree ')
1340             for blob in self.join(treeline[5:]):
1341                 yield blob
1342         else:
1343             raise GitError('invalid object type %r: expected blob/tree/commit'
1344                            % typ)
1345
1346     def join(self, id):
1347         """Generate a list of the content of all blobs that can be reached
1348         from an object.  The hash given in 'id' must point to a blob, a tree
1349         or a commit. The content of all blobs that can be seen from trees or
1350         commits will be added to the list.
1351         """
1352         for d in self._join(self.get(id)):
1353             yield d
1354
1355
1356 _cp = {}
1357
1358 def cp(repo_dir=None):
1359     """Create a CatPipe object or reuse the already existing one."""
1360     global _cp, repodir
1361     if not repo_dir:
1362         repo_dir = repodir or repo()
1363     repo_dir = os.path.abspath(repo_dir)
1364     cp = _cp.get(repo_dir)
1365     if not cp:
1366         cp = CatPipe(repo_dir)
1367         _cp[repo_dir] = cp
1368     return cp
1369
1370
1371 def close_catpipes():
1372     # FIXME: chain exceptions
1373     while _cp:
1374         _, cp = _cp.popitem()
1375         cp.close(wait=True)
1376
1377
1378 def tags(repo_dir = None):
1379     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1380     tags = {}
1381     for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1382         assert n.startswith(b'refs/tags/')
1383         name = n[10:]
1384         if not c in tags:
1385             tags[c] = []
1386         tags[c].append(name)  # more than one tag can point at 'c'
1387     return tags
1388
1389
1390 class MissingObject(KeyError):
1391     def __init__(self, oid):
1392         self.oid = oid
1393         KeyError.__init__(self, 'object %r is missing' % hexlify(oid))
1394
1395
1396 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1397                                    'path', 'chunk_path', 'data'])
1398 # The path is the mangled path, and if an item represents a fragment
1399 # of a chunked file, the chunk_path will be the chunked subtree path
1400 # for the chunk, i.e. ['', '2d3115e', ...].  The top-level path for a
1401 # chunked file will have a chunk_path of [''].  So some chunk subtree
1402 # of the file '/foo/bar/baz' might look like this:
1403 #
1404 #   item.path = ['foo', 'bar', 'baz.bup']
1405 #   item.chunk_path = ['', '2d3115e', '016b097']
1406 #   item.type = 'tree'
1407 #   ...
1408
1409
1410 def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1411     """Yield everything reachable from oidx via get_ref (which must behave
1412     like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1413     returns true.  Throw MissingObject if a hash encountered is
1414     missing from the repository, and don't read or return blob content
1415     in the data field unless include_data is set.
1416
1417     """
1418     # Maintain the pending stack on the heap to avoid stack overflow
1419     pending = [(oidx, [], [], None)]
1420     while len(pending):
1421         oidx, parent_path, chunk_path, mode = pending.pop()
1422         oid = unhexlify(oidx)
1423         if stop_at and stop_at(oidx):
1424             continue
1425
1426         if (not include_data) and mode and stat.S_ISREG(mode):
1427             # If the object is a "regular file", then it's a leaf in
1428             # the graph, so we can skip reading the data if the caller
1429             # hasn't requested it.
1430             yield WalkItem(oid=oid, type=b'blob',
1431                            chunk_path=chunk_path, path=parent_path,
1432                            mode=mode,
1433                            data=None)
1434             continue
1435
1436         item_it = get_ref(oidx)
1437         get_oidx, typ, _ = next(item_it)
1438         if not get_oidx:
1439             raise MissingObject(unhexlify(oidx))
1440         if typ not in (b'blob', b'commit', b'tree'):
1441             raise Exception('unexpected repository object type %r' % typ)
1442
1443         # FIXME: set the mode based on the type when the mode is None
1444         if typ == b'blob' and not include_data:
1445             # Dump data until we can ask cat_pipe not to fetch it
1446             for ignored in item_it:
1447                 pass
1448             data = None
1449         else:
1450             data = b''.join(item_it)
1451
1452         yield WalkItem(oid=oid, type=typ,
1453                        chunk_path=chunk_path, path=parent_path,
1454                        mode=mode,
1455                        data=(data if include_data else None))
1456
1457         if typ == b'commit':
1458             commit_items = parse_commit(data)
1459             for pid in commit_items.parents:
1460                 pending.append((pid, parent_path, chunk_path, mode))
1461             pending.append((commit_items.tree, parent_path, chunk_path,
1462                             hashsplit.GIT_MODE_TREE))
1463         elif typ == b'tree':
1464             for mode, name, ent_id in tree_decode(data):
1465                 demangled, bup_type = demangle_name(name, mode)
1466                 if chunk_path:
1467                     sub_path = parent_path
1468                     sub_chunk_path = chunk_path + [name]
1469                 else:
1470                     sub_path = parent_path + [name]
1471                     if bup_type == BUP_CHUNKED:
1472                         sub_chunk_path = [b'']
1473                     else:
1474                         sub_chunk_path = chunk_path
1475                 pending.append((hexlify(ent_id), sub_path, sub_chunk_path,
1476                                 mode))