lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5
   6 from __future__ import absolute_import
   7 import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
   8 from collections import namedtuple
   9 from itertools import islice
  10 from numbers import Integral
  11 from os import environ
  12
  13 from bup import _helpers, compat, hashsplit, path, midx, bloom, xstat
  14 from bup.compat import range
  15 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
  16                          exo,
  17                          fdatasync,
  18                          hostname, localtime, log, merge_iter,
  19                          mmap_read, mmap_readwrite,
  20                          parse_num,
  21                          progress, qprogress, shstr, stat_if_exists,
  22                          unlink, username, userfullname,
  23                          utc_offset_str)
  24
  25 verbose = 0
  26 ignore_midx = 0
  27 repodir = None  # The default repository, once initialized
  28
  29 _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
  30 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
  31
  32 _total_searches = 0
  33 _total_steps = 0
  34
  35
  36 class GitError(Exception):
  37     pass
  38
  39
  40 def _git_wait(cmd, p):
  41     rv = p.wait()
  42     if rv != 0:
  43         raise GitError('%s returned %d' % (shstr(cmd), rv))
  44
  45 def _git_capture(argv):
  46     p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv())
  47     r = p.stdout.read()
  48     _git_wait(repr(argv), p)
  49     return r
  50
  51 def _git_exo(cmd, **kwargs):
  52     kwargs['check'] = False
  53     result = exo(cmd, **kwargs)
  54     _, _, proc = result
  55     if proc.returncode != 0:
  56         raise GitError('%r returned %d' % (cmd, proc.returncode))
  57     return result
  58
  59 def git_config_get(option, repo_dir=None):
  60     cmd = ('git', 'config', '--get', option)
  61     p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
  62                          preexec_fn=_gitenv(repo_dir=repo_dir))
  63     r = p.stdout.read()
  64     rc = p.wait()
  65     if rc == 0:
  66         return r
  67     if rc != 1:
  68         raise GitError('%s returned %d' % (cmd, rc))
  69     return None
  70
  71
  72 def parse_tz_offset(s):
  73     """UTC offset in seconds."""
  74     tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
  75     if s[0] == '-':
  76         return - tz_off
  77     return tz_off
  78
  79
  80 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
  81 # Make sure that's authoritative.
  82 _start_end_char = r'[^ .,:;<>"\'\0\n]'
  83 _content_char = r'[^\0\n<>]'
  84 _safe_str_rx = '(?:%s{1,2}|(?:%s%s*%s))' \
  85     % (_start_end_char,
  86        _start_end_char, _content_char, _start_end_char)
  87 _tz_rx = r'[-+]\d\d[0-5]\d'
  88 _parent_rx = r'(?:parent [abcdefABCDEF0123456789]{40}\n)'
  89 # Assumes every following line starting with a space is part of the
  90 # mergetag.  Is there a formal commit blob spec?
  91 _mergetag_rx = r'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)'
  92 _commit_rx = re.compile(r'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
  93 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
  94 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?)
  95
  96 (?P<message>(?:.|\n)*)''' % (_parent_rx,
  97                              _safe_str_rx, _safe_str_rx, _tz_rx,
  98                              _safe_str_rx, _safe_str_rx, _tz_rx,
  99                              _mergetag_rx))
 100 _parent_hash_rx = re.compile(r'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
 101
 102 # Note that the author_sec and committer_sec values are (UTC) epoch
 103 # seconds, and for now the mergetag is not included.
 104 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
 105                                        'author_name', 'author_mail',
 106                                        'author_sec', 'author_offset',
 107                                        'committer_name', 'committer_mail',
 108                                        'committer_sec', 'committer_offset',
 109                                        'message'])
 110
 111 def parse_commit(content):
 112     commit_match = re.match(_commit_rx, content)
 113     if not commit_match:
 114         raise Exception('cannot parse commit %r' % content)
 115     matches = commit_match.groupdict()
 116     return CommitInfo(tree=matches['tree'],
 117                       parents=re.findall(_parent_hash_rx, matches['parents']),
 118                       author_name=matches['author_name'],
 119                       author_mail=matches['author_mail'],
 120                       author_sec=int(matches['asec']),
 121                       author_offset=parse_tz_offset(matches['atz']),
 122                       committer_name=matches['committer_name'],
 123                       committer_mail=matches['committer_mail'],
 124                       committer_sec=int(matches['csec']),
 125                       committer_offset=parse_tz_offset(matches['ctz']),
 126                       message=matches['message'])
 127
 128
 129 def get_cat_data(cat_iterator, expected_type):
 130     _, kind, _ = next(cat_iterator)
 131     if kind != expected_type:
 132         raise Exception('expected %r, saw %r' % (expected_type, kind))
 133     return ''.join(cat_iterator)
 134
 135 def get_commit_items(id, cp):
 136     return parse_commit(get_cat_data(cp.get(id), 'commit'))
 137
 138 def _local_git_date_str(epoch_sec):
 139     return '%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
 140
 141
 142 def _git_date_str(epoch_sec, tz_offset_sec):
 143     offs =  tz_offset_sec // 60
 144     return '%d %s%02d%02d' \
 145         % (epoch_sec,
 146            '+' if offs >= 0 else '-',
 147            abs(offs) // 60,
 148            abs(offs) % 60)
 149
 150
 151 def repo(sub = '', repo_dir=None):
 152     """Get the path to the git repository or one of its subdirectories."""
 153     repo_dir = repo_dir or repodir
 154     if not repo_dir:
 155         raise GitError('You should call check_repo_or_die()')
 156
 157     # If there's a .git subdirectory, then the actual repo is in there.
 158     gd = os.path.join(repo_dir, '.git')
 159     if os.path.exists(gd):
 160         repo_dir = gd
 161
 162     return os.path.join(repo_dir, sub)
 163
 164
 165 def shorten_hash(s):
 166     return re.sub(r'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)',
 167                   r'\1\2*\3', s)
 168
 169
 170 def repo_rel(path):
 171     full = os.path.abspath(path)
 172     fullrepo = os.path.abspath(repo(''))
 173     if not fullrepo.endswith('/'):
 174         fullrepo += '/'
 175     if full.startswith(fullrepo):
 176         path = full[len(fullrepo):]
 177     if path.startswith('index-cache/'):
 178         path = path[len('index-cache/'):]
 179     return shorten_hash(path)
 180
 181
 182 def all_packdirs():
 183     paths = [repo('objects/pack')]
 184     paths += glob.glob(repo('index-cache/*/.'))
 185     return paths
 186
 187
 188 def auto_midx(objdir):
 189     args = [path.exe(), 'midx', '--auto', '--dir', objdir]
 190     try:
 191         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 192     except OSError as e:
 193         # make sure 'args' gets printed to help with debugging
 194         add_error('%r: exception: %s' % (args, e))
 195         raise
 196     if rv:
 197         add_error('%r: returned %d' % (args, rv))
 198
 199     args = [path.exe(), 'bloom', '--dir', objdir]
 200     try:
 201         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 202     except OSError as e:
 203         # make sure 'args' gets printed to help with debugging
 204         add_error('%r: exception: %s' % (args, e))
 205         raise
 206     if rv:
 207         add_error('%r: returned %d' % (args, rv))
 208
 209
 210 def mangle_name(name, mode, gitmode):
 211     """Mangle a file name to present an abstract name for segmented files.
 212     Mangled file names will have the ".bup" extension added to them. If a
 213     file's name already ends with ".bup", a ".bupl" extension is added to
 214     disambiguate normal files from segmented ones.
 215     """
 216     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 217         assert(stat.S_ISDIR(gitmode))
 218         return name + '.bup'
 219     elif name.endswith('.bup') or name[:-1].endswith('.bup'):
 220         return name + '.bupl'
 221     else:
 222         return name
 223
 224
 225 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 226 def demangle_name(name, mode):
 227     """Remove name mangling from a file name, if necessary.
 228
 229     The return value is a tuple (demangled_filename,mode), where mode is one of
 230     the following:
 231
 232     * BUP_NORMAL  : files that should be read as-is from the repository
 233     * BUP_CHUNKED : files that were chunked and need to be reassembled
 234
 235     For more information on the name mangling algorithm, see mangle_name()
 236     """
 237     if name.endswith('.bupl'):
 238         return (name[:-5], BUP_NORMAL)
 239     elif name.endswith('.bup'):
 240         return (name[:-4], BUP_CHUNKED)
 241     elif name.endswith('.bupm'):
 242         return (name[:-5],
 243                 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
 244     else:
 245         return (name, BUP_NORMAL)
 246
 247
 248 def calc_hash(type, content):
 249     """Calculate some content's hash in the Git fashion."""
 250     header = '%s %d\0' % (type, len(content))
 251     sum = Sha1(header)
 252     sum.update(content)
 253     return sum.digest()
 254
 255
 256 def shalist_item_sort_key(ent):
 257     (mode, name, id) = ent
 258     assert(mode+0 == mode)
 259     if stat.S_ISDIR(mode):
 260         return name + '/'
 261     else:
 262         return name
 263
 264
 265 def tree_encode(shalist):
 266     """Generate a git tree object from (mode,name,hash) tuples."""
 267     shalist = sorted(shalist, key = shalist_item_sort_key)
 268     l = []
 269     for (mode,name,bin) in shalist:
 270         assert(mode)
 271         assert(mode+0 == mode)
 272         assert(name)
 273         assert(len(bin) == 20)
 274         s = '%o %s\0%s' % (mode,name,bin)
 275         assert(s[0] != '0')  # 0-padded octal is not acceptable in a git tree
 276         l.append(s)
 277     return ''.join(l)
 278
 279
 280 def tree_decode(buf):
 281     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 282     ofs = 0
 283     while ofs < len(buf):
 284         z = buf.find('\0', ofs)
 285         assert(z > ofs)
 286         spl = buf[ofs:z].split(' ', 1)
 287         assert(len(spl) == 2)
 288         mode,name = spl
 289         sha = buf[z+1:z+1+20]
 290         ofs = z+1+20
 291         yield (int(mode, 8), name, sha)
 292
 293
 294 def _encode_packobj(type, content, compression_level=1):
 295     if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
 296         raise ValueError('invalid compression level %s' % compression_level)
 297     szout = ''
 298     sz = len(content)
 299     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 300     sz >>= 4
 301     while 1:
 302         if sz: szbits |= 0x80
 303         szout += chr(szbits)
 304         if not sz:
 305             break
 306         szbits = sz & 0x7f
 307         sz >>= 7
 308     z = zlib.compressobj(compression_level)
 309     yield szout
 310     yield z.compress(content)
 311     yield z.flush()
 312
 313
 314 def _encode_looseobj(type, content, compression_level=1):
 315     z = zlib.compressobj(compression_level)
 316     yield z.compress('%s %d\0' % (type, len(content)))
 317     yield z.compress(content)
 318     yield z.flush()
 319
 320
 321 def _decode_looseobj(buf):
 322     assert(buf);
 323     s = zlib.decompress(buf)
 324     i = s.find('\0')
 325     assert(i > 0)
 326     l = s[:i].split(' ')
 327     type = l[0]
 328     sz = int(l[1])
 329     content = s[i+1:]
 330     assert(type in _typemap)
 331     assert(sz == len(content))
 332     return (type, content)
 333
 334
 335 def _decode_packobj(buf):
 336     assert(buf)
 337     c = ord(buf[0])
 338     type = _typermap[(c & 0x70) >> 4]
 339     sz = c & 0x0f
 340     shift = 4
 341     i = 0
 342     while c & 0x80:
 343         i += 1
 344         c = ord(buf[i])
 345         sz |= (c & 0x7f) << shift
 346         shift += 7
 347         if not (c & 0x80):
 348             break
 349     return (type, zlib.decompress(buf[i+1:]))
 350
 351
 352 class PackIdx:
 353     def __init__(self):
 354         assert(0)
 355
 356     def find_offset(self, hash):
 357         """Get the offset of an object inside the index file."""
 358         idx = self._idx_from_hash(hash)
 359         if idx != None:
 360             return self._ofs_from_idx(idx)
 361         return None
 362
 363     def exists(self, hash, want_source=False):
 364         """Return nonempty if the object exists in this index."""
 365         if hash and (self._idx_from_hash(hash) != None):
 366             return want_source and os.path.basename(self.name) or True
 367         return None
 368
 369     def __len__(self):
 370         return int(self.fanout[255])
 371
 372     def _idx_from_hash(self, hash):
 373         global _total_searches, _total_steps
 374         _total_searches += 1
 375         assert(len(hash) == 20)
 376         b1 = ord(hash[0])
 377         start = self.fanout[b1-1] # range -1..254
 378         end = self.fanout[b1] # range 0..255
 379         want = str(hash)
 380         _total_steps += 1  # lookup table is a step
 381         while start < end:
 382             _total_steps += 1
 383             mid = start + (end-start)/2
 384             v = self._idx_to_hash(mid)
 385             if v < want:
 386                 start = mid+1
 387             elif v > want:
 388                 end = mid
 389             else: # got it!
 390                 return mid
 391         return None
 392
 393
 394 class PackIdxV1(PackIdx):
 395     """Object representation of a Git pack index (version 1) file."""
 396     def __init__(self, filename, f):
 397         self.name = filename
 398         self.idxnames = [self.name]
 399         self.map = mmap_read(f)
 400         self.fanout = list(struct.unpack('!256I',
 401                                          str(buffer(self.map, 0, 256*4))))
 402         self.fanout.append(0)  # entry "-1"
 403         nsha = self.fanout[255]
 404         self.sha_ofs = 256*4
 405         self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
 406
 407     def _ofs_from_idx(self, idx):
 408         return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
 409
 410     def _idx_to_hash(self, idx):
 411         return str(self.shatable[idx*24+4 : idx*24+24])
 412
 413     def __iter__(self):
 414         for i in range(self.fanout[255]):
 415             yield buffer(self.map, 256*4 + 24*i + 4, 20)
 416
 417
 418 class PackIdxV2(PackIdx):
 419     """Object representation of a Git pack index (version 2) file."""
 420     def __init__(self, filename, f):
 421         self.name = filename
 422         self.idxnames = [self.name]
 423         self.map = mmap_read(f)
 424         assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
 425         self.fanout = list(struct.unpack('!256I',
 426                                          str(buffer(self.map, 8, 256*4))))
 427         self.fanout.append(0)  # entry "-1"
 428         nsha = self.fanout[255]
 429         self.sha_ofs = 8 + 256*4
 430         self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
 431         self.ofstable = buffer(self.map,
 432                                self.sha_ofs + nsha*20 + nsha*4,
 433                                nsha*4)
 434         self.ofs64table = buffer(self.map,
 435                                  8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
 436
 437     def _ofs_from_idx(self, idx):
 438         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
 439         if ofs & 0x80000000:
 440             idx64 = ofs & 0x7fffffff
 441             ofs = struct.unpack('!Q',
 442                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
 443         return ofs
 444
 445     def _idx_to_hash(self, idx):
 446         return str(self.shatable[idx*20:(idx+1)*20])
 447
 448     def __iter__(self):
 449         for i in range(self.fanout[255]):
 450             yield buffer(self.map, 8 + 256*4 + 20*i, 20)
 451
 452
 453 _mpi_count = 0
 454 class PackIdxList:
 455     def __init__(self, dir):
 456         global _mpi_count
 457         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 458         _mpi_count += 1
 459         self.dir = dir
 460         self.also = set()
 461         self.packs = []
 462         self.do_bloom = False
 463         self.bloom = None
 464         self.refresh()
 465
 466     def __del__(self):
 467         global _mpi_count
 468         _mpi_count -= 1
 469         assert(_mpi_count == 0)
 470
 471     def __iter__(self):
 472         return iter(idxmerge(self.packs))
 473
 474     def __len__(self):
 475         return sum(len(pack) for pack in self.packs)
 476
 477     def exists(self, hash, want_source=False):
 478         """Return nonempty if the object exists in the index files."""
 479         global _total_searches
 480         _total_searches += 1
 481         if hash in self.also:
 482             return True
 483         if self.do_bloom and self.bloom:
 484             if self.bloom.exists(hash):
 485                 self.do_bloom = False
 486             else:
 487                 _total_searches -= 1  # was counted by bloom
 488                 return None
 489         for i in xrange(len(self.packs)):
 490             p = self.packs[i]
 491             _total_searches -= 1  # will be incremented by sub-pack
 492             ix = p.exists(hash, want_source=want_source)
 493             if ix:
 494                 # reorder so most recently used packs are searched first
 495                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 496                 return ix
 497         self.do_bloom = True
 498         return None
 499
 500     def refresh(self, skip_midx = False):
 501         """Refresh the index list.
 502         This method verifies if .midx files were superseded (e.g. all of its
 503         contents are in another, bigger .midx file) and removes the superseded
 504         files.
 505
 506         If skip_midx is True, all work on .midx files will be skipped and .midx
 507         files will be removed from the list.
 508
 509         The module-global variable 'ignore_midx' can force this function to
 510         always act as if skip_midx was True.
 511         """
 512         self.bloom = None # Always reopen the bloom as it may have been relaced
 513         self.do_bloom = False
 514         skip_midx = skip_midx or ignore_midx
 515         d = dict((p.name, p) for p in self.packs
 516                  if not skip_midx or not isinstance(p, midx.PackMidx))
 517         if os.path.exists(self.dir):
 518             if not skip_midx:
 519                 midxl = []
 520                 for ix in self.packs:
 521                     if isinstance(ix, midx.PackMidx):
 522                         for name in ix.idxnames:
 523                             d[os.path.join(self.dir, name)] = ix
 524                 for full in glob.glob(os.path.join(self.dir,'*.midx')):
 525                     if not d.get(full):
 526                         mx = midx.PackMidx(full)
 527                         (mxd, mxf) = os.path.split(mx.name)
 528                         broken = False
 529                         for n in mx.idxnames:
 530                             if not os.path.exists(os.path.join(mxd, n)):
 531                                 log(('warning: index %s missing\n' +
 532                                     '  used by %s\n') % (n, mxf))
 533                                 broken = True
 534                         if broken:
 535                             mx.close()
 536                             del mx
 537                             unlink(full)
 538                         else:
 539                             midxl.append(mx)
 540                 midxl.sort(key=lambda ix:
 541                            (-len(ix), -xstat.stat(ix.name).st_mtime))
 542                 for ix in midxl:
 543                     any_needed = False
 544                     for sub in ix.idxnames:
 545                         found = d.get(os.path.join(self.dir, sub))
 546                         if not found or isinstance(found, PackIdx):
 547                             # doesn't exist, or exists but not in a midx
 548                             any_needed = True
 549                             break
 550                     if any_needed:
 551                         d[ix.name] = ix
 552                         for name in ix.idxnames:
 553                             d[os.path.join(self.dir, name)] = ix
 554                     elif not ix.force_keep:
 555                         debug1('midx: removing redundant: %s\n'
 556                                % os.path.basename(ix.name))
 557                         ix.close()
 558                         unlink(ix.name)
 559             for full in glob.glob(os.path.join(self.dir,'*.idx')):
 560                 if not d.get(full):
 561                     try:
 562                         ix = open_idx(full)
 563                     except GitError as e:
 564                         add_error(e)
 565                         continue
 566                     d[full] = ix
 567             bfull = os.path.join(self.dir, 'bup.bloom')
 568             if self.bloom is None and os.path.exists(bfull):
 569                 self.bloom = bloom.ShaBloom(bfull)
 570             self.packs = list(set(d.values()))
 571             self.packs.sort(reverse=True, key=lambda x: len(x))
 572             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 573                 self.do_bloom = True
 574             else:
 575                 self.bloom = None
 576         debug1('PackIdxList: using %d index%s.\n'
 577             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 578
 579     def add(self, hash):
 580         """Insert an additional object in the list."""
 581         self.also.add(hash)
 582
 583
 584 def open_idx(filename):
 585     if filename.endswith('.idx'):
 586         f = open(filename, 'rb')
 587         header = f.read(8)
 588         if header[0:4] == '\377tOc':
 589             version = struct.unpack('!I', header[4:8])[0]
 590             if version == 2:
 591                 return PackIdxV2(filename, f)
 592             else:
 593                 raise GitError('%s: expected idx file version 2, got %d'
 594                                % (filename, version))
 595         elif len(header) == 8 and header[0:4] < '\377tOc':
 596             return PackIdxV1(filename, f)
 597         else:
 598             raise GitError('%s: unrecognized idx file header' % filename)
 599     elif filename.endswith('.midx'):
 600         return midx.PackMidx(filename)
 601     else:
 602         raise GitError('idx filenames must end with .idx or .midx')
 603
 604
 605 def idxmerge(idxlist, final_progress=True):
 606     """Generate a list of all the objects reachable in a PackIdxList."""
 607     def pfunc(count, total):
 608         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 609                   % (count*100.0/total, count, total))
 610     def pfinal(count, total):
 611         if final_progress:
 612             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 613                      % (100, total, total))
 614     return merge_iter(idxlist, 10024, pfunc, pfinal)
 615
 616
 617 def _make_objcache():
 618     return PackIdxList(repo('objects/pack'))
 619
 620 # bup-gc assumes that it can disable all PackWriter activities
 621 # (bloom/midx/cache) via the constructor and close() arguments.
 622
 623 class PackWriter:
 624     """Writes Git objects inside a pack file."""
 625     def __init__(self, objcache_maker=_make_objcache, compression_level=1,
 626                  run_midx=True, on_pack_finish=None,
 627                  max_pack_size=None, max_pack_objects=None, repo_dir=None):
 628         self.repo_dir = repo_dir or repo()
 629         self.file = None
 630         self.parentfd = None
 631         self.count = 0
 632         self.outbytes = 0
 633         self.filename = None
 634         self.idx = None
 635         self.objcache_maker = objcache_maker
 636         self.objcache = None
 637         self.compression_level = compression_level
 638         self.run_midx=run_midx
 639         self.on_pack_finish = on_pack_finish
 640         if not max_pack_size:
 641             max_pack_size = git_config_get('pack.packSizeLimit',
 642                                            repo_dir=self.repo_dir)
 643             if max_pack_size is not None:
 644                 max_pack_size = parse_num(max_pack_size)
 645             if not max_pack_size:
 646                 # larger packs slow down pruning
 647                 max_pack_size = 1000 * 1000 * 1000
 648         self.max_pack_size = max_pack_size
 649         # cache memory usage is about 83 bytes per object
 650         self.max_pack_objects = max_pack_objects if max_pack_objects \
 651                                 else max(1, self.max_pack_size // 5000)
 652
 653     def __del__(self):
 654         self.close()
 655
 656     def __enter__(self):
 657         return self
 658
 659     def __exit__(self, type, value, traceback):
 660         self.close()
 661
 662     def _open(self):
 663         if not self.file:
 664             objdir = dir = os.path.join(self.repo_dir, 'objects')
 665             fd, name = tempfile.mkstemp(suffix='.pack', dir=objdir)
 666             try:
 667                 self.file = os.fdopen(fd, 'w+b')
 668             except:
 669                 os.close(fd)
 670                 raise
 671             try:
 672                 self.parentfd = os.open(objdir, os.O_RDONLY)
 673             except:
 674                 f = self.file
 675                 self.file = None
 676                 f.close()
 677                 raise
 678             assert(name.endswith('.pack'))
 679             self.filename = name[:-5]
 680             self.file.write('PACK\0\0\0\2\0\0\0\0')
 681             self.idx = list(list() for i in xrange(256))
 682
 683     def _raw_write(self, datalist, sha):
 684         self._open()
 685         f = self.file
 686         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 687         # the file never has a *partial* blob.  So let's make sure it's
 688         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 689         # to our hashsplit algorithm.)  f.write() does its own buffering,
 690         # but that's okay because we'll flush it in _end().
 691         oneblob = ''.join(datalist)
 692         try:
 693             f.write(oneblob)
 694         except IOError as e:
 695             raise GitError, e, sys.exc_info()[2]
 696         nw = len(oneblob)
 697         crc = zlib.crc32(oneblob) & 0xffffffff
 698         self._update_idx(sha, crc, nw)
 699         self.outbytes += nw
 700         self.count += 1
 701         return nw, crc
 702
 703     def _update_idx(self, sha, crc, size):
 704         assert(sha)
 705         if self.idx:
 706             self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
 707
 708     def _write(self, sha, type, content):
 709         if verbose:
 710             log('>')
 711         if not sha:
 712             sha = calc_hash(type, content)
 713         size, crc = self._raw_write(_encode_packobj(type, content,
 714                                                     self.compression_level),
 715                                     sha=sha)
 716         if self.outbytes >= self.max_pack_size \
 717            or self.count >= self.max_pack_objects:
 718             self.breakpoint()
 719         return sha
 720
 721     def breakpoint(self):
 722         """Clear byte and object counts and return the last processed id."""
 723         id = self._end(self.run_midx)
 724         self.outbytes = self.count = 0
 725         return id
 726
 727     def _require_objcache(self):
 728         if self.objcache is None and self.objcache_maker:
 729             self.objcache = self.objcache_maker()
 730         if self.objcache is None:
 731             raise GitError(
 732                     "PackWriter not opened or can't check exists w/o objcache")
 733
 734     def exists(self, id, want_source=False):
 735         """Return non-empty if an object is found in the object cache."""
 736         self._require_objcache()
 737         return self.objcache.exists(id, want_source=want_source)
 738
 739     def just_write(self, sha, type, content):
 740         """Write an object to the pack file without checking for duplication."""
 741         self._write(sha, type, content)
 742         # If nothing else, gc doesn't have/want an objcache
 743         if self.objcache is not None:
 744             self.objcache.add(sha)
 745
 746     def maybe_write(self, type, content):
 747         """Write an object to the pack file if not present and return its id."""
 748         sha = calc_hash(type, content)
 749         if not self.exists(sha):
 750             self._require_objcache()
 751             self.just_write(sha, type, content)
 752         return sha
 753
 754     def new_blob(self, blob):
 755         """Create a blob object in the pack with the supplied content."""
 756         return self.maybe_write('blob', blob)
 757
 758     def new_tree(self, shalist):
 759         """Create a tree object in the pack."""
 760         content = tree_encode(shalist)
 761         return self.maybe_write('tree', content)
 762
 763     def new_commit(self, tree, parent,
 764                    author, adate_sec, adate_tz,
 765                    committer, cdate_sec, cdate_tz,
 766                    msg):
 767         """Create a commit object in the pack.  The date_sec values must be
 768         epoch-seconds, and if a tz is None, the local timezone is assumed."""
 769         if adate_tz:
 770             adate_str = _git_date_str(adate_sec, adate_tz)
 771         else:
 772             adate_str = _local_git_date_str(adate_sec)
 773         if cdate_tz:
 774             cdate_str = _git_date_str(cdate_sec, cdate_tz)
 775         else:
 776             cdate_str = _local_git_date_str(cdate_sec)
 777         l = []
 778         if tree: l.append('tree %s' % tree.encode('hex'))
 779         if parent: l.append('parent %s' % parent.encode('hex'))
 780         if author: l.append('author %s %s' % (author, adate_str))
 781         if committer: l.append('committer %s %s' % (committer, cdate_str))
 782         l.append('')
 783         l.append(msg)
 784         return self.maybe_write('commit', '\n'.join(l))
 785
 786     def abort(self):
 787         """Remove the pack file from disk."""
 788         f = self.file
 789         if f:
 790             pfd = self.parentfd
 791             self.file = None
 792             self.parentfd = None
 793             self.idx = None
 794             try:
 795                 try:
 796                     os.unlink(self.filename + '.pack')
 797                 finally:
 798                     f.close()
 799             finally:
 800                 if pfd is not None:
 801                     os.close(pfd)
 802
 803     def _end(self, run_midx=True):
 804         f = self.file
 805         if not f: return None
 806         self.file = None
 807         try:
 808             self.objcache = None
 809             idx = self.idx
 810             self.idx = None
 811
 812             # update object count
 813             f.seek(8)
 814             cp = struct.pack('!i', self.count)
 815             assert(len(cp) == 4)
 816             f.write(cp)
 817
 818             # calculate the pack sha1sum
 819             f.seek(0)
 820             sum = Sha1()
 821             for b in chunkyreader(f):
 822                 sum.update(b)
 823             packbin = sum.digest()
 824             f.write(packbin)
 825             fdatasync(f.fileno())
 826         finally:
 827             f.close()
 828
 829         obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
 830         nameprefix = os.path.join(self.repo_dir,
 831                                   'objects/pack/pack-' +  obj_list_sha)
 832         if os.path.exists(self.filename + '.map'):
 833             os.unlink(self.filename + '.map')
 834         os.rename(self.filename + '.pack', nameprefix + '.pack')
 835         os.rename(self.filename + '.idx', nameprefix + '.idx')
 836         try:
 837             os.fsync(self.parentfd)
 838         finally:
 839             os.close(self.parentfd)
 840
 841         if run_midx:
 842             auto_midx(os.path.join(self.repo_dir, 'objects/pack'))
 843
 844         if self.on_pack_finish:
 845             self.on_pack_finish(nameprefix)
 846
 847         return nameprefix
 848
 849     def close(self, run_midx=True):
 850         """Close the pack file and move it to its definitive path."""
 851         return self._end(run_midx=run_midx)
 852
 853     def _write_pack_idx_v2(self, filename, idx, packbin):
 854         ofs64_count = 0
 855         for section in idx:
 856             for entry in section:
 857                 if entry[2] >= 2**31:
 858                     ofs64_count += 1
 859
 860         # Length: header + fan-out + shas-and-crcs + overflow-offsets
 861         index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
 862         idx_map = None
 863         idx_f = open(filename, 'w+b')
 864         try:
 865             idx_f.truncate(index_len)
 866             fdatasync(idx_f.fileno())
 867             idx_map = mmap_readwrite(idx_f, close=False)
 868             try:
 869                 count = _helpers.write_idx(filename, idx_map, idx, self.count)
 870                 assert(count == self.count)
 871                 idx_map.flush()
 872             finally:
 873                 idx_map.close()
 874         finally:
 875             idx_f.close()
 876
 877         idx_f = open(filename, 'a+b')
 878         try:
 879             idx_f.write(packbin)
 880             idx_f.seek(0)
 881             idx_sum = Sha1()
 882             b = idx_f.read(8 + 4*256)
 883             idx_sum.update(b)
 884
 885             obj_list_sum = Sha1()
 886             for b in chunkyreader(idx_f, 20*self.count):
 887                 idx_sum.update(b)
 888                 obj_list_sum.update(b)
 889             namebase = obj_list_sum.hexdigest()
 890
 891             for b in chunkyreader(idx_f):
 892                 idx_sum.update(b)
 893             idx_f.write(idx_sum.digest())
 894             fdatasync(idx_f.fileno())
 895             return namebase
 896         finally:
 897             idx_f.close()
 898
 899
 900 def _gitenv(repo_dir = None):
 901     if not repo_dir:
 902         repo_dir = repo()
 903     def env():
 904         os.environ['GIT_DIR'] = os.path.abspath(repo_dir)
 905     return env
 906
 907
 908 def list_refs(patterns=None, repo_dir=None,
 909               limit_to_heads=False, limit_to_tags=False):
 910     """Yield (refname, hash) tuples for all repository refs unless
 911     patterns are specified.  In that case, only include tuples for
 912     refs matching those patterns (cf. git-show-ref(1)).  The limits
 913     restrict the result items to refs/heads or refs/tags.  If both
 914     limits are specified, items from both sources will be included.
 915
 916     """
 917     argv = ['git', 'show-ref']
 918     if limit_to_heads:
 919         argv.append('--heads')
 920     if limit_to_tags:
 921         argv.append('--tags')
 922     argv.append('--')
 923     if patterns:
 924         argv.extend(patterns)
 925     p = subprocess.Popen(argv,
 926                          preexec_fn = _gitenv(repo_dir),
 927                          stdout = subprocess.PIPE)
 928     out = p.stdout.read().strip()
 929     rv = p.wait()  # not fatal
 930     if rv:
 931         assert(not out)
 932     if out:
 933         for d in out.split('\n'):
 934             (sha, name) = d.split(' ', 1)
 935             yield (name, sha.decode('hex'))
 936
 937
 938 def read_ref(refname, repo_dir = None):
 939     """Get the commit id of the most recent commit made on a given ref."""
 940     refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
 941     l = tuple(islice(refs, 2))
 942     if l:
 943         assert(len(l) == 1)
 944         return l[0][1]
 945     else:
 946         return None
 947
 948
 949 def rev_list_invocation(ref_or_refs, count=None, format=None):
 950     if isinstance(ref_or_refs, compat.str_type):
 951         refs = (ref_or_refs,)
 952     else:
 953         refs = ref_or_refs
 954     argv = ['git', 'rev-list']
 955     if isinstance(count, Integral):
 956         argv.extend(['-n', str(count)])
 957     elif count:
 958         raise ValueError('unexpected count argument %r' % count)
 959
 960     if format:
 961         argv.append('--pretty=format:' + format)
 962     for ref in refs:
 963         assert not ref.startswith('-')
 964         argv.append(ref)
 965     argv.append('--')
 966     return argv
 967
 968
 969 def rev_list(ref_or_refs, count=None, parse=None, format=None, repo_dir=None):
 970     """Yield information about commits as per "git rev-list".  If a format
 971     is not provided, yield one hex hash at a time.  If a format is
 972     provided, pass it to rev-list and call parse(git_stdout) for each
 973     commit with the stream positioned just after the rev-list "commit
 974     HASH" header line.  When a format is provided yield (oidx,
 975     parse(git_stdout)) for each commit.
 976
 977     """
 978     assert bool(parse) == bool(format)
 979     p = subprocess.Popen(rev_list_invocation(ref_or_refs, count=count,
 980                                              format=format),
 981                          preexec_fn = _gitenv(repo_dir),
 982                          stdout = subprocess.PIPE)
 983     if not format:
 984         for line in p.stdout:
 985             yield line.strip()
 986     else:
 987         line = p.stdout.readline()
 988         while line:
 989             s = line.strip()
 990             if not s.startswith('commit '):
 991                 raise Exception('unexpected line ' + s)
 992             s = s[7:]
 993             assert len(s) == 40
 994             yield s, parse(p.stdout)
 995             line = p.stdout.readline()
 996
 997     rv = p.wait()  # not fatal
 998     if rv:
 999         raise GitError, 'git rev-list returned error %d' % rv
1000
1001
1002 def get_commit_dates(refs, repo_dir=None):
1003     """Get the dates for the specified commit refs.  For now, every unique
1004        string in refs must resolve to a different commit or this
1005        function will fail."""
1006     result = []
1007     for ref in refs:
1008         commit = get_commit_items(ref, cp(repo_dir))
1009         result.append(commit.author_sec)
1010     return result
1011
1012
1013 def rev_parse(committish, repo_dir=None):
1014     """Resolve the full hash for 'committish', if it exists.
1015
1016     Should be roughly equivalent to 'git rev-parse'.
1017
1018     Returns the hex value of the hash if it is found, None if 'committish' does
1019     not correspond to anything.
1020     """
1021     head = read_ref(committish, repo_dir=repo_dir)
1022     if head:
1023         debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
1024         return head
1025
1026     pL = PackIdxList(repo('objects/pack', repo_dir=repo_dir))
1027
1028     if len(committish) == 40:
1029         try:
1030             hash = committish.decode('hex')
1031         except TypeError:
1032             return None
1033
1034         if pL.exists(hash):
1035             return hash
1036
1037     return None
1038
1039
1040 def update_ref(refname, newval, oldval, repo_dir=None):
1041     """Update a repository reference."""
1042     if not oldval:
1043         oldval = ''
1044     assert(refname.startswith('refs/heads/') \
1045            or refname.startswith('refs/tags/'))
1046     p = subprocess.Popen(['git', 'update-ref', refname,
1047                           newval.encode('hex'), oldval.encode('hex')],
1048                          preexec_fn = _gitenv(repo_dir))
1049     _git_wait('git update-ref', p)
1050
1051
1052 def delete_ref(refname, oldvalue=None):
1053     """Delete a repository reference (see git update-ref(1))."""
1054     assert(refname.startswith('refs/'))
1055     oldvalue = [] if not oldvalue else [oldvalue]
1056     p = subprocess.Popen(['git', 'update-ref', '-d', refname] + oldvalue,
1057                          preexec_fn = _gitenv())
1058     _git_wait('git update-ref', p)
1059
1060
1061 def guess_repo(path=None):
1062     """Set the path value in the global variable "repodir".
1063     This makes bup look for an existing bup repository, but not fail if a
1064     repository doesn't exist. Usually, if you are interacting with a bup
1065     repository, you would not be calling this function but using
1066     check_repo_or_die().
1067     """
1068     global repodir
1069     if path:
1070         repodir = path
1071     if not repodir:
1072         repodir = os.environ.get('BUP_DIR')
1073         if not repodir:
1074             repodir = os.path.expanduser('~/.bup')
1075
1076
1077 def init_repo(path=None):
1078     """Create the Git bare repository for bup in a given path."""
1079     guess_repo(path)
1080     d = repo()  # appends a / to the path
1081     parent = os.path.dirname(os.path.dirname(d))
1082     if parent and not os.path.exists(parent):
1083         raise GitError('parent directory "%s" does not exist\n' % parent)
1084     if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
1085         raise GitError('"%s" exists but is not a directory\n' % d)
1086     p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
1087                          preexec_fn = _gitenv())
1088     _git_wait('git init', p)
1089     # Force the index version configuration in order to ensure bup works
1090     # regardless of the version of the installed Git binary.
1091     p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
1092                          stdout=sys.stderr, preexec_fn = _gitenv())
1093     _git_wait('git config', p)
1094     # Enable the reflog
1095     p = subprocess.Popen(['git', 'config', 'core.logAllRefUpdates', 'true'],
1096                          stdout=sys.stderr, preexec_fn = _gitenv())
1097     _git_wait('git config', p)
1098
1099
1100 def check_repo_or_die(path=None):
1101     """Check to see if a bup repository probably exists, and abort if not."""
1102     guess_repo(path)
1103     top = repo()
1104     pst = stat_if_exists(top + '/objects/pack')
1105     if pst and stat.S_ISDIR(pst.st_mode):
1106         return
1107     if not pst:
1108         top_st = stat_if_exists(top)
1109         if not top_st:
1110             log('error: repository %r does not exist (see "bup help init")\n'
1111                 % top)
1112             sys.exit(15)
1113     log('error: %r is not a repository\n' % top)
1114     sys.exit(14)
1115
1116
1117 def is_suitable_git(ver_str):
1118     if not ver_str.startswith(b'git version '):
1119         return 'unrecognized'
1120     ver_str = ver_str[len(b'git version '):]
1121     if ver_str.startswith(b'0.'):
1122         return 'insufficient'
1123     if ver_str.startswith(b'1.'):
1124         if re.match(br'1\.[012345]rc', ver_str):
1125             return 'insufficient'
1126         if re.match(br'1\.[01234]\.', ver_str):
1127             return 'insufficient'
1128         if re.match(br'1\.5\.[012345]($|\.)', ver_str):
1129             return 'insufficient'
1130         if re.match(br'1\.5\.6-rc', ver_str):
1131             return 'insufficient'
1132         return 'suitable'
1133     if re.match(br'[0-9]+(\.|$)?', ver_str):
1134         return 'suitable'
1135     sys.exit(13)
1136
1137 _git_great = None
1138
1139 def require_suitable_git(ver_str=None):
1140     """Raise GitError if the version of git isn't suitable.
1141
1142     Rely on ver_str when provided, rather than invoking the git in the
1143     path.
1144
1145     """
1146     global _git_great
1147     if _git_great is not None:
1148         return
1149     if environ.get(b'BUP_GIT_VERSION_IS_FINE', b'').lower() \
1150        in (b'yes', b'true', b'1'):
1151         _git_great = True
1152         return
1153     if not ver_str:
1154         ver_str, _, _ = _git_exo([b'git', b'--version'])
1155     status = is_suitable_git(ver_str)
1156     if status == 'unrecognized':
1157         raise GitError('Unexpected git --version output: %r' % ver_str)
1158     if status == 'insufficient':
1159         log('error: git version must be at least 1.5.6\n')
1160         sys.exit(1)
1161     if status == 'suitable':
1162         _git_great = True
1163         return
1164     assert False
1165
1166
1167 class _AbortableIter:
1168     def __init__(self, it, onabort = None):
1169         self.it = it
1170         self.onabort = onabort
1171         self.done = None
1172
1173     def __iter__(self):
1174         return self
1175
1176     def next(self):
1177         try:
1178             return next(self.it)
1179         except StopIteration as e:
1180             self.done = True
1181             raise
1182         except:
1183             self.abort()
1184             raise
1185
1186     def abort(self):
1187         """Abort iteration and call the abortion callback, if needed."""
1188         if not self.done:
1189             self.done = True
1190             if self.onabort:
1191                 self.onabort()
1192
1193     def __del__(self):
1194         self.abort()
1195
1196
1197 class CatPipe:
1198     """Link to 'git cat-file' that is used to retrieve blob data."""
1199     def __init__(self, repo_dir = None):
1200         require_suitable_git()
1201         self.repo_dir = repo_dir
1202         self.p = self.inprogress = None
1203
1204     def _abort(self):
1205         if self.p:
1206             self.p.stdout.close()
1207             self.p.stdin.close()
1208         self.p = None
1209         self.inprogress = None
1210
1211     def restart(self):
1212         self._abort()
1213         self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1214                                   stdin=subprocess.PIPE,
1215                                   stdout=subprocess.PIPE,
1216                                   close_fds = True,
1217                                   bufsize = 4096,
1218                                   preexec_fn = _gitenv(self.repo_dir))
1219
1220     def get(self, ref):
1221         """Yield (oidx, type, size), followed by the data referred to by ref.
1222         If ref does not exist, only yield (None, None, None).
1223
1224         """
1225         if not self.p or self.p.poll() != None:
1226             self.restart()
1227         assert(self.p)
1228         poll_result = self.p.poll()
1229         assert(poll_result == None)
1230         if self.inprogress:
1231             log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1232         assert(not self.inprogress)
1233         assert(ref.find('\n') < 0)
1234         assert(ref.find('\r') < 0)
1235         assert(not ref.startswith('-'))
1236         self.inprogress = ref
1237         self.p.stdin.write('%s\n' % ref)
1238         self.p.stdin.flush()
1239         hdr = self.p.stdout.readline()
1240         if hdr.endswith(' missing\n'):
1241             self.inprogress = None
1242             yield None, None, None
1243             return
1244         info = hdr.split(' ')
1245         if len(info) != 3 or len(info[0]) != 40:
1246             raise GitError('expected object (id, type, size), got %r' % info)
1247         oidx, typ, size = info
1248         size = int(size)
1249         it = _AbortableIter(chunkyreader(self.p.stdout, size),
1250                             onabort=self._abort)
1251         try:
1252             yield oidx, typ, size
1253             for blob in it:
1254                 yield blob
1255             readline_result = self.p.stdout.readline()
1256             assert(readline_result == '\n')
1257             self.inprogress = None
1258         except Exception as e:
1259             it.abort()
1260             raise
1261
1262     def _join(self, it):
1263         _, typ, _ = next(it)
1264         if typ == 'blob':
1265             for blob in it:
1266                 yield blob
1267         elif typ == 'tree':
1268             treefile = ''.join(it)
1269             for (mode, name, sha) in tree_decode(treefile):
1270                 for blob in self.join(sha.encode('hex')):
1271                     yield blob
1272         elif typ == 'commit':
1273             treeline = ''.join(it).split('\n')[0]
1274             assert(treeline.startswith('tree '))
1275             for blob in self.join(treeline[5:]):
1276                 yield blob
1277         else:
1278             raise GitError('invalid object type %r: expected blob/tree/commit'
1279                            % typ)
1280
1281     def join(self, id):
1282         """Generate a list of the content of all blobs that can be reached
1283         from an object.  The hash given in 'id' must point to a blob, a tree
1284         or a commit. The content of all blobs that can be seen from trees or
1285         commits will be added to the list.
1286         """
1287         try:
1288             for d in self._join(self.get(id)):
1289                 yield d
1290         except StopIteration:
1291             log('booger!\n')
1292
1293
1294 _cp = {}
1295
1296 def cp(repo_dir=None):
1297     """Create a CatPipe object or reuse the already existing one."""
1298     global _cp, repodir
1299     if not repo_dir:
1300         repo_dir = repodir or repo()
1301     repo_dir = os.path.abspath(repo_dir)
1302     cp = _cp.get(repo_dir)
1303     if not cp:
1304         cp = CatPipe(repo_dir)
1305         _cp[repo_dir] = cp
1306     return cp
1307
1308
1309 def tags(repo_dir = None):
1310     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1311     tags = {}
1312     for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1313         assert(n.startswith('refs/tags/'))
1314         name = n[10:]
1315         if not c in tags:
1316             tags[c] = []
1317         tags[c].append(name)  # more than one tag can point at 'c'
1318     return tags
1319
1320
1321 class MissingObject(KeyError):
1322     def __init__(self, oid):
1323         self.oid = oid
1324         KeyError.__init__(self, 'object %r is missing' % oid.encode('hex'))
1325
1326
1327 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1328                                    'path', 'chunk_path', 'data'])
1329 # The path is the mangled path, and if an item represents a fragment
1330 # of a chunked file, the chunk_path will be the chunked subtree path
1331 # for the chunk, i.e. ['', '2d3115e', ...].  The top-level path for a
1332 # chunked file will have a chunk_path of [''].  So some chunk subtree
1333 # of the file '/foo/bar/baz' might look like this:
1334 #
1335 #   item.path = ['foo', 'bar', 'baz.bup']
1336 #   item.chunk_path = ['', '2d3115e', '016b097']
1337 #   item.type = 'tree'
1338 #   ...
1339
1340
1341 def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1342     """Yield everything reachable from oidx via get_ref (which must behave
1343     like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1344     returns true.  Throw MissingObject if a hash encountered is
1345     missing from the repository, and don't read or return blob content
1346     in the data field unless include_data is set.
1347
1348     """
1349     # Maintain the pending stack on the heap to avoid stack overflow
1350     pending = [(oidx, [], [], None)]
1351     while len(pending):
1352         oidx, parent_path, chunk_path, mode = pending.pop()
1353         oid = oidx.decode('hex')
1354         if stop_at and stop_at(oidx):
1355             continue
1356
1357         if (not include_data) and mode and stat.S_ISREG(mode):
1358             # If the object is a "regular file", then it's a leaf in
1359             # the graph, so we can skip reading the data if the caller
1360             # hasn't requested it.
1361             yield WalkItem(oid=oid, type='blob',
1362                            chunk_path=chunk_path, path=parent_path,
1363                            mode=mode,
1364                            data=None)
1365             continue
1366
1367         item_it = get_ref(oidx)
1368         get_oidx, typ, _ = next(item_it)
1369         if not get_oidx:
1370             raise MissingObject(oidx.decode('hex'))
1371         if typ not in ('blob', 'commit', 'tree'):
1372             raise Exception('unexpected repository object type %r' % typ)
1373
1374         # FIXME: set the mode based on the type when the mode is None
1375         if typ == 'blob' and not include_data:
1376             # Dump data until we can ask cat_pipe not to fetch it
1377             for ignored in item_it:
1378                 pass
1379             data = None
1380         else:
1381             data = ''.join(item_it)
1382
1383         yield WalkItem(oid=oid, type=typ,
1384                        chunk_path=chunk_path, path=parent_path,
1385                        mode=mode,
1386                        data=(data if include_data else None))
1387
1388         if typ == 'commit':
1389             commit_items = parse_commit(data)
1390             for pid in commit_items.parents:
1391                 pending.append((pid, parent_path, chunk_path, mode))
1392             pending.append((commit_items.tree, parent_path, chunk_path,
1393                             hashsplit.GIT_MODE_TREE))
1394         elif typ == 'tree':
1395             for mode, name, ent_id in tree_decode(data):
1396                 demangled, bup_type = demangle_name(name, mode)
1397                 if chunk_path:
1398                     sub_path = parent_path
1399                     sub_chunk_path = chunk_path + [name]
1400                 else:
1401                     sub_path = parent_path + [name]
1402                     if bup_type == BUP_CHUNKED:
1403                         sub_chunk_path = ['']
1404                     else:
1405                         sub_chunk_path = chunk_path
1406                 pending.append((ent_id.encode('hex'), sub_path, sub_chunk_path,
1407                                 mode))