lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5
   6 from __future__ import absolute_import
   7 import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
   8 from array import array
   9 from collections import namedtuple
  10 from itertools import islice
  11 from numbers import Integral
  12
  13 from bup import _helpers, compat, hashsplit, path, midx, bloom, xstat
  14 from bup.compat import range
  15 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
  16                          fdatasync,
  17                          hostname, localtime, log,
  18                          merge_dict,
  19                          merge_iter,
  20                          mmap_read, mmap_readwrite,
  21                          parse_num,
  22                          progress, qprogress, shstr, stat_if_exists,
  23                          unlink,
  24                          utc_offset_str)
  25 from bup.pwdgrp import username, userfullname
  26
  27 verbose = 0
  28 repodir = None  # The default repository, once initialized
  29
  30 _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
  31 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
  32
  33 _total_searches = 0
  34 _total_steps = 0
  35
  36
  37 class GitError(Exception):
  38     pass
  39
  40
  41 def _gitenv(repo_dir=None):
  42     if not repo_dir:
  43         repo_dir = repo()
  44     return merge_dict(os.environ, {'GIT_DIR': os.path.abspath(repo_dir)})
  45
  46 def _git_wait(cmd, p):
  47     rv = p.wait()
  48     if rv != 0:
  49         raise GitError('%s returned %d' % (shstr(cmd), rv))
  50
  51 def _git_capture(argv):
  52     p = subprocess.Popen(argv, stdout=subprocess.PIPE, env=_gitenv())
  53     r = p.stdout.read()
  54     _git_wait(repr(argv), p)
  55     return r
  56
  57 def git_config_get(option, repo_dir=None):
  58     cmd = ('git', 'config', '--get', option)
  59     p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
  60                          env=_gitenv(repo_dir=repo_dir))
  61     r = p.stdout.read()
  62     rc = p.wait()
  63     if rc == 0:
  64         return r
  65     if rc != 1:
  66         raise GitError('%s returned %d' % (cmd, rc))
  67     return None
  68
  69
  70 def parse_tz_offset(s):
  71     """UTC offset in seconds."""
  72     tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
  73     if s[0] == '-':
  74         return - tz_off
  75     return tz_off
  76
  77
  78 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
  79 # Make sure that's authoritative.
  80 _start_end_char = r'[^ .,:;<>"\'\0\n]'
  81 _content_char = r'[^\0\n<>]'
  82 _safe_str_rx = '(?:%s{1,2}|(?:%s%s*%s))' \
  83     % (_start_end_char,
  84        _start_end_char, _content_char, _start_end_char)
  85 _tz_rx = r'[-+]\d\d[0-5]\d'
  86 _parent_rx = r'(?:parent [abcdefABCDEF0123456789]{40}\n)'
  87 # Assumes every following line starting with a space is part of the
  88 # mergetag.  Is there a formal commit blob spec?
  89 _mergetag_rx = r'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)'
  90 _commit_rx = re.compile(r'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
  91 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
  92 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?)
  93
  94 (?P<message>(?:.|\n)*)''' % (_parent_rx,
  95                              _safe_str_rx, _safe_str_rx, _tz_rx,
  96                              _safe_str_rx, _safe_str_rx, _tz_rx,
  97                              _mergetag_rx))
  98 _parent_hash_rx = re.compile(r'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
  99
 100 # Note that the author_sec and committer_sec values are (UTC) epoch
 101 # seconds, and for now the mergetag is not included.
 102 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
 103                                        'author_name', 'author_mail',
 104                                        'author_sec', 'author_offset',
 105                                        'committer_name', 'committer_mail',
 106                                        'committer_sec', 'committer_offset',
 107                                        'message'])
 108
 109 def parse_commit(content):
 110     commit_match = re.match(_commit_rx, content)
 111     if not commit_match:
 112         raise Exception('cannot parse commit %r' % content)
 113     matches = commit_match.groupdict()
 114     return CommitInfo(tree=matches['tree'],
 115                       parents=re.findall(_parent_hash_rx, matches['parents']),
 116                       author_name=matches['author_name'],
 117                       author_mail=matches['author_mail'],
 118                       author_sec=int(matches['asec']),
 119                       author_offset=parse_tz_offset(matches['atz']),
 120                       committer_name=matches['committer_name'],
 121                       committer_mail=matches['committer_mail'],
 122                       committer_sec=int(matches['csec']),
 123                       committer_offset=parse_tz_offset(matches['ctz']),
 124                       message=matches['message'])
 125
 126
 127 def get_cat_data(cat_iterator, expected_type):
 128     _, kind, _ = next(cat_iterator)
 129     if kind != expected_type:
 130         raise Exception('expected %r, saw %r' % (expected_type, kind))
 131     return ''.join(cat_iterator)
 132
 133 def get_commit_items(id, cp):
 134     return parse_commit(get_cat_data(cp.get(id), 'commit'))
 135
 136 def _local_git_date_str(epoch_sec):
 137     return '%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
 138
 139
 140 def _git_date_str(epoch_sec, tz_offset_sec):
 141     offs =  tz_offset_sec // 60
 142     return '%d %s%02d%02d' \
 143         % (epoch_sec,
 144            '+' if offs >= 0 else '-',
 145            abs(offs) // 60,
 146            abs(offs) % 60)
 147
 148
 149 def repo(sub = '', repo_dir=None):
 150     """Get the path to the git repository or one of its subdirectories."""
 151     repo_dir = repo_dir or repodir
 152     if not repo_dir:
 153         raise GitError('You should call check_repo_or_die()')
 154
 155     # If there's a .git subdirectory, then the actual repo is in there.
 156     gd = os.path.join(repo_dir, '.git')
 157     if os.path.exists(gd):
 158         repo_dir = gd
 159
 160     return os.path.join(repo_dir, sub)
 161
 162
 163 def shorten_hash(s):
 164     return re.sub(r'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)',
 165                   r'\1\2*\3', s)
 166
 167
 168 def repo_rel(path):
 169     full = os.path.abspath(path)
 170     fullrepo = os.path.abspath(repo(''))
 171     if not fullrepo.endswith('/'):
 172         fullrepo += '/'
 173     if full.startswith(fullrepo):
 174         path = full[len(fullrepo):]
 175     if path.startswith('index-cache/'):
 176         path = path[len('index-cache/'):]
 177     return shorten_hash(path)
 178
 179
 180 def all_packdirs():
 181     paths = [repo('objects/pack')]
 182     paths += glob.glob(repo('index-cache/*/.'))
 183     return paths
 184
 185
 186 def auto_midx(objdir):
 187     args = [path.exe(), 'midx', '--auto', '--dir', objdir]
 188     try:
 189         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 190     except OSError as e:
 191         # make sure 'args' gets printed to help with debugging
 192         add_error('%r: exception: %s' % (args, e))
 193         raise
 194     if rv:
 195         add_error('%r: returned %d' % (args, rv))
 196
 197     args = [path.exe(), 'bloom', '--dir', objdir]
 198     try:
 199         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 200     except OSError as e:
 201         # make sure 'args' gets printed to help with debugging
 202         add_error('%r: exception: %s' % (args, e))
 203         raise
 204     if rv:
 205         add_error('%r: returned %d' % (args, rv))
 206
 207
 208 def mangle_name(name, mode, gitmode):
 209     """Mangle a file name to present an abstract name for segmented files.
 210     Mangled file names will have the ".bup" extension added to them. If a
 211     file's name already ends with ".bup", a ".bupl" extension is added to
 212     disambiguate normal files from segmented ones.
 213     """
 214     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 215         assert(stat.S_ISDIR(gitmode))
 216         return name + '.bup'
 217     elif name.endswith('.bup') or name[:-1].endswith('.bup'):
 218         return name + '.bupl'
 219     else:
 220         return name
 221
 222
 223 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 224 def demangle_name(name, mode):
 225     """Remove name mangling from a file name, if necessary.
 226
 227     The return value is a tuple (demangled_filename,mode), where mode is one of
 228     the following:
 229
 230     * BUP_NORMAL  : files that should be read as-is from the repository
 231     * BUP_CHUNKED : files that were chunked and need to be reassembled
 232
 233     For more information on the name mangling algorithm, see mangle_name()
 234     """
 235     if name.endswith('.bupl'):
 236         return (name[:-5], BUP_NORMAL)
 237     elif name.endswith('.bup'):
 238         return (name[:-4], BUP_CHUNKED)
 239     elif name.endswith('.bupm'):
 240         return (name[:-5],
 241                 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
 242     else:
 243         return (name, BUP_NORMAL)
 244
 245
 246 def calc_hash(type, content):
 247     """Calculate some content's hash in the Git fashion."""
 248     header = '%s %d\0' % (type, len(content))
 249     sum = Sha1(header)
 250     sum.update(content)
 251     return sum.digest()
 252
 253
 254 def shalist_item_sort_key(ent):
 255     (mode, name, id) = ent
 256     assert(mode+0 == mode)
 257     if stat.S_ISDIR(mode):
 258         return name + '/'
 259     else:
 260         return name
 261
 262
 263 def tree_encode(shalist):
 264     """Generate a git tree object from (mode,name,hash) tuples."""
 265     shalist = sorted(shalist, key = shalist_item_sort_key)
 266     l = []
 267     for (mode,name,bin) in shalist:
 268         assert(mode)
 269         assert(mode+0 == mode)
 270         assert(name)
 271         assert(len(bin) == 20)
 272         s = '%o %s\0%s' % (mode,name,bin)
 273         assert(s[0] != '0')  # 0-padded octal is not acceptable in a git tree
 274         l.append(s)
 275     return ''.join(l)
 276
 277
 278 def tree_decode(buf):
 279     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 280     ofs = 0
 281     while ofs < len(buf):
 282         z = buf.find('\0', ofs)
 283         assert(z > ofs)
 284         spl = buf[ofs:z].split(' ', 1)
 285         assert(len(spl) == 2)
 286         mode,name = spl
 287         sha = buf[z+1:z+1+20]
 288         ofs = z+1+20
 289         yield (int(mode, 8), name, sha)
 290
 291
 292 def _encode_packobj(type, content, compression_level=1):
 293     if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
 294         raise ValueError('invalid compression level %s' % compression_level)
 295     szout = ''
 296     sz = len(content)
 297     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 298     sz >>= 4
 299     while 1:
 300         if sz: szbits |= 0x80
 301         szout += chr(szbits)
 302         if not sz:
 303             break
 304         szbits = sz & 0x7f
 305         sz >>= 7
 306     z = zlib.compressobj(compression_level)
 307     yield szout
 308     yield z.compress(content)
 309     yield z.flush()
 310
 311
 312 def _encode_looseobj(type, content, compression_level=1):
 313     z = zlib.compressobj(compression_level)
 314     yield z.compress('%s %d\0' % (type, len(content)))
 315     yield z.compress(content)
 316     yield z.flush()
 317
 318
 319 def _decode_looseobj(buf):
 320     assert(buf);
 321     s = zlib.decompress(buf)
 322     i = s.find('\0')
 323     assert(i > 0)
 324     l = s[:i].split(' ')
 325     type = l[0]
 326     sz = int(l[1])
 327     content = s[i+1:]
 328     assert(type in _typemap)
 329     assert(sz == len(content))
 330     return (type, content)
 331
 332
 333 def _decode_packobj(buf):
 334     assert(buf)
 335     c = ord(buf[0])
 336     type = _typermap[(c & 0x70) >> 4]
 337     sz = c & 0x0f
 338     shift = 4
 339     i = 0
 340     while c & 0x80:
 341         i += 1
 342         c = ord(buf[i])
 343         sz |= (c & 0x7f) << shift
 344         shift += 7
 345         if not (c & 0x80):
 346             break
 347     return (type, zlib.decompress(buf[i+1:]))
 348
 349
 350 class PackIdx:
 351     def __init__(self):
 352         assert(0)
 353
 354     def find_offset(self, hash):
 355         """Get the offset of an object inside the index file."""
 356         idx = self._idx_from_hash(hash)
 357         if idx != None:
 358             return self._ofs_from_idx(idx)
 359         return None
 360
 361     def exists(self, hash, want_source=False):
 362         """Return nonempty if the object exists in this index."""
 363         if hash and (self._idx_from_hash(hash) != None):
 364             return want_source and os.path.basename(self.name) or True
 365         return None
 366
 367     def _idx_from_hash(self, hash):
 368         global _total_searches, _total_steps
 369         _total_searches += 1
 370         assert(len(hash) == 20)
 371         b1 = ord(hash[0])
 372         start = self.fanout[b1-1] # range -1..254
 373         end = self.fanout[b1] # range 0..255
 374         want = str(hash)
 375         _total_steps += 1  # lookup table is a step
 376         while start < end:
 377             _total_steps += 1
 378             mid = start + (end-start)/2
 379             v = self._idx_to_hash(mid)
 380             if v < want:
 381                 start = mid+1
 382             elif v > want:
 383                 end = mid
 384             else: # got it!
 385                 return mid
 386         return None
 387
 388
 389 class PackIdxV1(PackIdx):
 390     """Object representation of a Git pack index (version 1) file."""
 391     def __init__(self, filename, f):
 392         self.name = filename
 393         self.idxnames = [self.name]
 394         self.map = mmap_read(f)
 395         # Min size for 'L' is 4, which is sufficient for struct's '!I'
 396         self.fanout = array('L', struct.unpack('!256I', self.map))
 397         self.fanout.append(0)  # entry "-1"
 398         self.nsha = self.fanout[255]
 399         self.sha_ofs = 256 * 4
 400         # Avoid slicing shatable for individual hashes (very high overhead)
 401         self.shatable = buffer(self.map, self.sha_ofs, self.nsha * 24)
 402
 403     def __len__(self):
 404         return int(self.nsha)  # int() from long for python 2
 405
 406     def _ofs_from_idx(self, idx):
 407         if idx >= self.nsha or idx < 0:
 408             raise IndexError('invalid pack index index %d' % idx)
 409         ofs = self.sha_ofs + idx * 24
 410         return struct.unpack_from('!I', self.map, offset=ofs)[0]
 411
 412     def _idx_to_hash(self, idx):
 413         if idx >= self.nsha or idx < 0:
 414             raise IndexError('invalid pack index index %d' % idx)
 415         ofs = self.sha_ofs + idx * 24 + 4
 416         return self.map[ofs : ofs + 20]
 417
 418     def __iter__(self):
 419         start = self.sha_ofs + 4
 420         for ofs in range(start, start + 24 * self.nsha, 24):
 421             yield self.map[ofs : ofs + 20]
 422
 423
 424 class PackIdxV2(PackIdx):
 425     """Object representation of a Git pack index (version 2) file."""
 426     def __init__(self, filename, f):
 427         self.name = filename
 428         self.idxnames = [self.name]
 429         self.map = mmap_read(f)
 430         assert self.map[0:8] == b'\377tOc\0\0\0\2'
 431         # Min size for 'L' is 4, which is sufficient for struct's '!I'
 432         self.fanout = array('L', struct.unpack_from('!256I', self.map, offset=8))
 433         self.fanout.append(0)
 434         self.nsha = self.fanout[255]
 435         self.sha_ofs = 8 + 256*4
 436         self.ofstable_ofs = self.sha_ofs + self.nsha * 20 + self.nsha * 4
 437         self.ofs64table_ofs = self.ofstable_ofs + self.nsha * 4
 438         # Avoid slicing this for individual hashes (very high overhead)
 439         self.shatable = buffer(self.map, self.sha_ofs, self.nsha*20)
 440
 441     def __len__(self):
 442         return int(self.nsha)  # int() from long for python 2
 443
 444     def _ofs_from_idx(self, idx):
 445         if idx >= self.nsha or idx < 0:
 446             raise IndexError('invalid pack index index %d' % idx)
 447         ofs_ofs = self.ofstable_ofs + idx * 4
 448         ofs = struct.unpack_from('!I', self.map, offset=ofs_ofs)[0]
 449         if ofs & 0x80000000:
 450             idx64 = ofs & 0x7fffffff
 451             ofs64_ofs = self.ofs64table_ofs + idx64 * 8
 452             ofs = struct.unpack_from('!Q', self.map, offset=ofs64_ofs)[0]
 453         return ofs
 454
 455     def _idx_to_hash(self, idx):
 456         if idx >= self.nsha or idx < 0:
 457             raise IndexError('invalid pack index index %d' % idx)
 458         ofs = self.sha_ofs + idx * 20
 459         return self.map[ofs : ofs + 20]
 460
 461     def __iter__(self):
 462         start = self.sha_ofs
 463         for ofs in range(start, start + 20 * self.nsha, 20):
 464             yield self.map[ofs : ofs + 20]
 465
 466
 467 _mpi_count = 0
 468 class PackIdxList:
 469     def __init__(self, dir, ignore_midx=False):
 470         global _mpi_count
 471         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 472         _mpi_count += 1
 473         self.dir = dir
 474         self.also = set()
 475         self.packs = []
 476         self.do_bloom = False
 477         self.bloom = None
 478         self.ignore_midx = ignore_midx
 479         self.refresh()
 480
 481     def __del__(self):
 482         global _mpi_count
 483         _mpi_count -= 1
 484         assert(_mpi_count == 0)
 485
 486     def __iter__(self):
 487         return iter(idxmerge(self.packs))
 488
 489     def __len__(self):
 490         return sum(len(pack) for pack in self.packs)
 491
 492     def exists(self, hash, want_source=False):
 493         """Return nonempty if the object exists in the index files."""
 494         global _total_searches
 495         _total_searches += 1
 496         if hash in self.also:
 497             return True
 498         if self.do_bloom and self.bloom:
 499             if self.bloom.exists(hash):
 500                 self.do_bloom = False
 501             else:
 502                 _total_searches -= 1  # was counted by bloom
 503                 return None
 504         for i in xrange(len(self.packs)):
 505             p = self.packs[i]
 506             _total_searches -= 1  # will be incremented by sub-pack
 507             ix = p.exists(hash, want_source=want_source)
 508             if ix:
 509                 # reorder so most recently used packs are searched first
 510                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 511                 return ix
 512         self.do_bloom = True
 513         return None
 514
 515     def refresh(self, skip_midx = False):
 516         """Refresh the index list.
 517         This method verifies if .midx files were superseded (e.g. all of its
 518         contents are in another, bigger .midx file) and removes the superseded
 519         files.
 520
 521         If skip_midx is True, all work on .midx files will be skipped and .midx
 522         files will be removed from the list.
 523
 524         The instance variable 'ignore_midx' can force this function to
 525         always act as if skip_midx was True.
 526         """
 527         self.bloom = None # Always reopen the bloom as it may have been relaced
 528         self.do_bloom = False
 529         skip_midx = skip_midx or self.ignore_midx
 530         d = dict((p.name, p) for p in self.packs
 531                  if not skip_midx or not isinstance(p, midx.PackMidx))
 532         if os.path.exists(self.dir):
 533             if not skip_midx:
 534                 midxl = []
 535                 for ix in self.packs:
 536                     if isinstance(ix, midx.PackMidx):
 537                         for name in ix.idxnames:
 538                             d[os.path.join(self.dir, name)] = ix
 539                 for full in glob.glob(os.path.join(self.dir,'*.midx')):
 540                     if not d.get(full):
 541                         mx = midx.PackMidx(full)
 542                         (mxd, mxf) = os.path.split(mx.name)
 543                         broken = False
 544                         for n in mx.idxnames:
 545                             if not os.path.exists(os.path.join(mxd, n)):
 546                                 log(('warning: index %s missing\n' +
 547                                     '  used by %s\n') % (n, mxf))
 548                                 broken = True
 549                         if broken:
 550                             mx.close()
 551                             del mx
 552                             unlink(full)
 553                         else:
 554                             midxl.append(mx)
 555                 midxl.sort(key=lambda ix:
 556                            (-len(ix), -xstat.stat(ix.name).st_mtime))
 557                 for ix in midxl:
 558                     any_needed = False
 559                     for sub in ix.idxnames:
 560                         found = d.get(os.path.join(self.dir, sub))
 561                         if not found or isinstance(found, PackIdx):
 562                             # doesn't exist, or exists but not in a midx
 563                             any_needed = True
 564                             break
 565                     if any_needed:
 566                         d[ix.name] = ix
 567                         for name in ix.idxnames:
 568                             d[os.path.join(self.dir, name)] = ix
 569                     elif not ix.force_keep:
 570                         debug1('midx: removing redundant: %s\n'
 571                                % os.path.basename(ix.name))
 572                         ix.close()
 573                         unlink(ix.name)
 574             for full in glob.glob(os.path.join(self.dir,'*.idx')):
 575                 if not d.get(full):
 576                     try:
 577                         ix = open_idx(full)
 578                     except GitError as e:
 579                         add_error(e)
 580                         continue
 581                     d[full] = ix
 582             bfull = os.path.join(self.dir, 'bup.bloom')
 583             if self.bloom is None and os.path.exists(bfull):
 584                 self.bloom = bloom.ShaBloom(bfull)
 585             self.packs = list(set(d.values()))
 586             self.packs.sort(reverse=True, key=lambda x: len(x))
 587             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 588                 self.do_bloom = True
 589             else:
 590                 self.bloom = None
 591         debug1('PackIdxList: using %d index%s.\n'
 592             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 593
 594     def add(self, hash):
 595         """Insert an additional object in the list."""
 596         self.also.add(hash)
 597
 598
 599 def open_idx(filename):
 600     if filename.endswith('.idx'):
 601         f = open(filename, 'rb')
 602         header = f.read(8)
 603         if header[0:4] == '\377tOc':
 604             version = struct.unpack('!I', header[4:8])[0]
 605             if version == 2:
 606                 return PackIdxV2(filename, f)
 607             else:
 608                 raise GitError('%s: expected idx file version 2, got %d'
 609                                % (filename, version))
 610         elif len(header) == 8 and header[0:4] < '\377tOc':
 611             return PackIdxV1(filename, f)
 612         else:
 613             raise GitError('%s: unrecognized idx file header' % filename)
 614     elif filename.endswith('.midx'):
 615         return midx.PackMidx(filename)
 616     else:
 617         raise GitError('idx filenames must end with .idx or .midx')
 618
 619
 620 def idxmerge(idxlist, final_progress=True):
 621     """Generate a list of all the objects reachable in a PackIdxList."""
 622     def pfunc(count, total):
 623         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 624                   % (count*100.0/total, count, total))
 625     def pfinal(count, total):
 626         if final_progress:
 627             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 628                      % (100, total, total))
 629     return merge_iter(idxlist, 10024, pfunc, pfinal)
 630
 631
 632 def _make_objcache():
 633     return PackIdxList(repo('objects/pack'))
 634
 635 # bup-gc assumes that it can disable all PackWriter activities
 636 # (bloom/midx/cache) via the constructor and close() arguments.
 637
 638 class PackWriter:
 639     """Writes Git objects inside a pack file."""
 640     def __init__(self, objcache_maker=_make_objcache, compression_level=1,
 641                  run_midx=True, on_pack_finish=None,
 642                  max_pack_size=None, max_pack_objects=None, repo_dir=None):
 643         self.repo_dir = repo_dir or repo()
 644         self.file = None
 645         self.parentfd = None
 646         self.count = 0
 647         self.outbytes = 0
 648         self.filename = None
 649         self.idx = None
 650         self.objcache_maker = objcache_maker
 651         self.objcache = None
 652         self.compression_level = compression_level
 653         self.run_midx=run_midx
 654         self.on_pack_finish = on_pack_finish
 655         if not max_pack_size:
 656             max_pack_size = git_config_get('pack.packSizeLimit',
 657                                            repo_dir=self.repo_dir)
 658             if max_pack_size is not None:
 659                 max_pack_size = parse_num(max_pack_size)
 660             if not max_pack_size:
 661                 # larger packs slow down pruning
 662                 max_pack_size = 1000 * 1000 * 1000
 663         self.max_pack_size = max_pack_size
 664         # cache memory usage is about 83 bytes per object
 665         self.max_pack_objects = max_pack_objects if max_pack_objects \
 666                                 else max(1, self.max_pack_size // 5000)
 667
 668     def __del__(self):
 669         self.close()
 670
 671     def __enter__(self):
 672         return self
 673
 674     def __exit__(self, type, value, traceback):
 675         self.close()
 676
 677     def _open(self):
 678         if not self.file:
 679             objdir = dir = os.path.join(self.repo_dir, 'objects')
 680             fd, name = tempfile.mkstemp(suffix='.pack', dir=objdir)
 681             try:
 682                 self.file = os.fdopen(fd, 'w+b')
 683             except:
 684                 os.close(fd)
 685                 raise
 686             try:
 687                 self.parentfd = os.open(objdir, os.O_RDONLY)
 688             except:
 689                 f = self.file
 690                 self.file = None
 691                 f.close()
 692                 raise
 693             assert(name.endswith('.pack'))
 694             self.filename = name[:-5]
 695             self.file.write('PACK\0\0\0\2\0\0\0\0')
 696             self.idx = list(list() for i in xrange(256))
 697
 698     def _raw_write(self, datalist, sha):
 699         self._open()
 700         f = self.file
 701         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 702         # the file never has a *partial* blob.  So let's make sure it's
 703         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 704         # to our hashsplit algorithm.)  f.write() does its own buffering,
 705         # but that's okay because we'll flush it in _end().
 706         oneblob = ''.join(datalist)
 707         try:
 708             f.write(oneblob)
 709         except IOError as e:
 710             raise GitError, e, sys.exc_info()[2]
 711         nw = len(oneblob)
 712         crc = zlib.crc32(oneblob) & 0xffffffff
 713         self._update_idx(sha, crc, nw)
 714         self.outbytes += nw
 715         self.count += 1
 716         return nw, crc
 717
 718     def _update_idx(self, sha, crc, size):
 719         assert(sha)
 720         if self.idx:
 721             self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
 722
 723     def _write(self, sha, type, content):
 724         if verbose:
 725             log('>')
 726         if not sha:
 727             sha = calc_hash(type, content)
 728         size, crc = self._raw_write(_encode_packobj(type, content,
 729                                                     self.compression_level),
 730                                     sha=sha)
 731         if self.outbytes >= self.max_pack_size \
 732            or self.count >= self.max_pack_objects:
 733             self.breakpoint()
 734         return sha
 735
 736     def breakpoint(self):
 737         """Clear byte and object counts and return the last processed id."""
 738         id = self._end(self.run_midx)
 739         self.outbytes = self.count = 0
 740         return id
 741
 742     def _require_objcache(self):
 743         if self.objcache is None and self.objcache_maker:
 744             self.objcache = self.objcache_maker()
 745         if self.objcache is None:
 746             raise GitError(
 747                     "PackWriter not opened or can't check exists w/o objcache")
 748
 749     def exists(self, id, want_source=False):
 750         """Return non-empty if an object is found in the object cache."""
 751         self._require_objcache()
 752         return self.objcache.exists(id, want_source=want_source)
 753
 754     def just_write(self, sha, type, content):
 755         """Write an object to the pack file without checking for duplication."""
 756         self._write(sha, type, content)
 757         # If nothing else, gc doesn't have/want an objcache
 758         if self.objcache is not None:
 759             self.objcache.add(sha)
 760
 761     def maybe_write(self, type, content):
 762         """Write an object to the pack file if not present and return its id."""
 763         sha = calc_hash(type, content)
 764         if not self.exists(sha):
 765             self._require_objcache()
 766             self.just_write(sha, type, content)
 767         return sha
 768
 769     def new_blob(self, blob):
 770         """Create a blob object in the pack with the supplied content."""
 771         return self.maybe_write('blob', blob)
 772
 773     def new_tree(self, shalist):
 774         """Create a tree object in the pack."""
 775         content = tree_encode(shalist)
 776         return self.maybe_write('tree', content)
 777
 778     def new_commit(self, tree, parent,
 779                    author, adate_sec, adate_tz,
 780                    committer, cdate_sec, cdate_tz,
 781                    msg):
 782         """Create a commit object in the pack.  The date_sec values must be
 783         epoch-seconds, and if a tz is None, the local timezone is assumed."""
 784         if adate_tz:
 785             adate_str = _git_date_str(adate_sec, adate_tz)
 786         else:
 787             adate_str = _local_git_date_str(adate_sec)
 788         if cdate_tz:
 789             cdate_str = _git_date_str(cdate_sec, cdate_tz)
 790         else:
 791             cdate_str = _local_git_date_str(cdate_sec)
 792         l = []
 793         if tree: l.append('tree %s' % tree.encode('hex'))
 794         if parent: l.append('parent %s' % parent.encode('hex'))
 795         if author: l.append('author %s %s' % (author, adate_str))
 796         if committer: l.append('committer %s %s' % (committer, cdate_str))
 797         l.append('')
 798         l.append(msg)
 799         return self.maybe_write('commit', '\n'.join(l))
 800
 801     def abort(self):
 802         """Remove the pack file from disk."""
 803         f = self.file
 804         if f:
 805             pfd = self.parentfd
 806             self.file = None
 807             self.parentfd = None
 808             self.idx = None
 809             try:
 810                 try:
 811                     os.unlink(self.filename + '.pack')
 812                 finally:
 813                     f.close()
 814             finally:
 815                 if pfd is not None:
 816                     os.close(pfd)
 817
 818     def _end(self, run_midx=True):
 819         f = self.file
 820         if not f: return None
 821         self.file = None
 822         try:
 823             self.objcache = None
 824             idx = self.idx
 825             self.idx = None
 826
 827             # update object count
 828             f.seek(8)
 829             cp = struct.pack('!i', self.count)
 830             assert(len(cp) == 4)
 831             f.write(cp)
 832
 833             # calculate the pack sha1sum
 834             f.seek(0)
 835             sum = Sha1()
 836             for b in chunkyreader(f):
 837                 sum.update(b)
 838             packbin = sum.digest()
 839             f.write(packbin)
 840             fdatasync(f.fileno())
 841         finally:
 842             f.close()
 843
 844         obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
 845         nameprefix = os.path.join(self.repo_dir,
 846                                   'objects/pack/pack-' +  obj_list_sha)
 847         if os.path.exists(self.filename + '.map'):
 848             os.unlink(self.filename + '.map')
 849         os.rename(self.filename + '.pack', nameprefix + '.pack')
 850         os.rename(self.filename + '.idx', nameprefix + '.idx')
 851         try:
 852             os.fsync(self.parentfd)
 853         finally:
 854             os.close(self.parentfd)
 855
 856         if run_midx:
 857             auto_midx(os.path.join(self.repo_dir, 'objects/pack'))
 858
 859         if self.on_pack_finish:
 860             self.on_pack_finish(nameprefix)
 861
 862         return nameprefix
 863
 864     def close(self, run_midx=True):
 865         """Close the pack file and move it to its definitive path."""
 866         return self._end(run_midx=run_midx)
 867
 868     def _write_pack_idx_v2(self, filename, idx, packbin):
 869         ofs64_count = 0
 870         for section in idx:
 871             for entry in section:
 872                 if entry[2] >= 2**31:
 873                     ofs64_count += 1
 874
 875         # Length: header + fan-out + shas-and-crcs + overflow-offsets
 876         index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
 877         idx_map = None
 878         idx_f = open(filename, 'w+b')
 879         try:
 880             idx_f.truncate(index_len)
 881             fdatasync(idx_f.fileno())
 882             idx_map = mmap_readwrite(idx_f, close=False)
 883             try:
 884                 count = _helpers.write_idx(filename, idx_map, idx, self.count)
 885                 assert(count == self.count)
 886                 idx_map.flush()
 887             finally:
 888                 idx_map.close()
 889         finally:
 890             idx_f.close()
 891
 892         idx_f = open(filename, 'a+b')
 893         try:
 894             idx_f.write(packbin)
 895             idx_f.seek(0)
 896             idx_sum = Sha1()
 897             b = idx_f.read(8 + 4*256)
 898             idx_sum.update(b)
 899
 900             obj_list_sum = Sha1()
 901             for b in chunkyreader(idx_f, 20*self.count):
 902                 idx_sum.update(b)
 903                 obj_list_sum.update(b)
 904             namebase = obj_list_sum.hexdigest()
 905
 906             for b in chunkyreader(idx_f):
 907                 idx_sum.update(b)
 908             idx_f.write(idx_sum.digest())
 909             fdatasync(idx_f.fileno())
 910             return namebase
 911         finally:
 912             idx_f.close()
 913
 914
 915 def list_refs(patterns=None, repo_dir=None,
 916               limit_to_heads=False, limit_to_tags=False):
 917     """Yield (refname, hash) tuples for all repository refs unless
 918     patterns are specified.  In that case, only include tuples for
 919     refs matching those patterns (cf. git-show-ref(1)).  The limits
 920     restrict the result items to refs/heads or refs/tags.  If both
 921     limits are specified, items from both sources will be included.
 922
 923     """
 924     argv = ['git', 'show-ref']
 925     if limit_to_heads:
 926         argv.append('--heads')
 927     if limit_to_tags:
 928         argv.append('--tags')
 929     argv.append('--')
 930     if patterns:
 931         argv.extend(patterns)
 932     p = subprocess.Popen(argv, env=_gitenv(repo_dir), stdout=subprocess.PIPE)
 933     out = p.stdout.read().strip()
 934     rv = p.wait()  # not fatal
 935     if rv:
 936         assert(not out)
 937     if out:
 938         for d in out.split('\n'):
 939             (sha, name) = d.split(' ', 1)
 940             yield (name, sha.decode('hex'))
 941
 942
 943 def read_ref(refname, repo_dir = None):
 944     """Get the commit id of the most recent commit made on a given ref."""
 945     refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
 946     l = tuple(islice(refs, 2))
 947     if l:
 948         assert(len(l) == 1)
 949         return l[0][1]
 950     else:
 951         return None
 952
 953
 954 def rev_list_invocation(ref_or_refs, count=None, format=None):
 955     if isinstance(ref_or_refs, compat.str_type):
 956         refs = (ref_or_refs,)
 957     else:
 958         refs = ref_or_refs
 959     argv = ['git', 'rev-list']
 960     if isinstance(count, Integral):
 961         argv.extend(['-n', str(count)])
 962     elif count:
 963         raise ValueError('unexpected count argument %r' % count)
 964
 965     if format:
 966         argv.append('--pretty=format:' + format)
 967     for ref in refs:
 968         assert not ref.startswith('-')
 969         argv.append(ref)
 970     argv.append('--')
 971     return argv
 972
 973
 974 def rev_list(ref_or_refs, count=None, parse=None, format=None, repo_dir=None):
 975     """Yield information about commits as per "git rev-list".  If a format
 976     is not provided, yield one hex hash at a time.  If a format is
 977     provided, pass it to rev-list and call parse(git_stdout) for each
 978     commit with the stream positioned just after the rev-list "commit
 979     HASH" header line.  When a format is provided yield (oidx,
 980     parse(git_stdout)) for each commit.
 981
 982     """
 983     assert bool(parse) == bool(format)
 984     p = subprocess.Popen(rev_list_invocation(ref_or_refs, count=count,
 985                                              format=format),
 986                          env=_gitenv(repo_dir),
 987                          stdout = subprocess.PIPE)
 988     if not format:
 989         for line in p.stdout:
 990             yield line.strip()
 991     else:
 992         line = p.stdout.readline()
 993         while line:
 994             s = line.strip()
 995             if not s.startswith('commit '):
 996                 raise Exception('unexpected line ' + s)
 997             s = s[7:]
 998             assert len(s) == 40
 999             yield s, parse(p.stdout)
1000             line = p.stdout.readline()
1001
1002     rv = p.wait()  # not fatal
1003     if rv:
1004         raise GitError, 'git rev-list returned error %d' % rv
1005
1006
1007 def get_commit_dates(refs, repo_dir=None):
1008     """Get the dates for the specified commit refs.  For now, every unique
1009        string in refs must resolve to a different commit or this
1010        function will fail."""
1011     result = []
1012     for ref in refs:
1013         commit = get_commit_items(ref, cp(repo_dir))
1014         result.append(commit.author_sec)
1015     return result
1016
1017
1018 def rev_parse(committish, repo_dir=None):
1019     """Resolve the full hash for 'committish', if it exists.
1020
1021     Should be roughly equivalent to 'git rev-parse'.
1022
1023     Returns the hex value of the hash if it is found, None if 'committish' does
1024     not correspond to anything.
1025     """
1026     head = read_ref(committish, repo_dir=repo_dir)
1027     if head:
1028         debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
1029         return head
1030
1031     pL = PackIdxList(repo('objects/pack', repo_dir=repo_dir))
1032
1033     if len(committish) == 40:
1034         try:
1035             hash = committish.decode('hex')
1036         except TypeError:
1037             return None
1038
1039         if pL.exists(hash):
1040             return hash
1041
1042     return None
1043
1044
1045 def update_ref(refname, newval, oldval, repo_dir=None):
1046     """Update a repository reference."""
1047     if not oldval:
1048         oldval = ''
1049     assert(refname.startswith('refs/heads/') \
1050            or refname.startswith('refs/tags/'))
1051     p = subprocess.Popen(['git', 'update-ref', refname,
1052                           newval.encode('hex'), oldval.encode('hex')],
1053                          env=_gitenv(repo_dir))
1054     _git_wait('git update-ref', p)
1055
1056
1057 def delete_ref(refname, oldvalue=None):
1058     """Delete a repository reference (see git update-ref(1))."""
1059     assert(refname.startswith('refs/'))
1060     oldvalue = [] if not oldvalue else [oldvalue]
1061     p = subprocess.Popen(['git', 'update-ref', '-d', refname] + oldvalue,
1062                          env=_gitenv())
1063     _git_wait('git update-ref', p)
1064
1065
1066 def guess_repo(path=None):
1067     """Set the path value in the global variable "repodir".
1068     This makes bup look for an existing bup repository, but not fail if a
1069     repository doesn't exist. Usually, if you are interacting with a bup
1070     repository, you would not be calling this function but using
1071     check_repo_or_die().
1072     """
1073     global repodir
1074     if path:
1075         repodir = path
1076     if not repodir:
1077         repodir = os.environ.get('BUP_DIR')
1078         if not repodir:
1079             repodir = os.path.expanduser('~/.bup')
1080
1081
1082 def init_repo(path=None):
1083     """Create the Git bare repository for bup in a given path."""
1084     guess_repo(path)
1085     d = repo()  # appends a / to the path
1086     parent = os.path.dirname(os.path.dirname(d))
1087     if parent and not os.path.exists(parent):
1088         raise GitError('parent directory "%s" does not exist\n' % parent)
1089     if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
1090         raise GitError('"%s" exists but is not a directory\n' % d)
1091     p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
1092                          env=_gitenv())
1093     _git_wait('git init', p)
1094     # Force the index version configuration in order to ensure bup works
1095     # regardless of the version of the installed Git binary.
1096     p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
1097                          stdout=sys.stderr, env=_gitenv())
1098     _git_wait('git config', p)
1099     # Enable the reflog
1100     p = subprocess.Popen(['git', 'config', 'core.logAllRefUpdates', 'true'],
1101                          stdout=sys.stderr, env=_gitenv())
1102     _git_wait('git config', p)
1103
1104
1105 def check_repo_or_die(path=None):
1106     """Check to see if a bup repository probably exists, and abort if not."""
1107     guess_repo(path)
1108     top = repo()
1109     pst = stat_if_exists(top + '/objects/pack')
1110     if pst and stat.S_ISDIR(pst.st_mode):
1111         return
1112     if not pst:
1113         top_st = stat_if_exists(top)
1114         if not top_st:
1115             log('error: repository %r does not exist (see "bup help init")\n'
1116                 % top)
1117             sys.exit(15)
1118     log('error: %r is not a repository\n' % top)
1119     sys.exit(14)
1120
1121
1122 _ver = None
1123 def ver():
1124     """Get Git's version and ensure a usable version is installed.
1125
1126     The returned version is formatted as an ordered tuple with each position
1127     representing a digit in the version tag. For example, the following tuple
1128     would represent version 1.6.6.9:
1129
1130         ('1', '6', '6', '9')
1131     """
1132     global _ver
1133     if not _ver:
1134         p = subprocess.Popen(['git', '--version'],
1135                              stdout=subprocess.PIPE)
1136         gvs = p.stdout.read()
1137         _git_wait('git --version', p)
1138         m = re.match(r'git version (\S+.\S+)', gvs)
1139         if not m:
1140             raise GitError('git --version weird output: %r' % gvs)
1141         _ver = tuple(m.group(1).split('.'))
1142     needed = ('1','5', '3', '1')
1143     if _ver < needed:
1144         raise GitError('git version %s or higher is required; you have %s'
1145                        % ('.'.join(needed), '.'.join(_ver)))
1146     return _ver
1147
1148
1149 class _AbortableIter:
1150     def __init__(self, it, onabort = None):
1151         self.it = it
1152         self.onabort = onabort
1153         self.done = None
1154
1155     def __iter__(self):
1156         return self
1157
1158     def next(self):
1159         try:
1160             return next(self.it)
1161         except StopIteration as e:
1162             self.done = True
1163             raise
1164         except:
1165             self.abort()
1166             raise
1167
1168     def abort(self):
1169         """Abort iteration and call the abortion callback, if needed."""
1170         if not self.done:
1171             self.done = True
1172             if self.onabort:
1173                 self.onabort()
1174
1175     def __del__(self):
1176         self.abort()
1177
1178
1179 class CatPipe:
1180     """Link to 'git cat-file' that is used to retrieve blob data."""
1181     def __init__(self, repo_dir = None):
1182         self.repo_dir = repo_dir
1183         wanted = ('1','5','6')
1184         if ver() < wanted:
1185             log('error: git version must be at least 1.5.6\n')
1186             sys.exit(1)
1187         self.p = self.inprogress = None
1188
1189     def _abort(self):
1190         if self.p:
1191             self.p.stdout.close()
1192             self.p.stdin.close()
1193         self.p = None
1194         self.inprogress = None
1195
1196     def restart(self):
1197         self._abort()
1198         self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1199                                   stdin=subprocess.PIPE,
1200                                   stdout=subprocess.PIPE,
1201                                   close_fds = True,
1202                                   bufsize = 4096,
1203                                   env=_gitenv(self.repo_dir))
1204
1205     def get(self, ref):
1206         """Yield (oidx, type, size), followed by the data referred to by ref.
1207         If ref does not exist, only yield (None, None, None).
1208
1209         """
1210         if not self.p or self.p.poll() != None:
1211             self.restart()
1212         assert(self.p)
1213         poll_result = self.p.poll()
1214         assert(poll_result == None)
1215         if self.inprogress:
1216             log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1217         assert(not self.inprogress)
1218         assert(ref.find('\n') < 0)
1219         assert(ref.find('\r') < 0)
1220         assert(not ref.startswith('-'))
1221         self.inprogress = ref
1222         self.p.stdin.write('%s\n' % ref)
1223         self.p.stdin.flush()
1224         hdr = self.p.stdout.readline()
1225         if hdr.endswith(' missing\n'):
1226             self.inprogress = None
1227             yield None, None, None
1228             return
1229         info = hdr.split(' ')
1230         if len(info) != 3 or len(info[0]) != 40:
1231             raise GitError('expected object (id, type, size), got %r' % info)
1232         oidx, typ, size = info
1233         size = int(size)
1234         it = _AbortableIter(chunkyreader(self.p.stdout, size),
1235                             onabort=self._abort)
1236         try:
1237             yield oidx, typ, size
1238             for blob in it:
1239                 yield blob
1240             readline_result = self.p.stdout.readline()
1241             assert(readline_result == '\n')
1242             self.inprogress = None
1243         except Exception as e:
1244             it.abort()
1245             raise
1246
1247     def _join(self, it):
1248         _, typ, _ = next(it)
1249         if typ == 'blob':
1250             for blob in it:
1251                 yield blob
1252         elif typ == 'tree':
1253             treefile = ''.join(it)
1254             for (mode, name, sha) in tree_decode(treefile):
1255                 for blob in self.join(sha.encode('hex')):
1256                     yield blob
1257         elif typ == 'commit':
1258             treeline = ''.join(it).split('\n')[0]
1259             assert(treeline.startswith('tree '))
1260             for blob in self.join(treeline[5:]):
1261                 yield blob
1262         else:
1263             raise GitError('invalid object type %r: expected blob/tree/commit'
1264                            % typ)
1265
1266     def join(self, id):
1267         """Generate a list of the content of all blobs that can be reached
1268         from an object.  The hash given in 'id' must point to a blob, a tree
1269         or a commit. The content of all blobs that can be seen from trees or
1270         commits will be added to the list.
1271         """
1272         try:
1273             for d in self._join(self.get(id)):
1274                 yield d
1275         except StopIteration:
1276             log('booger!\n')
1277
1278
1279 _cp = {}
1280
1281 def cp(repo_dir=None):
1282     """Create a CatPipe object or reuse the already existing one."""
1283     global _cp, repodir
1284     if not repo_dir:
1285         repo_dir = repodir or repo()
1286     repo_dir = os.path.abspath(repo_dir)
1287     cp = _cp.get(repo_dir)
1288     if not cp:
1289         cp = CatPipe(repo_dir)
1290         _cp[repo_dir] = cp
1291     return cp
1292
1293
1294 def tags(repo_dir = None):
1295     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1296     tags = {}
1297     for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1298         assert(n.startswith('refs/tags/'))
1299         name = n[10:]
1300         if not c in tags:
1301             tags[c] = []
1302         tags[c].append(name)  # more than one tag can point at 'c'
1303     return tags
1304
1305
1306 class MissingObject(KeyError):
1307     def __init__(self, oid):
1308         self.oid = oid
1309         KeyError.__init__(self, 'object %r is missing' % oid.encode('hex'))
1310
1311
1312 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1313                                    'path', 'chunk_path', 'data'])
1314 # The path is the mangled path, and if an item represents a fragment
1315 # of a chunked file, the chunk_path will be the chunked subtree path
1316 # for the chunk, i.e. ['', '2d3115e', ...].  The top-level path for a
1317 # chunked file will have a chunk_path of [''].  So some chunk subtree
1318 # of the file '/foo/bar/baz' might look like this:
1319 #
1320 #   item.path = ['foo', 'bar', 'baz.bup']
1321 #   item.chunk_path = ['', '2d3115e', '016b097']
1322 #   item.type = 'tree'
1323 #   ...
1324
1325
1326 def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1327     """Yield everything reachable from oidx via get_ref (which must behave
1328     like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1329     returns true.  Throw MissingObject if a hash encountered is
1330     missing from the repository, and don't read or return blob content
1331     in the data field unless include_data is set.
1332
1333     """
1334     # Maintain the pending stack on the heap to avoid stack overflow
1335     pending = [(oidx, [], [], None)]
1336     while len(pending):
1337         oidx, parent_path, chunk_path, mode = pending.pop()
1338         oid = oidx.decode('hex')
1339         if stop_at and stop_at(oidx):
1340             continue
1341
1342         if (not include_data) and mode and stat.S_ISREG(mode):
1343             # If the object is a "regular file", then it's a leaf in
1344             # the graph, so we can skip reading the data if the caller
1345             # hasn't requested it.
1346             yield WalkItem(oid=oid, type='blob',
1347                            chunk_path=chunk_path, path=parent_path,
1348                            mode=mode,
1349                            data=None)
1350             continue
1351
1352         item_it = get_ref(oidx)
1353         get_oidx, typ, _ = next(item_it)
1354         if not get_oidx:
1355             raise MissingObject(oidx.decode('hex'))
1356         if typ not in ('blob', 'commit', 'tree'):
1357             raise Exception('unexpected repository object type %r' % typ)
1358
1359         # FIXME: set the mode based on the type when the mode is None
1360         if typ == 'blob' and not include_data:
1361             # Dump data until we can ask cat_pipe not to fetch it
1362             for ignored in item_it:
1363                 pass
1364             data = None
1365         else:
1366             data = ''.join(item_it)
1367
1368         yield WalkItem(oid=oid, type=typ,
1369                        chunk_path=chunk_path, path=parent_path,
1370                        mode=mode,
1371                        data=(data if include_data else None))
1372
1373         if typ == 'commit':
1374             commit_items = parse_commit(data)
1375             for pid in commit_items.parents:
1376                 pending.append((pid, parent_path, chunk_path, mode))
1377             pending.append((commit_items.tree, parent_path, chunk_path,
1378                             hashsplit.GIT_MODE_TREE))
1379         elif typ == 'tree':
1380             for mode, name, ent_id in tree_decode(data):
1381                 demangled, bup_type = demangle_name(name, mode)
1382                 if chunk_path:
1383                     sub_path = parent_path
1384                     sub_chunk_path = chunk_path + [name]
1385                 else:
1386                     sub_path = parent_path + [name]
1387                     if bup_type == BUP_CHUNKED:
1388                         sub_chunk_path = ['']
1389                     else:
1390                         sub_chunk_path = chunk_path
1391                 pending.append((ent_id.encode('hex'), sub_path, sub_chunk_path,
1392                                 mode))