lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5
   6 from __future__ import absolute_import
   7 import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
   8 from collections import namedtuple
   9 from itertools import islice
  10 from numbers import Integral
  11
  12 from bup import _helpers, compat, hashsplit, path, midx, bloom, xstat
  13 from bup.compat import range
  14 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
  15                          fdatasync,
  16                          hostname, localtime, log,
  17                          merge_dict,
  18                          merge_iter,
  19                          mmap_read, mmap_readwrite,
  20                          parse_num,
  21                          progress, qprogress, shstr, stat_if_exists,
  22                          unlink, username, userfullname,
  23                          utc_offset_str)
  24
  25 verbose = 0
  26 ignore_midx = 0
  27 repodir = None  # The default repository, once initialized
  28
  29 _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
  30 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
  31
  32 _total_searches = 0
  33 _total_steps = 0
  34
  35
  36 class GitError(Exception):
  37     pass
  38
  39
  40 def _gitenv(repo_dir=None):
  41     if not repo_dir:
  42         repo_dir = repo()
  43     return merge_dict(os.environ, {'GIT_DIR': os.path.abspath(repo_dir)})
  44
  45 def _git_wait(cmd, p):
  46     rv = p.wait()
  47     if rv != 0:
  48         raise GitError('%s returned %d' % (shstr(cmd), rv))
  49
  50 def _git_capture(argv):
  51     p = subprocess.Popen(argv, stdout=subprocess.PIPE, env=_gitenv())
  52     r = p.stdout.read()
  53     _git_wait(repr(argv), p)
  54     return r
  55
  56 def git_config_get(option, repo_dir=None):
  57     cmd = ('git', 'config', '--get', option)
  58     p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
  59                          env=_gitenv(repo_dir=repo_dir))
  60     r = p.stdout.read()
  61     rc = p.wait()
  62     if rc == 0:
  63         return r
  64     if rc != 1:
  65         raise GitError('%s returned %d' % (cmd, rc))
  66     return None
  67
  68
  69 def parse_tz_offset(s):
  70     """UTC offset in seconds."""
  71     tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
  72     if s[0] == '-':
  73         return - tz_off
  74     return tz_off
  75
  76
  77 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
  78 # Make sure that's authoritative.
  79 _start_end_char = r'[^ .,:;<>"\'\0\n]'
  80 _content_char = r'[^\0\n<>]'
  81 _safe_str_rx = '(?:%s{1,2}|(?:%s%s*%s))' \
  82     % (_start_end_char,
  83        _start_end_char, _content_char, _start_end_char)
  84 _tz_rx = r'[-+]\d\d[0-5]\d'
  85 _parent_rx = r'(?:parent [abcdefABCDEF0123456789]{40}\n)'
  86 # Assumes every following line starting with a space is part of the
  87 # mergetag.  Is there a formal commit blob spec?
  88 _mergetag_rx = r'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)'
  89 _commit_rx = re.compile(r'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
  90 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
  91 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?)
  92
  93 (?P<message>(?:.|\n)*)''' % (_parent_rx,
  94                              _safe_str_rx, _safe_str_rx, _tz_rx,
  95                              _safe_str_rx, _safe_str_rx, _tz_rx,
  96                              _mergetag_rx))
  97 _parent_hash_rx = re.compile(r'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
  98
  99 # Note that the author_sec and committer_sec values are (UTC) epoch
 100 # seconds, and for now the mergetag is not included.
 101 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
 102                                        'author_name', 'author_mail',
 103                                        'author_sec', 'author_offset',
 104                                        'committer_name', 'committer_mail',
 105                                        'committer_sec', 'committer_offset',
 106                                        'message'])
 107
 108 def parse_commit(content):
 109     commit_match = re.match(_commit_rx, content)
 110     if not commit_match:
 111         raise Exception('cannot parse commit %r' % content)
 112     matches = commit_match.groupdict()
 113     return CommitInfo(tree=matches['tree'],
 114                       parents=re.findall(_parent_hash_rx, matches['parents']),
 115                       author_name=matches['author_name'],
 116                       author_mail=matches['author_mail'],
 117                       author_sec=int(matches['asec']),
 118                       author_offset=parse_tz_offset(matches['atz']),
 119                       committer_name=matches['committer_name'],
 120                       committer_mail=matches['committer_mail'],
 121                       committer_sec=int(matches['csec']),
 122                       committer_offset=parse_tz_offset(matches['ctz']),
 123                       message=matches['message'])
 124
 125
 126 def get_cat_data(cat_iterator, expected_type):
 127     _, kind, _ = next(cat_iterator)
 128     if kind != expected_type:
 129         raise Exception('expected %r, saw %r' % (expected_type, kind))
 130     return ''.join(cat_iterator)
 131
 132 def get_commit_items(id, cp):
 133     return parse_commit(get_cat_data(cp.get(id), 'commit'))
 134
 135 def _local_git_date_str(epoch_sec):
 136     return '%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
 137
 138
 139 def _git_date_str(epoch_sec, tz_offset_sec):
 140     offs =  tz_offset_sec // 60
 141     return '%d %s%02d%02d' \
 142         % (epoch_sec,
 143            '+' if offs >= 0 else '-',
 144            abs(offs) // 60,
 145            abs(offs) % 60)
 146
 147
 148 def repo(sub = '', repo_dir=None):
 149     """Get the path to the git repository or one of its subdirectories."""
 150     repo_dir = repo_dir or repodir
 151     if not repo_dir:
 152         raise GitError('You should call check_repo_or_die()')
 153
 154     # If there's a .git subdirectory, then the actual repo is in there.
 155     gd = os.path.join(repo_dir, '.git')
 156     if os.path.exists(gd):
 157         repo_dir = gd
 158
 159     return os.path.join(repo_dir, sub)
 160
 161
 162 def shorten_hash(s):
 163     return re.sub(r'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)',
 164                   r'\1\2*\3', s)
 165
 166
 167 def repo_rel(path):
 168     full = os.path.abspath(path)
 169     fullrepo = os.path.abspath(repo(''))
 170     if not fullrepo.endswith('/'):
 171         fullrepo += '/'
 172     if full.startswith(fullrepo):
 173         path = full[len(fullrepo):]
 174     if path.startswith('index-cache/'):
 175         path = path[len('index-cache/'):]
 176     return shorten_hash(path)
 177
 178
 179 def all_packdirs():
 180     paths = [repo('objects/pack')]
 181     paths += glob.glob(repo('index-cache/*/.'))
 182     return paths
 183
 184
 185 def auto_midx(objdir):
 186     args = [path.exe(), 'midx', '--auto', '--dir', objdir]
 187     try:
 188         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 189     except OSError as e:
 190         # make sure 'args' gets printed to help with debugging
 191         add_error('%r: exception: %s' % (args, e))
 192         raise
 193     if rv:
 194         add_error('%r: returned %d' % (args, rv))
 195
 196     args = [path.exe(), 'bloom', '--dir', objdir]
 197     try:
 198         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 199     except OSError as e:
 200         # make sure 'args' gets printed to help with debugging
 201         add_error('%r: exception: %s' % (args, e))
 202         raise
 203     if rv:
 204         add_error('%r: returned %d' % (args, rv))
 205
 206
 207 def mangle_name(name, mode, gitmode):
 208     """Mangle a file name to present an abstract name for segmented files.
 209     Mangled file names will have the ".bup" extension added to them. If a
 210     file's name already ends with ".bup", a ".bupl" extension is added to
 211     disambiguate normal files from segmented ones.
 212     """
 213     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 214         assert(stat.S_ISDIR(gitmode))
 215         return name + '.bup'
 216     elif name.endswith('.bup') or name[:-1].endswith('.bup'):
 217         return name + '.bupl'
 218     else:
 219         return name
 220
 221
 222 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 223 def demangle_name(name, mode):
 224     """Remove name mangling from a file name, if necessary.
 225
 226     The return value is a tuple (demangled_filename,mode), where mode is one of
 227     the following:
 228
 229     * BUP_NORMAL  : files that should be read as-is from the repository
 230     * BUP_CHUNKED : files that were chunked and need to be reassembled
 231
 232     For more information on the name mangling algorithm, see mangle_name()
 233     """
 234     if name.endswith('.bupl'):
 235         return (name[:-5], BUP_NORMAL)
 236     elif name.endswith('.bup'):
 237         return (name[:-4], BUP_CHUNKED)
 238     elif name.endswith('.bupm'):
 239         return (name[:-5],
 240                 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
 241     else:
 242         return (name, BUP_NORMAL)
 243
 244
 245 def calc_hash(type, content):
 246     """Calculate some content's hash in the Git fashion."""
 247     header = '%s %d\0' % (type, len(content))
 248     sum = Sha1(header)
 249     sum.update(content)
 250     return sum.digest()
 251
 252
 253 def shalist_item_sort_key(ent):
 254     (mode, name, id) = ent
 255     assert(mode+0 == mode)
 256     if stat.S_ISDIR(mode):
 257         return name + '/'
 258     else:
 259         return name
 260
 261
 262 def tree_encode(shalist):
 263     """Generate a git tree object from (mode,name,hash) tuples."""
 264     shalist = sorted(shalist, key = shalist_item_sort_key)
 265     l = []
 266     for (mode,name,bin) in shalist:
 267         assert(mode)
 268         assert(mode+0 == mode)
 269         assert(name)
 270         assert(len(bin) == 20)
 271         s = '%o %s\0%s' % (mode,name,bin)
 272         assert(s[0] != '0')  # 0-padded octal is not acceptable in a git tree
 273         l.append(s)
 274     return ''.join(l)
 275
 276
 277 def tree_decode(buf):
 278     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 279     ofs = 0
 280     while ofs < len(buf):
 281         z = buf.find('\0', ofs)
 282         assert(z > ofs)
 283         spl = buf[ofs:z].split(' ', 1)
 284         assert(len(spl) == 2)
 285         mode,name = spl
 286         sha = buf[z+1:z+1+20]
 287         ofs = z+1+20
 288         yield (int(mode, 8), name, sha)
 289
 290
 291 def _encode_packobj(type, content, compression_level=1):
 292     if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
 293         raise ValueError('invalid compression level %s' % compression_level)
 294     szout = ''
 295     sz = len(content)
 296     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 297     sz >>= 4
 298     while 1:
 299         if sz: szbits |= 0x80
 300         szout += chr(szbits)
 301         if not sz:
 302             break
 303         szbits = sz & 0x7f
 304         sz >>= 7
 305     z = zlib.compressobj(compression_level)
 306     yield szout
 307     yield z.compress(content)
 308     yield z.flush()
 309
 310
 311 def _encode_looseobj(type, content, compression_level=1):
 312     z = zlib.compressobj(compression_level)
 313     yield z.compress('%s %d\0' % (type, len(content)))
 314     yield z.compress(content)
 315     yield z.flush()
 316
 317
 318 def _decode_looseobj(buf):
 319     assert(buf);
 320     s = zlib.decompress(buf)
 321     i = s.find('\0')
 322     assert(i > 0)
 323     l = s[:i].split(' ')
 324     type = l[0]
 325     sz = int(l[1])
 326     content = s[i+1:]
 327     assert(type in _typemap)
 328     assert(sz == len(content))
 329     return (type, content)
 330
 331
 332 def _decode_packobj(buf):
 333     assert(buf)
 334     c = ord(buf[0])
 335     type = _typermap[(c & 0x70) >> 4]
 336     sz = c & 0x0f
 337     shift = 4
 338     i = 0
 339     while c & 0x80:
 340         i += 1
 341         c = ord(buf[i])
 342         sz |= (c & 0x7f) << shift
 343         shift += 7
 344         if not (c & 0x80):
 345             break
 346     return (type, zlib.decompress(buf[i+1:]))
 347
 348
 349 class PackIdx:
 350     def __init__(self):
 351         assert(0)
 352
 353     def find_offset(self, hash):
 354         """Get the offset of an object inside the index file."""
 355         idx = self._idx_from_hash(hash)
 356         if idx != None:
 357             return self._ofs_from_idx(idx)
 358         return None
 359
 360     def exists(self, hash, want_source=False):
 361         """Return nonempty if the object exists in this index."""
 362         if hash and (self._idx_from_hash(hash) != None):
 363             return want_source and os.path.basename(self.name) or True
 364         return None
 365
 366     def __len__(self):
 367         return int(self.fanout[255])
 368
 369     def _idx_from_hash(self, hash):
 370         global _total_searches, _total_steps
 371         _total_searches += 1
 372         assert(len(hash) == 20)
 373         b1 = ord(hash[0])
 374         start = self.fanout[b1-1] # range -1..254
 375         end = self.fanout[b1] # range 0..255
 376         want = str(hash)
 377         _total_steps += 1  # lookup table is a step
 378         while start < end:
 379             _total_steps += 1
 380             mid = start + (end-start)/2
 381             v = self._idx_to_hash(mid)
 382             if v < want:
 383                 start = mid+1
 384             elif v > want:
 385                 end = mid
 386             else: # got it!
 387                 return mid
 388         return None
 389
 390
 391 class PackIdxV1(PackIdx):
 392     """Object representation of a Git pack index (version 1) file."""
 393     def __init__(self, filename, f):
 394         self.name = filename
 395         self.idxnames = [self.name]
 396         self.map = mmap_read(f)
 397         self.fanout = list(struct.unpack('!256I', buffer(self.map, 0, 256 * 4)))
 398         self.fanout.append(0)  # entry "-1"
 399         nsha = self.fanout[255]
 400         self.sha_ofs = 256*4
 401         self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
 402
 403     def _ofs_from_idx(self, idx):
 404         ofs = idx * 24
 405         return struct.unpack('!I', self.shatable[ofs : ofs + 4])[0]
 406
 407     def _idx_to_hash(self, idx):
 408         ofs = idx * 24 + 4
 409         return self.shatable[ofs : ofs + 20]
 410
 411     def __iter__(self):
 412         count = self.fanout[255]
 413         start = 256 * 4 + 4
 414         for ofs in range(start, start + (24 * count), 24):
 415             yield self.map[ofs : ofs + 20]
 416
 417
 418 class PackIdxV2(PackIdx):
 419     """Object representation of a Git pack index (version 2) file."""
 420     def __init__(self, filename, f):
 421         self.name = filename
 422         self.idxnames = [self.name]
 423         self.map = mmap_read(f)
 424         assert self.map[0:8] == b'\377tOc\0\0\0\2'
 425         self.fanout = list(struct.unpack('!256I',
 426                                          buffer(self.map[8 : 8 + 256 * 4])))
 427         self.fanout.append(0)  # entry "-1"
 428         nsha = self.fanout[255]
 429         self.sha_ofs = 8 + 256*4
 430         self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
 431         self.ofstable = buffer(self.map,
 432                                self.sha_ofs + nsha*20 + nsha*4,
 433                                nsha*4)
 434         self.ofs64table = buffer(self.map,
 435                                  8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
 436
 437     def _ofs_from_idx(self, idx):
 438         i = idx * 4
 439         ofs = struct.unpack('!I', self.ofstable[i : i + 4])[0]
 440         if ofs & 0x80000000:
 441             idx64 = ofs & 0x7fffffff
 442             idx64_i = idx64 * 8
 443             ofs = struct.unpack('!Q', self.ofs64table[idx64_i : idx64_i + 8])[0]
 444         return ofs
 445
 446     def _idx_to_hash(self, idx):
 447         return self.shatable[idx * 20 : (idx + 1) * 20]
 448
 449     def __iter__(self):
 450         count = self.fanout[255]
 451         start = 8 + 256 * 4
 452         for ofs in range(start, start + (20 * count), 20):
 453             yield self.map[ofs : ofs + 20]
 454
 455
 456 _mpi_count = 0
 457 class PackIdxList:
 458     def __init__(self, dir):
 459         global _mpi_count
 460         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 461         _mpi_count += 1
 462         self.dir = dir
 463         self.also = set()
 464         self.packs = []
 465         self.do_bloom = False
 466         self.bloom = None
 467         self.refresh()
 468
 469     def __del__(self):
 470         global _mpi_count
 471         _mpi_count -= 1
 472         assert(_mpi_count == 0)
 473
 474     def __iter__(self):
 475         return iter(idxmerge(self.packs))
 476
 477     def __len__(self):
 478         return sum(len(pack) for pack in self.packs)
 479
 480     def exists(self, hash, want_source=False):
 481         """Return nonempty if the object exists in the index files."""
 482         global _total_searches
 483         _total_searches += 1
 484         if hash in self.also:
 485             return True
 486         if self.do_bloom and self.bloom:
 487             if self.bloom.exists(hash):
 488                 self.do_bloom = False
 489             else:
 490                 _total_searches -= 1  # was counted by bloom
 491                 return None
 492         for i in xrange(len(self.packs)):
 493             p = self.packs[i]
 494             _total_searches -= 1  # will be incremented by sub-pack
 495             ix = p.exists(hash, want_source=want_source)
 496             if ix:
 497                 # reorder so most recently used packs are searched first
 498                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 499                 return ix
 500         self.do_bloom = True
 501         return None
 502
 503     def refresh(self, skip_midx = False):
 504         """Refresh the index list.
 505         This method verifies if .midx files were superseded (e.g. all of its
 506         contents are in another, bigger .midx file) and removes the superseded
 507         files.
 508
 509         If skip_midx is True, all work on .midx files will be skipped and .midx
 510         files will be removed from the list.
 511
 512         The module-global variable 'ignore_midx' can force this function to
 513         always act as if skip_midx was True.
 514         """
 515         self.bloom = None # Always reopen the bloom as it may have been relaced
 516         self.do_bloom = False
 517         skip_midx = skip_midx or ignore_midx
 518         d = dict((p.name, p) for p in self.packs
 519                  if not skip_midx or not isinstance(p, midx.PackMidx))
 520         if os.path.exists(self.dir):
 521             if not skip_midx:
 522                 midxl = []
 523                 for ix in self.packs:
 524                     if isinstance(ix, midx.PackMidx):
 525                         for name in ix.idxnames:
 526                             d[os.path.join(self.dir, name)] = ix
 527                 for full in glob.glob(os.path.join(self.dir,'*.midx')):
 528                     if not d.get(full):
 529                         mx = midx.PackMidx(full)
 530                         (mxd, mxf) = os.path.split(mx.name)
 531                         broken = False
 532                         for n in mx.idxnames:
 533                             if not os.path.exists(os.path.join(mxd, n)):
 534                                 log(('warning: index %s missing\n' +
 535                                     '  used by %s\n') % (n, mxf))
 536                                 broken = True
 537                         if broken:
 538                             mx.close()
 539                             del mx
 540                             unlink(full)
 541                         else:
 542                             midxl.append(mx)
 543                 midxl.sort(key=lambda ix:
 544                            (-len(ix), -xstat.stat(ix.name).st_mtime))
 545                 for ix in midxl:
 546                     any_needed = False
 547                     for sub in ix.idxnames:
 548                         found = d.get(os.path.join(self.dir, sub))
 549                         if not found or isinstance(found, PackIdx):
 550                             # doesn't exist, or exists but not in a midx
 551                             any_needed = True
 552                             break
 553                     if any_needed:
 554                         d[ix.name] = ix
 555                         for name in ix.idxnames:
 556                             d[os.path.join(self.dir, name)] = ix
 557                     elif not ix.force_keep:
 558                         debug1('midx: removing redundant: %s\n'
 559                                % os.path.basename(ix.name))
 560                         ix.close()
 561                         unlink(ix.name)
 562             for full in glob.glob(os.path.join(self.dir,'*.idx')):
 563                 if not d.get(full):
 564                     try:
 565                         ix = open_idx(full)
 566                     except GitError as e:
 567                         add_error(e)
 568                         continue
 569                     d[full] = ix
 570             bfull = os.path.join(self.dir, 'bup.bloom')
 571             if self.bloom is None and os.path.exists(bfull):
 572                 self.bloom = bloom.ShaBloom(bfull)
 573             self.packs = list(set(d.values()))
 574             self.packs.sort(reverse=True, key=lambda x: len(x))
 575             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 576                 self.do_bloom = True
 577             else:
 578                 self.bloom = None
 579         debug1('PackIdxList: using %d index%s.\n'
 580             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 581
 582     def add(self, hash):
 583         """Insert an additional object in the list."""
 584         self.also.add(hash)
 585
 586
 587 def open_idx(filename):
 588     if filename.endswith('.idx'):
 589         f = open(filename, 'rb')
 590         header = f.read(8)
 591         if header[0:4] == '\377tOc':
 592             version = struct.unpack('!I', header[4:8])[0]
 593             if version == 2:
 594                 return PackIdxV2(filename, f)
 595             else:
 596                 raise GitError('%s: expected idx file version 2, got %d'
 597                                % (filename, version))
 598         elif len(header) == 8 and header[0:4] < '\377tOc':
 599             return PackIdxV1(filename, f)
 600         else:
 601             raise GitError('%s: unrecognized idx file header' % filename)
 602     elif filename.endswith('.midx'):
 603         return midx.PackMidx(filename)
 604     else:
 605         raise GitError('idx filenames must end with .idx or .midx')
 606
 607
 608 def idxmerge(idxlist, final_progress=True):
 609     """Generate a list of all the objects reachable in a PackIdxList."""
 610     def pfunc(count, total):
 611         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 612                   % (count*100.0/total, count, total))
 613     def pfinal(count, total):
 614         if final_progress:
 615             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 616                      % (100, total, total))
 617     return merge_iter(idxlist, 10024, pfunc, pfinal)
 618
 619
 620 def _make_objcache():
 621     return PackIdxList(repo('objects/pack'))
 622
 623 # bup-gc assumes that it can disable all PackWriter activities
 624 # (bloom/midx/cache) via the constructor and close() arguments.
 625
 626 class PackWriter:
 627     """Writes Git objects inside a pack file."""
 628     def __init__(self, objcache_maker=_make_objcache, compression_level=1,
 629                  run_midx=True, on_pack_finish=None,
 630                  max_pack_size=None, max_pack_objects=None, repo_dir=None):
 631         self.repo_dir = repo_dir or repo()
 632         self.file = None
 633         self.parentfd = None
 634         self.count = 0
 635         self.outbytes = 0
 636         self.filename = None
 637         self.idx = None
 638         self.objcache_maker = objcache_maker
 639         self.objcache = None
 640         self.compression_level = compression_level
 641         self.run_midx=run_midx
 642         self.on_pack_finish = on_pack_finish
 643         if not max_pack_size:
 644             max_pack_size = git_config_get('pack.packSizeLimit',
 645                                            repo_dir=self.repo_dir)
 646             if max_pack_size is not None:
 647                 max_pack_size = parse_num(max_pack_size)
 648             if not max_pack_size:
 649                 # larger packs slow down pruning
 650                 max_pack_size = 1000 * 1000 * 1000
 651         self.max_pack_size = max_pack_size
 652         # cache memory usage is about 83 bytes per object
 653         self.max_pack_objects = max_pack_objects if max_pack_objects \
 654                                 else max(1, self.max_pack_size // 5000)
 655
 656     def __del__(self):
 657         self.close()
 658
 659     def __enter__(self):
 660         return self
 661
 662     def __exit__(self, type, value, traceback):
 663         self.close()
 664
 665     def _open(self):
 666         if not self.file:
 667             objdir = dir = os.path.join(self.repo_dir, 'objects')
 668             fd, name = tempfile.mkstemp(suffix='.pack', dir=objdir)
 669             try:
 670                 self.file = os.fdopen(fd, 'w+b')
 671             except:
 672                 os.close(fd)
 673                 raise
 674             try:
 675                 self.parentfd = os.open(objdir, os.O_RDONLY)
 676             except:
 677                 f = self.file
 678                 self.file = None
 679                 f.close()
 680                 raise
 681             assert(name.endswith('.pack'))
 682             self.filename = name[:-5]
 683             self.file.write('PACK\0\0\0\2\0\0\0\0')
 684             self.idx = list(list() for i in xrange(256))
 685
 686     def _raw_write(self, datalist, sha):
 687         self._open()
 688         f = self.file
 689         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 690         # the file never has a *partial* blob.  So let's make sure it's
 691         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 692         # to our hashsplit algorithm.)  f.write() does its own buffering,
 693         # but that's okay because we'll flush it in _end().
 694         oneblob = ''.join(datalist)
 695         try:
 696             f.write(oneblob)
 697         except IOError as e:
 698             raise GitError, e, sys.exc_info()[2]
 699         nw = len(oneblob)
 700         crc = zlib.crc32(oneblob) & 0xffffffff
 701         self._update_idx(sha, crc, nw)
 702         self.outbytes += nw
 703         self.count += 1
 704         return nw, crc
 705
 706     def _update_idx(self, sha, crc, size):
 707         assert(sha)
 708         if self.idx:
 709             self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
 710
 711     def _write(self, sha, type, content):
 712         if verbose:
 713             log('>')
 714         if not sha:
 715             sha = calc_hash(type, content)
 716         size, crc = self._raw_write(_encode_packobj(type, content,
 717                                                     self.compression_level),
 718                                     sha=sha)
 719         if self.outbytes >= self.max_pack_size \
 720            or self.count >= self.max_pack_objects:
 721             self.breakpoint()
 722         return sha
 723
 724     def breakpoint(self):
 725         """Clear byte and object counts and return the last processed id."""
 726         id = self._end(self.run_midx)
 727         self.outbytes = self.count = 0
 728         return id
 729
 730     def _require_objcache(self):
 731         if self.objcache is None and self.objcache_maker:
 732             self.objcache = self.objcache_maker()
 733         if self.objcache is None:
 734             raise GitError(
 735                     "PackWriter not opened or can't check exists w/o objcache")
 736
 737     def exists(self, id, want_source=False):
 738         """Return non-empty if an object is found in the object cache."""
 739         self._require_objcache()
 740         return self.objcache.exists(id, want_source=want_source)
 741
 742     def just_write(self, sha, type, content):
 743         """Write an object to the pack file without checking for duplication."""
 744         self._write(sha, type, content)
 745         # If nothing else, gc doesn't have/want an objcache
 746         if self.objcache is not None:
 747             self.objcache.add(sha)
 748
 749     def maybe_write(self, type, content):
 750         """Write an object to the pack file if not present and return its id."""
 751         sha = calc_hash(type, content)
 752         if not self.exists(sha):
 753             self._require_objcache()
 754             self.just_write(sha, type, content)
 755         return sha
 756
 757     def new_blob(self, blob):
 758         """Create a blob object in the pack with the supplied content."""
 759         return self.maybe_write('blob', blob)
 760
 761     def new_tree(self, shalist):
 762         """Create a tree object in the pack."""
 763         content = tree_encode(shalist)
 764         return self.maybe_write('tree', content)
 765
 766     def new_commit(self, tree, parent,
 767                    author, adate_sec, adate_tz,
 768                    committer, cdate_sec, cdate_tz,
 769                    msg):
 770         """Create a commit object in the pack.  The date_sec values must be
 771         epoch-seconds, and if a tz is None, the local timezone is assumed."""
 772         if adate_tz:
 773             adate_str = _git_date_str(adate_sec, adate_tz)
 774         else:
 775             adate_str = _local_git_date_str(adate_sec)
 776         if cdate_tz:
 777             cdate_str = _git_date_str(cdate_sec, cdate_tz)
 778         else:
 779             cdate_str = _local_git_date_str(cdate_sec)
 780         l = []
 781         if tree: l.append('tree %s' % tree.encode('hex'))
 782         if parent: l.append('parent %s' % parent.encode('hex'))
 783         if author: l.append('author %s %s' % (author, adate_str))
 784         if committer: l.append('committer %s %s' % (committer, cdate_str))
 785         l.append('')
 786         l.append(msg)
 787         return self.maybe_write('commit', '\n'.join(l))
 788
 789     def abort(self):
 790         """Remove the pack file from disk."""
 791         f = self.file
 792         if f:
 793             pfd = self.parentfd
 794             self.file = None
 795             self.parentfd = None
 796             self.idx = None
 797             try:
 798                 try:
 799                     os.unlink(self.filename + '.pack')
 800                 finally:
 801                     f.close()
 802             finally:
 803                 if pfd is not None:
 804                     os.close(pfd)
 805
 806     def _end(self, run_midx=True):
 807         f = self.file
 808         if not f: return None
 809         self.file = None
 810         try:
 811             self.objcache = None
 812             idx = self.idx
 813             self.idx = None
 814
 815             # update object count
 816             f.seek(8)
 817             cp = struct.pack('!i', self.count)
 818             assert(len(cp) == 4)
 819             f.write(cp)
 820
 821             # calculate the pack sha1sum
 822             f.seek(0)
 823             sum = Sha1()
 824             for b in chunkyreader(f):
 825                 sum.update(b)
 826             packbin = sum.digest()
 827             f.write(packbin)
 828             fdatasync(f.fileno())
 829         finally:
 830             f.close()
 831
 832         obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
 833         nameprefix = os.path.join(self.repo_dir,
 834                                   'objects/pack/pack-' +  obj_list_sha)
 835         if os.path.exists(self.filename + '.map'):
 836             os.unlink(self.filename + '.map')
 837         os.rename(self.filename + '.pack', nameprefix + '.pack')
 838         os.rename(self.filename + '.idx', nameprefix + '.idx')
 839         try:
 840             os.fsync(self.parentfd)
 841         finally:
 842             os.close(self.parentfd)
 843
 844         if run_midx:
 845             auto_midx(os.path.join(self.repo_dir, 'objects/pack'))
 846
 847         if self.on_pack_finish:
 848             self.on_pack_finish(nameprefix)
 849
 850         return nameprefix
 851
 852     def close(self, run_midx=True):
 853         """Close the pack file and move it to its definitive path."""
 854         return self._end(run_midx=run_midx)
 855
 856     def _write_pack_idx_v2(self, filename, idx, packbin):
 857         ofs64_count = 0
 858         for section in idx:
 859             for entry in section:
 860                 if entry[2] >= 2**31:
 861                     ofs64_count += 1
 862
 863         # Length: header + fan-out + shas-and-crcs + overflow-offsets
 864         index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
 865         idx_map = None
 866         idx_f = open(filename, 'w+b')
 867         try:
 868             idx_f.truncate(index_len)
 869             fdatasync(idx_f.fileno())
 870             idx_map = mmap_readwrite(idx_f, close=False)
 871             try:
 872                 count = _helpers.write_idx(filename, idx_map, idx, self.count)
 873                 assert(count == self.count)
 874                 idx_map.flush()
 875             finally:
 876                 idx_map.close()
 877         finally:
 878             idx_f.close()
 879
 880         idx_f = open(filename, 'a+b')
 881         try:
 882             idx_f.write(packbin)
 883             idx_f.seek(0)
 884             idx_sum = Sha1()
 885             b = idx_f.read(8 + 4*256)
 886             idx_sum.update(b)
 887
 888             obj_list_sum = Sha1()
 889             for b in chunkyreader(idx_f, 20*self.count):
 890                 idx_sum.update(b)
 891                 obj_list_sum.update(b)
 892             namebase = obj_list_sum.hexdigest()
 893
 894             for b in chunkyreader(idx_f):
 895                 idx_sum.update(b)
 896             idx_f.write(idx_sum.digest())
 897             fdatasync(idx_f.fileno())
 898             return namebase
 899         finally:
 900             idx_f.close()
 901
 902
 903 def list_refs(patterns=None, repo_dir=None,
 904               limit_to_heads=False, limit_to_tags=False):
 905     """Yield (refname, hash) tuples for all repository refs unless
 906     patterns are specified.  In that case, only include tuples for
 907     refs matching those patterns (cf. git-show-ref(1)).  The limits
 908     restrict the result items to refs/heads or refs/tags.  If both
 909     limits are specified, items from both sources will be included.
 910
 911     """
 912     argv = ['git', 'show-ref']
 913     if limit_to_heads:
 914         argv.append('--heads')
 915     if limit_to_tags:
 916         argv.append('--tags')
 917     argv.append('--')
 918     if patterns:
 919         argv.extend(patterns)
 920     p = subprocess.Popen(argv, env=_gitenv(repo_dir), stdout=subprocess.PIPE)
 921     out = p.stdout.read().strip()
 922     rv = p.wait()  # not fatal
 923     if rv:
 924         assert(not out)
 925     if out:
 926         for d in out.split('\n'):
 927             (sha, name) = d.split(' ', 1)
 928             yield (name, sha.decode('hex'))
 929
 930
 931 def read_ref(refname, repo_dir = None):
 932     """Get the commit id of the most recent commit made on a given ref."""
 933     refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
 934     l = tuple(islice(refs, 2))
 935     if l:
 936         assert(len(l) == 1)
 937         return l[0][1]
 938     else:
 939         return None
 940
 941
 942 def rev_list_invocation(ref_or_refs, count=None, format=None):
 943     if isinstance(ref_or_refs, compat.str_type):
 944         refs = (ref_or_refs,)
 945     else:
 946         refs = ref_or_refs
 947     argv = ['git', 'rev-list']
 948     if isinstance(count, Integral):
 949         argv.extend(['-n', str(count)])
 950     elif count:
 951         raise ValueError('unexpected count argument %r' % count)
 952
 953     if format:
 954         argv.append('--pretty=format:' + format)
 955     for ref in refs:
 956         assert not ref.startswith('-')
 957         argv.append(ref)
 958     argv.append('--')
 959     return argv
 960
 961
 962 def rev_list(ref_or_refs, count=None, parse=None, format=None, repo_dir=None):
 963     """Yield information about commits as per "git rev-list".  If a format
 964     is not provided, yield one hex hash at a time.  If a format is
 965     provided, pass it to rev-list and call parse(git_stdout) for each
 966     commit with the stream positioned just after the rev-list "commit
 967     HASH" header line.  When a format is provided yield (oidx,
 968     parse(git_stdout)) for each commit.
 969
 970     """
 971     assert bool(parse) == bool(format)
 972     p = subprocess.Popen(rev_list_invocation(ref_or_refs, count=count,
 973                                              format=format),
 974                          env=_gitenv(repo_dir),
 975                          stdout = subprocess.PIPE)
 976     if not format:
 977         for line in p.stdout:
 978             yield line.strip()
 979     else:
 980         line = p.stdout.readline()
 981         while line:
 982             s = line.strip()
 983             if not s.startswith('commit '):
 984                 raise Exception('unexpected line ' + s)
 985             s = s[7:]
 986             assert len(s) == 40
 987             yield s, parse(p.stdout)
 988             line = p.stdout.readline()
 989
 990     rv = p.wait()  # not fatal
 991     if rv:
 992         raise GitError, 'git rev-list returned error %d' % rv
 993
 994
 995 def get_commit_dates(refs, repo_dir=None):
 996     """Get the dates for the specified commit refs.  For now, every unique
 997        string in refs must resolve to a different commit or this
 998        function will fail."""
 999     result = []
1000     for ref in refs:
1001         commit = get_commit_items(ref, cp(repo_dir))
1002         result.append(commit.author_sec)
1003     return result
1004
1005
1006 def rev_parse(committish, repo_dir=None):
1007     """Resolve the full hash for 'committish', if it exists.
1008
1009     Should be roughly equivalent to 'git rev-parse'.
1010
1011     Returns the hex value of the hash if it is found, None if 'committish' does
1012     not correspond to anything.
1013     """
1014     head = read_ref(committish, repo_dir=repo_dir)
1015     if head:
1016         debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
1017         return head
1018
1019     pL = PackIdxList(repo('objects/pack', repo_dir=repo_dir))
1020
1021     if len(committish) == 40:
1022         try:
1023             hash = committish.decode('hex')
1024         except TypeError:
1025             return None
1026
1027         if pL.exists(hash):
1028             return hash
1029
1030     return None
1031
1032
1033 def update_ref(refname, newval, oldval, repo_dir=None):
1034     """Update a repository reference."""
1035     if not oldval:
1036         oldval = ''
1037     assert(refname.startswith('refs/heads/') \
1038            or refname.startswith('refs/tags/'))
1039     p = subprocess.Popen(['git', 'update-ref', refname,
1040                           newval.encode('hex'), oldval.encode('hex')],
1041                          env=_gitenv(repo_dir))
1042     _git_wait('git update-ref', p)
1043
1044
1045 def delete_ref(refname, oldvalue=None):
1046     """Delete a repository reference (see git update-ref(1))."""
1047     assert(refname.startswith('refs/'))
1048     oldvalue = [] if not oldvalue else [oldvalue]
1049     p = subprocess.Popen(['git', 'update-ref', '-d', refname] + oldvalue,
1050                          env=_gitenv())
1051     _git_wait('git update-ref', p)
1052
1053
1054 def guess_repo(path=None):
1055     """Set the path value in the global variable "repodir".
1056     This makes bup look for an existing bup repository, but not fail if a
1057     repository doesn't exist. Usually, if you are interacting with a bup
1058     repository, you would not be calling this function but using
1059     check_repo_or_die().
1060     """
1061     global repodir
1062     if path:
1063         repodir = path
1064     if not repodir:
1065         repodir = os.environ.get('BUP_DIR')
1066         if not repodir:
1067             repodir = os.path.expanduser('~/.bup')
1068
1069
1070 def init_repo(path=None):
1071     """Create the Git bare repository for bup in a given path."""
1072     guess_repo(path)
1073     d = repo()  # appends a / to the path
1074     parent = os.path.dirname(os.path.dirname(d))
1075     if parent and not os.path.exists(parent):
1076         raise GitError('parent directory "%s" does not exist\n' % parent)
1077     if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
1078         raise GitError('"%s" exists but is not a directory\n' % d)
1079     p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
1080                          env=_gitenv())
1081     _git_wait('git init', p)
1082     # Force the index version configuration in order to ensure bup works
1083     # regardless of the version of the installed Git binary.
1084     p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
1085                          stdout=sys.stderr, env=_gitenv())
1086     _git_wait('git config', p)
1087     # Enable the reflog
1088     p = subprocess.Popen(['git', 'config', 'core.logAllRefUpdates', 'true'],
1089                          stdout=sys.stderr, env=_gitenv())
1090     _git_wait('git config', p)
1091
1092
1093 def check_repo_or_die(path=None):
1094     """Check to see if a bup repository probably exists, and abort if not."""
1095     guess_repo(path)
1096     top = repo()
1097     pst = stat_if_exists(top + '/objects/pack')
1098     if pst and stat.S_ISDIR(pst.st_mode):
1099         return
1100     if not pst:
1101         top_st = stat_if_exists(top)
1102         if not top_st:
1103             log('error: repository %r does not exist (see "bup help init")\n'
1104                 % top)
1105             sys.exit(15)
1106     log('error: %r is not a repository\n' % top)
1107     sys.exit(14)
1108
1109
1110 _ver = None
1111 def ver():
1112     """Get Git's version and ensure a usable version is installed.
1113
1114     The returned version is formatted as an ordered tuple with each position
1115     representing a digit in the version tag. For example, the following tuple
1116     would represent version 1.6.6.9:
1117
1118         ('1', '6', '6', '9')
1119     """
1120     global _ver
1121     if not _ver:
1122         p = subprocess.Popen(['git', '--version'],
1123                              stdout=subprocess.PIPE)
1124         gvs = p.stdout.read()
1125         _git_wait('git --version', p)
1126         m = re.match(r'git version (\S+.\S+)', gvs)
1127         if not m:
1128             raise GitError('git --version weird output: %r' % gvs)
1129         _ver = tuple(m.group(1).split('.'))
1130     needed = ('1','5', '3', '1')
1131     if _ver < needed:
1132         raise GitError('git version %s or higher is required; you have %s'
1133                        % ('.'.join(needed), '.'.join(_ver)))
1134     return _ver
1135
1136
1137 class _AbortableIter:
1138     def __init__(self, it, onabort = None):
1139         self.it = it
1140         self.onabort = onabort
1141         self.done = None
1142
1143     def __iter__(self):
1144         return self
1145
1146     def next(self):
1147         try:
1148             return next(self.it)
1149         except StopIteration as e:
1150             self.done = True
1151             raise
1152         except:
1153             self.abort()
1154             raise
1155
1156     def abort(self):
1157         """Abort iteration and call the abortion callback, if needed."""
1158         if not self.done:
1159             self.done = True
1160             if self.onabort:
1161                 self.onabort()
1162
1163     def __del__(self):
1164         self.abort()
1165
1166
1167 _ver_warned = 0
1168 class CatPipe:
1169     """Link to 'git cat-file' that is used to retrieve blob data."""
1170     def __init__(self, repo_dir = None):
1171         global _ver_warned
1172         self.repo_dir = repo_dir
1173         wanted = ('1','5','6')
1174         if ver() < wanted:
1175             log('error: git version must be at least 1.5.6\n')
1176             sys.exit(1)
1177         self.p = self.inprogress = None
1178
1179     def _abort(self):
1180         if self.p:
1181             self.p.stdout.close()
1182             self.p.stdin.close()
1183         self.p = None
1184         self.inprogress = None
1185
1186     def restart(self):
1187         self._abort()
1188         self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1189                                   stdin=subprocess.PIPE,
1190                                   stdout=subprocess.PIPE,
1191                                   close_fds = True,
1192                                   bufsize = 4096,
1193                                   env=_gitenv(self.repo_dir))
1194
1195     def get(self, ref):
1196         """Yield (oidx, type, size), followed by the data referred to by ref.
1197         If ref does not exist, only yield (None, None, None).
1198
1199         """
1200         if not self.p or self.p.poll() != None:
1201             self.restart()
1202         assert(self.p)
1203         poll_result = self.p.poll()
1204         assert(poll_result == None)
1205         if self.inprogress:
1206             log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1207         assert(not self.inprogress)
1208         assert(ref.find('\n') < 0)
1209         assert(ref.find('\r') < 0)
1210         assert(not ref.startswith('-'))
1211         self.inprogress = ref
1212         self.p.stdin.write('%s\n' % ref)
1213         self.p.stdin.flush()
1214         hdr = self.p.stdout.readline()
1215         if hdr.endswith(' missing\n'):
1216             self.inprogress = None
1217             yield None, None, None
1218             return
1219         info = hdr.split(' ')
1220         if len(info) != 3 or len(info[0]) != 40:
1221             raise GitError('expected object (id, type, size), got %r' % info)
1222         oidx, typ, size = info
1223         size = int(size)
1224         it = _AbortableIter(chunkyreader(self.p.stdout, size),
1225                             onabort=self._abort)
1226         try:
1227             yield oidx, typ, size
1228             for blob in it:
1229                 yield blob
1230             readline_result = self.p.stdout.readline()
1231             assert(readline_result == '\n')
1232             self.inprogress = None
1233         except Exception as e:
1234             it.abort()
1235             raise
1236
1237     def _join(self, it):
1238         _, typ, _ = next(it)
1239         if typ == 'blob':
1240             for blob in it:
1241                 yield blob
1242         elif typ == 'tree':
1243             treefile = ''.join(it)
1244             for (mode, name, sha) in tree_decode(treefile):
1245                 for blob in self.join(sha.encode('hex')):
1246                     yield blob
1247         elif typ == 'commit':
1248             treeline = ''.join(it).split('\n')[0]
1249             assert(treeline.startswith('tree '))
1250             for blob in self.join(treeline[5:]):
1251                 yield blob
1252         else:
1253             raise GitError('invalid object type %r: expected blob/tree/commit'
1254                            % typ)
1255
1256     def join(self, id):
1257         """Generate a list of the content of all blobs that can be reached
1258         from an object.  The hash given in 'id' must point to a blob, a tree
1259         or a commit. The content of all blobs that can be seen from trees or
1260         commits will be added to the list.
1261         """
1262         try:
1263             for d in self._join(self.get(id)):
1264                 yield d
1265         except StopIteration:
1266             log('booger!\n')
1267
1268
1269 _cp = {}
1270
1271 def cp(repo_dir=None):
1272     """Create a CatPipe object or reuse the already existing one."""
1273     global _cp, repodir
1274     if not repo_dir:
1275         repo_dir = repodir or repo()
1276     repo_dir = os.path.abspath(repo_dir)
1277     cp = _cp.get(repo_dir)
1278     if not cp:
1279         cp = CatPipe(repo_dir)
1280         _cp[repo_dir] = cp
1281     return cp
1282
1283
1284 def tags(repo_dir = None):
1285     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1286     tags = {}
1287     for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1288         assert(n.startswith('refs/tags/'))
1289         name = n[10:]
1290         if not c in tags:
1291             tags[c] = []
1292         tags[c].append(name)  # more than one tag can point at 'c'
1293     return tags
1294
1295
1296 class MissingObject(KeyError):
1297     def __init__(self, oid):
1298         self.oid = oid
1299         KeyError.__init__(self, 'object %r is missing' % oid.encode('hex'))
1300
1301
1302 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1303                                    'path', 'chunk_path', 'data'])
1304 # The path is the mangled path, and if an item represents a fragment
1305 # of a chunked file, the chunk_path will be the chunked subtree path
1306 # for the chunk, i.e. ['', '2d3115e', ...].  The top-level path for a
1307 # chunked file will have a chunk_path of [''].  So some chunk subtree
1308 # of the file '/foo/bar/baz' might look like this:
1309 #
1310 #   item.path = ['foo', 'bar', 'baz.bup']
1311 #   item.chunk_path = ['', '2d3115e', '016b097']
1312 #   item.type = 'tree'
1313 #   ...
1314
1315
1316 def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1317     """Yield everything reachable from oidx via get_ref (which must behave
1318     like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1319     returns true.  Throw MissingObject if a hash encountered is
1320     missing from the repository, and don't read or return blob content
1321     in the data field unless include_data is set.
1322
1323     """
1324     # Maintain the pending stack on the heap to avoid stack overflow
1325     pending = [(oidx, [], [], None)]
1326     while len(pending):
1327         oidx, parent_path, chunk_path, mode = pending.pop()
1328         oid = oidx.decode('hex')
1329         if stop_at and stop_at(oidx):
1330             continue
1331
1332         if (not include_data) and mode and stat.S_ISREG(mode):
1333             # If the object is a "regular file", then it's a leaf in
1334             # the graph, so we can skip reading the data if the caller
1335             # hasn't requested it.
1336             yield WalkItem(oid=oid, type='blob',
1337                            chunk_path=chunk_path, path=parent_path,
1338                            mode=mode,
1339                            data=None)
1340             continue
1341
1342         item_it = get_ref(oidx)
1343         get_oidx, typ, _ = next(item_it)
1344         if not get_oidx:
1345             raise MissingObject(oidx.decode('hex'))
1346         if typ not in ('blob', 'commit', 'tree'):
1347             raise Exception('unexpected repository object type %r' % typ)
1348
1349         # FIXME: set the mode based on the type when the mode is None
1350         if typ == 'blob' and not include_data:
1351             # Dump data until we can ask cat_pipe not to fetch it
1352             for ignored in item_it:
1353                 pass
1354             data = None
1355         else:
1356             data = ''.join(item_it)
1357
1358         yield WalkItem(oid=oid, type=typ,
1359                        chunk_path=chunk_path, path=parent_path,
1360                        mode=mode,
1361                        data=(data if include_data else None))
1362
1363         if typ == 'commit':
1364             commit_items = parse_commit(data)
1365             for pid in commit_items.parents:
1366                 pending.append((pid, parent_path, chunk_path, mode))
1367             pending.append((commit_items.tree, parent_path, chunk_path,
1368                             hashsplit.GIT_MODE_TREE))
1369         elif typ == 'tree':
1370             for mode, name, ent_id in tree_decode(data):
1371                 demangled, bup_type = demangle_name(name, mode)
1372                 if chunk_path:
1373                     sub_path = parent_path
1374                     sub_chunk_path = chunk_path + [name]
1375                 else:
1376                     sub_path = parent_path + [name]
1377                     if bup_type == BUP_CHUNKED:
1378                         sub_chunk_path = ['']
1379                     else:
1380                         sub_chunk_path = chunk_path
1381                 pending.append((ent_id.encode('hex'), sub_path, sub_chunk_path,
1382                                 mode))