lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5
   6 from __future__ import absolute_import
   7 import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
   8 from collections import namedtuple
   9 from itertools import islice
  10 from numbers import Integral
  11
  12 from bup import _helpers, compat, hashsplit, path, midx, bloom, xstat
  13 from bup.compat import range
  14 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
  15                          fdatasync,
  16                          hostname, localtime, log, merge_iter,
  17                          mmap_read, mmap_readwrite,
  18                          parse_num,
  19                          progress, qprogress, shstr, stat_if_exists,
  20                          unlink, username, userfullname,
  21                          utc_offset_str)
  22
  23 verbose = 0
  24 ignore_midx = 0
  25 repodir = None  # The default repository, once initialized
  26
  27 _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
  28 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
  29
  30 _total_searches = 0
  31 _total_steps = 0
  32
  33
  34 class GitError(Exception):
  35     pass
  36
  37
  38 def _git_wait(cmd, p):
  39     rv = p.wait()
  40     if rv != 0:
  41         raise GitError('%s returned %d' % (shstr(cmd), rv))
  42
  43 def _git_capture(argv):
  44     p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv())
  45     r = p.stdout.read()
  46     _git_wait(repr(argv), p)
  47     return r
  48
  49 def git_config_get(option, repo_dir=None):
  50     cmd = ('git', 'config', '--get', option)
  51     p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
  52                          preexec_fn=_gitenv(repo_dir=repo_dir))
  53     r = p.stdout.read()
  54     rc = p.wait()
  55     if rc == 0:
  56         return r
  57     if rc != 1:
  58         raise GitError('%s returned %d' % (cmd, rc))
  59     return None
  60
  61
  62 def parse_tz_offset(s):
  63     """UTC offset in seconds."""
  64     tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
  65     if s[0] == '-':
  66         return - tz_off
  67     return tz_off
  68
  69
  70 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
  71 # Make sure that's authoritative.
  72 _start_end_char = r'[^ .,:;<>"\'\0\n]'
  73 _content_char = r'[^\0\n<>]'
  74 _safe_str_rx = '(?:%s{1,2}|(?:%s%s*%s))' \
  75     % (_start_end_char,
  76        _start_end_char, _content_char, _start_end_char)
  77 _tz_rx = r'[-+]\d\d[0-5]\d'
  78 _parent_rx = r'(?:parent [abcdefABCDEF0123456789]{40}\n)'
  79 _commit_rx = re.compile(r'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
  80 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
  81 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)
  82
  83 (?P<message>(?:.|\n)*)''' % (_parent_rx,
  84                              _safe_str_rx, _safe_str_rx, _tz_rx,
  85                              _safe_str_rx, _safe_str_rx, _tz_rx))
  86 _parent_hash_rx = re.compile(r'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
  87
  88
  89 # Note that the author_sec and committer_sec values are (UTC) epoch seconds.
  90 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
  91                                        'author_name', 'author_mail',
  92                                        'author_sec', 'author_offset',
  93                                        'committer_name', 'committer_mail',
  94                                        'committer_sec', 'committer_offset',
  95                                        'message'])
  96
  97 def parse_commit(content):
  98     commit_match = re.match(_commit_rx, content)
  99     if not commit_match:
 100         raise Exception('cannot parse commit %r' % content)
 101     matches = commit_match.groupdict()
 102     return CommitInfo(tree=matches['tree'],
 103                       parents=re.findall(_parent_hash_rx, matches['parents']),
 104                       author_name=matches['author_name'],
 105                       author_mail=matches['author_mail'],
 106                       author_sec=int(matches['asec']),
 107                       author_offset=parse_tz_offset(matches['atz']),
 108                       committer_name=matches['committer_name'],
 109                       committer_mail=matches['committer_mail'],
 110                       committer_sec=int(matches['csec']),
 111                       committer_offset=parse_tz_offset(matches['ctz']),
 112                       message=matches['message'])
 113
 114
 115 def get_cat_data(cat_iterator, expected_type):
 116     _, kind, _ = next(cat_iterator)
 117     if kind != expected_type:
 118         raise Exception('expected %r, saw %r' % (expected_type, kind))
 119     return ''.join(cat_iterator)
 120
 121 def get_commit_items(id, cp):
 122     return parse_commit(get_cat_data(cp.get(id), 'commit'))
 123
 124 def _local_git_date_str(epoch_sec):
 125     return '%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
 126
 127
 128 def _git_date_str(epoch_sec, tz_offset_sec):
 129     offs =  tz_offset_sec // 60
 130     return '%d %s%02d%02d' \
 131         % (epoch_sec,
 132            '+' if offs >= 0 else '-',
 133            abs(offs) // 60,
 134            abs(offs) % 60)
 135
 136
 137 def repo(sub = '', repo_dir=None):
 138     """Get the path to the git repository or one of its subdirectories."""
 139     repo_dir = repo_dir or repodir
 140     if not repo_dir:
 141         raise GitError('You should call check_repo_or_die()')
 142
 143     # If there's a .git subdirectory, then the actual repo is in there.
 144     gd = os.path.join(repo_dir, '.git')
 145     if os.path.exists(gd):
 146         repo_dir = gd
 147
 148     return os.path.join(repo_dir, sub)
 149
 150
 151 def shorten_hash(s):
 152     return re.sub(r'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)',
 153                   r'\1\2*\3', s)
 154
 155
 156 def repo_rel(path):
 157     full = os.path.abspath(path)
 158     fullrepo = os.path.abspath(repo(''))
 159     if not fullrepo.endswith('/'):
 160         fullrepo += '/'
 161     if full.startswith(fullrepo):
 162         path = full[len(fullrepo):]
 163     if path.startswith('index-cache/'):
 164         path = path[len('index-cache/'):]
 165     return shorten_hash(path)
 166
 167
 168 def all_packdirs():
 169     paths = [repo('objects/pack')]
 170     paths += glob.glob(repo('index-cache/*/.'))
 171     return paths
 172
 173
 174 def auto_midx(objdir):
 175     args = [path.exe(), 'midx', '--auto', '--dir', objdir]
 176     try:
 177         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 178     except OSError as e:
 179         # make sure 'args' gets printed to help with debugging
 180         add_error('%r: exception: %s' % (args, e))
 181         raise
 182     if rv:
 183         add_error('%r: returned %d' % (args, rv))
 184
 185     args = [path.exe(), 'bloom', '--dir', objdir]
 186     try:
 187         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 188     except OSError as e:
 189         # make sure 'args' gets printed to help with debugging
 190         add_error('%r: exception: %s' % (args, e))
 191         raise
 192     if rv:
 193         add_error('%r: returned %d' % (args, rv))
 194
 195
 196 def mangle_name(name, mode, gitmode):
 197     """Mangle a file name to present an abstract name for segmented files.
 198     Mangled file names will have the ".bup" extension added to them. If a
 199     file's name already ends with ".bup", a ".bupl" extension is added to
 200     disambiguate normal files from segmented ones.
 201     """
 202     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 203         assert(stat.S_ISDIR(gitmode))
 204         return name + '.bup'
 205     elif name.endswith('.bup') or name[:-1].endswith('.bup'):
 206         return name + '.bupl'
 207     else:
 208         return name
 209
 210
 211 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 212 def demangle_name(name, mode):
 213     """Remove name mangling from a file name, if necessary.
 214
 215     The return value is a tuple (demangled_filename,mode), where mode is one of
 216     the following:
 217
 218     * BUP_NORMAL  : files that should be read as-is from the repository
 219     * BUP_CHUNKED : files that were chunked and need to be reassembled
 220
 221     For more information on the name mangling algorithm, see mangle_name()
 222     """
 223     if name.endswith('.bupl'):
 224         return (name[:-5], BUP_NORMAL)
 225     elif name.endswith('.bup'):
 226         return (name[:-4], BUP_CHUNKED)
 227     elif name.endswith('.bupm'):
 228         return (name[:-5],
 229                 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
 230     else:
 231         return (name, BUP_NORMAL)
 232
 233
 234 def calc_hash(type, content):
 235     """Calculate some content's hash in the Git fashion."""
 236     header = '%s %d\0' % (type, len(content))
 237     sum = Sha1(header)
 238     sum.update(content)
 239     return sum.digest()
 240
 241
 242 def shalist_item_sort_key(ent):
 243     (mode, name, id) = ent
 244     assert(mode+0 == mode)
 245     if stat.S_ISDIR(mode):
 246         return name + '/'
 247     else:
 248         return name
 249
 250
 251 def tree_encode(shalist):
 252     """Generate a git tree object from (mode,name,hash) tuples."""
 253     shalist = sorted(shalist, key = shalist_item_sort_key)
 254     l = []
 255     for (mode,name,bin) in shalist:
 256         assert(mode)
 257         assert(mode+0 == mode)
 258         assert(name)
 259         assert(len(bin) == 20)
 260         s = '%o %s\0%s' % (mode,name,bin)
 261         assert(s[0] != '0')  # 0-padded octal is not acceptable in a git tree
 262         l.append(s)
 263     return ''.join(l)
 264
 265
 266 def tree_decode(buf):
 267     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 268     ofs = 0
 269     while ofs < len(buf):
 270         z = buf.find('\0', ofs)
 271         assert(z > ofs)
 272         spl = buf[ofs:z].split(' ', 1)
 273         assert(len(spl) == 2)
 274         mode,name = spl
 275         sha = buf[z+1:z+1+20]
 276         ofs = z+1+20
 277         yield (int(mode, 8), name, sha)
 278
 279
 280 def _encode_packobj(type, content, compression_level=1):
 281     if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
 282         raise ValueError('invalid compression level %s' % compression_level)
 283     szout = ''
 284     sz = len(content)
 285     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 286     sz >>= 4
 287     while 1:
 288         if sz: szbits |= 0x80
 289         szout += chr(szbits)
 290         if not sz:
 291             break
 292         szbits = sz & 0x7f
 293         sz >>= 7
 294     z = zlib.compressobj(compression_level)
 295     yield szout
 296     yield z.compress(content)
 297     yield z.flush()
 298
 299
 300 def _encode_looseobj(type, content, compression_level=1):
 301     z = zlib.compressobj(compression_level)
 302     yield z.compress('%s %d\0' % (type, len(content)))
 303     yield z.compress(content)
 304     yield z.flush()
 305
 306
 307 def _decode_looseobj(buf):
 308     assert(buf);
 309     s = zlib.decompress(buf)
 310     i = s.find('\0')
 311     assert(i > 0)
 312     l = s[:i].split(' ')
 313     type = l[0]
 314     sz = int(l[1])
 315     content = s[i+1:]
 316     assert(type in _typemap)
 317     assert(sz == len(content))
 318     return (type, content)
 319
 320
 321 def _decode_packobj(buf):
 322     assert(buf)
 323     c = ord(buf[0])
 324     type = _typermap[(c & 0x70) >> 4]
 325     sz = c & 0x0f
 326     shift = 4
 327     i = 0
 328     while c & 0x80:
 329         i += 1
 330         c = ord(buf[i])
 331         sz |= (c & 0x7f) << shift
 332         shift += 7
 333         if not (c & 0x80):
 334             break
 335     return (type, zlib.decompress(buf[i+1:]))
 336
 337
 338 class PackIdx:
 339     def __init__(self):
 340         assert(0)
 341
 342     def find_offset(self, hash):
 343         """Get the offset of an object inside the index file."""
 344         idx = self._idx_from_hash(hash)
 345         if idx != None:
 346             return self._ofs_from_idx(idx)
 347         return None
 348
 349     def exists(self, hash, want_source=False):
 350         """Return nonempty if the object exists in this index."""
 351         if hash and (self._idx_from_hash(hash) != None):
 352             return want_source and os.path.basename(self.name) or True
 353         return None
 354
 355     def __len__(self):
 356         return int(self.fanout[255])
 357
 358     def _idx_from_hash(self, hash):
 359         global _total_searches, _total_steps
 360         _total_searches += 1
 361         assert(len(hash) == 20)
 362         b1 = ord(hash[0])
 363         start = self.fanout[b1-1] # range -1..254
 364         end = self.fanout[b1] # range 0..255
 365         want = str(hash)
 366         _total_steps += 1  # lookup table is a step
 367         while start < end:
 368             _total_steps += 1
 369             mid = start + (end-start)/2
 370             v = self._idx_to_hash(mid)
 371             if v < want:
 372                 start = mid+1
 373             elif v > want:
 374                 end = mid
 375             else: # got it!
 376                 return mid
 377         return None
 378
 379
 380 class PackIdxV1(PackIdx):
 381     """Object representation of a Git pack index (version 1) file."""
 382     def __init__(self, filename, f):
 383         self.name = filename
 384         self.idxnames = [self.name]
 385         self.map = mmap_read(f)
 386         self.fanout = list(struct.unpack('!256I',
 387                                          str(buffer(self.map, 0, 256*4))))
 388         self.fanout.append(0)  # entry "-1"
 389         nsha = self.fanout[255]
 390         self.sha_ofs = 256*4
 391         self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
 392
 393     def _ofs_from_idx(self, idx):
 394         return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
 395
 396     def _idx_to_hash(self, idx):
 397         return str(self.shatable[idx*24+4 : idx*24+24])
 398
 399     def __iter__(self):
 400         for i in range(self.fanout[255]):
 401             yield buffer(self.map, 256*4 + 24*i + 4, 20)
 402
 403
 404 class PackIdxV2(PackIdx):
 405     """Object representation of a Git pack index (version 2) file."""
 406     def __init__(self, filename, f):
 407         self.name = filename
 408         self.idxnames = [self.name]
 409         self.map = mmap_read(f)
 410         assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
 411         self.fanout = list(struct.unpack('!256I',
 412                                          str(buffer(self.map, 8, 256*4))))
 413         self.fanout.append(0)  # entry "-1"
 414         nsha = self.fanout[255]
 415         self.sha_ofs = 8 + 256*4
 416         self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
 417         self.ofstable = buffer(self.map,
 418                                self.sha_ofs + nsha*20 + nsha*4,
 419                                nsha*4)
 420         self.ofs64table = buffer(self.map,
 421                                  8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
 422
 423     def _ofs_from_idx(self, idx):
 424         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
 425         if ofs & 0x80000000:
 426             idx64 = ofs & 0x7fffffff
 427             ofs = struct.unpack('!Q',
 428                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
 429         return ofs
 430
 431     def _idx_to_hash(self, idx):
 432         return str(self.shatable[idx*20:(idx+1)*20])
 433
 434     def __iter__(self):
 435         for i in range(self.fanout[255]):
 436             yield buffer(self.map, 8 + 256*4 + 20*i, 20)
 437
 438
 439 _mpi_count = 0
 440 class PackIdxList:
 441     def __init__(self, dir):
 442         global _mpi_count
 443         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 444         _mpi_count += 1
 445         self.dir = dir
 446         self.also = set()
 447         self.packs = []
 448         self.do_bloom = False
 449         self.bloom = None
 450         self.refresh()
 451
 452     def __del__(self):
 453         global _mpi_count
 454         _mpi_count -= 1
 455         assert(_mpi_count == 0)
 456
 457     def __iter__(self):
 458         return iter(idxmerge(self.packs))
 459
 460     def __len__(self):
 461         return sum(len(pack) for pack in self.packs)
 462
 463     def exists(self, hash, want_source=False):
 464         """Return nonempty if the object exists in the index files."""
 465         global _total_searches
 466         _total_searches += 1
 467         if hash in self.also:
 468             return True
 469         if self.do_bloom and self.bloom:
 470             if self.bloom.exists(hash):
 471                 self.do_bloom = False
 472             else:
 473                 _total_searches -= 1  # was counted by bloom
 474                 return None
 475         for i in xrange(len(self.packs)):
 476             p = self.packs[i]
 477             _total_searches -= 1  # will be incremented by sub-pack
 478             ix = p.exists(hash, want_source=want_source)
 479             if ix:
 480                 # reorder so most recently used packs are searched first
 481                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 482                 return ix
 483         self.do_bloom = True
 484         return None
 485
 486     def refresh(self, skip_midx = False):
 487         """Refresh the index list.
 488         This method verifies if .midx files were superseded (e.g. all of its
 489         contents are in another, bigger .midx file) and removes the superseded
 490         files.
 491
 492         If skip_midx is True, all work on .midx files will be skipped and .midx
 493         files will be removed from the list.
 494
 495         The module-global variable 'ignore_midx' can force this function to
 496         always act as if skip_midx was True.
 497         """
 498         self.bloom = None # Always reopen the bloom as it may have been relaced
 499         self.do_bloom = False
 500         skip_midx = skip_midx or ignore_midx
 501         d = dict((p.name, p) for p in self.packs
 502                  if not skip_midx or not isinstance(p, midx.PackMidx))
 503         if os.path.exists(self.dir):
 504             if not skip_midx:
 505                 midxl = []
 506                 for ix in self.packs:
 507                     if isinstance(ix, midx.PackMidx):
 508                         for name in ix.idxnames:
 509                             d[os.path.join(self.dir, name)] = ix
 510                 for full in glob.glob(os.path.join(self.dir,'*.midx')):
 511                     if not d.get(full):
 512                         mx = midx.PackMidx(full)
 513                         (mxd, mxf) = os.path.split(mx.name)
 514                         broken = False
 515                         for n in mx.idxnames:
 516                             if not os.path.exists(os.path.join(mxd, n)):
 517                                 log(('warning: index %s missing\n' +
 518                                     '  used by %s\n') % (n, mxf))
 519                                 broken = True
 520                         if broken:
 521                             mx.close()
 522                             del mx
 523                             unlink(full)
 524                         else:
 525                             midxl.append(mx)
 526                 midxl.sort(key=lambda ix:
 527                            (-len(ix), -xstat.stat(ix.name).st_mtime))
 528                 for ix in midxl:
 529                     any_needed = False
 530                     for sub in ix.idxnames:
 531                         found = d.get(os.path.join(self.dir, sub))
 532                         if not found or isinstance(found, PackIdx):
 533                             # doesn't exist, or exists but not in a midx
 534                             any_needed = True
 535                             break
 536                     if any_needed:
 537                         d[ix.name] = ix
 538                         for name in ix.idxnames:
 539                             d[os.path.join(self.dir, name)] = ix
 540                     elif not ix.force_keep:
 541                         debug1('midx: removing redundant: %s\n'
 542                                % os.path.basename(ix.name))
 543                         ix.close()
 544                         unlink(ix.name)
 545             for full in glob.glob(os.path.join(self.dir,'*.idx')):
 546                 if not d.get(full):
 547                     try:
 548                         ix = open_idx(full)
 549                     except GitError as e:
 550                         add_error(e)
 551                         continue
 552                     d[full] = ix
 553             bfull = os.path.join(self.dir, 'bup.bloom')
 554             if self.bloom is None and os.path.exists(bfull):
 555                 self.bloom = bloom.ShaBloom(bfull)
 556             self.packs = list(set(d.values()))
 557             self.packs.sort(reverse=True, key=lambda x: len(x))
 558             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 559                 self.do_bloom = True
 560             else:
 561                 self.bloom = None
 562         debug1('PackIdxList: using %d index%s.\n'
 563             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 564
 565     def add(self, hash):
 566         """Insert an additional object in the list."""
 567         self.also.add(hash)
 568
 569
 570 def open_idx(filename):
 571     if filename.endswith('.idx'):
 572         f = open(filename, 'rb')
 573         header = f.read(8)
 574         if header[0:4] == '\377tOc':
 575             version = struct.unpack('!I', header[4:8])[0]
 576             if version == 2:
 577                 return PackIdxV2(filename, f)
 578             else:
 579                 raise GitError('%s: expected idx file version 2, got %d'
 580                                % (filename, version))
 581         elif len(header) == 8 and header[0:4] < '\377tOc':
 582             return PackIdxV1(filename, f)
 583         else:
 584             raise GitError('%s: unrecognized idx file header' % filename)
 585     elif filename.endswith('.midx'):
 586         return midx.PackMidx(filename)
 587     else:
 588         raise GitError('idx filenames must end with .idx or .midx')
 589
 590
 591 def idxmerge(idxlist, final_progress=True):
 592     """Generate a list of all the objects reachable in a PackIdxList."""
 593     def pfunc(count, total):
 594         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 595                   % (count*100.0/total, count, total))
 596     def pfinal(count, total):
 597         if final_progress:
 598             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 599                      % (100, total, total))
 600     return merge_iter(idxlist, 10024, pfunc, pfinal)
 601
 602
 603 def _make_objcache():
 604     return PackIdxList(repo('objects/pack'))
 605
 606 # bup-gc assumes that it can disable all PackWriter activities
 607 # (bloom/midx/cache) via the constructor and close() arguments.
 608
 609 class PackWriter:
 610     """Writes Git objects inside a pack file."""
 611     def __init__(self, objcache_maker=_make_objcache, compression_level=1,
 612                  run_midx=True, on_pack_finish=None,
 613                  max_pack_size=None, max_pack_objects=None, repo_dir=None):
 614         self.repo_dir = repo_dir or repo()
 615         self.file = None
 616         self.parentfd = None
 617         self.count = 0
 618         self.outbytes = 0
 619         self.filename = None
 620         self.idx = None
 621         self.objcache_maker = objcache_maker
 622         self.objcache = None
 623         self.compression_level = compression_level
 624         self.run_midx=run_midx
 625         self.on_pack_finish = on_pack_finish
 626         if not max_pack_size:
 627             max_pack_size = git_config_get('pack.packSizeLimit',
 628                                            repo_dir=self.repo_dir)
 629             if max_pack_size is not None:
 630                 max_pack_size = parse_num(max_pack_size)
 631             if not max_pack_size:
 632                 # larger packs slow down pruning
 633                 max_pack_size = 1000 * 1000 * 1000
 634         self.max_pack_size = max_pack_size
 635         # cache memory usage is about 83 bytes per object
 636         self.max_pack_objects = max_pack_objects if max_pack_objects \
 637                                 else max(1, self.max_pack_size // 5000)
 638
 639     def __del__(self):
 640         self.close()
 641
 642     def __enter__(self):
 643         return self
 644
 645     def __exit__(self, type, value, traceback):
 646         self.close()
 647
 648     def _open(self):
 649         if not self.file:
 650             objdir = dir = os.path.join(self.repo_dir, 'objects')
 651             fd, name = tempfile.mkstemp(suffix='.pack', dir=objdir)
 652             try:
 653                 self.file = os.fdopen(fd, 'w+b')
 654             except:
 655                 os.close(fd)
 656                 raise
 657             try:
 658                 self.parentfd = os.open(objdir, os.O_RDONLY)
 659             except:
 660                 f = self.file
 661                 self.file = None
 662                 f.close()
 663                 raise
 664             assert(name.endswith('.pack'))
 665             self.filename = name[:-5]
 666             self.file.write('PACK\0\0\0\2\0\0\0\0')
 667             self.idx = list(list() for i in xrange(256))
 668
 669     def _raw_write(self, datalist, sha):
 670         self._open()
 671         f = self.file
 672         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 673         # the file never has a *partial* blob.  So let's make sure it's
 674         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 675         # to our hashsplit algorithm.)  f.write() does its own buffering,
 676         # but that's okay because we'll flush it in _end().
 677         oneblob = ''.join(datalist)
 678         try:
 679             f.write(oneblob)
 680         except IOError as e:
 681             raise GitError, e, sys.exc_info()[2]
 682         nw = len(oneblob)
 683         crc = zlib.crc32(oneblob) & 0xffffffff
 684         self._update_idx(sha, crc, nw)
 685         self.outbytes += nw
 686         self.count += 1
 687         return nw, crc
 688
 689     def _update_idx(self, sha, crc, size):
 690         assert(sha)
 691         if self.idx:
 692             self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
 693
 694     def _write(self, sha, type, content):
 695         if verbose:
 696             log('>')
 697         if not sha:
 698             sha = calc_hash(type, content)
 699         size, crc = self._raw_write(_encode_packobj(type, content,
 700                                                     self.compression_level),
 701                                     sha=sha)
 702         if self.outbytes >= self.max_pack_size \
 703            or self.count >= self.max_pack_objects:
 704             self.breakpoint()
 705         return sha
 706
 707     def breakpoint(self):
 708         """Clear byte and object counts and return the last processed id."""
 709         id = self._end(self.run_midx)
 710         self.outbytes = self.count = 0
 711         return id
 712
 713     def _require_objcache(self):
 714         if self.objcache is None and self.objcache_maker:
 715             self.objcache = self.objcache_maker()
 716         if self.objcache is None:
 717             raise GitError(
 718                     "PackWriter not opened or can't check exists w/o objcache")
 719
 720     def exists(self, id, want_source=False):
 721         """Return non-empty if an object is found in the object cache."""
 722         self._require_objcache()
 723         return self.objcache.exists(id, want_source=want_source)
 724
 725     def just_write(self, sha, type, content):
 726         """Write an object to the pack file without checking for duplication."""
 727         self._write(sha, type, content)
 728         # If nothing else, gc doesn't have/want an objcache
 729         if self.objcache is not None:
 730             self.objcache.add(sha)
 731
 732     def maybe_write(self, type, content):
 733         """Write an object to the pack file if not present and return its id."""
 734         sha = calc_hash(type, content)
 735         if not self.exists(sha):
 736             self._require_objcache()
 737             self.just_write(sha, type, content)
 738         return sha
 739
 740     def new_blob(self, blob):
 741         """Create a blob object in the pack with the supplied content."""
 742         return self.maybe_write('blob', blob)
 743
 744     def new_tree(self, shalist):
 745         """Create a tree object in the pack."""
 746         content = tree_encode(shalist)
 747         return self.maybe_write('tree', content)
 748
 749     def new_commit(self, tree, parent,
 750                    author, adate_sec, adate_tz,
 751                    committer, cdate_sec, cdate_tz,
 752                    msg):
 753         """Create a commit object in the pack.  The date_sec values must be
 754         epoch-seconds, and if a tz is None, the local timezone is assumed."""
 755         if adate_tz:
 756             adate_str = _git_date_str(adate_sec, adate_tz)
 757         else:
 758             adate_str = _local_git_date_str(adate_sec)
 759         if cdate_tz:
 760             cdate_str = _git_date_str(cdate_sec, cdate_tz)
 761         else:
 762             cdate_str = _local_git_date_str(cdate_sec)
 763         l = []
 764         if tree: l.append('tree %s' % tree.encode('hex'))
 765         if parent: l.append('parent %s' % parent.encode('hex'))
 766         if author: l.append('author %s %s' % (author, adate_str))
 767         if committer: l.append('committer %s %s' % (committer, cdate_str))
 768         l.append('')
 769         l.append(msg)
 770         return self.maybe_write('commit', '\n'.join(l))
 771
 772     def abort(self):
 773         """Remove the pack file from disk."""
 774         f = self.file
 775         if f:
 776             pfd = self.parentfd
 777             self.file = None
 778             self.parentfd = None
 779             self.idx = None
 780             try:
 781                 try:
 782                     os.unlink(self.filename + '.pack')
 783                 finally:
 784                     f.close()
 785             finally:
 786                 if pfd is not None:
 787                     os.close(pfd)
 788
 789     def _end(self, run_midx=True):
 790         f = self.file
 791         if not f: return None
 792         self.file = None
 793         try:
 794             self.objcache = None
 795             idx = self.idx
 796             self.idx = None
 797
 798             # update object count
 799             f.seek(8)
 800             cp = struct.pack('!i', self.count)
 801             assert(len(cp) == 4)
 802             f.write(cp)
 803
 804             # calculate the pack sha1sum
 805             f.seek(0)
 806             sum = Sha1()
 807             for b in chunkyreader(f):
 808                 sum.update(b)
 809             packbin = sum.digest()
 810             f.write(packbin)
 811             fdatasync(f.fileno())
 812         finally:
 813             f.close()
 814
 815         obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
 816         nameprefix = os.path.join(self.repo_dir,
 817                                   'objects/pack/pack-' +  obj_list_sha)
 818         if os.path.exists(self.filename + '.map'):
 819             os.unlink(self.filename + '.map')
 820         os.rename(self.filename + '.pack', nameprefix + '.pack')
 821         os.rename(self.filename + '.idx', nameprefix + '.idx')
 822         try:
 823             os.fsync(self.parentfd)
 824         finally:
 825             os.close(self.parentfd)
 826
 827         if run_midx:
 828             auto_midx(os.path.join(self.repo_dir, 'objects/pack'))
 829
 830         if self.on_pack_finish:
 831             self.on_pack_finish(nameprefix)
 832
 833         return nameprefix
 834
 835     def close(self, run_midx=True):
 836         """Close the pack file and move it to its definitive path."""
 837         return self._end(run_midx=run_midx)
 838
 839     def _write_pack_idx_v2(self, filename, idx, packbin):
 840         ofs64_count = 0
 841         for section in idx:
 842             for entry in section:
 843                 if entry[2] >= 2**31:
 844                     ofs64_count += 1
 845
 846         # Length: header + fan-out + shas-and-crcs + overflow-offsets
 847         index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
 848         idx_map = None
 849         idx_f = open(filename, 'w+b')
 850         try:
 851             idx_f.truncate(index_len)
 852             fdatasync(idx_f.fileno())
 853             idx_map = mmap_readwrite(idx_f, close=False)
 854             try:
 855                 count = _helpers.write_idx(filename, idx_map, idx, self.count)
 856                 assert(count == self.count)
 857                 idx_map.flush()
 858             finally:
 859                 idx_map.close()
 860         finally:
 861             idx_f.close()
 862
 863         idx_f = open(filename, 'a+b')
 864         try:
 865             idx_f.write(packbin)
 866             idx_f.seek(0)
 867             idx_sum = Sha1()
 868             b = idx_f.read(8 + 4*256)
 869             idx_sum.update(b)
 870
 871             obj_list_sum = Sha1()
 872             for b in chunkyreader(idx_f, 20*self.count):
 873                 idx_sum.update(b)
 874                 obj_list_sum.update(b)
 875             namebase = obj_list_sum.hexdigest()
 876
 877             for b in chunkyreader(idx_f):
 878                 idx_sum.update(b)
 879             idx_f.write(idx_sum.digest())
 880             fdatasync(idx_f.fileno())
 881             return namebase
 882         finally:
 883             idx_f.close()
 884
 885
 886 def _gitenv(repo_dir = None):
 887     if not repo_dir:
 888         repo_dir = repo()
 889     def env():
 890         os.environ['GIT_DIR'] = os.path.abspath(repo_dir)
 891     return env
 892
 893
 894 def list_refs(patterns=None, repo_dir=None,
 895               limit_to_heads=False, limit_to_tags=False):
 896     """Yield (refname, hash) tuples for all repository refs unless
 897     patterns are specified.  In that case, only include tuples for
 898     refs matching those patterns (cf. git-show-ref(1)).  The limits
 899     restrict the result items to refs/heads or refs/tags.  If both
 900     limits are specified, items from both sources will be included.
 901
 902     """
 903     argv = ['git', 'show-ref']
 904     if limit_to_heads:
 905         argv.append('--heads')
 906     if limit_to_tags:
 907         argv.append('--tags')
 908     argv.append('--')
 909     if patterns:
 910         argv.extend(patterns)
 911     p = subprocess.Popen(argv,
 912                          preexec_fn = _gitenv(repo_dir),
 913                          stdout = subprocess.PIPE)
 914     out = p.stdout.read().strip()
 915     rv = p.wait()  # not fatal
 916     if rv:
 917         assert(not out)
 918     if out:
 919         for d in out.split('\n'):
 920             (sha, name) = d.split(' ', 1)
 921             yield (name, sha.decode('hex'))
 922
 923
 924 def read_ref(refname, repo_dir = None):
 925     """Get the commit id of the most recent commit made on a given ref."""
 926     refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
 927     l = tuple(islice(refs, 2))
 928     if l:
 929         assert(len(l) == 1)
 930         return l[0][1]
 931     else:
 932         return None
 933
 934
 935 def rev_list_invocation(ref_or_refs, count=None, format=None):
 936     if isinstance(ref_or_refs, compat.str_type):
 937         refs = (ref_or_refs,)
 938     else:
 939         refs = ref_or_refs
 940     argv = ['git', 'rev-list']
 941     if isinstance(count, Integral):
 942         argv.extend(['-n', str(count)])
 943     elif count:
 944         raise ValueError('unexpected count argument %r' % count)
 945
 946     if format:
 947         argv.append('--pretty=format:' + format)
 948     for ref in refs:
 949         assert not ref.startswith('-')
 950         argv.append(ref)
 951     argv.append('--')
 952     return argv
 953
 954
 955 def rev_list(ref_or_refs, count=None, parse=None, format=None, repo_dir=None):
 956     """Yield information about commits as per "git rev-list".  If a format
 957     is not provided, yield one hex hash at a time.  If a format is
 958     provided, pass it to rev-list and call parse(git_stdout) for each
 959     commit with the stream positioned just after the rev-list "commit
 960     HASH" header line.  When a format is provided yield (oidx,
 961     parse(git_stdout)) for each commit.
 962
 963     """
 964     assert bool(parse) == bool(format)
 965     p = subprocess.Popen(rev_list_invocation(ref_or_refs, count=count,
 966                                              format=format),
 967                          preexec_fn = _gitenv(repo_dir),
 968                          stdout = subprocess.PIPE)
 969     if not format:
 970         for line in p.stdout:
 971             yield line.strip()
 972     else:
 973         line = p.stdout.readline()
 974         while line:
 975             s = line.strip()
 976             if not s.startswith('commit '):
 977                 raise Exception('unexpected line ' + s)
 978             yield s[7:], parse(p.stdout)
 979             line = p.stdout.readline()
 980
 981     rv = p.wait()  # not fatal
 982     if rv:
 983         raise GitError, 'git rev-list returned error %d' % rv
 984
 985
 986 def get_commit_dates(refs, repo_dir=None):
 987     """Get the dates for the specified commit refs.  For now, every unique
 988        string in refs must resolve to a different commit or this
 989        function will fail."""
 990     result = []
 991     for ref in refs:
 992         commit = get_commit_items(ref, cp(repo_dir))
 993         result.append(commit.author_sec)
 994     return result
 995
 996
 997 def rev_parse(committish, repo_dir=None):
 998     """Resolve the full hash for 'committish', if it exists.
 999
1000     Should be roughly equivalent to 'git rev-parse'.
1001
1002     Returns the hex value of the hash if it is found, None if 'committish' does
1003     not correspond to anything.
1004     """
1005     head = read_ref(committish, repo_dir=repo_dir)
1006     if head:
1007         debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
1008         return head
1009
1010     pL = PackIdxList(repo('objects/pack', repo_dir=repo_dir))
1011
1012     if len(committish) == 40:
1013         try:
1014             hash = committish.decode('hex')
1015         except TypeError:
1016             return None
1017
1018         if pL.exists(hash):
1019             return hash
1020
1021     return None
1022
1023
1024 def update_ref(refname, newval, oldval, repo_dir=None):
1025     """Update a repository reference."""
1026     if not oldval:
1027         oldval = ''
1028     assert(refname.startswith('refs/heads/') \
1029            or refname.startswith('refs/tags/'))
1030     p = subprocess.Popen(['git', 'update-ref', refname,
1031                           newval.encode('hex'), oldval.encode('hex')],
1032                          preexec_fn = _gitenv(repo_dir))
1033     _git_wait('git update-ref', p)
1034
1035
1036 def delete_ref(refname, oldvalue=None):
1037     """Delete a repository reference (see git update-ref(1))."""
1038     assert(refname.startswith('refs/'))
1039     oldvalue = [] if not oldvalue else [oldvalue]
1040     p = subprocess.Popen(['git', 'update-ref', '-d', refname] + oldvalue,
1041                          preexec_fn = _gitenv())
1042     _git_wait('git update-ref', p)
1043
1044
1045 def guess_repo(path=None):
1046     """Set the path value in the global variable "repodir".
1047     This makes bup look for an existing bup repository, but not fail if a
1048     repository doesn't exist. Usually, if you are interacting with a bup
1049     repository, you would not be calling this function but using
1050     check_repo_or_die().
1051     """
1052     global repodir
1053     if path:
1054         repodir = path
1055     if not repodir:
1056         repodir = os.environ.get('BUP_DIR')
1057         if not repodir:
1058             repodir = os.path.expanduser('~/.bup')
1059
1060
1061 def init_repo(path=None):
1062     """Create the Git bare repository for bup in a given path."""
1063     guess_repo(path)
1064     d = repo()  # appends a / to the path
1065     parent = os.path.dirname(os.path.dirname(d))
1066     if parent and not os.path.exists(parent):
1067         raise GitError('parent directory "%s" does not exist\n' % parent)
1068     if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
1069         raise GitError('"%s" exists but is not a directory\n' % d)
1070     p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
1071                          preexec_fn = _gitenv())
1072     _git_wait('git init', p)
1073     # Force the index version configuration in order to ensure bup works
1074     # regardless of the version of the installed Git binary.
1075     p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
1076                          stdout=sys.stderr, preexec_fn = _gitenv())
1077     _git_wait('git config', p)
1078     # Enable the reflog
1079     p = subprocess.Popen(['git', 'config', 'core.logAllRefUpdates', 'true'],
1080                          stdout=sys.stderr, preexec_fn = _gitenv())
1081     _git_wait('git config', p)
1082
1083
1084 def check_repo_or_die(path=None):
1085     """Check to see if a bup repository probably exists, and abort if not."""
1086     guess_repo(path)
1087     top = repo()
1088     pst = stat_if_exists(top + '/objects/pack')
1089     if pst and stat.S_ISDIR(pst.st_mode):
1090         return
1091     if not pst:
1092         top_st = stat_if_exists(top)
1093         if not top_st:
1094             log('error: repository %r does not exist (see "bup help init")\n'
1095                 % top)
1096             sys.exit(15)
1097     log('error: %r is not a repository\n' % top)
1098     sys.exit(14)
1099
1100
1101 _ver = None
1102 def ver():
1103     """Get Git's version and ensure a usable version is installed.
1104
1105     The returned version is formatted as an ordered tuple with each position
1106     representing a digit in the version tag. For example, the following tuple
1107     would represent version 1.6.6.9:
1108
1109         ('1', '6', '6', '9')
1110     """
1111     global _ver
1112     if not _ver:
1113         p = subprocess.Popen(['git', '--version'],
1114                              stdout=subprocess.PIPE)
1115         gvs = p.stdout.read()
1116         _git_wait('git --version', p)
1117         m = re.match(r'git version (\S+.\S+)', gvs)
1118         if not m:
1119             raise GitError('git --version weird output: %r' % gvs)
1120         _ver = tuple(m.group(1).split('.'))
1121     needed = ('1','5', '3', '1')
1122     if _ver < needed:
1123         raise GitError('git version %s or higher is required; you have %s'
1124                        % ('.'.join(needed), '.'.join(_ver)))
1125     return _ver
1126
1127
1128 class _AbortableIter:
1129     def __init__(self, it, onabort = None):
1130         self.it = it
1131         self.onabort = onabort
1132         self.done = None
1133
1134     def __iter__(self):
1135         return self
1136
1137     def next(self):
1138         try:
1139             return next(self.it)
1140         except StopIteration as e:
1141             self.done = True
1142             raise
1143         except:
1144             self.abort()
1145             raise
1146
1147     def abort(self):
1148         """Abort iteration and call the abortion callback, if needed."""
1149         if not self.done:
1150             self.done = True
1151             if self.onabort:
1152                 self.onabort()
1153
1154     def __del__(self):
1155         self.abort()
1156
1157
1158 _ver_warned = 0
1159 class CatPipe:
1160     """Link to 'git cat-file' that is used to retrieve blob data."""
1161     def __init__(self, repo_dir = None):
1162         global _ver_warned
1163         self.repo_dir = repo_dir
1164         wanted = ('1','5','6')
1165         if ver() < wanted:
1166             log('error: git version must be at least 1.5.6\n')
1167             sys.exit(1)
1168         self.p = self.inprogress = None
1169
1170     def _abort(self):
1171         if self.p:
1172             self.p.stdout.close()
1173             self.p.stdin.close()
1174         self.p = None
1175         self.inprogress = None
1176
1177     def restart(self):
1178         self._abort()
1179         self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1180                                   stdin=subprocess.PIPE,
1181                                   stdout=subprocess.PIPE,
1182                                   close_fds = True,
1183                                   bufsize = 4096,
1184                                   preexec_fn = _gitenv(self.repo_dir))
1185
1186     def get(self, ref):
1187         """Yield (oidx, type, size), followed by the data referred to by ref.
1188         If ref does not exist, only yield (None, None, None).
1189
1190         """
1191         if not self.p or self.p.poll() != None:
1192             self.restart()
1193         assert(self.p)
1194         poll_result = self.p.poll()
1195         assert(poll_result == None)
1196         if self.inprogress:
1197             log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1198         assert(not self.inprogress)
1199         assert(ref.find('\n') < 0)
1200         assert(ref.find('\r') < 0)
1201         assert(not ref.startswith('-'))
1202         self.inprogress = ref
1203         self.p.stdin.write('%s\n' % ref)
1204         self.p.stdin.flush()
1205         hdr = self.p.stdout.readline()
1206         if hdr.endswith(' missing\n'):
1207             self.inprogress = None
1208             yield None, None, None
1209             return
1210         info = hdr.split(' ')
1211         if len(info) != 3 or len(info[0]) != 40:
1212             raise GitError('expected object (id, type, size), got %r' % info)
1213         oidx, typ, size = info
1214         size = int(size)
1215         it = _AbortableIter(chunkyreader(self.p.stdout, size),
1216                             onabort=self._abort)
1217         try:
1218             yield oidx, typ, size
1219             for blob in it:
1220                 yield blob
1221             readline_result = self.p.stdout.readline()
1222             assert(readline_result == '\n')
1223             self.inprogress = None
1224         except Exception as e:
1225             it.abort()
1226             raise
1227
1228     def _join(self, it):
1229         _, typ, _ = next(it)
1230         if typ == 'blob':
1231             for blob in it:
1232                 yield blob
1233         elif typ == 'tree':
1234             treefile = ''.join(it)
1235             for (mode, name, sha) in tree_decode(treefile):
1236                 for blob in self.join(sha.encode('hex')):
1237                     yield blob
1238         elif typ == 'commit':
1239             treeline = ''.join(it).split('\n')[0]
1240             assert(treeline.startswith('tree '))
1241             for blob in self.join(treeline[5:]):
1242                 yield blob
1243         else:
1244             raise GitError('invalid object type %r: expected blob/tree/commit'
1245                            % typ)
1246
1247     def join(self, id):
1248         """Generate a list of the content of all blobs that can be reached
1249         from an object.  The hash given in 'id' must point to a blob, a tree
1250         or a commit. The content of all blobs that can be seen from trees or
1251         commits will be added to the list.
1252         """
1253         try:
1254             for d in self._join(self.get(id)):
1255                 yield d
1256         except StopIteration:
1257             log('booger!\n')
1258
1259
1260 _cp = {}
1261
1262 def cp(repo_dir=None):
1263     """Create a CatPipe object or reuse the already existing one."""
1264     global _cp, repodir
1265     if not repo_dir:
1266         repo_dir = repodir or repo()
1267     repo_dir = os.path.abspath(repo_dir)
1268     cp = _cp.get(repo_dir)
1269     if not cp:
1270         cp = CatPipe(repo_dir)
1271         _cp[repo_dir] = cp
1272     return cp
1273
1274
1275 def tags(repo_dir = None):
1276     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1277     tags = {}
1278     for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1279         assert(n.startswith('refs/tags/'))
1280         name = n[10:]
1281         if not c in tags:
1282             tags[c] = []
1283         tags[c].append(name)  # more than one tag can point at 'c'
1284     return tags
1285
1286
1287 class MissingObject(KeyError):
1288     def __init__(self, oid):
1289         self.oid = oid
1290         KeyError.__init__(self, 'object %r is missing' % oid.encode('hex'))
1291
1292
1293 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1294                                    'path', 'chunk_path', 'data'])
1295 # The path is the mangled path, and if an item represents a fragment
1296 # of a chunked file, the chunk_path will be the chunked subtree path
1297 # for the chunk, i.e. ['', '2d3115e', ...].  The top-level path for a
1298 # chunked file will have a chunk_path of [''].  So some chunk subtree
1299 # of the file '/foo/bar/baz' might look like this:
1300 #
1301 #   item.path = ['foo', 'bar', 'baz.bup']
1302 #   item.chunk_path = ['', '2d3115e', '016b097']
1303 #   item.type = 'tree'
1304 #   ...
1305
1306
1307 def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1308     """Yield everything reachable from oidx via get_ref (which must behave
1309     like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1310     returns true.  Throw MissingObject if a hash encountered is
1311     missing from the repository, and don't read or return blob content
1312     in the data field unless include_data is set.
1313
1314     """
1315     # Maintain the pending stack on the heap to avoid stack overflow
1316     pending = [(oidx, [], [], None)]
1317     while len(pending):
1318         oidx, parent_path, chunk_path, mode = pending.pop()
1319         oid = oidx.decode('hex')
1320         if stop_at and stop_at(oidx):
1321             continue
1322
1323         if (not include_data) and mode and stat.S_ISREG(mode):
1324             # If the object is a "regular file", then it's a leaf in
1325             # the graph, so we can skip reading the data if the caller
1326             # hasn't requested it.
1327             yield WalkItem(oid=oid, type='blob',
1328                            chunk_path=chunk_path, path=parent_path,
1329                            mode=mode,
1330                            data=None)
1331             continue
1332
1333         item_it = get_ref(oidx)
1334         get_oidx, typ, _ = next(item_it)
1335         if not get_oidx:
1336             raise MissingObject(oidx.decode('hex'))
1337         if typ not in ('blob', 'commit', 'tree'):
1338             raise Exception('unexpected repository object type %r' % typ)
1339
1340         # FIXME: set the mode based on the type when the mode is None
1341         if typ == 'blob' and not include_data:
1342             # Dump data until we can ask cat_pipe not to fetch it
1343             for ignored in item_it:
1344                 pass
1345             data = None
1346         else:
1347             data = ''.join(item_it)
1348
1349         yield WalkItem(oid=oid, type=typ,
1350                        chunk_path=chunk_path, path=parent_path,
1351                        mode=mode,
1352                        data=(data if include_data else None))
1353
1354         if typ == 'commit':
1355             commit_items = parse_commit(data)
1356             for pid in commit_items.parents:
1357                 pending.append((pid, parent_path, chunk_path, mode))
1358             pending.append((commit_items.tree, parent_path, chunk_path,
1359                             hashsplit.GIT_MODE_TREE))
1360         elif typ == 'tree':
1361             for mode, name, ent_id in tree_decode(data):
1362                 demangled, bup_type = demangle_name(name, mode)
1363                 if chunk_path:
1364                     sub_path = parent_path
1365                     sub_chunk_path = chunk_path + [name]
1366                 else:
1367                     sub_path = parent_path + [name]
1368                     if bup_type == BUP_CHUNKED:
1369                         sub_chunk_path = ['']
1370                     else:
1371                         sub_chunk_path = chunk_path
1372                 pending.append((ent_id.encode('hex'), sub_path, sub_chunk_path,
1373                                 mode))