lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5
   6 from __future__ import absolute_import
   7 import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
   8 from collections import namedtuple
   9 from itertools import islice
  10 from numbers import Integral
  11
  12 from bup import _helpers, compat, hashsplit, path, midx, bloom, xstat
  13 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
  14                          fdatasync,
  15                          hostname, localtime, log, merge_iter,
  16                          mmap_read, mmap_readwrite,
  17                          parse_num,
  18                          progress, qprogress, shstr, stat_if_exists,
  19                          unlink, username, userfullname,
  20                          utc_offset_str)
  21
  22 verbose = 0
  23 ignore_midx = 0
  24 repodir = None  # The default repository, once initialized
  25
  26 _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
  27 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
  28
  29 _total_searches = 0
  30 _total_steps = 0
  31
  32
  33 class GitError(Exception):
  34     pass
  35
  36
  37 def _git_wait(cmd, p):
  38     rv = p.wait()
  39     if rv != 0:
  40         raise GitError('%s returned %d' % (shstr(cmd), rv))
  41
  42 def _git_capture(argv):
  43     p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv())
  44     r = p.stdout.read()
  45     _git_wait(repr(argv), p)
  46     return r
  47
  48 def git_config_get(option, repo_dir=None):
  49     cmd = ('git', 'config', '--get', option)
  50     p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
  51                          preexec_fn=_gitenv(repo_dir=repo_dir))
  52     r = p.stdout.read()
  53     rc = p.wait()
  54     if rc == 0:
  55         return r
  56     if rc != 1:
  57         raise GitError('%s returned %d' % (cmd, rc))
  58     return None
  59
  60
  61 def parse_tz_offset(s):
  62     """UTC offset in seconds."""
  63     tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
  64     if s[0] == '-':
  65         return - tz_off
  66     return tz_off
  67
  68
  69 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
  70 # Make sure that's authoritative.
  71 _start_end_char = r'[^ .,:;<>"\'\0\n]'
  72 _content_char = r'[^\0\n<>]'
  73 _safe_str_rx = '(?:%s{1,2}|(?:%s%s*%s))' \
  74     % (_start_end_char,
  75        _start_end_char, _content_char, _start_end_char)
  76 _tz_rx = r'[-+]\d\d[0-5]\d'
  77 _parent_rx = r'(?:parent [abcdefABCDEF0123456789]{40}\n)'
  78 _commit_rx = re.compile(r'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
  79 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
  80 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)
  81
  82 (?P<message>(?:.|\n)*)''' % (_parent_rx,
  83                              _safe_str_rx, _safe_str_rx, _tz_rx,
  84                              _safe_str_rx, _safe_str_rx, _tz_rx))
  85 _parent_hash_rx = re.compile(r'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
  86
  87
  88 # Note that the author_sec and committer_sec values are (UTC) epoch seconds.
  89 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
  90                                        'author_name', 'author_mail',
  91                                        'author_sec', 'author_offset',
  92                                        'committer_name', 'committer_mail',
  93                                        'committer_sec', 'committer_offset',
  94                                        'message'])
  95
  96 def parse_commit(content):
  97     commit_match = re.match(_commit_rx, content)
  98     if not commit_match:
  99         raise Exception('cannot parse commit %r' % content)
 100     matches = commit_match.groupdict()
 101     return CommitInfo(tree=matches['tree'],
 102                       parents=re.findall(_parent_hash_rx, matches['parents']),
 103                       author_name=matches['author_name'],
 104                       author_mail=matches['author_mail'],
 105                       author_sec=int(matches['asec']),
 106                       author_offset=parse_tz_offset(matches['atz']),
 107                       committer_name=matches['committer_name'],
 108                       committer_mail=matches['committer_mail'],
 109                       committer_sec=int(matches['csec']),
 110                       committer_offset=parse_tz_offset(matches['ctz']),
 111                       message=matches['message'])
 112
 113
 114 def get_commit_items(id, cp):
 115     commit_it = cp.get(id)
 116     _, typ, _ = next(commit_it)
 117     assert(typ == 'commit')
 118     commit_content = ''.join(commit_it)
 119     return parse_commit(commit_content)
 120
 121
 122 def _local_git_date_str(epoch_sec):
 123     return '%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
 124
 125
 126 def _git_date_str(epoch_sec, tz_offset_sec):
 127     offs =  tz_offset_sec // 60
 128     return '%d %s%02d%02d' \
 129         % (epoch_sec,
 130            '+' if offs >= 0 else '-',
 131            abs(offs) // 60,
 132            abs(offs) % 60)
 133
 134
 135 def repo(sub = '', repo_dir=None):
 136     """Get the path to the git repository or one of its subdirectories."""
 137     repo_dir = repo_dir or repodir
 138     if not repo_dir:
 139         raise GitError('You should call check_repo_or_die()')
 140
 141     # If there's a .git subdirectory, then the actual repo is in there.
 142     gd = os.path.join(repo_dir, '.git')
 143     if os.path.exists(gd):
 144         repo_dir = gd
 145
 146     return os.path.join(repo_dir, sub)
 147
 148
 149 def shorten_hash(s):
 150     return re.sub(r'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)',
 151                   r'\1\2*\3', s)
 152
 153
 154 def repo_rel(path):
 155     full = os.path.abspath(path)
 156     fullrepo = os.path.abspath(repo(''))
 157     if not fullrepo.endswith('/'):
 158         fullrepo += '/'
 159     if full.startswith(fullrepo):
 160         path = full[len(fullrepo):]
 161     if path.startswith('index-cache/'):
 162         path = path[len('index-cache/'):]
 163     return shorten_hash(path)
 164
 165
 166 def all_packdirs():
 167     paths = [repo('objects/pack')]
 168     paths += glob.glob(repo('index-cache/*/.'))
 169     return paths
 170
 171
 172 def auto_midx(objdir):
 173     args = [path.exe(), 'midx', '--auto', '--dir', objdir]
 174     try:
 175         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 176     except OSError as e:
 177         # make sure 'args' gets printed to help with debugging
 178         add_error('%r: exception: %s' % (args, e))
 179         raise
 180     if rv:
 181         add_error('%r: returned %d' % (args, rv))
 182
 183     args = [path.exe(), 'bloom', '--dir', objdir]
 184     try:
 185         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 186     except OSError as e:
 187         # make sure 'args' gets printed to help with debugging
 188         add_error('%r: exception: %s' % (args, e))
 189         raise
 190     if rv:
 191         add_error('%r: returned %d' % (args, rv))
 192
 193
 194 def mangle_name(name, mode, gitmode):
 195     """Mangle a file name to present an abstract name for segmented files.
 196     Mangled file names will have the ".bup" extension added to them. If a
 197     file's name already ends with ".bup", a ".bupl" extension is added to
 198     disambiguate normal files from segmented ones.
 199     """
 200     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 201         assert(stat.S_ISDIR(gitmode))
 202         return name + '.bup'
 203     elif name.endswith('.bup') or name[:-1].endswith('.bup'):
 204         return name + '.bupl'
 205     else:
 206         return name
 207
 208
 209 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 210 def demangle_name(name, mode):
 211     """Remove name mangling from a file name, if necessary.
 212
 213     The return value is a tuple (demangled_filename,mode), where mode is one of
 214     the following:
 215
 216     * BUP_NORMAL  : files that should be read as-is from the repository
 217     * BUP_CHUNKED : files that were chunked and need to be reassembled
 218
 219     For more information on the name mangling algorithm, see mangle_name()
 220     """
 221     if name.endswith('.bupl'):
 222         return (name[:-5], BUP_NORMAL)
 223     elif name.endswith('.bup'):
 224         return (name[:-4], BUP_CHUNKED)
 225     elif name.endswith('.bupm'):
 226         return (name[:-5],
 227                 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
 228     else:
 229         return (name, BUP_NORMAL)
 230
 231
 232 def calc_hash(type, content):
 233     """Calculate some content's hash in the Git fashion."""
 234     header = '%s %d\0' % (type, len(content))
 235     sum = Sha1(header)
 236     sum.update(content)
 237     return sum.digest()
 238
 239
 240 def shalist_item_sort_key(ent):
 241     (mode, name, id) = ent
 242     assert(mode+0 == mode)
 243     if stat.S_ISDIR(mode):
 244         return name + '/'
 245     else:
 246         return name
 247
 248
 249 def tree_encode(shalist):
 250     """Generate a git tree object from (mode,name,hash) tuples."""
 251     shalist = sorted(shalist, key = shalist_item_sort_key)
 252     l = []
 253     for (mode,name,bin) in shalist:
 254         assert(mode)
 255         assert(mode+0 == mode)
 256         assert(name)
 257         assert(len(bin) == 20)
 258         s = '%o %s\0%s' % (mode,name,bin)
 259         assert(s[0] != '0')  # 0-padded octal is not acceptable in a git tree
 260         l.append(s)
 261     return ''.join(l)
 262
 263
 264 def tree_decode(buf):
 265     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 266     ofs = 0
 267     while ofs < len(buf):
 268         z = buf.find('\0', ofs)
 269         assert(z > ofs)
 270         spl = buf[ofs:z].split(' ', 1)
 271         assert(len(spl) == 2)
 272         mode,name = spl
 273         sha = buf[z+1:z+1+20]
 274         ofs = z+1+20
 275         yield (int(mode, 8), name, sha)
 276
 277
 278 def _encode_packobj(type, content, compression_level=1):
 279     if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
 280         raise ValueError('invalid compression level %s' % compression_level)
 281     szout = ''
 282     sz = len(content)
 283     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 284     sz >>= 4
 285     while 1:
 286         if sz: szbits |= 0x80
 287         szout += chr(szbits)
 288         if not sz:
 289             break
 290         szbits = sz & 0x7f
 291         sz >>= 7
 292     z = zlib.compressobj(compression_level)
 293     yield szout
 294     yield z.compress(content)
 295     yield z.flush()
 296
 297
 298 def _encode_looseobj(type, content, compression_level=1):
 299     z = zlib.compressobj(compression_level)
 300     yield z.compress('%s %d\0' % (type, len(content)))
 301     yield z.compress(content)
 302     yield z.flush()
 303
 304
 305 def _decode_looseobj(buf):
 306     assert(buf);
 307     s = zlib.decompress(buf)
 308     i = s.find('\0')
 309     assert(i > 0)
 310     l = s[:i].split(' ')
 311     type = l[0]
 312     sz = int(l[1])
 313     content = s[i+1:]
 314     assert(type in _typemap)
 315     assert(sz == len(content))
 316     return (type, content)
 317
 318
 319 def _decode_packobj(buf):
 320     assert(buf)
 321     c = ord(buf[0])
 322     type = _typermap[(c & 0x70) >> 4]
 323     sz = c & 0x0f
 324     shift = 4
 325     i = 0
 326     while c & 0x80:
 327         i += 1
 328         c = ord(buf[i])
 329         sz |= (c & 0x7f) << shift
 330         shift += 7
 331         if not (c & 0x80):
 332             break
 333     return (type, zlib.decompress(buf[i+1:]))
 334
 335
 336 class PackIdx:
 337     def __init__(self):
 338         assert(0)
 339
 340     def find_offset(self, hash):
 341         """Get the offset of an object inside the index file."""
 342         idx = self._idx_from_hash(hash)
 343         if idx != None:
 344             return self._ofs_from_idx(idx)
 345         return None
 346
 347     def exists(self, hash, want_source=False):
 348         """Return nonempty if the object exists in this index."""
 349         if hash and (self._idx_from_hash(hash) != None):
 350             return want_source and os.path.basename(self.name) or True
 351         return None
 352
 353     def __len__(self):
 354         return int(self.fanout[255])
 355
 356     def _idx_from_hash(self, hash):
 357         global _total_searches, _total_steps
 358         _total_searches += 1
 359         assert(len(hash) == 20)
 360         b1 = ord(hash[0])
 361         start = self.fanout[b1-1] # range -1..254
 362         end = self.fanout[b1] # range 0..255
 363         want = str(hash)
 364         _total_steps += 1  # lookup table is a step
 365         while start < end:
 366             _total_steps += 1
 367             mid = start + (end-start)/2
 368             v = self._idx_to_hash(mid)
 369             if v < want:
 370                 start = mid+1
 371             elif v > want:
 372                 end = mid
 373             else: # got it!
 374                 return mid
 375         return None
 376
 377
 378 class PackIdxV1(PackIdx):
 379     """Object representation of a Git pack index (version 1) file."""
 380     def __init__(self, filename, f):
 381         self.name = filename
 382         self.idxnames = [self.name]
 383         self.map = mmap_read(f)
 384         self.fanout = list(struct.unpack('!256I',
 385                                          str(buffer(self.map, 0, 256*4))))
 386         self.fanout.append(0)  # entry "-1"
 387         nsha = self.fanout[255]
 388         self.sha_ofs = 256*4
 389         self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
 390
 391     def _ofs_from_idx(self, idx):
 392         return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
 393
 394     def _idx_to_hash(self, idx):
 395         return str(self.shatable[idx*24+4 : idx*24+24])
 396
 397     def __iter__(self):
 398         for i in xrange(self.fanout[255]):
 399             yield buffer(self.map, 256*4 + 24*i + 4, 20)
 400
 401
 402 class PackIdxV2(PackIdx):
 403     """Object representation of a Git pack index (version 2) file."""
 404     def __init__(self, filename, f):
 405         self.name = filename
 406         self.idxnames = [self.name]
 407         self.map = mmap_read(f)
 408         assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
 409         self.fanout = list(struct.unpack('!256I',
 410                                          str(buffer(self.map, 8, 256*4))))
 411         self.fanout.append(0)  # entry "-1"
 412         nsha = self.fanout[255]
 413         self.sha_ofs = 8 + 256*4
 414         self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
 415         self.ofstable = buffer(self.map,
 416                                self.sha_ofs + nsha*20 + nsha*4,
 417                                nsha*4)
 418         self.ofs64table = buffer(self.map,
 419                                  8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
 420
 421     def _ofs_from_idx(self, idx):
 422         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
 423         if ofs & 0x80000000:
 424             idx64 = ofs & 0x7fffffff
 425             ofs = struct.unpack('!Q',
 426                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
 427         return ofs
 428
 429     def _idx_to_hash(self, idx):
 430         return str(self.shatable[idx*20:(idx+1)*20])
 431
 432     def __iter__(self):
 433         for i in xrange(self.fanout[255]):
 434             yield buffer(self.map, 8 + 256*4 + 20*i, 20)
 435
 436
 437 _mpi_count = 0
 438 class PackIdxList:
 439     def __init__(self, dir):
 440         global _mpi_count
 441         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 442         _mpi_count += 1
 443         self.dir = dir
 444         self.also = set()
 445         self.packs = []
 446         self.do_bloom = False
 447         self.bloom = None
 448         self.refresh()
 449
 450     def __del__(self):
 451         global _mpi_count
 452         _mpi_count -= 1
 453         assert(_mpi_count == 0)
 454
 455     def __iter__(self):
 456         return iter(idxmerge(self.packs))
 457
 458     def __len__(self):
 459         return sum(len(pack) for pack in self.packs)
 460
 461     def exists(self, hash, want_source=False):
 462         """Return nonempty if the object exists in the index files."""
 463         global _total_searches
 464         _total_searches += 1
 465         if hash in self.also:
 466             return True
 467         if self.do_bloom and self.bloom:
 468             if self.bloom.exists(hash):
 469                 self.do_bloom = False
 470             else:
 471                 _total_searches -= 1  # was counted by bloom
 472                 return None
 473         for i in xrange(len(self.packs)):
 474             p = self.packs[i]
 475             _total_searches -= 1  # will be incremented by sub-pack
 476             ix = p.exists(hash, want_source=want_source)
 477             if ix:
 478                 # reorder so most recently used packs are searched first
 479                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 480                 return ix
 481         self.do_bloom = True
 482         return None
 483
 484     def refresh(self, skip_midx = False):
 485         """Refresh the index list.
 486         This method verifies if .midx files were superseded (e.g. all of its
 487         contents are in another, bigger .midx file) and removes the superseded
 488         files.
 489
 490         If skip_midx is True, all work on .midx files will be skipped and .midx
 491         files will be removed from the list.
 492
 493         The module-global variable 'ignore_midx' can force this function to
 494         always act as if skip_midx was True.
 495         """
 496         self.bloom = None # Always reopen the bloom as it may have been relaced
 497         self.do_bloom = False
 498         skip_midx = skip_midx or ignore_midx
 499         d = dict((p.name, p) for p in self.packs
 500                  if not skip_midx or not isinstance(p, midx.PackMidx))
 501         if os.path.exists(self.dir):
 502             if not skip_midx:
 503                 midxl = []
 504                 for ix in self.packs:
 505                     if isinstance(ix, midx.PackMidx):
 506                         for name in ix.idxnames:
 507                             d[os.path.join(self.dir, name)] = ix
 508                 for full in glob.glob(os.path.join(self.dir,'*.midx')):
 509                     if not d.get(full):
 510                         mx = midx.PackMidx(full)
 511                         (mxd, mxf) = os.path.split(mx.name)
 512                         broken = False
 513                         for n in mx.idxnames:
 514                             if not os.path.exists(os.path.join(mxd, n)):
 515                                 log(('warning: index %s missing\n' +
 516                                     '  used by %s\n') % (n, mxf))
 517                                 broken = True
 518                         if broken:
 519                             mx.close()
 520                             del mx
 521                             unlink(full)
 522                         else:
 523                             midxl.append(mx)
 524                 midxl.sort(key=lambda ix:
 525                            (-len(ix), -xstat.stat(ix.name).st_mtime))
 526                 for ix in midxl:
 527                     any_needed = False
 528                     for sub in ix.idxnames:
 529                         found = d.get(os.path.join(self.dir, sub))
 530                         if not found or isinstance(found, PackIdx):
 531                             # doesn't exist, or exists but not in a midx
 532                             any_needed = True
 533                             break
 534                     if any_needed:
 535                         d[ix.name] = ix
 536                         for name in ix.idxnames:
 537                             d[os.path.join(self.dir, name)] = ix
 538                     elif not ix.force_keep:
 539                         debug1('midx: removing redundant: %s\n'
 540                                % os.path.basename(ix.name))
 541                         ix.close()
 542                         unlink(ix.name)
 543             for full in glob.glob(os.path.join(self.dir,'*.idx')):
 544                 if not d.get(full):
 545                     try:
 546                         ix = open_idx(full)
 547                     except GitError as e:
 548                         add_error(e)
 549                         continue
 550                     d[full] = ix
 551             bfull = os.path.join(self.dir, 'bup.bloom')
 552             if self.bloom is None and os.path.exists(bfull):
 553                 self.bloom = bloom.ShaBloom(bfull)
 554             self.packs = list(set(d.values()))
 555             self.packs.sort(reverse=True, key=lambda x: len(x))
 556             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 557                 self.do_bloom = True
 558             else:
 559                 self.bloom = None
 560         debug1('PackIdxList: using %d index%s.\n'
 561             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 562
 563     def add(self, hash):
 564         """Insert an additional object in the list."""
 565         self.also.add(hash)
 566
 567
 568 def open_idx(filename):
 569     if filename.endswith('.idx'):
 570         f = open(filename, 'rb')
 571         header = f.read(8)
 572         if header[0:4] == '\377tOc':
 573             version = struct.unpack('!I', header[4:8])[0]
 574             if version == 2:
 575                 return PackIdxV2(filename, f)
 576             else:
 577                 raise GitError('%s: expected idx file version 2, got %d'
 578                                % (filename, version))
 579         elif len(header) == 8 and header[0:4] < '\377tOc':
 580             return PackIdxV1(filename, f)
 581         else:
 582             raise GitError('%s: unrecognized idx file header' % filename)
 583     elif filename.endswith('.midx'):
 584         return midx.PackMidx(filename)
 585     else:
 586         raise GitError('idx filenames must end with .idx or .midx')
 587
 588
 589 def idxmerge(idxlist, final_progress=True):
 590     """Generate a list of all the objects reachable in a PackIdxList."""
 591     def pfunc(count, total):
 592         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 593                   % (count*100.0/total, count, total))
 594     def pfinal(count, total):
 595         if final_progress:
 596             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 597                      % (100, total, total))
 598     return merge_iter(idxlist, 10024, pfunc, pfinal)
 599
 600
 601 def _make_objcache():
 602     return PackIdxList(repo('objects/pack'))
 603
 604 # bup-gc assumes that it can disable all PackWriter activities
 605 # (bloom/midx/cache) via the constructor and close() arguments.
 606
 607 class PackWriter:
 608     """Writes Git objects inside a pack file."""
 609     def __init__(self, objcache_maker=_make_objcache, compression_level=1,
 610                  run_midx=True, on_pack_finish=None,
 611                  max_pack_size=None, max_pack_objects=None, repo_dir=None):
 612         self.repo_dir = repo_dir or repo()
 613         self.file = None
 614         self.parentfd = None
 615         self.count = 0
 616         self.outbytes = 0
 617         self.filename = None
 618         self.idx = None
 619         self.objcache_maker = objcache_maker
 620         self.objcache = None
 621         self.compression_level = compression_level
 622         self.run_midx=run_midx
 623         self.on_pack_finish = on_pack_finish
 624         if not max_pack_size:
 625             max_pack_size = git_config_get('pack.packSizeLimit',
 626                                            repo_dir=self.repo_dir)
 627             if max_pack_size is not None:
 628                 max_pack_size = parse_num(max_pack_size)
 629             if not max_pack_size:
 630                 # larger packs slow down pruning
 631                 max_pack_size = 1000 * 1000 * 1000
 632         self.max_pack_size = max_pack_size
 633         # cache memory usage is about 83 bytes per object
 634         self.max_pack_objects = max_pack_objects if max_pack_objects \
 635                                 else max(1, self.max_pack_size // 5000)
 636
 637     def __del__(self):
 638         self.close()
 639
 640     def __enter__(self):
 641         return self
 642
 643     def __exit__(self, type, value, traceback):
 644         self.close()
 645
 646     def _open(self):
 647         if not self.file:
 648             objdir = dir = os.path.join(self.repo_dir, 'objects')
 649             fd, name = tempfile.mkstemp(suffix='.pack', dir=objdir)
 650             try:
 651                 self.file = os.fdopen(fd, 'w+b')
 652             except:
 653                 os.close(fd)
 654                 raise
 655             try:
 656                 self.parentfd = os.open(objdir, os.O_RDONLY)
 657             except:
 658                 f = self.file
 659                 self.file = None
 660                 f.close()
 661                 raise
 662             assert(name.endswith('.pack'))
 663             self.filename = name[:-5]
 664             self.file.write('PACK\0\0\0\2\0\0\0\0')
 665             self.idx = list(list() for i in xrange(256))
 666
 667     def _raw_write(self, datalist, sha):
 668         self._open()
 669         f = self.file
 670         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 671         # the file never has a *partial* blob.  So let's make sure it's
 672         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 673         # to our hashsplit algorithm.)  f.write() does its own buffering,
 674         # but that's okay because we'll flush it in _end().
 675         oneblob = ''.join(datalist)
 676         try:
 677             f.write(oneblob)
 678         except IOError as e:
 679             raise GitError, e, sys.exc_info()[2]
 680         nw = len(oneblob)
 681         crc = zlib.crc32(oneblob) & 0xffffffff
 682         self._update_idx(sha, crc, nw)
 683         self.outbytes += nw
 684         self.count += 1
 685         return nw, crc
 686
 687     def _update_idx(self, sha, crc, size):
 688         assert(sha)
 689         if self.idx:
 690             self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
 691
 692     def _write(self, sha, type, content):
 693         if verbose:
 694             log('>')
 695         if not sha:
 696             sha = calc_hash(type, content)
 697         size, crc = self._raw_write(_encode_packobj(type, content,
 698                                                     self.compression_level),
 699                                     sha=sha)
 700         if self.outbytes >= self.max_pack_size \
 701            or self.count >= self.max_pack_objects:
 702             self.breakpoint()
 703         return sha
 704
 705     def breakpoint(self):
 706         """Clear byte and object counts and return the last processed id."""
 707         id = self._end(self.run_midx)
 708         self.outbytes = self.count = 0
 709         return id
 710
 711     def _require_objcache(self):
 712         if self.objcache is None and self.objcache_maker:
 713             self.objcache = self.objcache_maker()
 714         if self.objcache is None:
 715             raise GitError(
 716                     "PackWriter not opened or can't check exists w/o objcache")
 717
 718     def exists(self, id, want_source=False):
 719         """Return non-empty if an object is found in the object cache."""
 720         self._require_objcache()
 721         return self.objcache.exists(id, want_source=want_source)
 722
 723     def just_write(self, sha, type, content):
 724         """Write an object to the pack file, bypassing the objcache.  Fails if
 725         sha exists()."""
 726         self._write(sha, type, content)
 727
 728     def maybe_write(self, type, content):
 729         """Write an object to the pack file if not present and return its id."""
 730         sha = calc_hash(type, content)
 731         if not self.exists(sha):
 732             self.just_write(sha, type, content)
 733             self._require_objcache()
 734             self.objcache.add(sha)
 735         return sha
 736
 737     def new_blob(self, blob):
 738         """Create a blob object in the pack with the supplied content."""
 739         return self.maybe_write('blob', blob)
 740
 741     def new_tree(self, shalist):
 742         """Create a tree object in the pack."""
 743         content = tree_encode(shalist)
 744         return self.maybe_write('tree', content)
 745
 746     def new_commit(self, tree, parent,
 747                    author, adate_sec, adate_tz,
 748                    committer, cdate_sec, cdate_tz,
 749                    msg):
 750         """Create a commit object in the pack.  The date_sec values must be
 751         epoch-seconds, and if a tz is None, the local timezone is assumed."""
 752         if adate_tz:
 753             adate_str = _git_date_str(adate_sec, adate_tz)
 754         else:
 755             adate_str = _local_git_date_str(adate_sec)
 756         if cdate_tz:
 757             cdate_str = _git_date_str(cdate_sec, cdate_tz)
 758         else:
 759             cdate_str = _local_git_date_str(cdate_sec)
 760         l = []
 761         if tree: l.append('tree %s' % tree.encode('hex'))
 762         if parent: l.append('parent %s' % parent.encode('hex'))
 763         if author: l.append('author %s %s' % (author, adate_str))
 764         if committer: l.append('committer %s %s' % (committer, cdate_str))
 765         l.append('')
 766         l.append(msg)
 767         return self.maybe_write('commit', '\n'.join(l))
 768
 769     def abort(self):
 770         """Remove the pack file from disk."""
 771         f = self.file
 772         if f:
 773             pfd = self.parentfd
 774             self.file = None
 775             self.parentfd = None
 776             self.idx = None
 777             try:
 778                 try:
 779                     os.unlink(self.filename + '.pack')
 780                 finally:
 781                     f.close()
 782             finally:
 783                 if pfd is not None:
 784                     os.close(pfd)
 785
 786     def _end(self, run_midx=True):
 787         f = self.file
 788         if not f: return None
 789         self.file = None
 790         try:
 791             self.objcache = None
 792             idx = self.idx
 793             self.idx = None
 794
 795             # update object count
 796             f.seek(8)
 797             cp = struct.pack('!i', self.count)
 798             assert(len(cp) == 4)
 799             f.write(cp)
 800
 801             # calculate the pack sha1sum
 802             f.seek(0)
 803             sum = Sha1()
 804             for b in chunkyreader(f):
 805                 sum.update(b)
 806             packbin = sum.digest()
 807             f.write(packbin)
 808             fdatasync(f.fileno())
 809         finally:
 810             f.close()
 811
 812         obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
 813         nameprefix = os.path.join(self.repo_dir,
 814                                   'objects/pack/pack-' +  obj_list_sha)
 815         if os.path.exists(self.filename + '.map'):
 816             os.unlink(self.filename + '.map')
 817         os.rename(self.filename + '.pack', nameprefix + '.pack')
 818         os.rename(self.filename + '.idx', nameprefix + '.idx')
 819         try:
 820             os.fsync(self.parentfd)
 821         finally:
 822             os.close(self.parentfd)
 823
 824         if run_midx:
 825             auto_midx(os.path.join(self.repo_dir, 'objects/pack'))
 826
 827         if self.on_pack_finish:
 828             self.on_pack_finish(nameprefix)
 829
 830         return nameprefix
 831
 832     def close(self, run_midx=True):
 833         """Close the pack file and move it to its definitive path."""
 834         return self._end(run_midx=run_midx)
 835
 836     def _write_pack_idx_v2(self, filename, idx, packbin):
 837         ofs64_count = 0
 838         for section in idx:
 839             for entry in section:
 840                 if entry[2] >= 2**31:
 841                     ofs64_count += 1
 842
 843         # Length: header + fan-out + shas-and-crcs + overflow-offsets
 844         index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
 845         idx_map = None
 846         idx_f = open(filename, 'w+b')
 847         try:
 848             idx_f.truncate(index_len)
 849             fdatasync(idx_f.fileno())
 850             idx_map = mmap_readwrite(idx_f, close=False)
 851             try:
 852                 count = _helpers.write_idx(filename, idx_map, idx, self.count)
 853                 assert(count == self.count)
 854                 idx_map.flush()
 855             finally:
 856                 idx_map.close()
 857         finally:
 858             idx_f.close()
 859
 860         idx_f = open(filename, 'a+b')
 861         try:
 862             idx_f.write(packbin)
 863             idx_f.seek(0)
 864             idx_sum = Sha1()
 865             b = idx_f.read(8 + 4*256)
 866             idx_sum.update(b)
 867
 868             obj_list_sum = Sha1()
 869             for b in chunkyreader(idx_f, 20*self.count):
 870                 idx_sum.update(b)
 871                 obj_list_sum.update(b)
 872             namebase = obj_list_sum.hexdigest()
 873
 874             for b in chunkyreader(idx_f):
 875                 idx_sum.update(b)
 876             idx_f.write(idx_sum.digest())
 877             fdatasync(idx_f.fileno())
 878             return namebase
 879         finally:
 880             idx_f.close()
 881
 882
 883 def _gitenv(repo_dir = None):
 884     if not repo_dir:
 885         repo_dir = repo()
 886     def env():
 887         os.environ['GIT_DIR'] = os.path.abspath(repo_dir)
 888     return env
 889
 890
 891 def list_refs(patterns=None, repo_dir=None,
 892               limit_to_heads=False, limit_to_tags=False):
 893     """Yield (refname, hash) tuples for all repository refs unless
 894     patterns are specified.  In that case, only include tuples for
 895     refs matching those patterns (cf. git-show-ref(1)).  The limits
 896     restrict the result items to refs/heads or refs/tags.  If both
 897     limits are specified, items from both sources will be included.
 898
 899     """
 900     argv = ['git', 'show-ref']
 901     if limit_to_heads:
 902         argv.append('--heads')
 903     if limit_to_tags:
 904         argv.append('--tags')
 905     argv.append('--')
 906     if patterns:
 907         argv.extend(patterns)
 908     p = subprocess.Popen(argv,
 909                          preexec_fn = _gitenv(repo_dir),
 910                          stdout = subprocess.PIPE)
 911     out = p.stdout.read().strip()
 912     rv = p.wait()  # not fatal
 913     if rv:
 914         assert(not out)
 915     if out:
 916         for d in out.split('\n'):
 917             (sha, name) = d.split(' ', 1)
 918             yield (name, sha.decode('hex'))
 919
 920
 921 def read_ref(refname, repo_dir = None):
 922     """Get the commit id of the most recent commit made on a given ref."""
 923     refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
 924     l = tuple(islice(refs, 2))
 925     if l:
 926         assert(len(l) == 1)
 927         return l[0][1]
 928     else:
 929         return None
 930
 931
 932 def rev_list_invocation(ref_or_refs, count=None, format=None):
 933     if isinstance(ref_or_refs, compat.str_type):
 934         refs = (ref_or_refs,)
 935     else:
 936         refs = ref_or_refs
 937     argv = ['git', 'rev-list']
 938     if isinstance(count, Integral):
 939         argv.extend(['-n', str(count)])
 940     elif count:
 941         raise ValueError('unexpected count argument %r' % count)
 942
 943     if format:
 944         argv.append('--pretty=format:' + format)
 945     for ref in refs:
 946         assert not ref.startswith('-')
 947         argv.append(ref)
 948     argv.append('--')
 949     return argv
 950
 951
 952 def rev_list(ref_or_refs, count=None, parse=None, format=None, repo_dir=None):
 953     """Yield information about commits as per "git rev-list".  If a format
 954     is not provided, yield one hex hash at a time.  If a format is
 955     provided, pass it to rev-list and call parse(git_stdout) for each
 956     commit with the stream positioned just after the rev-list "commit
 957     HASH" header line.  When a format is provided yield (oidx,
 958     parse(git_stdout)) for each commit.
 959
 960     """
 961     assert bool(parse) == bool(format)
 962     p = subprocess.Popen(rev_list_invocation(ref_or_refs, count=count,
 963                                              format=format),
 964                          preexec_fn = _gitenv(repo_dir),
 965                          stdout = subprocess.PIPE)
 966     if not format:
 967         for line in p.stdout:
 968             yield line.strip()
 969     else:
 970         line = p.stdout.readline()
 971         while line:
 972             s = line.strip()
 973             if not s.startswith('commit '):
 974                 raise Exception('unexpected line ' + s)
 975             yield s[7:], parse(p.stdout)
 976             line = p.stdout.readline()
 977
 978     rv = p.wait()  # not fatal
 979     if rv:
 980         raise GitError, 'git rev-list returned error %d' % rv
 981
 982
 983 def get_commit_dates(refs, repo_dir=None):
 984     """Get the dates for the specified commit refs.  For now, every unique
 985        string in refs must resolve to a different commit or this
 986        function will fail."""
 987     result = []
 988     for ref in refs:
 989         commit = get_commit_items(ref, cp(repo_dir))
 990         result.append(commit.author_sec)
 991     return result
 992
 993
 994 def rev_parse(committish, repo_dir=None):
 995     """Resolve the full hash for 'committish', if it exists.
 996
 997     Should be roughly equivalent to 'git rev-parse'.
 998
 999     Returns the hex value of the hash if it is found, None if 'committish' does
1000     not correspond to anything.
1001     """
1002     head = read_ref(committish, repo_dir=repo_dir)
1003     if head:
1004         debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
1005         return head
1006
1007     pL = PackIdxList(repo('objects/pack', repo_dir=repo_dir))
1008
1009     if len(committish) == 40:
1010         try:
1011             hash = committish.decode('hex')
1012         except TypeError:
1013             return None
1014
1015         if pL.exists(hash):
1016             return hash
1017
1018     return None
1019
1020
1021 def update_ref(refname, newval, oldval, repo_dir=None):
1022     """Update a repository reference."""
1023     if not oldval:
1024         oldval = ''
1025     assert(refname.startswith('refs/heads/') \
1026            or refname.startswith('refs/tags/'))
1027     p = subprocess.Popen(['git', 'update-ref', refname,
1028                           newval.encode('hex'), oldval.encode('hex')],
1029                          preexec_fn = _gitenv(repo_dir))
1030     _git_wait('git update-ref', p)
1031
1032
1033 def delete_ref(refname, oldvalue=None):
1034     """Delete a repository reference (see git update-ref(1))."""
1035     assert(refname.startswith('refs/'))
1036     oldvalue = [] if not oldvalue else [oldvalue]
1037     p = subprocess.Popen(['git', 'update-ref', '-d', refname] + oldvalue,
1038                          preexec_fn = _gitenv())
1039     _git_wait('git update-ref', p)
1040
1041
1042 def guess_repo(path=None):
1043     """Set the path value in the global variable "repodir".
1044     This makes bup look for an existing bup repository, but not fail if a
1045     repository doesn't exist. Usually, if you are interacting with a bup
1046     repository, you would not be calling this function but using
1047     check_repo_or_die().
1048     """
1049     global repodir
1050     if path:
1051         repodir = path
1052     if not repodir:
1053         repodir = os.environ.get('BUP_DIR')
1054         if not repodir:
1055             repodir = os.path.expanduser('~/.bup')
1056
1057
1058 def init_repo(path=None):
1059     """Create the Git bare repository for bup in a given path."""
1060     guess_repo(path)
1061     d = repo()  # appends a / to the path
1062     parent = os.path.dirname(os.path.dirname(d))
1063     if parent and not os.path.exists(parent):
1064         raise GitError('parent directory "%s" does not exist\n' % parent)
1065     if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
1066         raise GitError('"%s" exists but is not a directory\n' % d)
1067     p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
1068                          preexec_fn = _gitenv())
1069     _git_wait('git init', p)
1070     # Force the index version configuration in order to ensure bup works
1071     # regardless of the version of the installed Git binary.
1072     p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
1073                          stdout=sys.stderr, preexec_fn = _gitenv())
1074     _git_wait('git config', p)
1075     # Enable the reflog
1076     p = subprocess.Popen(['git', 'config', 'core.logAllRefUpdates', 'true'],
1077                          stdout=sys.stderr, preexec_fn = _gitenv())
1078     _git_wait('git config', p)
1079
1080
1081 def check_repo_or_die(path=None):
1082     """Check to see if a bup repository probably exists, and abort if not."""
1083     guess_repo(path)
1084     top = repo()
1085     pst = stat_if_exists(top + '/objects/pack')
1086     if pst and stat.S_ISDIR(pst.st_mode):
1087         return
1088     if not pst:
1089         top_st = stat_if_exists(top)
1090         if not top_st:
1091             log('error: repository %r does not exist (see "bup help init")\n'
1092                 % top)
1093             sys.exit(15)
1094     log('error: %r is not a repository\n' % top)
1095     sys.exit(14)
1096
1097
1098 _ver = None
1099 def ver():
1100     """Get Git's version and ensure a usable version is installed.
1101
1102     The returned version is formatted as an ordered tuple with each position
1103     representing a digit in the version tag. For example, the following tuple
1104     would represent version 1.6.6.9:
1105
1106         ('1', '6', '6', '9')
1107     """
1108     global _ver
1109     if not _ver:
1110         p = subprocess.Popen(['git', '--version'],
1111                              stdout=subprocess.PIPE)
1112         gvs = p.stdout.read()
1113         _git_wait('git --version', p)
1114         m = re.match(r'git version (\S+.\S+)', gvs)
1115         if not m:
1116             raise GitError('git --version weird output: %r' % gvs)
1117         _ver = tuple(m.group(1).split('.'))
1118     needed = ('1','5', '3', '1')
1119     if _ver < needed:
1120         raise GitError('git version %s or higher is required; you have %s'
1121                        % ('.'.join(needed), '.'.join(_ver)))
1122     return _ver
1123
1124
1125 class _AbortableIter:
1126     def __init__(self, it, onabort = None):
1127         self.it = it
1128         self.onabort = onabort
1129         self.done = None
1130
1131     def __iter__(self):
1132         return self
1133
1134     def next(self):
1135         try:
1136             return next(self.it)
1137         except StopIteration as e:
1138             self.done = True
1139             raise
1140         except:
1141             self.abort()
1142             raise
1143
1144     def abort(self):
1145         """Abort iteration and call the abortion callback, if needed."""
1146         if not self.done:
1147             self.done = True
1148             if self.onabort:
1149                 self.onabort()
1150
1151     def __del__(self):
1152         self.abort()
1153
1154
1155 _ver_warned = 0
1156 class CatPipe:
1157     """Link to 'git cat-file' that is used to retrieve blob data."""
1158     def __init__(self, repo_dir = None):
1159         global _ver_warned
1160         self.repo_dir = repo_dir
1161         wanted = ('1','5','6')
1162         if ver() < wanted:
1163             log('error: git version must be at least 1.5.6\n')
1164             sys.exit(1)
1165         self.p = self.inprogress = None
1166
1167     def _abort(self):
1168         if self.p:
1169             self.p.stdout.close()
1170             self.p.stdin.close()
1171         self.p = None
1172         self.inprogress = None
1173
1174     def restart(self):
1175         self._abort()
1176         self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1177                                   stdin=subprocess.PIPE,
1178                                   stdout=subprocess.PIPE,
1179                                   close_fds = True,
1180                                   bufsize = 4096,
1181                                   preexec_fn = _gitenv(self.repo_dir))
1182
1183     def get(self, ref):
1184         """Yield (oidx, type, size), followed by the data referred to by ref.
1185         If ref does not exist, only yield (None, None, None).
1186
1187         """
1188         if not self.p or self.p.poll() != None:
1189             self.restart()
1190         assert(self.p)
1191         poll_result = self.p.poll()
1192         assert(poll_result == None)
1193         if self.inprogress:
1194             log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1195         assert(not self.inprogress)
1196         assert(ref.find('\n') < 0)
1197         assert(ref.find('\r') < 0)
1198         assert(not ref.startswith('-'))
1199         self.inprogress = ref
1200         self.p.stdin.write('%s\n' % ref)
1201         self.p.stdin.flush()
1202         hdr = self.p.stdout.readline()
1203         if hdr.endswith(' missing\n'):
1204             self.inprogress = None
1205             yield None, None, None
1206             return
1207         info = hdr.split(' ')
1208         if len(info) != 3 or len(info[0]) != 40:
1209             raise GitError('expected object (id, type, size), got %r' % spl)
1210         oidx, typ, size = info
1211         size = int(size)
1212         it = _AbortableIter(chunkyreader(self.p.stdout, size),
1213                             onabort=self._abort)
1214         try:
1215             yield oidx, typ, size
1216             for blob in it:
1217                 yield blob
1218             readline_result = self.p.stdout.readline()
1219             assert(readline_result == '\n')
1220             self.inprogress = None
1221         except Exception as e:
1222             it.abort()
1223             raise
1224
1225     def _join(self, it):
1226         _, typ, _ = next(it)
1227         if typ == 'blob':
1228             for blob in it:
1229                 yield blob
1230         elif typ == 'tree':
1231             treefile = ''.join(it)
1232             for (mode, name, sha) in tree_decode(treefile):
1233                 for blob in self.join(sha.encode('hex')):
1234                     yield blob
1235         elif typ == 'commit':
1236             treeline = ''.join(it).split('\n')[0]
1237             assert(treeline.startswith('tree '))
1238             for blob in self.join(treeline[5:]):
1239                 yield blob
1240         else:
1241             raise GitError('invalid object type %r: expected blob/tree/commit'
1242                            % typ)
1243
1244     def join(self, id):
1245         """Generate a list of the content of all blobs that can be reached
1246         from an object.  The hash given in 'id' must point to a blob, a tree
1247         or a commit. The content of all blobs that can be seen from trees or
1248         commits will be added to the list.
1249         """
1250         try:
1251             for d in self._join(self.get(id)):
1252                 yield d
1253         except StopIteration:
1254             log('booger!\n')
1255
1256
1257 _cp = {}
1258
1259 def cp(repo_dir=None):
1260     """Create a CatPipe object or reuse the already existing one."""
1261     global _cp, repodir
1262     if not repo_dir:
1263         repo_dir = repodir or repo()
1264     repo_dir = os.path.abspath(repo_dir)
1265     cp = _cp.get(repo_dir)
1266     if not cp:
1267         cp = CatPipe(repo_dir)
1268         _cp[repo_dir] = cp
1269     return cp
1270
1271
1272 def tags(repo_dir = None):
1273     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1274     tags = {}
1275     for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1276         assert(n.startswith('refs/tags/'))
1277         name = n[10:]
1278         if not c in tags:
1279             tags[c] = []
1280         tags[c].append(name)  # more than one tag can point at 'c'
1281     return tags
1282
1283
1284 class MissingObject(KeyError):
1285     def __init__(self, oid):
1286         self.oid = oid
1287         KeyError.__init__(self, 'object %r is missing' % oid.encode('hex'))
1288
1289
1290 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1291                                    'path', 'chunk_path', 'data'])
1292 # The path is the mangled path, and if an item represents a fragment
1293 # of a chunked file, the chunk_path will be the chunked subtree path
1294 # for the chunk, i.e. ['', '2d3115e', ...].  The top-level path for a
1295 # chunked file will have a chunk_path of [''].  So some chunk subtree
1296 # of the file '/foo/bar/baz' might look like this:
1297 #
1298 #   item.path = ['foo', 'bar', 'baz.bup']
1299 #   item.chunk_path = ['', '2d3115e', '016b097']
1300 #   item.type = 'tree'
1301 #   ...
1302
1303
1304 def walk_object(cat_pipe, oidx,
1305                 stop_at=None,
1306                 include_data=None):
1307     """Yield everything reachable from oidx via cat_pipe as a WalkItem,
1308     stopping whenever stop_at(oidx) returns true.  Throw MissingObject
1309     if a hash encountered is missing from the repository, and don't
1310     read or return blob content in the data field unless include_data
1311     is set.
1312     """
1313     # Maintain the pending stack on the heap to avoid stack overflow
1314     pending = [(oidx, [], [], None)]
1315     while len(pending):
1316         oidx, parent_path, chunk_path, mode = pending.pop()
1317         oid = oidx.decode('hex')
1318         if stop_at and stop_at(oidx):
1319             continue
1320
1321         if (not include_data) and mode and stat.S_ISREG(mode):
1322             # If the object is a "regular file", then it's a leaf in
1323             # the graph, so we can skip reading the data if the caller
1324             # hasn't requested it.
1325             yield WalkItem(oid=oid, type='blob',
1326                            chunk_path=chunk_path, path=parent_path,
1327                            mode=mode,
1328                            data=None)
1329             continue
1330
1331         item_it = cat_pipe.get(oidx)
1332         get_oidx, typ, _ = next(item_it)
1333         if not get_oidx:
1334             raise MissingObject(oidx.decode('hex'))
1335         if typ not in ('blob', 'commit', 'tree'):
1336             raise Exception('unexpected repository object type %r' % typ)
1337
1338         # FIXME: set the mode based on the type when the mode is None
1339         if typ == 'blob' and not include_data:
1340             # Dump data until we can ask cat_pipe not to fetch it
1341             for ignored in item_it:
1342                 pass
1343             data = None
1344         else:
1345             data = ''.join(item_it)
1346
1347         yield WalkItem(oid=oid, type=typ,
1348                        chunk_path=chunk_path, path=parent_path,
1349                        mode=mode,
1350                        data=(data if include_data else None))
1351
1352         if typ == 'commit':
1353             commit_items = parse_commit(data)
1354             for pid in commit_items.parents:
1355                 pending.append((pid, parent_path, chunk_path, mode))
1356             pending.append((commit_items.tree, parent_path, chunk_path,
1357                             hashsplit.GIT_MODE_TREE))
1358         elif typ == 'tree':
1359             for mode, name, ent_id in tree_decode(data):
1360                 demangled, bup_type = demangle_name(name, mode)
1361                 if chunk_path:
1362                     sub_path = parent_path
1363                     sub_chunk_path = chunk_path + [name]
1364                 else:
1365                     sub_path = parent_path + [name]
1366                     if bup_type == BUP_CHUNKED:
1367                         sub_chunk_path = ['']
1368                     else:
1369                         sub_chunk_path = chunk_path
1370                 pending.append((ent_id.encode('hex'), sub_path, sub_chunk_path,
1371                                 mode))