lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5
   6 import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
   7 from collections import namedtuple
   8 from itertools import islice
   9 from numbers import Integral
  10
  11 from bup import _helpers, compat, hashsplit, path, midx, bloom, xstat
  12 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
  13                          fdatasync,
  14                          hostname, localtime, log, merge_iter,
  15                          mmap_read, mmap_readwrite,
  16                          parse_num,
  17                          progress, qprogress, stat_if_exists,
  18                          unlink, username, userfullname,
  19                          utc_offset_str)
  20
  21 verbose = 0
  22 ignore_midx = 0
  23 repodir = None  # The default repository, once initialized
  24
  25 _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
  26 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
  27
  28 _total_searches = 0
  29 _total_steps = 0
  30
  31
  32 class GitError(Exception):
  33     pass
  34
  35
  36 def _git_wait(cmd, p):
  37     rv = p.wait()
  38     if rv != 0:
  39         raise GitError('%s returned %d' % (cmd, rv))
  40
  41 def _git_capture(argv):
  42     p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv())
  43     r = p.stdout.read()
  44     _git_wait(repr(argv), p)
  45     return r
  46
  47 def git_config_get(option, repo_dir=None):
  48     cmd = ('git', 'config', '--get', option)
  49     p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
  50                          preexec_fn=_gitenv(repo_dir=repo_dir))
  51     r = p.stdout.read()
  52     rc = p.wait()
  53     if rc == 0:
  54         return r
  55     if rc != 1:
  56         raise GitError('%s returned %d' % (cmd, rc))
  57     return None
  58
  59
  60 def parse_tz_offset(s):
  61     """UTC offset in seconds."""
  62     tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
  63     if s[0] == '-':
  64         return - tz_off
  65     return tz_off
  66
  67
  68 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
  69 # Make sure that's authoritative.
  70 _start_end_char = r'[^ .,:;<>"\'\0\n]'
  71 _content_char = r'[^\0\n<>]'
  72 _safe_str_rx = '(?:%s{1,2}|(?:%s%s*%s))' \
  73     % (_start_end_char,
  74        _start_end_char, _content_char, _start_end_char)
  75 _tz_rx = r'[-+]\d\d[0-5]\d'
  76 _parent_rx = r'(?:parent [abcdefABCDEF0123456789]{40}\n)'
  77 _commit_rx = re.compile(r'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
  78 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
  79 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)
  80
  81 (?P<message>(?:.|\n)*)''' % (_parent_rx,
  82                              _safe_str_rx, _safe_str_rx, _tz_rx,
  83                              _safe_str_rx, _safe_str_rx, _tz_rx))
  84 _parent_hash_rx = re.compile(r'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
  85
  86
  87 # Note that the author_sec and committer_sec values are (UTC) epoch seconds.
  88 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
  89                                        'author_name', 'author_mail',
  90                                        'author_sec', 'author_offset',
  91                                        'committer_name', 'committer_mail',
  92                                        'committer_sec', 'committer_offset',
  93                                        'message'])
  94
  95 def parse_commit(content):
  96     commit_match = re.match(_commit_rx, content)
  97     if not commit_match:
  98         raise Exception('cannot parse commit %r' % content)
  99     matches = commit_match.groupdict()
 100     return CommitInfo(tree=matches['tree'],
 101                       parents=re.findall(_parent_hash_rx, matches['parents']),
 102                       author_name=matches['author_name'],
 103                       author_mail=matches['author_mail'],
 104                       author_sec=int(matches['asec']),
 105                       author_offset=parse_tz_offset(matches['atz']),
 106                       committer_name=matches['committer_name'],
 107                       committer_mail=matches['committer_mail'],
 108                       committer_sec=int(matches['csec']),
 109                       committer_offset=parse_tz_offset(matches['ctz']),
 110                       message=matches['message'])
 111
 112
 113 def get_commit_items(id, cp):
 114     commit_it = cp.get(id)
 115     _, typ, _ = next(commit_it)
 116     assert(typ == 'commit')
 117     commit_content = ''.join(commit_it)
 118     return parse_commit(commit_content)
 119
 120
 121 def _local_git_date_str(epoch_sec):
 122     return '%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
 123
 124
 125 def _git_date_str(epoch_sec, tz_offset_sec):
 126     offs =  tz_offset_sec // 60
 127     return '%d %s%02d%02d' \
 128         % (epoch_sec,
 129            '+' if offs >= 0 else '-',
 130            abs(offs) // 60,
 131            abs(offs) % 60)
 132
 133
 134 def repo(sub = '', repo_dir=None):
 135     """Get the path to the git repository or one of its subdirectories."""
 136     global repodir
 137     repo_dir = repo_dir or repodir
 138     if not repo_dir:
 139         raise GitError('You should call check_repo_or_die()')
 140
 141     # If there's a .git subdirectory, then the actual repo is in there.
 142     gd = os.path.join(repo_dir, '.git')
 143     if os.path.exists(gd):
 144         repodir = gd
 145
 146     return os.path.join(repo_dir, sub)
 147
 148
 149 def shorten_hash(s):
 150     return re.sub(r'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)',
 151                   r'\1\2*\3', s)
 152
 153
 154 def repo_rel(path):
 155     full = os.path.abspath(path)
 156     fullrepo = os.path.abspath(repo(''))
 157     if not fullrepo.endswith('/'):
 158         fullrepo += '/'
 159     if full.startswith(fullrepo):
 160         path = full[len(fullrepo):]
 161     if path.startswith('index-cache/'):
 162         path = path[len('index-cache/'):]
 163     return shorten_hash(path)
 164
 165
 166 def all_packdirs():
 167     paths = [repo('objects/pack')]
 168     paths += glob.glob(repo('index-cache/*/.'))
 169     return paths
 170
 171
 172 def auto_midx(objdir):
 173     args = [path.exe(), 'midx', '--auto', '--dir', objdir]
 174     try:
 175         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 176     except OSError as e:
 177         # make sure 'args' gets printed to help with debugging
 178         add_error('%r: exception: %s' % (args, e))
 179         raise
 180     if rv:
 181         add_error('%r: returned %d' % (args, rv))
 182
 183     args = [path.exe(), 'bloom', '--dir', objdir]
 184     try:
 185         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 186     except OSError as e:
 187         # make sure 'args' gets printed to help with debugging
 188         add_error('%r: exception: %s' % (args, e))
 189         raise
 190     if rv:
 191         add_error('%r: returned %d' % (args, rv))
 192
 193
 194 def mangle_name(name, mode, gitmode):
 195     """Mangle a file name to present an abstract name for segmented files.
 196     Mangled file names will have the ".bup" extension added to them. If a
 197     file's name already ends with ".bup", a ".bupl" extension is added to
 198     disambiguate normal files from segmented ones.
 199     """
 200     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 201         assert(stat.S_ISDIR(gitmode))
 202         return name + '.bup'
 203     elif name.endswith('.bup') or name[:-1].endswith('.bup'):
 204         return name + '.bupl'
 205     else:
 206         return name
 207
 208
 209 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 210 def demangle_name(name, mode):
 211     """Remove name mangling from a file name, if necessary.
 212
 213     The return value is a tuple (demangled_filename,mode), where mode is one of
 214     the following:
 215
 216     * BUP_NORMAL  : files that should be read as-is from the repository
 217     * BUP_CHUNKED : files that were chunked and need to be reassembled
 218
 219     For more information on the name mangling algorithm, see mangle_name()
 220     """
 221     if name.endswith('.bupl'):
 222         return (name[:-5], BUP_NORMAL)
 223     elif name.endswith('.bup'):
 224         return (name[:-4], BUP_CHUNKED)
 225     elif name.endswith('.bupm'):
 226         return (name[:-5],
 227                 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
 228     else:
 229         return (name, BUP_NORMAL)
 230
 231
 232 def calc_hash(type, content):
 233     """Calculate some content's hash in the Git fashion."""
 234     header = '%s %d\0' % (type, len(content))
 235     sum = Sha1(header)
 236     sum.update(content)
 237     return sum.digest()
 238
 239
 240 def shalist_item_sort_key(ent):
 241     (mode, name, id) = ent
 242     assert(mode+0 == mode)
 243     if stat.S_ISDIR(mode):
 244         return name + '/'
 245     else:
 246         return name
 247
 248
 249 def tree_encode(shalist):
 250     """Generate a git tree object from (mode,name,hash) tuples."""
 251     shalist = sorted(shalist, key = shalist_item_sort_key)
 252     l = []
 253     for (mode,name,bin) in shalist:
 254         assert(mode)
 255         assert(mode+0 == mode)
 256         assert(name)
 257         assert(len(bin) == 20)
 258         s = '%o %s\0%s' % (mode,name,bin)
 259         assert(s[0] != '0')  # 0-padded octal is not acceptable in a git tree
 260         l.append(s)
 261     return ''.join(l)
 262
 263
 264 def tree_decode(buf):
 265     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 266     ofs = 0
 267     while ofs < len(buf):
 268         z = buf.find('\0', ofs)
 269         assert(z > ofs)
 270         spl = buf[ofs:z].split(' ', 1)
 271         assert(len(spl) == 2)
 272         mode,name = spl
 273         sha = buf[z+1:z+1+20]
 274         ofs = z+1+20
 275         yield (int(mode, 8), name, sha)
 276
 277
 278 def _encode_packobj(type, content, compression_level=1):
 279     if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
 280         raise ValueError('invalid compression level %s' % compression_level)
 281     szout = ''
 282     sz = len(content)
 283     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 284     sz >>= 4
 285     while 1:
 286         if sz: szbits |= 0x80
 287         szout += chr(szbits)
 288         if not sz:
 289             break
 290         szbits = sz & 0x7f
 291         sz >>= 7
 292     z = zlib.compressobj(compression_level)
 293     yield szout
 294     yield z.compress(content)
 295     yield z.flush()
 296
 297
 298 def _encode_looseobj(type, content, compression_level=1):
 299     z = zlib.compressobj(compression_level)
 300     yield z.compress('%s %d\0' % (type, len(content)))
 301     yield z.compress(content)
 302     yield z.flush()
 303
 304
 305 def _decode_looseobj(buf):
 306     assert(buf);
 307     s = zlib.decompress(buf)
 308     i = s.find('\0')
 309     assert(i > 0)
 310     l = s[:i].split(' ')
 311     type = l[0]
 312     sz = int(l[1])
 313     content = s[i+1:]
 314     assert(type in _typemap)
 315     assert(sz == len(content))
 316     return (type, content)
 317
 318
 319 def _decode_packobj(buf):
 320     assert(buf)
 321     c = ord(buf[0])
 322     type = _typermap[(c & 0x70) >> 4]
 323     sz = c & 0x0f
 324     shift = 4
 325     i = 0
 326     while c & 0x80:
 327         i += 1
 328         c = ord(buf[i])
 329         sz |= (c & 0x7f) << shift
 330         shift += 7
 331         if not (c & 0x80):
 332             break
 333     return (type, zlib.decompress(buf[i+1:]))
 334
 335
 336 class PackIdx:
 337     def __init__(self):
 338         assert(0)
 339
 340     def find_offset(self, hash):
 341         """Get the offset of an object inside the index file."""
 342         idx = self._idx_from_hash(hash)
 343         if idx != None:
 344             return self._ofs_from_idx(idx)
 345         return None
 346
 347     def exists(self, hash, want_source=False):
 348         """Return nonempty if the object exists in this index."""
 349         if hash and (self._idx_from_hash(hash) != None):
 350             return want_source and os.path.basename(self.name) or True
 351         return None
 352
 353     def __len__(self):
 354         return int(self.fanout[255])
 355
 356     def _idx_from_hash(self, hash):
 357         global _total_searches, _total_steps
 358         _total_searches += 1
 359         assert(len(hash) == 20)
 360         b1 = ord(hash[0])
 361         start = self.fanout[b1-1] # range -1..254
 362         end = self.fanout[b1] # range 0..255
 363         want = str(hash)
 364         _total_steps += 1  # lookup table is a step
 365         while start < end:
 366             _total_steps += 1
 367             mid = start + (end-start)/2
 368             v = self._idx_to_hash(mid)
 369             if v < want:
 370                 start = mid+1
 371             elif v > want:
 372                 end = mid
 373             else: # got it!
 374                 return mid
 375         return None
 376
 377
 378 class PackIdxV1(PackIdx):
 379     """Object representation of a Git pack index (version 1) file."""
 380     def __init__(self, filename, f):
 381         self.name = filename
 382         self.idxnames = [self.name]
 383         self.map = mmap_read(f)
 384         self.fanout = list(struct.unpack('!256I',
 385                                          str(buffer(self.map, 0, 256*4))))
 386         self.fanout.append(0)  # entry "-1"
 387         nsha = self.fanout[255]
 388         self.sha_ofs = 256*4
 389         self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
 390
 391     def _ofs_from_idx(self, idx):
 392         return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
 393
 394     def _idx_to_hash(self, idx):
 395         return str(self.shatable[idx*24+4 : idx*24+24])
 396
 397     def __iter__(self):
 398         for i in xrange(self.fanout[255]):
 399             yield buffer(self.map, 256*4 + 24*i + 4, 20)
 400
 401
 402 class PackIdxV2(PackIdx):
 403     """Object representation of a Git pack index (version 2) file."""
 404     def __init__(self, filename, f):
 405         self.name = filename
 406         self.idxnames = [self.name]
 407         self.map = mmap_read(f)
 408         assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
 409         self.fanout = list(struct.unpack('!256I',
 410                                          str(buffer(self.map, 8, 256*4))))
 411         self.fanout.append(0)  # entry "-1"
 412         nsha = self.fanout[255]
 413         self.sha_ofs = 8 + 256*4
 414         self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
 415         self.ofstable = buffer(self.map,
 416                                self.sha_ofs + nsha*20 + nsha*4,
 417                                nsha*4)
 418         self.ofs64table = buffer(self.map,
 419                                  8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
 420
 421     def _ofs_from_idx(self, idx):
 422         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
 423         if ofs & 0x80000000:
 424             idx64 = ofs & 0x7fffffff
 425             ofs = struct.unpack('!Q',
 426                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
 427         return ofs
 428
 429     def _idx_to_hash(self, idx):
 430         return str(self.shatable[idx*20:(idx+1)*20])
 431
 432     def __iter__(self):
 433         for i in xrange(self.fanout[255]):
 434             yield buffer(self.map, 8 + 256*4 + 20*i, 20)
 435
 436
 437 _mpi_count = 0
 438 class PackIdxList:
 439     def __init__(self, dir):
 440         global _mpi_count
 441         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 442         _mpi_count += 1
 443         self.dir = dir
 444         self.also = set()
 445         self.packs = []
 446         self.do_bloom = False
 447         self.bloom = None
 448         self.refresh()
 449
 450     def __del__(self):
 451         global _mpi_count
 452         _mpi_count -= 1
 453         assert(_mpi_count == 0)
 454
 455     def __iter__(self):
 456         return iter(idxmerge(self.packs))
 457
 458     def __len__(self):
 459         return sum(len(pack) for pack in self.packs)
 460
 461     def exists(self, hash, want_source=False):
 462         """Return nonempty if the object exists in the index files."""
 463         global _total_searches
 464         _total_searches += 1
 465         if hash in self.also:
 466             return True
 467         if self.do_bloom and self.bloom:
 468             if self.bloom.exists(hash):
 469                 self.do_bloom = False
 470             else:
 471                 _total_searches -= 1  # was counted by bloom
 472                 return None
 473         for i in xrange(len(self.packs)):
 474             p = self.packs[i]
 475             _total_searches -= 1  # will be incremented by sub-pack
 476             ix = p.exists(hash, want_source=want_source)
 477             if ix:
 478                 # reorder so most recently used packs are searched first
 479                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 480                 return ix
 481         self.do_bloom = True
 482         return None
 483
 484     def refresh(self, skip_midx = False):
 485         """Refresh the index list.
 486         This method verifies if .midx files were superseded (e.g. all of its
 487         contents are in another, bigger .midx file) and removes the superseded
 488         files.
 489
 490         If skip_midx is True, all work on .midx files will be skipped and .midx
 491         files will be removed from the list.
 492
 493         The module-global variable 'ignore_midx' can force this function to
 494         always act as if skip_midx was True.
 495         """
 496         self.bloom = None # Always reopen the bloom as it may have been relaced
 497         self.do_bloom = False
 498         skip_midx = skip_midx or ignore_midx
 499         d = dict((p.name, p) for p in self.packs
 500                  if not skip_midx or not isinstance(p, midx.PackMidx))
 501         if os.path.exists(self.dir):
 502             if not skip_midx:
 503                 midxl = []
 504                 for ix in self.packs:
 505                     if isinstance(ix, midx.PackMidx):
 506                         for name in ix.idxnames:
 507                             d[os.path.join(self.dir, name)] = ix
 508                 for full in glob.glob(os.path.join(self.dir,'*.midx')):
 509                     if not d.get(full):
 510                         mx = midx.PackMidx(full)
 511                         (mxd, mxf) = os.path.split(mx.name)
 512                         broken = False
 513                         for n in mx.idxnames:
 514                             if not os.path.exists(os.path.join(mxd, n)):
 515                                 log(('warning: index %s missing\n' +
 516                                     '  used by %s\n') % (n, mxf))
 517                                 broken = True
 518                         if broken:
 519                             mx.close()
 520                             del mx
 521                             unlink(full)
 522                         else:
 523                             midxl.append(mx)
 524                 midxl.sort(key=lambda ix:
 525                            (-len(ix), -xstat.stat(ix.name).st_mtime))
 526                 for ix in midxl:
 527                     any_needed = False
 528                     for sub in ix.idxnames:
 529                         found = d.get(os.path.join(self.dir, sub))
 530                         if not found or isinstance(found, PackIdx):
 531                             # doesn't exist, or exists but not in a midx
 532                             any_needed = True
 533                             break
 534                     if any_needed:
 535                         d[ix.name] = ix
 536                         for name in ix.idxnames:
 537                             d[os.path.join(self.dir, name)] = ix
 538                     elif not ix.force_keep:
 539                         debug1('midx: removing redundant: %s\n'
 540                                % os.path.basename(ix.name))
 541                         ix.close()
 542                         unlink(ix.name)
 543             for full in glob.glob(os.path.join(self.dir,'*.idx')):
 544                 if not d.get(full):
 545                     try:
 546                         ix = open_idx(full)
 547                     except GitError as e:
 548                         add_error(e)
 549                         continue
 550                     d[full] = ix
 551             bfull = os.path.join(self.dir, 'bup.bloom')
 552             if self.bloom is None and os.path.exists(bfull):
 553                 self.bloom = bloom.ShaBloom(bfull)
 554             self.packs = list(set(d.values()))
 555             self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
 556             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 557                 self.do_bloom = True
 558             else:
 559                 self.bloom = None
 560         debug1('PackIdxList: using %d index%s.\n'
 561             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 562
 563     def add(self, hash):
 564         """Insert an additional object in the list."""
 565         self.also.add(hash)
 566
 567
 568 def open_idx(filename):
 569     if filename.endswith('.idx'):
 570         f = open(filename, 'rb')
 571         header = f.read(8)
 572         if header[0:4] == '\377tOc':
 573             version = struct.unpack('!I', header[4:8])[0]
 574             if version == 2:
 575                 return PackIdxV2(filename, f)
 576             else:
 577                 raise GitError('%s: expected idx file version 2, got %d'
 578                                % (filename, version))
 579         elif len(header) == 8 and header[0:4] < '\377tOc':
 580             return PackIdxV1(filename, f)
 581         else:
 582             raise GitError('%s: unrecognized idx file header' % filename)
 583     elif filename.endswith('.midx'):
 584         return midx.PackMidx(filename)
 585     else:
 586         raise GitError('idx filenames must end with .idx or .midx')
 587
 588
 589 def idxmerge(idxlist, final_progress=True):
 590     """Generate a list of all the objects reachable in a PackIdxList."""
 591     def pfunc(count, total):
 592         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 593                   % (count*100.0/total, count, total))
 594     def pfinal(count, total):
 595         if final_progress:
 596             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 597                      % (100, total, total))
 598     return merge_iter(idxlist, 10024, pfunc, pfinal)
 599
 600
 601 def _make_objcache():
 602     return PackIdxList(repo('objects/pack'))
 603
 604 # bup-gc assumes that it can disable all PackWriter activities
 605 # (bloom/midx/cache) via the constructor and close() arguments.
 606
 607 class PackWriter:
 608     """Writes Git objects inside a pack file."""
 609     def __init__(self, objcache_maker=_make_objcache, compression_level=1,
 610                  run_midx=True, on_pack_finish=None,
 611                  max_pack_size=None, max_pack_objects=None):
 612         self.repo_dir = repo()
 613         self.file = None
 614         self.parentfd = None
 615         self.count = 0
 616         self.outbytes = 0
 617         self.filename = None
 618         self.idx = None
 619         self.objcache_maker = objcache_maker
 620         self.objcache = None
 621         self.compression_level = compression_level
 622         self.run_midx=run_midx
 623         self.on_pack_finish = on_pack_finish
 624         if not max_pack_size:
 625             max_pack_size = git_config_get('pack.packSizeLimit',
 626                                            repo_dir=self.repo_dir)
 627             if max_pack_size is not None:
 628                 max_pack_size = parse_num(max_pack_size)
 629             if not max_pack_size:
 630                 # larger packs slow down pruning
 631                 max_pack_size = 1000 * 1000 * 1000
 632         self.max_pack_size = max_pack_size
 633         # cache memory usage is about 83 bytes per object
 634         self.max_pack_objects = max_pack_objects if max_pack_objects \
 635                                 else max(1, self.max_pack_size // 5000)
 636
 637     def __del__(self):
 638         self.close()
 639
 640     def _open(self):
 641         if not self.file:
 642             objdir = dir = os.path.join(self.repo_dir, 'objects')
 643             fd, name = tempfile.mkstemp(suffix='.pack', dir=objdir)
 644             try:
 645                 self.file = os.fdopen(fd, 'w+b')
 646             except:
 647                 os.close(fd)
 648                 raise
 649             try:
 650                 self.parentfd = os.open(objdir, os.O_RDONLY)
 651             except:
 652                 f = self.file
 653                 self.file = None
 654                 f.close()
 655                 raise
 656             assert(name.endswith('.pack'))
 657             self.filename = name[:-5]
 658             self.file.write('PACK\0\0\0\2\0\0\0\0')
 659             self.idx = list(list() for i in xrange(256))
 660
 661     def _raw_write(self, datalist, sha):
 662         self._open()
 663         f = self.file
 664         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 665         # the file never has a *partial* blob.  So let's make sure it's
 666         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 667         # to our hashsplit algorithm.)  f.write() does its own buffering,
 668         # but that's okay because we'll flush it in _end().
 669         oneblob = ''.join(datalist)
 670         try:
 671             f.write(oneblob)
 672         except IOError as e:
 673             raise GitError, e, sys.exc_info()[2]
 674         nw = len(oneblob)
 675         crc = zlib.crc32(oneblob) & 0xffffffff
 676         self._update_idx(sha, crc, nw)
 677         self.outbytes += nw
 678         self.count += 1
 679         return nw, crc
 680
 681     def _update_idx(self, sha, crc, size):
 682         assert(sha)
 683         if self.idx:
 684             self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
 685
 686     def _write(self, sha, type, content):
 687         if verbose:
 688             log('>')
 689         if not sha:
 690             sha = calc_hash(type, content)
 691         size, crc = self._raw_write(_encode_packobj(type, content,
 692                                                     self.compression_level),
 693                                     sha=sha)
 694         if self.outbytes >= self.max_pack_size \
 695            or self.count >= self.max_pack_objects:
 696             self.breakpoint()
 697         return sha
 698
 699     def breakpoint(self):
 700         """Clear byte and object counts and return the last processed id."""
 701         id = self._end(self.run_midx)
 702         self.outbytes = self.count = 0
 703         return id
 704
 705     def _require_objcache(self):
 706         if self.objcache is None and self.objcache_maker:
 707             self.objcache = self.objcache_maker()
 708         if self.objcache is None:
 709             raise GitError(
 710                     "PackWriter not opened or can't check exists w/o objcache")
 711
 712     def exists(self, id, want_source=False):
 713         """Return non-empty if an object is found in the object cache."""
 714         self._require_objcache()
 715         return self.objcache.exists(id, want_source=want_source)
 716
 717     def just_write(self, sha, type, content):
 718         """Write an object to the pack file, bypassing the objcache.  Fails if
 719         sha exists()."""
 720         self._write(sha, type, content)
 721
 722     def maybe_write(self, type, content):
 723         """Write an object to the pack file if not present and return its id."""
 724         sha = calc_hash(type, content)
 725         if not self.exists(sha):
 726             self.just_write(sha, type, content)
 727             self._require_objcache()
 728             self.objcache.add(sha)
 729         return sha
 730
 731     def new_blob(self, blob):
 732         """Create a blob object in the pack with the supplied content."""
 733         return self.maybe_write('blob', blob)
 734
 735     def new_tree(self, shalist):
 736         """Create a tree object in the pack."""
 737         content = tree_encode(shalist)
 738         return self.maybe_write('tree', content)
 739
 740     def new_commit(self, tree, parent,
 741                    author, adate_sec, adate_tz,
 742                    committer, cdate_sec, cdate_tz,
 743                    msg):
 744         """Create a commit object in the pack.  The date_sec values must be
 745         epoch-seconds, and if a tz is None, the local timezone is assumed."""
 746         if adate_tz:
 747             adate_str = _git_date_str(adate_sec, adate_tz)
 748         else:
 749             adate_str = _local_git_date_str(adate_sec)
 750         if cdate_tz:
 751             cdate_str = _git_date_str(cdate_sec, cdate_tz)
 752         else:
 753             cdate_str = _local_git_date_str(cdate_sec)
 754         l = []
 755         if tree: l.append('tree %s' % tree.encode('hex'))
 756         if parent: l.append('parent %s' % parent.encode('hex'))
 757         if author: l.append('author %s %s' % (author, adate_str))
 758         if committer: l.append('committer %s %s' % (committer, cdate_str))
 759         l.append('')
 760         l.append(msg)
 761         return self.maybe_write('commit', '\n'.join(l))
 762
 763     def abort(self):
 764         """Remove the pack file from disk."""
 765         f = self.file
 766         if f:
 767             pfd = self.parentfd
 768             self.file = None
 769             self.parentfd = None
 770             self.idx = None
 771             try:
 772                 try:
 773                     os.unlink(self.filename + '.pack')
 774                 finally:
 775                     f.close()
 776             finally:
 777                 if pfd is not None:
 778                     os.close(pfd)
 779
 780     def _end(self, run_midx=True):
 781         f = self.file
 782         if not f: return None
 783         self.file = None
 784         try:
 785             self.objcache = None
 786             idx = self.idx
 787             self.idx = None
 788
 789             # update object count
 790             f.seek(8)
 791             cp = struct.pack('!i', self.count)
 792             assert(len(cp) == 4)
 793             f.write(cp)
 794
 795             # calculate the pack sha1sum
 796             f.seek(0)
 797             sum = Sha1()
 798             for b in chunkyreader(f):
 799                 sum.update(b)
 800             packbin = sum.digest()
 801             f.write(packbin)
 802             fdatasync(f.fileno())
 803         finally:
 804             f.close()
 805
 806         obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
 807         nameprefix = os.path.join(self.repo_dir,
 808                                   'objects/pack/pack-' +  obj_list_sha)
 809         if os.path.exists(self.filename + '.map'):
 810             os.unlink(self.filename + '.map')
 811         os.rename(self.filename + '.pack', nameprefix + '.pack')
 812         os.rename(self.filename + '.idx', nameprefix + '.idx')
 813         try:
 814             os.fsync(self.parentfd)
 815         finally:
 816             os.close(self.parentfd)
 817
 818         if run_midx:
 819             auto_midx(os.path.join(self.repo_dir, 'objects/pack'))
 820
 821         if self.on_pack_finish:
 822             self.on_pack_finish(nameprefix)
 823
 824         return nameprefix
 825
 826     def close(self, run_midx=True):
 827         """Close the pack file and move it to its definitive path."""
 828         return self._end(run_midx=run_midx)
 829
 830     def _write_pack_idx_v2(self, filename, idx, packbin):
 831         ofs64_count = 0
 832         for section in idx:
 833             for entry in section:
 834                 if entry[2] >= 2**31:
 835                     ofs64_count += 1
 836
 837         # Length: header + fan-out + shas-and-crcs + overflow-offsets
 838         index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
 839         idx_map = None
 840         idx_f = open(filename, 'w+b')
 841         try:
 842             idx_f.truncate(index_len)
 843             fdatasync(idx_f.fileno())
 844             idx_map = mmap_readwrite(idx_f, close=False)
 845             try:
 846                 count = _helpers.write_idx(filename, idx_map, idx, self.count)
 847                 assert(count == self.count)
 848                 idx_map.flush()
 849             finally:
 850                 idx_map.close()
 851         finally:
 852             idx_f.close()
 853
 854         idx_f = open(filename, 'a+b')
 855         try:
 856             idx_f.write(packbin)
 857             idx_f.seek(0)
 858             idx_sum = Sha1()
 859             b = idx_f.read(8 + 4*256)
 860             idx_sum.update(b)
 861
 862             obj_list_sum = Sha1()
 863             for b in chunkyreader(idx_f, 20*self.count):
 864                 idx_sum.update(b)
 865                 obj_list_sum.update(b)
 866             namebase = obj_list_sum.hexdigest()
 867
 868             for b in chunkyreader(idx_f):
 869                 idx_sum.update(b)
 870             idx_f.write(idx_sum.digest())
 871             fdatasync(idx_f.fileno())
 872             return namebase
 873         finally:
 874             idx_f.close()
 875
 876
 877 def _gitenv(repo_dir = None):
 878     if not repo_dir:
 879         repo_dir = repo()
 880     def env():
 881         os.environ['GIT_DIR'] = os.path.abspath(repo_dir)
 882     return env
 883
 884
 885 def list_refs(patterns=None, repo_dir=None,
 886               limit_to_heads=False, limit_to_tags=False):
 887     """Yield (refname, hash) tuples for all repository refs unless
 888     patterns are specified.  In that case, only include tuples for
 889     refs matching those patterns (cf. git-show-ref(1)).  The limits
 890     restrict the result items to refs/heads or refs/tags.  If both
 891     limits are specified, items from both sources will be included.
 892
 893     """
 894     argv = ['git', 'show-ref']
 895     if limit_to_heads:
 896         argv.append('--heads')
 897     if limit_to_tags:
 898         argv.append('--tags')
 899     argv.append('--')
 900     if patterns:
 901         argv.extend(patterns)
 902     p = subprocess.Popen(argv,
 903                          preexec_fn = _gitenv(repo_dir),
 904                          stdout = subprocess.PIPE)
 905     out = p.stdout.read().strip()
 906     rv = p.wait()  # not fatal
 907     if rv:
 908         assert(not out)
 909     if out:
 910         for d in out.split('\n'):
 911             (sha, name) = d.split(' ', 1)
 912             yield (name, sha.decode('hex'))
 913
 914
 915 def read_ref(refname, repo_dir = None):
 916     """Get the commit id of the most recent commit made on a given ref."""
 917     refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
 918     l = tuple(islice(refs, 2))
 919     if l:
 920         assert(len(l) == 1)
 921         return l[0][1]
 922     else:
 923         return None
 924
 925
 926 def rev_list(ref_or_refs, count=None, parse=None, format=None, repo_dir=None):
 927     """Yield information about commits as per "git rev-list".  If a format
 928     is not provided, yield one hex hash at a time.  If a format is
 929     provided, pass it to rev-list and call parse(git_stdout) for each
 930     commit with the stream positioned just after the rev-list "commit
 931     HASH" header line.  When a format is provided yield (oidx,
 932     parse(git_stdout)) for each commit.
 933
 934     """
 935     assert bool(parse) == bool(format)
 936     if isinstance(ref_or_refs, compat.str_type):
 937         refs = (ref_or_refs,)
 938     else:
 939         refs = ref_or_refs
 940     argv = ['git', 'rev-list']
 941     if isinstance(count, Integral):
 942         argv.extend(['-n', str(count)])
 943     else:
 944         assert not count
 945     if format:
 946         argv.append('--pretty=format:' + format)
 947     for ref in refs:
 948         assert not ref.startswith('-')
 949         argv.append(ref)
 950     argv.append('--')
 951     p = subprocess.Popen(argv,
 952                          preexec_fn = _gitenv(repo_dir),
 953                          stdout = subprocess.PIPE)
 954     if not format:
 955         for line in p.stdout:
 956             yield line.strip()
 957     else:
 958         line = p.stdout.readline()
 959         while line:
 960             s = line.strip()
 961             if not s.startswith('commit '):
 962                 raise Exception('unexpected line ' + s)
 963             yield s[7:], parse(p.stdout)
 964             line = p.stdout.readline()
 965
 966     rv = p.wait()  # not fatal
 967     if rv:
 968         raise GitError, 'git rev-list returned error %d' % rv
 969
 970
 971 def get_commit_dates(refs, repo_dir=None):
 972     """Get the dates for the specified commit refs.  For now, every unique
 973        string in refs must resolve to a different commit or this
 974        function will fail."""
 975     result = []
 976     for ref in refs:
 977         commit = get_commit_items(ref, cp(repo_dir))
 978         result.append(commit.author_sec)
 979     return result
 980
 981
 982 def rev_parse(committish, repo_dir=None):
 983     """Resolve the full hash for 'committish', if it exists.
 984
 985     Should be roughly equivalent to 'git rev-parse'.
 986
 987     Returns the hex value of the hash if it is found, None if 'committish' does
 988     not correspond to anything.
 989     """
 990     head = read_ref(committish, repo_dir=repo_dir)
 991     if head:
 992         debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
 993         return head
 994
 995     pL = PackIdxList(repo('objects/pack', repo_dir=repo_dir))
 996
 997     if len(committish) == 40:
 998         try:
 999             hash = committish.decode('hex')
1000         except TypeError:
1001             return None
1002
1003         if pL.exists(hash):
1004             return hash
1005
1006     return None
1007
1008
1009 def update_ref(refname, newval, oldval, repo_dir=None):
1010     """Update a repository reference."""
1011     if not oldval:
1012         oldval = ''
1013     assert(refname.startswith('refs/heads/') \
1014            or refname.startswith('refs/tags/'))
1015     p = subprocess.Popen(['git', 'update-ref', refname,
1016                           newval.encode('hex'), oldval.encode('hex')],
1017                          preexec_fn = _gitenv(repo_dir))
1018     _git_wait('git update-ref', p)
1019
1020
1021 def delete_ref(refname, oldvalue=None):
1022     """Delete a repository reference (see git update-ref(1))."""
1023     assert(refname.startswith('refs/'))
1024     oldvalue = [] if not oldvalue else [oldvalue]
1025     p = subprocess.Popen(['git', 'update-ref', '-d', refname] + oldvalue,
1026                          preexec_fn = _gitenv())
1027     _git_wait('git update-ref', p)
1028
1029
1030 def guess_repo(path=None):
1031     """Set the path value in the global variable "repodir".
1032     This makes bup look for an existing bup repository, but not fail if a
1033     repository doesn't exist. Usually, if you are interacting with a bup
1034     repository, you would not be calling this function but using
1035     check_repo_or_die().
1036     """
1037     global repodir
1038     if path:
1039         repodir = path
1040     if not repodir:
1041         repodir = os.environ.get('BUP_DIR')
1042         if not repodir:
1043             repodir = os.path.expanduser('~/.bup')
1044
1045
1046 def init_repo(path=None):
1047     """Create the Git bare repository for bup in a given path."""
1048     guess_repo(path)
1049     d = repo()  # appends a / to the path
1050     parent = os.path.dirname(os.path.dirname(d))
1051     if parent and not os.path.exists(parent):
1052         raise GitError('parent directory "%s" does not exist\n' % parent)
1053     if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
1054         raise GitError('"%s" exists but is not a directory\n' % d)
1055     p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
1056                          preexec_fn = _gitenv())
1057     _git_wait('git init', p)
1058     # Force the index version configuration in order to ensure bup works
1059     # regardless of the version of the installed Git binary.
1060     p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
1061                          stdout=sys.stderr, preexec_fn = _gitenv())
1062     _git_wait('git config', p)
1063     # Enable the reflog
1064     p = subprocess.Popen(['git', 'config', 'core.logAllRefUpdates', 'true'],
1065                          stdout=sys.stderr, preexec_fn = _gitenv())
1066     _git_wait('git config', p)
1067
1068
1069 def check_repo_or_die(path=None):
1070     """Check to see if a bup repository probably exists, and abort if not."""
1071     guess_repo(path)
1072     top = repo()
1073     pst = stat_if_exists(top + '/objects/pack')
1074     if pst and stat.S_ISDIR(pst.st_mode):
1075         return
1076     if not pst:
1077         top_st = stat_if_exists(top)
1078         if not top_st:
1079             log('error: repository %r does not exist (see "bup help init")\n'
1080                 % top)
1081             sys.exit(15)
1082     log('error: %r is not a repository\n' % top)
1083     sys.exit(14)
1084
1085
1086 _ver = None
1087 def ver():
1088     """Get Git's version and ensure a usable version is installed.
1089
1090     The returned version is formatted as an ordered tuple with each position
1091     representing a digit in the version tag. For example, the following tuple
1092     would represent version 1.6.6.9:
1093
1094         ('1', '6', '6', '9')
1095     """
1096     global _ver
1097     if not _ver:
1098         p = subprocess.Popen(['git', '--version'],
1099                              stdout=subprocess.PIPE)
1100         gvs = p.stdout.read()
1101         _git_wait('git --version', p)
1102         m = re.match(r'git version (\S+.\S+)', gvs)
1103         if not m:
1104             raise GitError('git --version weird output: %r' % gvs)
1105         _ver = tuple(m.group(1).split('.'))
1106     needed = ('1','5', '3', '1')
1107     if _ver < needed:
1108         raise GitError('git version %s or higher is required; you have %s'
1109                        % ('.'.join(needed), '.'.join(_ver)))
1110     return _ver
1111
1112
1113 class _AbortableIter:
1114     def __init__(self, it, onabort = None):
1115         self.it = it
1116         self.onabort = onabort
1117         self.done = None
1118
1119     def __iter__(self):
1120         return self
1121
1122     def next(self):
1123         try:
1124             return next(self.it)
1125         except StopIteration as e:
1126             self.done = True
1127             raise
1128         except:
1129             self.abort()
1130             raise
1131
1132     def abort(self):
1133         """Abort iteration and call the abortion callback, if needed."""
1134         if not self.done:
1135             self.done = True
1136             if self.onabort:
1137                 self.onabort()
1138
1139     def __del__(self):
1140         self.abort()
1141
1142
1143 _ver_warned = 0
1144 class CatPipe:
1145     """Link to 'git cat-file' that is used to retrieve blob data."""
1146     def __init__(self, repo_dir = None):
1147         global _ver_warned
1148         self.repo_dir = repo_dir
1149         wanted = ('1','5','6')
1150         if ver() < wanted:
1151             log('error: git version must be at least 1.5.6\n')
1152             sys.exit(1)
1153         self.p = self.inprogress = None
1154
1155     def _abort(self):
1156         if self.p:
1157             self.p.stdout.close()
1158             self.p.stdin.close()
1159         self.p = None
1160         self.inprogress = None
1161
1162     def restart(self):
1163         self._abort()
1164         self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1165                                   stdin=subprocess.PIPE,
1166                                   stdout=subprocess.PIPE,
1167                                   close_fds = True,
1168                                   bufsize = 4096,
1169                                   preexec_fn = _gitenv(self.repo_dir))
1170
1171     def get(self, ref):
1172         """Yield (oidx, type, size), followed by the data referred to by ref.
1173         If ref does not exist, only yield (None, None, None).
1174
1175         """
1176         if not self.p or self.p.poll() != None:
1177             self.restart()
1178         assert(self.p)
1179         poll_result = self.p.poll()
1180         assert(poll_result == None)
1181         if self.inprogress:
1182             log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1183         assert(not self.inprogress)
1184         assert(ref.find('\n') < 0)
1185         assert(ref.find('\r') < 0)
1186         assert(not ref.startswith('-'))
1187         self.inprogress = ref
1188         self.p.stdin.write('%s\n' % ref)
1189         self.p.stdin.flush()
1190         hdr = self.p.stdout.readline()
1191         if hdr.endswith(' missing\n'):
1192             self.inprogress = None
1193             yield None, None, None
1194             return
1195         info = hdr.split(' ')
1196         if len(info) != 3 or len(info[0]) != 40:
1197             raise GitError('expected object (id, type, size), got %r' % spl)
1198         oidx, typ, size = info
1199         size = int(size)
1200         it = _AbortableIter(chunkyreader(self.p.stdout, size),
1201                             onabort=self._abort)
1202         try:
1203             yield oidx, typ, size
1204             for blob in it:
1205                 yield blob
1206             readline_result = self.p.stdout.readline()
1207             assert(readline_result == '\n')
1208             self.inprogress = None
1209         except Exception as e:
1210             it.abort()
1211             raise
1212
1213     def _join(self, it):
1214         _, typ, _ = next(it)
1215         if typ == 'blob':
1216             for blob in it:
1217                 yield blob
1218         elif typ == 'tree':
1219             treefile = ''.join(it)
1220             for (mode, name, sha) in tree_decode(treefile):
1221                 for blob in self.join(sha.encode('hex')):
1222                     yield blob
1223         elif typ == 'commit':
1224             treeline = ''.join(it).split('\n')[0]
1225             assert(treeline.startswith('tree '))
1226             for blob in self.join(treeline[5:]):
1227                 yield blob
1228         else:
1229             raise GitError('invalid object type %r: expected blob/tree/commit'
1230                            % typ)
1231
1232     def join(self, id):
1233         """Generate a list of the content of all blobs that can be reached
1234         from an object.  The hash given in 'id' must point to a blob, a tree
1235         or a commit. The content of all blobs that can be seen from trees or
1236         commits will be added to the list.
1237         """
1238         try:
1239             for d in self._join(self.get(id)):
1240                 yield d
1241         except StopIteration:
1242             log('booger!\n')
1243
1244
1245 _cp = {}
1246
1247 def cp(repo_dir=None):
1248     """Create a CatPipe object or reuse the already existing one."""
1249     global _cp, repodir
1250     if not repo_dir:
1251         repo_dir = repodir or repo()
1252     repo_dir = os.path.abspath(repo_dir)
1253     cp = _cp.get(repo_dir)
1254     if not cp:
1255         cp = CatPipe(repo_dir)
1256         _cp[repo_dir] = cp
1257     return cp
1258
1259
1260 def tags(repo_dir = None):
1261     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1262     tags = {}
1263     for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1264         assert(n.startswith('refs/tags/'))
1265         name = n[10:]
1266         if not c in tags:
1267             tags[c] = []
1268         tags[c].append(name)  # more than one tag can point at 'c'
1269     return tags
1270
1271
1272 class MissingObject(KeyError):
1273     def __init__(self, oid):
1274         self.oid = oid
1275         KeyError.__init__(self, 'object %r is missing' % oid.encode('hex'))
1276
1277
1278 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1279                                    'path', 'chunk_path', 'data'])
1280 # The path is the mangled path, and if an item represents a fragment
1281 # of a chunked file, the chunk_path will be the chunked subtree path
1282 # for the chunk, i.e. ['', '2d3115e', ...].  The top-level path for a
1283 # chunked file will have a chunk_path of [''].  So some chunk subtree
1284 # of the file '/foo/bar/baz' might look like this:
1285 #
1286 #   item.path = ['foo', 'bar', 'baz.bup']
1287 #   item.chunk_path = ['', '2d3115e', '016b097']
1288 #   item.type = 'tree'
1289 #   ...
1290
1291
1292 def walk_object(cat_pipe, oidx,
1293                 stop_at=None,
1294                 include_data=None):
1295     """Yield everything reachable from oidx via cat_pipe as a WalkItem,
1296     stopping whenever stop_at(oidx) returns true.  Throw MissingObject
1297     if a hash encountered is missing from the repository, and don't
1298     read or return blob content in the data field unless include_data
1299     is set.
1300     """
1301     # Maintain the pending stack on the heap to avoid stack overflow
1302     pending = [(oidx, [], [], None)]
1303     while len(pending):
1304         oidx, parent_path, chunk_path, mode = pending.pop()
1305         oid = oidx.decode('hex')
1306         if stop_at and stop_at(oidx):
1307             continue
1308
1309         if (not include_data) and mode and stat.S_ISREG(mode):
1310             # If the object is a "regular file", then it's a leaf in
1311             # the graph, so we can skip reading the data if the caller
1312             # hasn't requested it.
1313             yield WalkItem(oid=oid, type='blob',
1314                            chunk_path=chunk_path, path=parent_path,
1315                            mode=mode,
1316                            data=None)
1317             continue
1318
1319         item_it = cat_pipe.get(oidx)
1320         get_oidx, typ, _ = next(item_it)
1321         if not get_oidx:
1322             raise MissingObject(oidx.decode('hex'))
1323         if typ not in ('blob', 'commit', 'tree'):
1324             raise Exception('unexpected repository object type %r' % typ)
1325
1326         # FIXME: set the mode based on the type when the mode is None
1327         if typ == 'blob' and not include_data:
1328             # Dump data until we can ask cat_pipe not to fetch it
1329             for ignored in item_it:
1330                 pass
1331             data = None
1332         else:
1333             data = ''.join(item_it)
1334
1335         yield WalkItem(oid=oid, type=typ,
1336                        chunk_path=chunk_path, path=parent_path,
1337                        mode=mode,
1338                        data=(data if include_data else None))
1339
1340         if typ == 'commit':
1341             commit_items = parse_commit(data)
1342             for pid in commit_items.parents:
1343                 pending.append((pid, parent_path, chunk_path, mode))
1344             pending.append((commit_items.tree, parent_path, chunk_path,
1345                             hashsplit.GIT_MODE_TREE))
1346         elif typ == 'tree':
1347             for mode, name, ent_id in tree_decode(data):
1348                 demangled, bup_type = demangle_name(name, mode)
1349                 if chunk_path:
1350                     sub_path = parent_path
1351                     sub_chunk_path = chunk_path + [name]
1352                 else:
1353                     sub_path = parent_path + [name]
1354                     if bup_type == BUP_CHUNKED:
1355                         sub_chunk_path = ['']
1356                     else:
1357                         sub_chunk_path = chunk_path
1358                 pending.append((ent_id.encode('hex'), sub_path, sub_chunk_path,
1359                                 mode))