lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5
   6 from __future__ import absolute_import
   7 import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
   8 from collections import namedtuple
   9 from itertools import islice
  10 from numbers import Integral
  11
  12 from bup import _helpers, compat, hashsplit, path, midx, bloom, xstat
  13 from bup.compat import range
  14 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
  15                          fdatasync,
  16                          hostname, localtime, log, merge_iter,
  17                          mmap_read, mmap_readwrite,
  18                          parse_num,
  19                          progress, qprogress, shstr, stat_if_exists,
  20                          unlink, username, userfullname,
  21                          utc_offset_str)
  22
  23 verbose = 0
  24 ignore_midx = 0
  25 repodir = None  # The default repository, once initialized
  26
  27 _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
  28 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
  29
  30 _total_searches = 0
  31 _total_steps = 0
  32
  33
  34 class GitError(Exception):
  35     pass
  36
  37
  38 def _git_wait(cmd, p):
  39     rv = p.wait()
  40     if rv != 0:
  41         raise GitError('%s returned %d' % (shstr(cmd), rv))
  42
  43 def _git_capture(argv):
  44     p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv())
  45     r = p.stdout.read()
  46     _git_wait(repr(argv), p)
  47     return r
  48
  49 def git_config_get(option, repo_dir=None):
  50     cmd = ('git', 'config', '--get', option)
  51     p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
  52                          preexec_fn=_gitenv(repo_dir=repo_dir))
  53     r = p.stdout.read()
  54     rc = p.wait()
  55     if rc == 0:
  56         return r
  57     if rc != 1:
  58         raise GitError('%s returned %d' % (cmd, rc))
  59     return None
  60
  61
  62 def parse_tz_offset(s):
  63     """UTC offset in seconds."""
  64     tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
  65     if s[0] == '-':
  66         return - tz_off
  67     return tz_off
  68
  69
  70 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
  71 # Make sure that's authoritative.
  72 _start_end_char = r'[^ .,:;<>"\'\0\n]'
  73 _content_char = r'[^\0\n<>]'
  74 _safe_str_rx = '(?:%s{1,2}|(?:%s%s*%s))' \
  75     % (_start_end_char,
  76        _start_end_char, _content_char, _start_end_char)
  77 _tz_rx = r'[-+]\d\d[0-5]\d'
  78 _parent_rx = r'(?:parent [abcdefABCDEF0123456789]{40}\n)'
  79 # Assumes every following line starting with a space is part of the
  80 # mergetag.  Is there a formal commit blob spec?
  81 _mergetag_rx = r'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)'
  82 _commit_rx = re.compile(r'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
  83 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
  84 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?)
  85
  86 (?P<message>(?:.|\n)*)''' % (_parent_rx,
  87                              _safe_str_rx, _safe_str_rx, _tz_rx,
  88                              _safe_str_rx, _safe_str_rx, _tz_rx,
  89                              _mergetag_rx))
  90 _parent_hash_rx = re.compile(r'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
  91
  92 # Note that the author_sec and committer_sec values are (UTC) epoch
  93 # seconds, and for now the mergetag is not included.
  94 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
  95                                        'author_name', 'author_mail',
  96                                        'author_sec', 'author_offset',
  97                                        'committer_name', 'committer_mail',
  98                                        'committer_sec', 'committer_offset',
  99                                        'message'])
 100
 101 def parse_commit(content):
 102     commit_match = re.match(_commit_rx, content)
 103     if not commit_match:
 104         raise Exception('cannot parse commit %r' % content)
 105     matches = commit_match.groupdict()
 106     return CommitInfo(tree=matches['tree'],
 107                       parents=re.findall(_parent_hash_rx, matches['parents']),
 108                       author_name=matches['author_name'],
 109                       author_mail=matches['author_mail'],
 110                       author_sec=int(matches['asec']),
 111                       author_offset=parse_tz_offset(matches['atz']),
 112                       committer_name=matches['committer_name'],
 113                       committer_mail=matches['committer_mail'],
 114                       committer_sec=int(matches['csec']),
 115                       committer_offset=parse_tz_offset(matches['ctz']),
 116                       message=matches['message'])
 117
 118
 119 def get_cat_data(cat_iterator, expected_type):
 120     _, kind, _ = next(cat_iterator)
 121     if kind != expected_type:
 122         raise Exception('expected %r, saw %r' % (expected_type, kind))
 123     return ''.join(cat_iterator)
 124
 125 def get_commit_items(id, cp):
 126     return parse_commit(get_cat_data(cp.get(id), 'commit'))
 127
 128 def _local_git_date_str(epoch_sec):
 129     return '%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
 130
 131
 132 def _git_date_str(epoch_sec, tz_offset_sec):
 133     offs =  tz_offset_sec // 60
 134     return '%d %s%02d%02d' \
 135         % (epoch_sec,
 136            '+' if offs >= 0 else '-',
 137            abs(offs) // 60,
 138            abs(offs) % 60)
 139
 140
 141 def repo(sub = '', repo_dir=None):
 142     """Get the path to the git repository or one of its subdirectories."""
 143     repo_dir = repo_dir or repodir
 144     if not repo_dir:
 145         raise GitError('You should call check_repo_or_die()')
 146
 147     # If there's a .git subdirectory, then the actual repo is in there.
 148     gd = os.path.join(repo_dir, '.git')
 149     if os.path.exists(gd):
 150         repo_dir = gd
 151
 152     return os.path.join(repo_dir, sub)
 153
 154
 155 def shorten_hash(s):
 156     return re.sub(r'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)',
 157                   r'\1\2*\3', s)
 158
 159
 160 def repo_rel(path):
 161     full = os.path.abspath(path)
 162     fullrepo = os.path.abspath(repo(''))
 163     if not fullrepo.endswith('/'):
 164         fullrepo += '/'
 165     if full.startswith(fullrepo):
 166         path = full[len(fullrepo):]
 167     if path.startswith('index-cache/'):
 168         path = path[len('index-cache/'):]
 169     return shorten_hash(path)
 170
 171
 172 def all_packdirs():
 173     paths = [repo('objects/pack')]
 174     paths += glob.glob(repo('index-cache/*/.'))
 175     return paths
 176
 177
 178 def auto_midx(objdir):
 179     args = [path.exe(), 'midx', '--auto', '--dir', objdir]
 180     try:
 181         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 182     except OSError as e:
 183         # make sure 'args' gets printed to help with debugging
 184         add_error('%r: exception: %s' % (args, e))
 185         raise
 186     if rv:
 187         add_error('%r: returned %d' % (args, rv))
 188
 189     args = [path.exe(), 'bloom', '--dir', objdir]
 190     try:
 191         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 192     except OSError as e:
 193         # make sure 'args' gets printed to help with debugging
 194         add_error('%r: exception: %s' % (args, e))
 195         raise
 196     if rv:
 197         add_error('%r: returned %d' % (args, rv))
 198
 199
 200 def mangle_name(name, mode, gitmode):
 201     """Mangle a file name to present an abstract name for segmented files.
 202     Mangled file names will have the ".bup" extension added to them. If a
 203     file's name already ends with ".bup", a ".bupl" extension is added to
 204     disambiguate normal files from segmented ones.
 205     """
 206     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 207         assert(stat.S_ISDIR(gitmode))
 208         return name + '.bup'
 209     elif name.endswith('.bup') or name[:-1].endswith('.bup'):
 210         return name + '.bupl'
 211     else:
 212         return name
 213
 214
 215 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 216 def demangle_name(name, mode):
 217     """Remove name mangling from a file name, if necessary.
 218
 219     The return value is a tuple (demangled_filename,mode), where mode is one of
 220     the following:
 221
 222     * BUP_NORMAL  : files that should be read as-is from the repository
 223     * BUP_CHUNKED : files that were chunked and need to be reassembled
 224
 225     For more information on the name mangling algorithm, see mangle_name()
 226     """
 227     if name.endswith('.bupl'):
 228         return (name[:-5], BUP_NORMAL)
 229     elif name.endswith('.bup'):
 230         return (name[:-4], BUP_CHUNKED)
 231     elif name.endswith('.bupm'):
 232         return (name[:-5],
 233                 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
 234     else:
 235         return (name, BUP_NORMAL)
 236
 237
 238 def calc_hash(type, content):
 239     """Calculate some content's hash in the Git fashion."""
 240     header = '%s %d\0' % (type, len(content))
 241     sum = Sha1(header)
 242     sum.update(content)
 243     return sum.digest()
 244
 245
 246 def shalist_item_sort_key(ent):
 247     (mode, name, id) = ent
 248     assert(mode+0 == mode)
 249     if stat.S_ISDIR(mode):
 250         return name + '/'
 251     else:
 252         return name
 253
 254
 255 def tree_encode(shalist):
 256     """Generate a git tree object from (mode,name,hash) tuples."""
 257     shalist = sorted(shalist, key = shalist_item_sort_key)
 258     l = []
 259     for (mode,name,bin) in shalist:
 260         assert(mode)
 261         assert(mode+0 == mode)
 262         assert(name)
 263         assert(len(bin) == 20)
 264         s = '%o %s\0%s' % (mode,name,bin)
 265         assert(s[0] != '0')  # 0-padded octal is not acceptable in a git tree
 266         l.append(s)
 267     return ''.join(l)
 268
 269
 270 def tree_decode(buf):
 271     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 272     ofs = 0
 273     while ofs < len(buf):
 274         z = buf.find('\0', ofs)
 275         assert(z > ofs)
 276         spl = buf[ofs:z].split(' ', 1)
 277         assert(len(spl) == 2)
 278         mode,name = spl
 279         sha = buf[z+1:z+1+20]
 280         ofs = z+1+20
 281         yield (int(mode, 8), name, sha)
 282
 283
 284 def _encode_packobj(type, content, compression_level=1):
 285     if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
 286         raise ValueError('invalid compression level %s' % compression_level)
 287     szout = ''
 288     sz = len(content)
 289     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 290     sz >>= 4
 291     while 1:
 292         if sz: szbits |= 0x80
 293         szout += chr(szbits)
 294         if not sz:
 295             break
 296         szbits = sz & 0x7f
 297         sz >>= 7
 298     z = zlib.compressobj(compression_level)
 299     yield szout
 300     yield z.compress(content)
 301     yield z.flush()
 302
 303
 304 def _encode_looseobj(type, content, compression_level=1):
 305     z = zlib.compressobj(compression_level)
 306     yield z.compress('%s %d\0' % (type, len(content)))
 307     yield z.compress(content)
 308     yield z.flush()
 309
 310
 311 def _decode_looseobj(buf):
 312     assert(buf);
 313     s = zlib.decompress(buf)
 314     i = s.find('\0')
 315     assert(i > 0)
 316     l = s[:i].split(' ')
 317     type = l[0]
 318     sz = int(l[1])
 319     content = s[i+1:]
 320     assert(type in _typemap)
 321     assert(sz == len(content))
 322     return (type, content)
 323
 324
 325 def _decode_packobj(buf):
 326     assert(buf)
 327     c = ord(buf[0])
 328     type = _typermap[(c & 0x70) >> 4]
 329     sz = c & 0x0f
 330     shift = 4
 331     i = 0
 332     while c & 0x80:
 333         i += 1
 334         c = ord(buf[i])
 335         sz |= (c & 0x7f) << shift
 336         shift += 7
 337         if not (c & 0x80):
 338             break
 339     return (type, zlib.decompress(buf[i+1:]))
 340
 341
 342 class PackIdx:
 343     def __init__(self):
 344         assert(0)
 345
 346     def find_offset(self, hash):
 347         """Get the offset of an object inside the index file."""
 348         idx = self._idx_from_hash(hash)
 349         if idx != None:
 350             return self._ofs_from_idx(idx)
 351         return None
 352
 353     def exists(self, hash, want_source=False):
 354         """Return nonempty if the object exists in this index."""
 355         if hash and (self._idx_from_hash(hash) != None):
 356             return want_source and os.path.basename(self.name) or True
 357         return None
 358
 359     def __len__(self):
 360         return int(self.fanout[255])
 361
 362     def _idx_from_hash(self, hash):
 363         global _total_searches, _total_steps
 364         _total_searches += 1
 365         assert(len(hash) == 20)
 366         b1 = ord(hash[0])
 367         start = self.fanout[b1-1] # range -1..254
 368         end = self.fanout[b1] # range 0..255
 369         want = str(hash)
 370         _total_steps += 1  # lookup table is a step
 371         while start < end:
 372             _total_steps += 1
 373             mid = start + (end-start)/2
 374             v = self._idx_to_hash(mid)
 375             if v < want:
 376                 start = mid+1
 377             elif v > want:
 378                 end = mid
 379             else: # got it!
 380                 return mid
 381         return None
 382
 383
 384 class PackIdxV1(PackIdx):
 385     """Object representation of a Git pack index (version 1) file."""
 386     def __init__(self, filename, f):
 387         self.name = filename
 388         self.idxnames = [self.name]
 389         self.map = mmap_read(f)
 390         self.fanout = list(struct.unpack('!256I',
 391                                          str(buffer(self.map, 0, 256*4))))
 392         self.fanout.append(0)  # entry "-1"
 393         nsha = self.fanout[255]
 394         self.sha_ofs = 256*4
 395         self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
 396
 397     def _ofs_from_idx(self, idx):
 398         return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
 399
 400     def _idx_to_hash(self, idx):
 401         return str(self.shatable[idx*24+4 : idx*24+24])
 402
 403     def __iter__(self):
 404         for i in range(self.fanout[255]):
 405             yield buffer(self.map, 256*4 + 24*i + 4, 20)
 406
 407
 408 class PackIdxV2(PackIdx):
 409     """Object representation of a Git pack index (version 2) file."""
 410     def __init__(self, filename, f):
 411         self.name = filename
 412         self.idxnames = [self.name]
 413         self.map = mmap_read(f)
 414         assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
 415         self.fanout = list(struct.unpack('!256I',
 416                                          str(buffer(self.map, 8, 256*4))))
 417         self.fanout.append(0)  # entry "-1"
 418         nsha = self.fanout[255]
 419         self.sha_ofs = 8 + 256*4
 420         self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
 421         self.ofstable = buffer(self.map,
 422                                self.sha_ofs + nsha*20 + nsha*4,
 423                                nsha*4)
 424         self.ofs64table = buffer(self.map,
 425                                  8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
 426
 427     def _ofs_from_idx(self, idx):
 428         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
 429         if ofs & 0x80000000:
 430             idx64 = ofs & 0x7fffffff
 431             ofs = struct.unpack('!Q',
 432                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
 433         return ofs
 434
 435     def _idx_to_hash(self, idx):
 436         return str(self.shatable[idx*20:(idx+1)*20])
 437
 438     def __iter__(self):
 439         for i in range(self.fanout[255]):
 440             yield buffer(self.map, 8 + 256*4 + 20*i, 20)
 441
 442
 443 _mpi_count = 0
 444 class PackIdxList:
 445     def __init__(self, dir):
 446         global _mpi_count
 447         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 448         _mpi_count += 1
 449         self.dir = dir
 450         self.also = set()
 451         self.packs = []
 452         self.do_bloom = False
 453         self.bloom = None
 454         self.refresh()
 455
 456     def __del__(self):
 457         global _mpi_count
 458         _mpi_count -= 1
 459         assert(_mpi_count == 0)
 460
 461     def __iter__(self):
 462         return iter(idxmerge(self.packs))
 463
 464     def __len__(self):
 465         return sum(len(pack) for pack in self.packs)
 466
 467     def exists(self, hash, want_source=False):
 468         """Return nonempty if the object exists in the index files."""
 469         global _total_searches
 470         _total_searches += 1
 471         if hash in self.also:
 472             return True
 473         if self.do_bloom and self.bloom:
 474             if self.bloom.exists(hash):
 475                 self.do_bloom = False
 476             else:
 477                 _total_searches -= 1  # was counted by bloom
 478                 return None
 479         for i in xrange(len(self.packs)):
 480             p = self.packs[i]
 481             _total_searches -= 1  # will be incremented by sub-pack
 482             ix = p.exists(hash, want_source=want_source)
 483             if ix:
 484                 # reorder so most recently used packs are searched first
 485                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 486                 return ix
 487         self.do_bloom = True
 488         return None
 489
 490     def refresh(self, skip_midx = False):
 491         """Refresh the index list.
 492         This method verifies if .midx files were superseded (e.g. all of its
 493         contents are in another, bigger .midx file) and removes the superseded
 494         files.
 495
 496         If skip_midx is True, all work on .midx files will be skipped and .midx
 497         files will be removed from the list.
 498
 499         The module-global variable 'ignore_midx' can force this function to
 500         always act as if skip_midx was True.
 501         """
 502         self.bloom = None # Always reopen the bloom as it may have been relaced
 503         self.do_bloom = False
 504         skip_midx = skip_midx or ignore_midx
 505         d = dict((p.name, p) for p in self.packs
 506                  if not skip_midx or not isinstance(p, midx.PackMidx))
 507         if os.path.exists(self.dir):
 508             if not skip_midx:
 509                 midxl = []
 510                 for ix in self.packs:
 511                     if isinstance(ix, midx.PackMidx):
 512                         for name in ix.idxnames:
 513                             d[os.path.join(self.dir, name)] = ix
 514                 for full in glob.glob(os.path.join(self.dir,'*.midx')):
 515                     if not d.get(full):
 516                         mx = midx.PackMidx(full)
 517                         (mxd, mxf) = os.path.split(mx.name)
 518                         broken = False
 519                         for n in mx.idxnames:
 520                             if not os.path.exists(os.path.join(mxd, n)):
 521                                 log(('warning: index %s missing\n' +
 522                                     '  used by %s\n') % (n, mxf))
 523                                 broken = True
 524                         if broken:
 525                             mx.close()
 526                             del mx
 527                             unlink(full)
 528                         else:
 529                             midxl.append(mx)
 530                 midxl.sort(key=lambda ix:
 531                            (-len(ix), -xstat.stat(ix.name).st_mtime))
 532                 for ix in midxl:
 533                     any_needed = False
 534                     for sub in ix.idxnames:
 535                         found = d.get(os.path.join(self.dir, sub))
 536                         if not found or isinstance(found, PackIdx):
 537                             # doesn't exist, or exists but not in a midx
 538                             any_needed = True
 539                             break
 540                     if any_needed:
 541                         d[ix.name] = ix
 542                         for name in ix.idxnames:
 543                             d[os.path.join(self.dir, name)] = ix
 544                     elif not ix.force_keep:
 545                         debug1('midx: removing redundant: %s\n'
 546                                % os.path.basename(ix.name))
 547                         ix.close()
 548                         unlink(ix.name)
 549             for full in glob.glob(os.path.join(self.dir,'*.idx')):
 550                 if not d.get(full):
 551                     try:
 552                         ix = open_idx(full)
 553                     except GitError as e:
 554                         add_error(e)
 555                         continue
 556                     d[full] = ix
 557             bfull = os.path.join(self.dir, 'bup.bloom')
 558             if self.bloom is None and os.path.exists(bfull):
 559                 self.bloom = bloom.ShaBloom(bfull)
 560             self.packs = list(set(d.values()))
 561             self.packs.sort(reverse=True, key=lambda x: len(x))
 562             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 563                 self.do_bloom = True
 564             else:
 565                 self.bloom = None
 566         debug1('PackIdxList: using %d index%s.\n'
 567             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 568
 569     def add(self, hash):
 570         """Insert an additional object in the list."""
 571         self.also.add(hash)
 572
 573
 574 def open_idx(filename):
 575     if filename.endswith('.idx'):
 576         f = open(filename, 'rb')
 577         header = f.read(8)
 578         if header[0:4] == '\377tOc':
 579             version = struct.unpack('!I', header[4:8])[0]
 580             if version == 2:
 581                 return PackIdxV2(filename, f)
 582             else:
 583                 raise GitError('%s: expected idx file version 2, got %d'
 584                                % (filename, version))
 585         elif len(header) == 8 and header[0:4] < '\377tOc':
 586             return PackIdxV1(filename, f)
 587         else:
 588             raise GitError('%s: unrecognized idx file header' % filename)
 589     elif filename.endswith('.midx'):
 590         return midx.PackMidx(filename)
 591     else:
 592         raise GitError('idx filenames must end with .idx or .midx')
 593
 594
 595 def idxmerge(idxlist, final_progress=True):
 596     """Generate a list of all the objects reachable in a PackIdxList."""
 597     def pfunc(count, total):
 598         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 599                   % (count*100.0/total, count, total))
 600     def pfinal(count, total):
 601         if final_progress:
 602             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 603                      % (100, total, total))
 604     return merge_iter(idxlist, 10024, pfunc, pfinal)
 605
 606
 607 def _make_objcache():
 608     return PackIdxList(repo('objects/pack'))
 609
 610 # bup-gc assumes that it can disable all PackWriter activities
 611 # (bloom/midx/cache) via the constructor and close() arguments.
 612
 613 class PackWriter:
 614     """Writes Git objects inside a pack file."""
 615     def __init__(self, objcache_maker=_make_objcache, compression_level=1,
 616                  run_midx=True, on_pack_finish=None,
 617                  max_pack_size=None, max_pack_objects=None, repo_dir=None):
 618         self.repo_dir = repo_dir or repo()
 619         self.file = None
 620         self.parentfd = None
 621         self.count = 0
 622         self.outbytes = 0
 623         self.filename = None
 624         self.idx = None
 625         self.objcache_maker = objcache_maker
 626         self.objcache = None
 627         self.compression_level = compression_level
 628         self.run_midx=run_midx
 629         self.on_pack_finish = on_pack_finish
 630         if not max_pack_size:
 631             max_pack_size = git_config_get('pack.packSizeLimit',
 632                                            repo_dir=self.repo_dir)
 633             if max_pack_size is not None:
 634                 max_pack_size = parse_num(max_pack_size)
 635             if not max_pack_size:
 636                 # larger packs slow down pruning
 637                 max_pack_size = 1000 * 1000 * 1000
 638         self.max_pack_size = max_pack_size
 639         # cache memory usage is about 83 bytes per object
 640         self.max_pack_objects = max_pack_objects if max_pack_objects \
 641                                 else max(1, self.max_pack_size // 5000)
 642
 643     def __del__(self):
 644         self.close()
 645
 646     def __enter__(self):
 647         return self
 648
 649     def __exit__(self, type, value, traceback):
 650         self.close()
 651
 652     def _open(self):
 653         if not self.file:
 654             objdir = dir = os.path.join(self.repo_dir, 'objects')
 655             fd, name = tempfile.mkstemp(suffix='.pack', dir=objdir)
 656             try:
 657                 self.file = os.fdopen(fd, 'w+b')
 658             except:
 659                 os.close(fd)
 660                 raise
 661             try:
 662                 self.parentfd = os.open(objdir, os.O_RDONLY)
 663             except:
 664                 f = self.file
 665                 self.file = None
 666                 f.close()
 667                 raise
 668             assert(name.endswith('.pack'))
 669             self.filename = name[:-5]
 670             self.file.write('PACK\0\0\0\2\0\0\0\0')
 671             self.idx = list(list() for i in xrange(256))
 672
 673     def _raw_write(self, datalist, sha):
 674         self._open()
 675         f = self.file
 676         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 677         # the file never has a *partial* blob.  So let's make sure it's
 678         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 679         # to our hashsplit algorithm.)  f.write() does its own buffering,
 680         # but that's okay because we'll flush it in _end().
 681         oneblob = ''.join(datalist)
 682         try:
 683             f.write(oneblob)
 684         except IOError as e:
 685             raise GitError, e, sys.exc_info()[2]
 686         nw = len(oneblob)
 687         crc = zlib.crc32(oneblob) & 0xffffffff
 688         self._update_idx(sha, crc, nw)
 689         self.outbytes += nw
 690         self.count += 1
 691         return nw, crc
 692
 693     def _update_idx(self, sha, crc, size):
 694         assert(sha)
 695         if self.idx:
 696             self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
 697
 698     def _write(self, sha, type, content):
 699         if verbose:
 700             log('>')
 701         if not sha:
 702             sha = calc_hash(type, content)
 703         size, crc = self._raw_write(_encode_packobj(type, content,
 704                                                     self.compression_level),
 705                                     sha=sha)
 706         if self.outbytes >= self.max_pack_size \
 707            or self.count >= self.max_pack_objects:
 708             self.breakpoint()
 709         return sha
 710
 711     def breakpoint(self):
 712         """Clear byte and object counts and return the last processed id."""
 713         id = self._end(self.run_midx)
 714         self.outbytes = self.count = 0
 715         return id
 716
 717     def _require_objcache(self):
 718         if self.objcache is None and self.objcache_maker:
 719             self.objcache = self.objcache_maker()
 720         if self.objcache is None:
 721             raise GitError(
 722                     "PackWriter not opened or can't check exists w/o objcache")
 723
 724     def exists(self, id, want_source=False):
 725         """Return non-empty if an object is found in the object cache."""
 726         self._require_objcache()
 727         return self.objcache.exists(id, want_source=want_source)
 728
 729     def just_write(self, sha, type, content):
 730         """Write an object to the pack file without checking for duplication."""
 731         self._write(sha, type, content)
 732         # If nothing else, gc doesn't have/want an objcache
 733         if self.objcache is not None:
 734             self.objcache.add(sha)
 735
 736     def maybe_write(self, type, content):
 737         """Write an object to the pack file if not present and return its id."""
 738         sha = calc_hash(type, content)
 739         if not self.exists(sha):
 740             self._require_objcache()
 741             self.just_write(sha, type, content)
 742         return sha
 743
 744     def new_blob(self, blob):
 745         """Create a blob object in the pack with the supplied content."""
 746         return self.maybe_write('blob', blob)
 747
 748     def new_tree(self, shalist):
 749         """Create a tree object in the pack."""
 750         content = tree_encode(shalist)
 751         return self.maybe_write('tree', content)
 752
 753     def new_commit(self, tree, parent,
 754                    author, adate_sec, adate_tz,
 755                    committer, cdate_sec, cdate_tz,
 756                    msg):
 757         """Create a commit object in the pack.  The date_sec values must be
 758         epoch-seconds, and if a tz is None, the local timezone is assumed."""
 759         if adate_tz:
 760             adate_str = _git_date_str(adate_sec, adate_tz)
 761         else:
 762             adate_str = _local_git_date_str(adate_sec)
 763         if cdate_tz:
 764             cdate_str = _git_date_str(cdate_sec, cdate_tz)
 765         else:
 766             cdate_str = _local_git_date_str(cdate_sec)
 767         l = []
 768         if tree: l.append('tree %s' % tree.encode('hex'))
 769         if parent: l.append('parent %s' % parent.encode('hex'))
 770         if author: l.append('author %s %s' % (author, adate_str))
 771         if committer: l.append('committer %s %s' % (committer, cdate_str))
 772         l.append('')
 773         l.append(msg)
 774         return self.maybe_write('commit', '\n'.join(l))
 775
 776     def abort(self):
 777         """Remove the pack file from disk."""
 778         f = self.file
 779         if f:
 780             pfd = self.parentfd
 781             self.file = None
 782             self.parentfd = None
 783             self.idx = None
 784             try:
 785                 try:
 786                     os.unlink(self.filename + '.pack')
 787                 finally:
 788                     f.close()
 789             finally:
 790                 if pfd is not None:
 791                     os.close(pfd)
 792
 793     def _end(self, run_midx=True):
 794         f = self.file
 795         if not f: return None
 796         self.file = None
 797         try:
 798             self.objcache = None
 799             idx = self.idx
 800             self.idx = None
 801
 802             # update object count
 803             f.seek(8)
 804             cp = struct.pack('!i', self.count)
 805             assert(len(cp) == 4)
 806             f.write(cp)
 807
 808             # calculate the pack sha1sum
 809             f.seek(0)
 810             sum = Sha1()
 811             for b in chunkyreader(f):
 812                 sum.update(b)
 813             packbin = sum.digest()
 814             f.write(packbin)
 815             fdatasync(f.fileno())
 816         finally:
 817             f.close()
 818
 819         obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
 820         nameprefix = os.path.join(self.repo_dir,
 821                                   'objects/pack/pack-' +  obj_list_sha)
 822         if os.path.exists(self.filename + '.map'):
 823             os.unlink(self.filename + '.map')
 824         os.rename(self.filename + '.pack', nameprefix + '.pack')
 825         os.rename(self.filename + '.idx', nameprefix + '.idx')
 826         try:
 827             os.fsync(self.parentfd)
 828         finally:
 829             os.close(self.parentfd)
 830
 831         if run_midx:
 832             auto_midx(os.path.join(self.repo_dir, 'objects/pack'))
 833
 834         if self.on_pack_finish:
 835             self.on_pack_finish(nameprefix)
 836
 837         return nameprefix
 838
 839     def close(self, run_midx=True):
 840         """Close the pack file and move it to its definitive path."""
 841         return self._end(run_midx=run_midx)
 842
 843     def _write_pack_idx_v2(self, filename, idx, packbin):
 844         ofs64_count = 0
 845         for section in idx:
 846             for entry in section:
 847                 if entry[2] >= 2**31:
 848                     ofs64_count += 1
 849
 850         # Length: header + fan-out + shas-and-crcs + overflow-offsets
 851         index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
 852         idx_map = None
 853         idx_f = open(filename, 'w+b')
 854         try:
 855             idx_f.truncate(index_len)
 856             fdatasync(idx_f.fileno())
 857             idx_map = mmap_readwrite(idx_f, close=False)
 858             try:
 859                 count = _helpers.write_idx(filename, idx_map, idx, self.count)
 860                 assert(count == self.count)
 861                 idx_map.flush()
 862             finally:
 863                 idx_map.close()
 864         finally:
 865             idx_f.close()
 866
 867         idx_f = open(filename, 'a+b')
 868         try:
 869             idx_f.write(packbin)
 870             idx_f.seek(0)
 871             idx_sum = Sha1()
 872             b = idx_f.read(8 + 4*256)
 873             idx_sum.update(b)
 874
 875             obj_list_sum = Sha1()
 876             for b in chunkyreader(idx_f, 20*self.count):
 877                 idx_sum.update(b)
 878                 obj_list_sum.update(b)
 879             namebase = obj_list_sum.hexdigest()
 880
 881             for b in chunkyreader(idx_f):
 882                 idx_sum.update(b)
 883             idx_f.write(idx_sum.digest())
 884             fdatasync(idx_f.fileno())
 885             return namebase
 886         finally:
 887             idx_f.close()
 888
 889
 890 def _gitenv(repo_dir = None):
 891     if not repo_dir:
 892         repo_dir = repo()
 893     def env():
 894         os.environ['GIT_DIR'] = os.path.abspath(repo_dir)
 895     return env
 896
 897
 898 def list_refs(patterns=None, repo_dir=None,
 899               limit_to_heads=False, limit_to_tags=False):
 900     """Yield (refname, hash) tuples for all repository refs unless
 901     patterns are specified.  In that case, only include tuples for
 902     refs matching those patterns (cf. git-show-ref(1)).  The limits
 903     restrict the result items to refs/heads or refs/tags.  If both
 904     limits are specified, items from both sources will be included.
 905
 906     """
 907     argv = ['git', 'show-ref']
 908     if limit_to_heads:
 909         argv.append('--heads')
 910     if limit_to_tags:
 911         argv.append('--tags')
 912     argv.append('--')
 913     if patterns:
 914         argv.extend(patterns)
 915     p = subprocess.Popen(argv,
 916                          preexec_fn = _gitenv(repo_dir),
 917                          stdout = subprocess.PIPE)
 918     out = p.stdout.read().strip()
 919     rv = p.wait()  # not fatal
 920     if rv:
 921         assert(not out)
 922     if out:
 923         for d in out.split('\n'):
 924             (sha, name) = d.split(' ', 1)
 925             yield (name, sha.decode('hex'))
 926
 927
 928 def read_ref(refname, repo_dir = None):
 929     """Get the commit id of the most recent commit made on a given ref."""
 930     refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
 931     l = tuple(islice(refs, 2))
 932     if l:
 933         assert(len(l) == 1)
 934         return l[0][1]
 935     else:
 936         return None
 937
 938
 939 def rev_list_invocation(ref_or_refs, count=None, format=None):
 940     if isinstance(ref_or_refs, compat.str_type):
 941         refs = (ref_or_refs,)
 942     else:
 943         refs = ref_or_refs
 944     argv = ['git', 'rev-list']
 945     if isinstance(count, Integral):
 946         argv.extend(['-n', str(count)])
 947     elif count:
 948         raise ValueError('unexpected count argument %r' % count)
 949
 950     if format:
 951         argv.append('--pretty=format:' + format)
 952     for ref in refs:
 953         assert not ref.startswith('-')
 954         argv.append(ref)
 955     argv.append('--')
 956     return argv
 957
 958
 959 def rev_list(ref_or_refs, count=None, parse=None, format=None, repo_dir=None):
 960     """Yield information about commits as per "git rev-list".  If a format
 961     is not provided, yield one hex hash at a time.  If a format is
 962     provided, pass it to rev-list and call parse(git_stdout) for each
 963     commit with the stream positioned just after the rev-list "commit
 964     HASH" header line.  When a format is provided yield (oidx,
 965     parse(git_stdout)) for each commit.
 966
 967     """
 968     assert bool(parse) == bool(format)
 969     p = subprocess.Popen(rev_list_invocation(ref_or_refs, count=count,
 970                                              format=format),
 971                          preexec_fn = _gitenv(repo_dir),
 972                          stdout = subprocess.PIPE)
 973     if not format:
 974         for line in p.stdout:
 975             yield line.strip()
 976     else:
 977         line = p.stdout.readline()
 978         while line:
 979             s = line.strip()
 980             if not s.startswith('commit '):
 981                 raise Exception('unexpected line ' + s)
 982             s = s[7:]
 983             assert len(s) == 40
 984             yield s, parse(p.stdout)
 985             line = p.stdout.readline()
 986
 987     rv = p.wait()  # not fatal
 988     if rv:
 989         raise GitError, 'git rev-list returned error %d' % rv
 990
 991
 992 def get_commit_dates(refs, repo_dir=None):
 993     """Get the dates for the specified commit refs.  For now, every unique
 994        string in refs must resolve to a different commit or this
 995        function will fail."""
 996     result = []
 997     for ref in refs:
 998         commit = get_commit_items(ref, cp(repo_dir))
 999         result.append(commit.author_sec)
1000     return result
1001
1002
1003 def rev_parse(committish, repo_dir=None):
1004     """Resolve the full hash for 'committish', if it exists.
1005
1006     Should be roughly equivalent to 'git rev-parse'.
1007
1008     Returns the hex value of the hash if it is found, None if 'committish' does
1009     not correspond to anything.
1010     """
1011     head = read_ref(committish, repo_dir=repo_dir)
1012     if head:
1013         debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
1014         return head
1015
1016     pL = PackIdxList(repo('objects/pack', repo_dir=repo_dir))
1017
1018     if len(committish) == 40:
1019         try:
1020             hash = committish.decode('hex')
1021         except TypeError:
1022             return None
1023
1024         if pL.exists(hash):
1025             return hash
1026
1027     return None
1028
1029
1030 def update_ref(refname, newval, oldval, repo_dir=None):
1031     """Update a repository reference."""
1032     if not oldval:
1033         oldval = ''
1034     assert(refname.startswith('refs/heads/') \
1035            or refname.startswith('refs/tags/'))
1036     p = subprocess.Popen(['git', 'update-ref', refname,
1037                           newval.encode('hex'), oldval.encode('hex')],
1038                          preexec_fn = _gitenv(repo_dir))
1039     _git_wait('git update-ref', p)
1040
1041
1042 def delete_ref(refname, oldvalue=None):
1043     """Delete a repository reference (see git update-ref(1))."""
1044     assert(refname.startswith('refs/'))
1045     oldvalue = [] if not oldvalue else [oldvalue]
1046     p = subprocess.Popen(['git', 'update-ref', '-d', refname] + oldvalue,
1047                          preexec_fn = _gitenv())
1048     _git_wait('git update-ref', p)
1049
1050
1051 def guess_repo(path=None):
1052     """Set the path value in the global variable "repodir".
1053     This makes bup look for an existing bup repository, but not fail if a
1054     repository doesn't exist. Usually, if you are interacting with a bup
1055     repository, you would not be calling this function but using
1056     check_repo_or_die().
1057     """
1058     global repodir
1059     if path:
1060         repodir = path
1061     if not repodir:
1062         repodir = os.environ.get('BUP_DIR')
1063         if not repodir:
1064             repodir = os.path.expanduser('~/.bup')
1065
1066
1067 def init_repo(path=None):
1068     """Create the Git bare repository for bup in a given path."""
1069     guess_repo(path)
1070     d = repo()  # appends a / to the path
1071     parent = os.path.dirname(os.path.dirname(d))
1072     if parent and not os.path.exists(parent):
1073         raise GitError('parent directory "%s" does not exist\n' % parent)
1074     if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
1075         raise GitError('"%s" exists but is not a directory\n' % d)
1076     p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
1077                          preexec_fn = _gitenv())
1078     _git_wait('git init', p)
1079     # Force the index version configuration in order to ensure bup works
1080     # regardless of the version of the installed Git binary.
1081     p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
1082                          stdout=sys.stderr, preexec_fn = _gitenv())
1083     _git_wait('git config', p)
1084     # Enable the reflog
1085     p = subprocess.Popen(['git', 'config', 'core.logAllRefUpdates', 'true'],
1086                          stdout=sys.stderr, preexec_fn = _gitenv())
1087     _git_wait('git config', p)
1088
1089
1090 def check_repo_or_die(path=None):
1091     """Check to see if a bup repository probably exists, and abort if not."""
1092     guess_repo(path)
1093     top = repo()
1094     pst = stat_if_exists(top + '/objects/pack')
1095     if pst and stat.S_ISDIR(pst.st_mode):
1096         return
1097     if not pst:
1098         top_st = stat_if_exists(top)
1099         if not top_st:
1100             log('error: repository %r does not exist (see "bup help init")\n'
1101                 % top)
1102             sys.exit(15)
1103     log('error: %r is not a repository\n' % top)
1104     sys.exit(14)
1105
1106
1107 _ver = None
1108 def ver():
1109     """Get Git's version and ensure a usable version is installed.
1110
1111     The returned version is formatted as an ordered tuple with each position
1112     representing a digit in the version tag. For example, the following tuple
1113     would represent version 1.6.6.9:
1114
1115         ('1', '6', '6', '9')
1116     """
1117     global _ver
1118     if not _ver:
1119         p = subprocess.Popen(['git', '--version'],
1120                              stdout=subprocess.PIPE)
1121         gvs = p.stdout.read()
1122         _git_wait('git --version', p)
1123         m = re.match(r'git version (\S+.\S+)', gvs)
1124         if not m:
1125             raise GitError('git --version weird output: %r' % gvs)
1126         _ver = tuple(m.group(1).split('.'))
1127     needed = ('1','5', '3', '1')
1128     if _ver < needed:
1129         raise GitError('git version %s or higher is required; you have %s'
1130                        % ('.'.join(needed), '.'.join(_ver)))
1131     return _ver
1132
1133
1134 class _AbortableIter:
1135     def __init__(self, it, onabort = None):
1136         self.it = it
1137         self.onabort = onabort
1138         self.done = None
1139
1140     def __iter__(self):
1141         return self
1142
1143     def next(self):
1144         try:
1145             return next(self.it)
1146         except StopIteration as e:
1147             self.done = True
1148             raise
1149         except:
1150             self.abort()
1151             raise
1152
1153     def abort(self):
1154         """Abort iteration and call the abortion callback, if needed."""
1155         if not self.done:
1156             self.done = True
1157             if self.onabort:
1158                 self.onabort()
1159
1160     def __del__(self):
1161         self.abort()
1162
1163
1164 _ver_warned = 0
1165 class CatPipe:
1166     """Link to 'git cat-file' that is used to retrieve blob data."""
1167     def __init__(self, repo_dir = None):
1168         global _ver_warned
1169         self.repo_dir = repo_dir
1170         wanted = ('1','5','6')
1171         if ver() < wanted:
1172             log('error: git version must be at least 1.5.6\n')
1173             sys.exit(1)
1174         self.p = self.inprogress = None
1175
1176     def _abort(self):
1177         if self.p:
1178             self.p.stdout.close()
1179             self.p.stdin.close()
1180         self.p = None
1181         self.inprogress = None
1182
1183     def restart(self):
1184         self._abort()
1185         self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1186                                   stdin=subprocess.PIPE,
1187                                   stdout=subprocess.PIPE,
1188                                   close_fds = True,
1189                                   bufsize = 4096,
1190                                   preexec_fn = _gitenv(self.repo_dir))
1191
1192     def get(self, ref):
1193         """Yield (oidx, type, size), followed by the data referred to by ref.
1194         If ref does not exist, only yield (None, None, None).
1195
1196         """
1197         if not self.p or self.p.poll() != None:
1198             self.restart()
1199         assert(self.p)
1200         poll_result = self.p.poll()
1201         assert(poll_result == None)
1202         if self.inprogress:
1203             log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1204         assert(not self.inprogress)
1205         assert(ref.find('\n') < 0)
1206         assert(ref.find('\r') < 0)
1207         assert(not ref.startswith('-'))
1208         self.inprogress = ref
1209         self.p.stdin.write('%s\n' % ref)
1210         self.p.stdin.flush()
1211         hdr = self.p.stdout.readline()
1212         if hdr.endswith(' missing\n'):
1213             self.inprogress = None
1214             yield None, None, None
1215             return
1216         info = hdr.split(' ')
1217         if len(info) != 3 or len(info[0]) != 40:
1218             raise GitError('expected object (id, type, size), got %r' % info)
1219         oidx, typ, size = info
1220         size = int(size)
1221         it = _AbortableIter(chunkyreader(self.p.stdout, size),
1222                             onabort=self._abort)
1223         try:
1224             yield oidx, typ, size
1225             for blob in it:
1226                 yield blob
1227             readline_result = self.p.stdout.readline()
1228             assert(readline_result == '\n')
1229             self.inprogress = None
1230         except Exception as e:
1231             it.abort()
1232             raise
1233
1234     def _join(self, it):
1235         _, typ, _ = next(it)
1236         if typ == 'blob':
1237             for blob in it:
1238                 yield blob
1239         elif typ == 'tree':
1240             treefile = ''.join(it)
1241             for (mode, name, sha) in tree_decode(treefile):
1242                 for blob in self.join(sha.encode('hex')):
1243                     yield blob
1244         elif typ == 'commit':
1245             treeline = ''.join(it).split('\n')[0]
1246             assert(treeline.startswith('tree '))
1247             for blob in self.join(treeline[5:]):
1248                 yield blob
1249         else:
1250             raise GitError('invalid object type %r: expected blob/tree/commit'
1251                            % typ)
1252
1253     def join(self, id):
1254         """Generate a list of the content of all blobs that can be reached
1255         from an object.  The hash given in 'id' must point to a blob, a tree
1256         or a commit. The content of all blobs that can be seen from trees or
1257         commits will be added to the list.
1258         """
1259         try:
1260             for d in self._join(self.get(id)):
1261                 yield d
1262         except StopIteration:
1263             log('booger!\n')
1264
1265
1266 _cp = {}
1267
1268 def cp(repo_dir=None):
1269     """Create a CatPipe object or reuse the already existing one."""
1270     global _cp, repodir
1271     if not repo_dir:
1272         repo_dir = repodir or repo()
1273     repo_dir = os.path.abspath(repo_dir)
1274     cp = _cp.get(repo_dir)
1275     if not cp:
1276         cp = CatPipe(repo_dir)
1277         _cp[repo_dir] = cp
1278     return cp
1279
1280
1281 def tags(repo_dir = None):
1282     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1283     tags = {}
1284     for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1285         assert(n.startswith('refs/tags/'))
1286         name = n[10:]
1287         if not c in tags:
1288             tags[c] = []
1289         tags[c].append(name)  # more than one tag can point at 'c'
1290     return tags
1291
1292
1293 class MissingObject(KeyError):
1294     def __init__(self, oid):
1295         self.oid = oid
1296         KeyError.__init__(self, 'object %r is missing' % oid.encode('hex'))
1297
1298
1299 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1300                                    'path', 'chunk_path', 'data'])
1301 # The path is the mangled path, and if an item represents a fragment
1302 # of a chunked file, the chunk_path will be the chunked subtree path
1303 # for the chunk, i.e. ['', '2d3115e', ...].  The top-level path for a
1304 # chunked file will have a chunk_path of [''].  So some chunk subtree
1305 # of the file '/foo/bar/baz' might look like this:
1306 #
1307 #   item.path = ['foo', 'bar', 'baz.bup']
1308 #   item.chunk_path = ['', '2d3115e', '016b097']
1309 #   item.type = 'tree'
1310 #   ...
1311
1312
1313 def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1314     """Yield everything reachable from oidx via get_ref (which must behave
1315     like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1316     returns true.  Throw MissingObject if a hash encountered is
1317     missing from the repository, and don't read or return blob content
1318     in the data field unless include_data is set.
1319
1320     """
1321     # Maintain the pending stack on the heap to avoid stack overflow
1322     pending = [(oidx, [], [], None)]
1323     while len(pending):
1324         oidx, parent_path, chunk_path, mode = pending.pop()
1325         oid = oidx.decode('hex')
1326         if stop_at and stop_at(oidx):
1327             continue
1328
1329         if (not include_data) and mode and stat.S_ISREG(mode):
1330             # If the object is a "regular file", then it's a leaf in
1331             # the graph, so we can skip reading the data if the caller
1332             # hasn't requested it.
1333             yield WalkItem(oid=oid, type='blob',
1334                            chunk_path=chunk_path, path=parent_path,
1335                            mode=mode,
1336                            data=None)
1337             continue
1338
1339         item_it = get_ref(oidx)
1340         get_oidx, typ, _ = next(item_it)
1341         if not get_oidx:
1342             raise MissingObject(oidx.decode('hex'))
1343         if typ not in ('blob', 'commit', 'tree'):
1344             raise Exception('unexpected repository object type %r' % typ)
1345
1346         # FIXME: set the mode based on the type when the mode is None
1347         if typ == 'blob' and not include_data:
1348             # Dump data until we can ask cat_pipe not to fetch it
1349             for ignored in item_it:
1350                 pass
1351             data = None
1352         else:
1353             data = ''.join(item_it)
1354
1355         yield WalkItem(oid=oid, type=typ,
1356                        chunk_path=chunk_path, path=parent_path,
1357                        mode=mode,
1358                        data=(data if include_data else None))
1359
1360         if typ == 'commit':
1361             commit_items = parse_commit(data)
1362             for pid in commit_items.parents:
1363                 pending.append((pid, parent_path, chunk_path, mode))
1364             pending.append((commit_items.tree, parent_path, chunk_path,
1365                             hashsplit.GIT_MODE_TREE))
1366         elif typ == 'tree':
1367             for mode, name, ent_id in tree_decode(data):
1368                 demangled, bup_type = demangle_name(name, mode)
1369                 if chunk_path:
1370                     sub_path = parent_path
1371                     sub_chunk_path = chunk_path + [name]
1372                 else:
1373                     sub_path = parent_path + [name]
1374                     if bup_type == BUP_CHUNKED:
1375                         sub_chunk_path = ['']
1376                     else:
1377                         sub_chunk_path = chunk_path
1378                 pending.append((ent_id.encode('hex'), sub_path, sub_chunk_path,
1379                                 mode))