lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5
   6 from __future__ import absolute_import, print_function
   7 import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
   8 from array import array
   9 from binascii import hexlify, unhexlify
  10 from collections import namedtuple
  11 from itertools import islice
  12 from numbers import Integral
  13
  14 from bup import _helpers, compat, hashsplit, path, midx, bloom, xstat
  15 from bup.compat import (buffer,
  16                         byte_int, bytes_from_byte, bytes_from_uint,
  17                         environ,
  18                         items,
  19                         range,
  20                         reraise)
  21 from bup.io import path_msg
  22 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
  23                          fdatasync,
  24                          hostname, localtime, log,
  25                          merge_dict,
  26                          merge_iter,
  27                          mmap_read, mmap_readwrite,
  28                          parse_num,
  29                          progress, qprogress, stat_if_exists,
  30                          unlink,
  31                          utc_offset_str)
  32 from bup.pwdgrp import username, userfullname
  33
  34
  35 verbose = 0
  36 repodir = None  # The default repository, once initialized
  37
  38 _typemap =  {b'blob': 3, b'tree': 2, b'commit': 1, b'tag': 4}
  39 _typermap = {v: k for k, v in items(_typemap)}
  40
  41
  42 _total_searches = 0
  43 _total_steps = 0
  44
  45
  46 class GitError(Exception):
  47     pass
  48
  49
  50 def _gitenv(repo_dir=None):
  51     if not repo_dir:
  52         repo_dir = repo()
  53     return merge_dict(environ, {b'GIT_DIR': os.path.abspath(repo_dir)})
  54
  55 def _git_wait(cmd, p):
  56     rv = p.wait()
  57     if rv != 0:
  58         raise GitError('%r returned %d' % (cmd, rv))
  59
  60 def _git_capture(argv):
  61     p = subprocess.Popen(argv, stdout=subprocess.PIPE, env=_gitenv())
  62     r = p.stdout.read()
  63     _git_wait(argv, p)
  64     return r
  65
  66 def git_config_get(option, repo_dir=None):
  67     cmd = (b'git', b'config', b'--get', option)
  68     p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
  69                          env=_gitenv(repo_dir=repo_dir))
  70     r = p.stdout.read()
  71     rc = p.wait()
  72     if rc == 0:
  73         return r
  74     if rc != 1:
  75         raise GitError('%r returned %d' % (cmd, rc))
  76     return None
  77
  78
  79 def parse_tz_offset(s):
  80     """UTC offset in seconds."""
  81     tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
  82     if bytes_from_byte(s[0]) == b'-':
  83         return - tz_off
  84     return tz_off
  85
  86
  87 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
  88 # Make sure that's authoritative.
  89 _start_end_char = br'[^ .,:;<>"\'\0\n]'
  90 _content_char = br'[^\0\n<>]'
  91 _safe_str_rx = br'(?:%s{1,2}|(?:%s%s*%s))' \
  92     % (_start_end_char,
  93        _start_end_char, _content_char, _start_end_char)
  94 _tz_rx = br'[-+]\d\d[0-5]\d'
  95 _parent_rx = br'(?:parent [abcdefABCDEF0123456789]{40}\n)'
  96 # Assumes every following line starting with a space is part of the
  97 # mergetag.  Is there a formal commit blob spec?
  98 _mergetag_rx = br'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)'
  99 _commit_rx = re.compile(br'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
 100 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
 101 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?)
 102
 103 (?P<message>(?:.|\n)*)''' % (_parent_rx,
 104                              _safe_str_rx, _safe_str_rx, _tz_rx,
 105                              _safe_str_rx, _safe_str_rx, _tz_rx,
 106                              _mergetag_rx))
 107 _parent_hash_rx = re.compile(br'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
 108
 109 # Note that the author_sec and committer_sec values are (UTC) epoch
 110 # seconds, and for now the mergetag is not included.
 111 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
 112                                        'author_name', 'author_mail',
 113                                        'author_sec', 'author_offset',
 114                                        'committer_name', 'committer_mail',
 115                                        'committer_sec', 'committer_offset',
 116                                        'message'])
 117
 118 def parse_commit(content):
 119     commit_match = re.match(_commit_rx, content)
 120     if not commit_match:
 121         raise Exception('cannot parse commit %r' % content)
 122     matches = commit_match.groupdict()
 123     return CommitInfo(tree=matches['tree'],
 124                       parents=re.findall(_parent_hash_rx, matches['parents']),
 125                       author_name=matches['author_name'],
 126                       author_mail=matches['author_mail'],
 127                       author_sec=int(matches['asec']),
 128                       author_offset=parse_tz_offset(matches['atz']),
 129                       committer_name=matches['committer_name'],
 130                       committer_mail=matches['committer_mail'],
 131                       committer_sec=int(matches['csec']),
 132                       committer_offset=parse_tz_offset(matches['ctz']),
 133                       message=matches['message'])
 134
 135
 136 def get_cat_data(cat_iterator, expected_type):
 137     _, kind, _ = next(cat_iterator)
 138     if kind != expected_type:
 139         raise Exception('expected %r, saw %r' % (expected_type, kind))
 140     return b''.join(cat_iterator)
 141
 142 def get_commit_items(id, cp):
 143     return parse_commit(get_cat_data(cp.get(id), b'commit'))
 144
 145 def _local_git_date_str(epoch_sec):
 146     return b'%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
 147
 148
 149 def _git_date_str(epoch_sec, tz_offset_sec):
 150     offs =  tz_offset_sec // 60
 151     return b'%d %s%02d%02d' \
 152         % (epoch_sec,
 153            b'+' if offs >= 0 else b'-',
 154            abs(offs) // 60,
 155            abs(offs) % 60)
 156
 157
 158 def repo(sub = b'', repo_dir=None):
 159     """Get the path to the git repository or one of its subdirectories."""
 160     repo_dir = repo_dir or repodir
 161     if not repo_dir:
 162         raise GitError('You should call check_repo_or_die()')
 163
 164     # If there's a .git subdirectory, then the actual repo is in there.
 165     gd = os.path.join(repo_dir, b'.git')
 166     if os.path.exists(gd):
 167         repo_dir = gd
 168
 169     return os.path.join(repo_dir, sub)
 170
 171
 172 _shorten_hash_rx = \
 173     re.compile(br'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)')
 174
 175 def shorten_hash(s):
 176     return _shorten_hash_rx.sub(br'\1\2*\3', s)
 177
 178
 179 def repo_rel(path):
 180     full = os.path.abspath(path)
 181     fullrepo = os.path.abspath(repo(b''))
 182     if not fullrepo.endswith(b'/'):
 183         fullrepo += b'/'
 184     if full.startswith(fullrepo):
 185         path = full[len(fullrepo):]
 186     if path.startswith(b'index-cache/'):
 187         path = path[len(b'index-cache/'):]
 188     return shorten_hash(path)
 189
 190
 191 def all_packdirs():
 192     paths = [repo(b'objects/pack')]
 193     paths += glob.glob(repo(b'index-cache/*/.'))
 194     return paths
 195
 196
 197 def auto_midx(objdir):
 198     args = [path.exe(), b'midx', b'--auto', b'--dir', objdir]
 199     try:
 200         rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
 201     except OSError as e:
 202         # make sure 'args' gets printed to help with debugging
 203         add_error('%r: exception: %s' % (args, e))
 204         raise
 205     if rv:
 206         add_error('%r: returned %d' % (args, rv))
 207
 208     args = [path.exe(), b'bloom', b'--dir', objdir]
 209     try:
 210         rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
 211     except OSError as e:
 212         # make sure 'args' gets printed to help with debugging
 213         add_error('%r: exception: %s' % (args, e))
 214         raise
 215     if rv:
 216         add_error('%r: returned %d' % (args, rv))
 217
 218
 219 def mangle_name(name, mode, gitmode):
 220     """Mangle a file name to present an abstract name for segmented files.
 221     Mangled file names will have the ".bup" extension added to them. If a
 222     file's name already ends with ".bup", a ".bupl" extension is added to
 223     disambiguate normal files from segmented ones.
 224     """
 225     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 226         assert(stat.S_ISDIR(gitmode))
 227         return name + b'.bup'
 228     elif name.endswith(b'.bup') or name[:-1].endswith(b'.bup'):
 229         return name + b'.bupl'
 230     else:
 231         return name
 232
 233
 234 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 235 def demangle_name(name, mode):
 236     """Remove name mangling from a file name, if necessary.
 237
 238     The return value is a tuple (demangled_filename,mode), where mode is one of
 239     the following:
 240
 241     * BUP_NORMAL  : files that should be read as-is from the repository
 242     * BUP_CHUNKED : files that were chunked and need to be reassembled
 243
 244     For more information on the name mangling algorithm, see mangle_name()
 245     """
 246     if name.endswith(b'.bupl'):
 247         return (name[:-5], BUP_NORMAL)
 248     elif name.endswith(b'.bup'):
 249         return (name[:-4], BUP_CHUNKED)
 250     elif name.endswith(b'.bupm'):
 251         return (name[:-5],
 252                 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
 253     else:
 254         return (name, BUP_NORMAL)
 255
 256
 257 def calc_hash(type, content):
 258     """Calculate some content's hash in the Git fashion."""
 259     header = b'%s %d\0' % (type, len(content))
 260     sum = Sha1(header)
 261     sum.update(content)
 262     return sum.digest()
 263
 264
 265 def shalist_item_sort_key(ent):
 266     (mode, name, id) = ent
 267     assert(mode+0 == mode)
 268     if stat.S_ISDIR(mode):
 269         return name + b'/'
 270     else:
 271         return name
 272
 273
 274 def tree_encode(shalist):
 275     """Generate a git tree object from (mode,name,hash) tuples."""
 276     shalist = sorted(shalist, key = shalist_item_sort_key)
 277     l = []
 278     for (mode,name,bin) in shalist:
 279         assert(mode)
 280         assert(mode+0 == mode)
 281         assert(name)
 282         assert(len(bin) == 20)
 283         s = b'%o %s\0%s' % (mode,name,bin)
 284         assert s[0] != b'0'  # 0-padded octal is not acceptable in a git tree
 285         l.append(s)
 286     return b''.join(l)
 287
 288
 289 def tree_decode(buf):
 290     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 291     ofs = 0
 292     while ofs < len(buf):
 293         z = buf.find(b'\0', ofs)
 294         assert(z > ofs)
 295         spl = buf[ofs:z].split(b' ', 1)
 296         assert(len(spl) == 2)
 297         mode,name = spl
 298         sha = buf[z+1:z+1+20]
 299         ofs = z+1+20
 300         yield (int(mode, 8), name, sha)
 301
 302
 303 def _encode_packobj(type, content, compression_level=1):
 304     if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
 305         raise ValueError('invalid compression level %s' % compression_level)
 306     szout = b''
 307     sz = len(content)
 308     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 309     sz >>= 4
 310     while 1:
 311         if sz: szbits |= 0x80
 312         szout += bytes_from_uint(szbits)
 313         if not sz:
 314             break
 315         szbits = sz & 0x7f
 316         sz >>= 7
 317     z = zlib.compressobj(compression_level)
 318     yield szout
 319     yield z.compress(content)
 320     yield z.flush()
 321
 322
 323 def _encode_looseobj(type, content, compression_level=1):
 324     z = zlib.compressobj(compression_level)
 325     yield z.compress(b'%s %d\0' % (type, len(content)))
 326     yield z.compress(content)
 327     yield z.flush()
 328
 329
 330 def _decode_looseobj(buf):
 331     assert(buf);
 332     s = zlib.decompress(buf)
 333     i = s.find(b'\0')
 334     assert(i > 0)
 335     l = s[:i].split(b' ')
 336     type = l[0]
 337     sz = int(l[1])
 338     content = s[i+1:]
 339     assert(type in _typemap)
 340     assert(sz == len(content))
 341     return (type, content)
 342
 343
 344 def _decode_packobj(buf):
 345     assert(buf)
 346     c = byte_int(buf[0])
 347     type = _typermap[(c & 0x70) >> 4]
 348     sz = c & 0x0f
 349     shift = 4
 350     i = 0
 351     while c & 0x80:
 352         i += 1
 353         c = byte_int(buf[i])
 354         sz |= (c & 0x7f) << shift
 355         shift += 7
 356         if not (c & 0x80):
 357             break
 358     return (type, zlib.decompress(buf[i+1:]))
 359
 360
 361 class PackIdx:
 362     def __init__(self):
 363         assert(0)
 364
 365     def find_offset(self, hash):
 366         """Get the offset of an object inside the index file."""
 367         idx = self._idx_from_hash(hash)
 368         if idx != None:
 369             return self._ofs_from_idx(idx)
 370         return None
 371
 372     def exists(self, hash, want_source=False):
 373         """Return nonempty if the object exists in this index."""
 374         if hash and (self._idx_from_hash(hash) != None):
 375             return want_source and os.path.basename(self.name) or True
 376         return None
 377
 378     def _idx_from_hash(self, hash):
 379         global _total_searches, _total_steps
 380         _total_searches += 1
 381         assert(len(hash) == 20)
 382         b1 = byte_int(hash[0])
 383         start = self.fanout[b1-1] # range -1..254
 384         end = self.fanout[b1] # range 0..255
 385         want = hash
 386         _total_steps += 1  # lookup table is a step
 387         while start < end:
 388             _total_steps += 1
 389             mid = start + (end - start) // 2
 390             v = self._idx_to_hash(mid)
 391             if v < want:
 392                 start = mid+1
 393             elif v > want:
 394                 end = mid
 395             else: # got it!
 396                 return mid
 397         return None
 398
 399
 400 class PackIdxV1(PackIdx):
 401     """Object representation of a Git pack index (version 1) file."""
 402     def __init__(self, filename, f):
 403         self.name = filename
 404         self.idxnames = [self.name]
 405         self.map = mmap_read(f)
 406         # Min size for 'L' is 4, which is sufficient for struct's '!I'
 407         self.fanout = array('L', struct.unpack('!256I', self.map))
 408         self.fanout.append(0)  # entry "-1"
 409         self.nsha = self.fanout[255]
 410         self.sha_ofs = 256 * 4
 411         # Avoid slicing shatable for individual hashes (very high overhead)
 412         self.shatable = buffer(self.map, self.sha_ofs, self.nsha * 24)
 413
 414     def __len__(self):
 415         return int(self.nsha)  # int() from long for python 2
 416
 417     def _ofs_from_idx(self, idx):
 418         if idx >= self.nsha or idx < 0:
 419             raise IndexError('invalid pack index index %d' % idx)
 420         ofs = self.sha_ofs + idx * 24
 421         return struct.unpack_from('!I', self.map, offset=ofs)[0]
 422
 423     def _idx_to_hash(self, idx):
 424         if idx >= self.nsha or idx < 0:
 425             raise IndexError('invalid pack index index %d' % idx)
 426         ofs = self.sha_ofs + idx * 24 + 4
 427         return self.map[ofs : ofs + 20]
 428
 429     def __iter__(self):
 430         start = self.sha_ofs + 4
 431         for ofs in range(start, start + 24 * self.nsha, 24):
 432             yield self.map[ofs : ofs + 20]
 433
 434
 435 class PackIdxV2(PackIdx):
 436     """Object representation of a Git pack index (version 2) file."""
 437     def __init__(self, filename, f):
 438         self.name = filename
 439         self.idxnames = [self.name]
 440         self.map = mmap_read(f)
 441         assert self.map[0:8] == b'\377tOc\0\0\0\2'
 442         # Min size for 'L' is 4, which is sufficient for struct's '!I'
 443         self.fanout = array('L', struct.unpack_from('!256I', self.map, offset=8))
 444         self.fanout.append(0)
 445         self.nsha = self.fanout[255]
 446         self.sha_ofs = 8 + 256*4
 447         self.ofstable_ofs = self.sha_ofs + self.nsha * 20 + self.nsha * 4
 448         self.ofs64table_ofs = self.ofstable_ofs + self.nsha * 4
 449         # Avoid slicing this for individual hashes (very high overhead)
 450         self.shatable = buffer(self.map, self.sha_ofs, self.nsha*20)
 451
 452     def __len__(self):
 453         return int(self.nsha)  # int() from long for python 2
 454
 455     def _ofs_from_idx(self, idx):
 456         if idx >= self.nsha or idx < 0:
 457             raise IndexError('invalid pack index index %d' % idx)
 458         ofs_ofs = self.ofstable_ofs + idx * 4
 459         ofs = struct.unpack_from('!I', self.map, offset=ofs_ofs)[0]
 460         if ofs & 0x80000000:
 461             idx64 = ofs & 0x7fffffff
 462             ofs64_ofs = self.ofs64table_ofs + idx64 * 8
 463             ofs = struct.unpack_from('!Q', self.map, offset=ofs64_ofs)[0]
 464         return ofs
 465
 466     def _idx_to_hash(self, idx):
 467         if idx >= self.nsha or idx < 0:
 468             raise IndexError('invalid pack index index %d' % idx)
 469         ofs = self.sha_ofs + idx * 20
 470         return self.map[ofs : ofs + 20]
 471
 472     def __iter__(self):
 473         start = self.sha_ofs
 474         for ofs in range(start, start + 20 * self.nsha, 20):
 475             yield self.map[ofs : ofs + 20]
 476
 477
 478 _mpi_count = 0
 479 class PackIdxList:
 480     def __init__(self, dir, ignore_midx=False):
 481         global _mpi_count
 482         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 483         _mpi_count += 1
 484         self.dir = dir
 485         self.also = set()
 486         self.packs = []
 487         self.do_bloom = False
 488         self.bloom = None
 489         self.ignore_midx = ignore_midx
 490         self.refresh()
 491
 492     def __del__(self):
 493         global _mpi_count
 494         _mpi_count -= 1
 495         assert(_mpi_count == 0)
 496
 497     def __iter__(self):
 498         return iter(idxmerge(self.packs))
 499
 500     def __len__(self):
 501         return sum(len(pack) for pack in self.packs)
 502
 503     def exists(self, hash, want_source=False):
 504         """Return nonempty if the object exists in the index files."""
 505         global _total_searches
 506         _total_searches += 1
 507         if hash in self.also:
 508             return True
 509         if self.do_bloom and self.bloom:
 510             if self.bloom.exists(hash):
 511                 self.do_bloom = False
 512             else:
 513                 _total_searches -= 1  # was counted by bloom
 514                 return None
 515         for i in range(len(self.packs)):
 516             p = self.packs[i]
 517             _total_searches -= 1  # will be incremented by sub-pack
 518             ix = p.exists(hash, want_source=want_source)
 519             if ix:
 520                 # reorder so most recently used packs are searched first
 521                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 522                 return ix
 523         self.do_bloom = True
 524         return None
 525
 526     def refresh(self, skip_midx = False):
 527         """Refresh the index list.
 528         This method verifies if .midx files were superseded (e.g. all of its
 529         contents are in another, bigger .midx file) and removes the superseded
 530         files.
 531
 532         If skip_midx is True, all work on .midx files will be skipped and .midx
 533         files will be removed from the list.
 534
 535         The instance variable 'ignore_midx' can force this function to
 536         always act as if skip_midx was True.
 537         """
 538         self.bloom = None # Always reopen the bloom as it may have been relaced
 539         self.do_bloom = False
 540         skip_midx = skip_midx or self.ignore_midx
 541         d = dict((p.name, p) for p in self.packs
 542                  if not skip_midx or not isinstance(p, midx.PackMidx))
 543         if os.path.exists(self.dir):
 544             if not skip_midx:
 545                 midxl = []
 546                 for ix in self.packs:
 547                     if isinstance(ix, midx.PackMidx):
 548                         for name in ix.idxnames:
 549                             d[os.path.join(self.dir, name)] = ix
 550                 for full in glob.glob(os.path.join(self.dir,b'*.midx')):
 551                     if not d.get(full):
 552                         mx = midx.PackMidx(full)
 553                         (mxd, mxf) = os.path.split(mx.name)
 554                         broken = False
 555                         for n in mx.idxnames:
 556                             if not os.path.exists(os.path.join(mxd, n)):
 557                                 log(('warning: index %s missing\n'
 558                                      '  used by %s\n')
 559                                     % (path_msg(n), path_msg(mxf)))
 560                                 broken = True
 561                         if broken:
 562                             mx.close()
 563                             del mx
 564                             unlink(full)
 565                         else:
 566                             midxl.append(mx)
 567                 midxl.sort(key=lambda ix:
 568                            (-len(ix), -xstat.stat(ix.name).st_mtime))
 569                 for ix in midxl:
 570                     any_needed = False
 571                     for sub in ix.idxnames:
 572                         found = d.get(os.path.join(self.dir, sub))
 573                         if not found or isinstance(found, PackIdx):
 574                             # doesn't exist, or exists but not in a midx
 575                             any_needed = True
 576                             break
 577                     if any_needed:
 578                         d[ix.name] = ix
 579                         for name in ix.idxnames:
 580                             d[os.path.join(self.dir, name)] = ix
 581                     elif not ix.force_keep:
 582                         debug1('midx: removing redundant: %s\n'
 583                                % path_msg(os.path.basename(ix.name)))
 584                         ix.close()
 585                         unlink(ix.name)
 586             for full in glob.glob(os.path.join(self.dir, b'*.idx')):
 587                 if not d.get(full):
 588                     try:
 589                         ix = open_idx(full)
 590                     except GitError as e:
 591                         add_error(e)
 592                         continue
 593                     d[full] = ix
 594             bfull = os.path.join(self.dir, b'bup.bloom')
 595             if self.bloom is None and os.path.exists(bfull):
 596                 self.bloom = bloom.ShaBloom(bfull)
 597             self.packs = list(set(d.values()))
 598             self.packs.sort(reverse=True, key=lambda x: len(x))
 599             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 600                 self.do_bloom = True
 601             else:
 602                 self.bloom = None
 603         debug1('PackIdxList: using %d index%s.\n'
 604             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 605
 606     def add(self, hash):
 607         """Insert an additional object in the list."""
 608         self.also.add(hash)
 609
 610
 611 def open_idx(filename):
 612     if filename.endswith(b'.idx'):
 613         f = open(filename, 'rb')
 614         header = f.read(8)
 615         if header[0:4] == b'\377tOc':
 616             version = struct.unpack('!I', header[4:8])[0]
 617             if version == 2:
 618                 return PackIdxV2(filename, f)
 619             else:
 620                 raise GitError('%s: expected idx file version 2, got %d'
 621                                % (path_msg(filename), version))
 622         elif len(header) == 8 and header[0:4] < b'\377tOc':
 623             return PackIdxV1(filename, f)
 624         else:
 625             raise GitError('%s: unrecognized idx file header'
 626                            % path_msg(filename))
 627     elif filename.endswith(b'.midx'):
 628         return midx.PackMidx(filename)
 629     else:
 630         raise GitError('idx filenames must end with .idx or .midx')
 631
 632
 633 def idxmerge(idxlist, final_progress=True):
 634     """Generate a list of all the objects reachable in a PackIdxList."""
 635     def pfunc(count, total):
 636         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 637                   % (count*100.0/total, count, total))
 638     def pfinal(count, total):
 639         if final_progress:
 640             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 641                      % (100, total, total))
 642     return merge_iter(idxlist, 10024, pfunc, pfinal)
 643
 644
 645 def _make_objcache():
 646     return PackIdxList(repo(b'objects/pack'))
 647
 648 # bup-gc assumes that it can disable all PackWriter activities
 649 # (bloom/midx/cache) via the constructor and close() arguments.
 650
 651 class PackWriter:
 652     """Writes Git objects inside a pack file."""
 653     def __init__(self, objcache_maker=_make_objcache, compression_level=1,
 654                  run_midx=True, on_pack_finish=None,
 655                  max_pack_size=None, max_pack_objects=None, repo_dir=None):
 656         self.repo_dir = repo_dir or repo()
 657         self.file = None
 658         self.parentfd = None
 659         self.count = 0
 660         self.outbytes = 0
 661         self.filename = None
 662         self.idx = None
 663         self.objcache_maker = objcache_maker
 664         self.objcache = None
 665         self.compression_level = compression_level
 666         self.run_midx=run_midx
 667         self.on_pack_finish = on_pack_finish
 668         if not max_pack_size:
 669             max_pack_size = git_config_get(b'pack.packSizeLimit',
 670                                            repo_dir=self.repo_dir)
 671             if max_pack_size is not None:
 672                 max_pack_size = parse_num(max_pack_size)
 673             if not max_pack_size:
 674                 # larger packs slow down pruning
 675                 max_pack_size = 1000 * 1000 * 1000
 676         self.max_pack_size = max_pack_size
 677         # cache memory usage is about 83 bytes per object
 678         self.max_pack_objects = max_pack_objects if max_pack_objects \
 679                                 else max(1, self.max_pack_size // 5000)
 680
 681     def __del__(self):
 682         self.close()
 683
 684     def __enter__(self):
 685         return self
 686
 687     def __exit__(self, type, value, traceback):
 688         self.close()
 689
 690     def _open(self):
 691         if not self.file:
 692             objdir = dir = os.path.join(self.repo_dir, b'objects')
 693             fd, name = tempfile.mkstemp(suffix=b'.pack', dir=objdir)
 694             try:
 695                 self.file = os.fdopen(fd, 'w+b')
 696             except:
 697                 os.close(fd)
 698                 raise
 699             try:
 700                 self.parentfd = os.open(objdir, os.O_RDONLY)
 701             except:
 702                 f = self.file
 703                 self.file = None
 704                 f.close()
 705                 raise
 706             assert name.endswith(b'.pack')
 707             self.filename = name[:-5]
 708             self.file.write(b'PACK\0\0\0\2\0\0\0\0')
 709             self.idx = list(list() for i in range(256))
 710
 711     def _raw_write(self, datalist, sha):
 712         self._open()
 713         f = self.file
 714         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 715         # the file never has a *partial* blob.  So let's make sure it's
 716         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 717         # to our hashsplit algorithm.)  f.write() does its own buffering,
 718         # but that's okay because we'll flush it in _end().
 719         oneblob = b''.join(datalist)
 720         try:
 721             f.write(oneblob)
 722         except IOError as e:
 723             reraise(GitError(e))
 724         nw = len(oneblob)
 725         crc = zlib.crc32(oneblob) & 0xffffffff
 726         self._update_idx(sha, crc, nw)
 727         self.outbytes += nw
 728         self.count += 1
 729         return nw, crc
 730
 731     def _update_idx(self, sha, crc, size):
 732         assert(sha)
 733         if self.idx:
 734             self.idx[byte_int(sha[0])].append((sha, crc,
 735                                                self.file.tell() - size))
 736
 737     def _write(self, sha, type, content):
 738         if verbose:
 739             log('>')
 740         if not sha:
 741             sha = calc_hash(type, content)
 742         size, crc = self._raw_write(_encode_packobj(type, content,
 743                                                     self.compression_level),
 744                                     sha=sha)
 745         if self.outbytes >= self.max_pack_size \
 746            or self.count >= self.max_pack_objects:
 747             self.breakpoint()
 748         return sha
 749
 750     def breakpoint(self):
 751         """Clear byte and object counts and return the last processed id."""
 752         id = self._end(self.run_midx)
 753         self.outbytes = self.count = 0
 754         return id
 755
 756     def _require_objcache(self):
 757         if self.objcache is None and self.objcache_maker:
 758             self.objcache = self.objcache_maker()
 759         if self.objcache is None:
 760             raise GitError(
 761                     "PackWriter not opened or can't check exists w/o objcache")
 762
 763     def exists(self, id, want_source=False):
 764         """Return non-empty if an object is found in the object cache."""
 765         self._require_objcache()
 766         return self.objcache.exists(id, want_source=want_source)
 767
 768     def just_write(self, sha, type, content):
 769         """Write an object to the pack file without checking for duplication."""
 770         self._write(sha, type, content)
 771         # If nothing else, gc doesn't have/want an objcache
 772         if self.objcache is not None:
 773             self.objcache.add(sha)
 774
 775     def maybe_write(self, type, content):
 776         """Write an object to the pack file if not present and return its id."""
 777         sha = calc_hash(type, content)
 778         if not self.exists(sha):
 779             self._require_objcache()
 780             self.just_write(sha, type, content)
 781         return sha
 782
 783     def new_blob(self, blob):
 784         """Create a blob object in the pack with the supplied content."""
 785         return self.maybe_write(b'blob', blob)
 786
 787     def new_tree(self, shalist):
 788         """Create a tree object in the pack."""
 789         content = tree_encode(shalist)
 790         return self.maybe_write(b'tree', content)
 791
 792     def new_commit(self, tree, parent,
 793                    author, adate_sec, adate_tz,
 794                    committer, cdate_sec, cdate_tz,
 795                    msg):
 796         """Create a commit object in the pack.  The date_sec values must be
 797         epoch-seconds, and if a tz is None, the local timezone is assumed."""
 798         if adate_tz:
 799             adate_str = _git_date_str(adate_sec, adate_tz)
 800         else:
 801             adate_str = _local_git_date_str(adate_sec)
 802         if cdate_tz:
 803             cdate_str = _git_date_str(cdate_sec, cdate_tz)
 804         else:
 805             cdate_str = _local_git_date_str(cdate_sec)
 806         l = []
 807         if tree: l.append(b'tree %s' % hexlify(tree))
 808         if parent: l.append(b'parent %s' % hexlify(parent))
 809         if author: l.append(b'author %s %s' % (author, adate_str))
 810         if committer: l.append(b'committer %s %s' % (committer, cdate_str))
 811         l.append(b'')
 812         l.append(msg)
 813         return self.maybe_write(b'commit', b'\n'.join(l))
 814
 815     def abort(self):
 816         """Remove the pack file from disk."""
 817         f = self.file
 818         if f:
 819             pfd = self.parentfd
 820             self.file = None
 821             self.parentfd = None
 822             self.idx = None
 823             try:
 824                 try:
 825                     os.unlink(self.filename + b'.pack')
 826                 finally:
 827                     f.close()
 828             finally:
 829                 if pfd is not None:
 830                     os.close(pfd)
 831
 832     def _end(self, run_midx=True):
 833         f = self.file
 834         if not f: return None
 835         self.file = None
 836         try:
 837             self.objcache = None
 838             idx = self.idx
 839             self.idx = None
 840
 841             # update object count
 842             f.seek(8)
 843             cp = struct.pack('!i', self.count)
 844             assert(len(cp) == 4)
 845             f.write(cp)
 846
 847             # calculate the pack sha1sum
 848             f.seek(0)
 849             sum = Sha1()
 850             for b in chunkyreader(f):
 851                 sum.update(b)
 852             packbin = sum.digest()
 853             f.write(packbin)
 854             fdatasync(f.fileno())
 855         finally:
 856             f.close()
 857
 858         obj_list_sha = self._write_pack_idx_v2(self.filename + b'.idx', idx,
 859                                                packbin)
 860         nameprefix = os.path.join(self.repo_dir,
 861                                   b'objects/pack/pack-' +  obj_list_sha)
 862         if os.path.exists(self.filename + b'.map'):
 863             os.unlink(self.filename + b'.map')
 864         os.rename(self.filename + b'.pack', nameprefix + b'.pack')
 865         os.rename(self.filename + b'.idx', nameprefix + b'.idx')
 866         try:
 867             os.fsync(self.parentfd)
 868         finally:
 869             os.close(self.parentfd)
 870
 871         if run_midx:
 872             auto_midx(os.path.join(self.repo_dir, b'objects/pack'))
 873
 874         if self.on_pack_finish:
 875             self.on_pack_finish(nameprefix)
 876
 877         return nameprefix
 878
 879     def close(self, run_midx=True):
 880         """Close the pack file and move it to its definitive path."""
 881         return self._end(run_midx=run_midx)
 882
 883     def _write_pack_idx_v2(self, filename, idx, packbin):
 884         ofs64_count = 0
 885         for section in idx:
 886             for entry in section:
 887                 if entry[2] >= 2**31:
 888                     ofs64_count += 1
 889
 890         # Length: header + fan-out + shas-and-crcs + overflow-offsets
 891         index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
 892         idx_map = None
 893         idx_f = open(filename, 'w+b')
 894         try:
 895             idx_f.truncate(index_len)
 896             fdatasync(idx_f.fileno())
 897             idx_map = mmap_readwrite(idx_f, close=False)
 898             try:
 899                 count = _helpers.write_idx(filename, idx_map, idx, self.count)
 900                 assert(count == self.count)
 901                 idx_map.flush()
 902             finally:
 903                 idx_map.close()
 904         finally:
 905             idx_f.close()
 906
 907         idx_f = open(filename, 'a+b')
 908         try:
 909             idx_f.write(packbin)
 910             idx_f.seek(0)
 911             idx_sum = Sha1()
 912             b = idx_f.read(8 + 4*256)
 913             idx_sum.update(b)
 914
 915             obj_list_sum = Sha1()
 916             for b in chunkyreader(idx_f, 20*self.count):
 917                 idx_sum.update(b)
 918                 obj_list_sum.update(b)
 919             namebase = hexlify(obj_list_sum.digest())
 920
 921             for b in chunkyreader(idx_f):
 922                 idx_sum.update(b)
 923             idx_f.write(idx_sum.digest())
 924             fdatasync(idx_f.fileno())
 925             return namebase
 926         finally:
 927             idx_f.close()
 928
 929
 930 def list_refs(patterns=None, repo_dir=None,
 931               limit_to_heads=False, limit_to_tags=False):
 932     """Yield (refname, hash) tuples for all repository refs unless
 933     patterns are specified.  In that case, only include tuples for
 934     refs matching those patterns (cf. git-show-ref(1)).  The limits
 935     restrict the result items to refs/heads or refs/tags.  If both
 936     limits are specified, items from both sources will be included.
 937
 938     """
 939     argv = [b'git', b'show-ref']
 940     if limit_to_heads:
 941         argv.append(b'--heads')
 942     if limit_to_tags:
 943         argv.append(b'--tags')
 944     argv.append(b'--')
 945     if patterns:
 946         argv.extend(patterns)
 947     p = subprocess.Popen(argv, env=_gitenv(repo_dir), stdout=subprocess.PIPE)
 948     out = p.stdout.read().strip()
 949     rv = p.wait()  # not fatal
 950     if rv:
 951         assert(not out)
 952     if out:
 953         for d in out.split(b'\n'):
 954             sha, name = d.split(b' ', 1)
 955             yield name, unhexlify(sha)
 956
 957
 958 def read_ref(refname, repo_dir = None):
 959     """Get the commit id of the most recent commit made on a given ref."""
 960     refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
 961     l = tuple(islice(refs, 2))
 962     if l:
 963         assert(len(l) == 1)
 964         return l[0][1]
 965     else:
 966         return None
 967
 968
 969 def rev_list_invocation(ref_or_refs, count=None, format=None):
 970     if isinstance(ref_or_refs, bytes):
 971         refs = (ref_or_refs,)
 972     else:
 973         refs = ref_or_refs
 974     argv = [b'git', b'rev-list']
 975     if isinstance(count, Integral):
 976         argv.extend([b'-n', b'%d' % count])
 977     elif count:
 978         raise ValueError('unexpected count argument %r' % count)
 979
 980     if format:
 981         argv.append(b'--pretty=format:' + format)
 982     for ref in refs:
 983         assert not ref.startswith(b'-')
 984         argv.append(ref)
 985     argv.append(b'--')
 986     return argv
 987
 988
 989 def rev_list(ref_or_refs, count=None, parse=None, format=None, repo_dir=None):
 990     """Yield information about commits as per "git rev-list".  If a format
 991     is not provided, yield one hex hash at a time.  If a format is
 992     provided, pass it to rev-list and call parse(git_stdout) for each
 993     commit with the stream positioned just after the rev-list "commit
 994     HASH" header line.  When a format is provided yield (oidx,
 995     parse(git_stdout)) for each commit.
 996
 997     """
 998     assert bool(parse) == bool(format)
 999     p = subprocess.Popen(rev_list_invocation(ref_or_refs, count=count,
1000                                              format=format),
1001                          env=_gitenv(repo_dir),
1002                          stdout = subprocess.PIPE)
1003     if not format:
1004         for line in p.stdout:
1005             yield line.strip()
1006     else:
1007         line = p.stdout.readline()
1008         while line:
1009             s = line.strip()
1010             if not s.startswith(b'commit '):
1011                 raise Exception('unexpected line ' + repr(s))
1012             s = s[7:]
1013             assert len(s) == 40
1014             yield s, parse(p.stdout)
1015             line = p.stdout.readline()
1016
1017     rv = p.wait()  # not fatal
1018     if rv:
1019         raise GitError('git rev-list returned error %d' % rv)
1020
1021
1022 def get_commit_dates(refs, repo_dir=None):
1023     """Get the dates for the specified commit refs.  For now, every unique
1024        string in refs must resolve to a different commit or this
1025        function will fail."""
1026     result = []
1027     for ref in refs:
1028         commit = get_commit_items(ref, cp(repo_dir))
1029         result.append(commit.author_sec)
1030     return result
1031
1032
1033 def rev_parse(committish, repo_dir=None):
1034     """Resolve the full hash for 'committish', if it exists.
1035
1036     Should be roughly equivalent to 'git rev-parse'.
1037
1038     Returns the hex value of the hash if it is found, None if 'committish' does
1039     not correspond to anything.
1040     """
1041     head = read_ref(committish, repo_dir=repo_dir)
1042     if head:
1043         debug2("resolved from ref: commit = %s\n" % hexlify(head))
1044         return head
1045
1046     pL = PackIdxList(repo(b'objects/pack', repo_dir=repo_dir))
1047
1048     if len(committish) == 40:
1049         try:
1050             hash = unhexlify(committish)
1051         except TypeError:
1052             return None
1053
1054         if pL.exists(hash):
1055             return hash
1056
1057     return None
1058
1059
1060 def update_ref(refname, newval, oldval, repo_dir=None):
1061     """Update a repository reference."""
1062     if not oldval:
1063         oldval = b''
1064     assert refname.startswith(b'refs/heads/') \
1065         or refname.startswith(b'refs/tags/')
1066     p = subprocess.Popen([b'git', b'update-ref', refname,
1067                           hexlify(newval), hexlify(oldval)],
1068                          env=_gitenv(repo_dir))
1069     _git_wait(b'git update-ref', p)
1070
1071
1072 def delete_ref(refname, oldvalue=None):
1073     """Delete a repository reference (see git update-ref(1))."""
1074     assert refname.startswith(b'refs/')
1075     oldvalue = [] if not oldvalue else [oldvalue]
1076     p = subprocess.Popen([b'git', b'update-ref', b'-d', refname] + oldvalue,
1077                          env=_gitenv())
1078     _git_wait('git update-ref', p)
1079
1080
1081 def guess_repo(path=None):
1082     """Set the path value in the global variable "repodir".
1083     This makes bup look for an existing bup repository, but not fail if a
1084     repository doesn't exist. Usually, if you are interacting with a bup
1085     repository, you would not be calling this function but using
1086     check_repo_or_die().
1087     """
1088     global repodir
1089     if path:
1090         repodir = path
1091     if not repodir:
1092         repodir = environ.get(b'BUP_DIR')
1093         if not repodir:
1094             repodir = os.path.expanduser(b'~/.bup')
1095
1096
1097 def init_repo(path=None):
1098     """Create the Git bare repository for bup in a given path."""
1099     guess_repo(path)
1100     d = repo()  # appends a / to the path
1101     parent = os.path.dirname(os.path.dirname(d))
1102     if parent and not os.path.exists(parent):
1103         raise GitError('parent directory "%s" does not exist\n'
1104                        % path_msg(parent))
1105     if os.path.exists(d) and not os.path.isdir(os.path.join(d, b'.')):
1106         raise GitError('"%s" exists but is not a directory\n' % path_msg(d))
1107     p = subprocess.Popen([b'git', b'--bare', b'init'], stdout=sys.stderr,
1108                          env=_gitenv())
1109     _git_wait('git init', p)
1110     # Force the index version configuration in order to ensure bup works
1111     # regardless of the version of the installed Git binary.
1112     p = subprocess.Popen([b'git', b'config', b'pack.indexVersion', '2'],
1113                          stdout=sys.stderr, env=_gitenv())
1114     _git_wait('git config', p)
1115     # Enable the reflog
1116     p = subprocess.Popen([b'git', b'config', b'core.logAllRefUpdates', b'true'],
1117                          stdout=sys.stderr, env=_gitenv())
1118     _git_wait('git config', p)
1119
1120
1121 def check_repo_or_die(path=None):
1122     """Check to see if a bup repository probably exists, and abort if not."""
1123     guess_repo(path)
1124     top = repo()
1125     pst = stat_if_exists(top + b'/objects/pack')
1126     if pst and stat.S_ISDIR(pst.st_mode):
1127         return
1128     if not pst:
1129         top_st = stat_if_exists(top)
1130         if not top_st:
1131             log('error: repository %r does not exist (see "bup help init")\n'
1132                 % top)
1133             sys.exit(15)
1134     log('error: %s is not a repository\n' % path_msg(top))
1135     sys.exit(14)
1136
1137
1138 _ver = None
1139 def ver():
1140     """Get Git's version and ensure a usable version is installed.
1141
1142     The returned version is formatted as an ordered tuple with each position
1143     representing a digit in the version tag. For example, the following tuple
1144     would represent version 1.6.6.9:
1145
1146         (1, 6, 6, 9)
1147     """
1148     global _ver
1149     if not _ver:
1150         p = subprocess.Popen([b'git', b'--version'], stdout=subprocess.PIPE)
1151         gvs = p.stdout.read()
1152         _git_wait('git --version', p)
1153         m = re.match(br'git version (\S+.\S+)', gvs)
1154         if not m:
1155             raise GitError('git --version weird output: %r' % gvs)
1156         _ver = tuple(int(x) for x in m.group(1).split(b'.'))
1157     needed = (1, 5, 3, 1)
1158     if _ver < needed:
1159         raise GitError('git version %s or higher is required; you have %s'
1160                        % ('.'.join(str(x) for x in needed),
1161                           '.'.join(str(x) for x in _ver)))
1162     return _ver
1163
1164
1165 class _AbortableIter:
1166     def __init__(self, it, onabort = None):
1167         self.it = it
1168         self.onabort = onabort
1169         self.done = None
1170
1171     def __iter__(self):
1172         return self
1173
1174     def __next__(self):
1175         try:
1176             return next(self.it)
1177         except StopIteration as e:
1178             self.done = True
1179             raise
1180         except:
1181             self.abort()
1182             raise
1183
1184     next = __next__
1185
1186     def abort(self):
1187         """Abort iteration and call the abortion callback, if needed."""
1188         if not self.done:
1189             self.done = True
1190             if self.onabort:
1191                 self.onabort()
1192
1193     def __del__(self):
1194         self.abort()
1195
1196
1197 class CatPipe:
1198     """Link to 'git cat-file' that is used to retrieve blob data."""
1199     def __init__(self, repo_dir = None):
1200         self.repo_dir = repo_dir
1201         wanted = (1, 5, 6)
1202         if ver() < wanted:
1203             log('error: git version must be at least 1.5.6\n')
1204             sys.exit(1)
1205         self.p = self.inprogress = None
1206
1207     def _abort(self):
1208         if self.p:
1209             self.p.stdout.close()
1210             self.p.stdin.close()
1211         self.p = None
1212         self.inprogress = None
1213
1214     def restart(self):
1215         self._abort()
1216         self.p = subprocess.Popen([b'git', b'cat-file', b'--batch'],
1217                                   stdin=subprocess.PIPE,
1218                                   stdout=subprocess.PIPE,
1219                                   close_fds = True,
1220                                   bufsize = 4096,
1221                                   env=_gitenv(self.repo_dir))
1222
1223     def get(self, ref):
1224         """Yield (oidx, type, size), followed by the data referred to by ref.
1225         If ref does not exist, only yield (None, None, None).
1226
1227         """
1228         if not self.p or self.p.poll() != None:
1229             self.restart()
1230         assert(self.p)
1231         poll_result = self.p.poll()
1232         assert(poll_result == None)
1233         if self.inprogress:
1234             log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1235         assert(not self.inprogress)
1236         assert ref.find(b'\n') < 0
1237         assert ref.find(b'\r') < 0
1238         assert not ref.startswith(b'-')
1239         self.inprogress = ref
1240         self.p.stdin.write(ref + b'\n')
1241         self.p.stdin.flush()
1242         hdr = self.p.stdout.readline()
1243         if hdr.endswith(b' missing\n'):
1244             self.inprogress = None
1245             yield None, None, None
1246             return
1247         info = hdr.split(b' ')
1248         if len(info) != 3 or len(info[0]) != 40:
1249             raise GitError('expected object (id, type, size), got %r' % info)
1250         oidx, typ, size = info
1251         size = int(size)
1252         it = _AbortableIter(chunkyreader(self.p.stdout, size),
1253                             onabort=self._abort)
1254         try:
1255             yield oidx, typ, size
1256             for blob in it:
1257                 yield blob
1258             readline_result = self.p.stdout.readline()
1259             assert readline_result == b'\n'
1260             self.inprogress = None
1261         except Exception as e:
1262             it.abort()
1263             raise
1264
1265     def _join(self, it):
1266         _, typ, _ = next(it)
1267         if typ == b'blob':
1268             for blob in it:
1269                 yield blob
1270         elif typ == b'tree':
1271             treefile = b''.join(it)
1272             for (mode, name, sha) in tree_decode(treefile):
1273                 for blob in self.join(hexlify(sha)):
1274                     yield blob
1275         elif typ == b'commit':
1276             treeline = b''.join(it).split(b'\n')[0]
1277             assert treeline.startswith(b'tree ')
1278             for blob in self.join(treeline[5:]):
1279                 yield blob
1280         else:
1281             raise GitError('invalid object type %r: expected blob/tree/commit'
1282                            % typ)
1283
1284     def join(self, id):
1285         """Generate a list of the content of all blobs that can be reached
1286         from an object.  The hash given in 'id' must point to a blob, a tree
1287         or a commit. The content of all blobs that can be seen from trees or
1288         commits will be added to the list.
1289         """
1290         try:
1291             for d in self._join(self.get(id)):
1292                 yield d
1293         except StopIteration:
1294             log('booger!\n')
1295
1296
1297 _cp = {}
1298
1299 def cp(repo_dir=None):
1300     """Create a CatPipe object or reuse the already existing one."""
1301     global _cp, repodir
1302     if not repo_dir:
1303         repo_dir = repodir or repo()
1304     repo_dir = os.path.abspath(repo_dir)
1305     cp = _cp.get(repo_dir)
1306     if not cp:
1307         cp = CatPipe(repo_dir)
1308         _cp[repo_dir] = cp
1309     return cp
1310
1311
1312 def tags(repo_dir = None):
1313     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1314     tags = {}
1315     for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1316         assert n.startswith(b'refs/tags/')
1317         name = n[10:]
1318         if not c in tags:
1319             tags[c] = []
1320         tags[c].append(name)  # more than one tag can point at 'c'
1321     return tags
1322
1323
1324 class MissingObject(KeyError):
1325     def __init__(self, oid):
1326         self.oid = oid
1327         KeyError.__init__(self, 'object %r is missing' % oid.encode('hex'))
1328
1329
1330 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1331                                    'path', 'chunk_path', 'data'])
1332 # The path is the mangled path, and if an item represents a fragment
1333 # of a chunked file, the chunk_path will be the chunked subtree path
1334 # for the chunk, i.e. ['', '2d3115e', ...].  The top-level path for a
1335 # chunked file will have a chunk_path of [''].  So some chunk subtree
1336 # of the file '/foo/bar/baz' might look like this:
1337 #
1338 #   item.path = ['foo', 'bar', 'baz.bup']
1339 #   item.chunk_path = ['', '2d3115e', '016b097']
1340 #   item.type = 'tree'
1341 #   ...
1342
1343
1344 def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1345     """Yield everything reachable from oidx via get_ref (which must behave
1346     like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1347     returns true.  Throw MissingObject if a hash encountered is
1348     missing from the repository, and don't read or return blob content
1349     in the data field unless include_data is set.
1350
1351     """
1352     # Maintain the pending stack on the heap to avoid stack overflow
1353     pending = [(oidx, [], [], None)]
1354     while len(pending):
1355         oidx, parent_path, chunk_path, mode = pending.pop()
1356         oid = unhexlify(oidx)
1357         if stop_at and stop_at(oidx):
1358             continue
1359
1360         if (not include_data) and mode and stat.S_ISREG(mode):
1361             # If the object is a "regular file", then it's a leaf in
1362             # the graph, so we can skip reading the data if the caller
1363             # hasn't requested it.
1364             yield WalkItem(oid=oid, type=b'blob',
1365                            chunk_path=chunk_path, path=parent_path,
1366                            mode=mode,
1367                            data=None)
1368             continue
1369
1370         item_it = get_ref(oidx)
1371         get_oidx, typ, _ = next(item_it)
1372         if not get_oidx:
1373             raise MissingObject(unhexlify(oidx))
1374         if typ not in (b'blob', b'commit', b'tree'):
1375             raise Exception('unexpected repository object type %r' % typ)
1376
1377         # FIXME: set the mode based on the type when the mode is None
1378         if typ == b'blob' and not include_data:
1379             # Dump data until we can ask cat_pipe not to fetch it
1380             for ignored in item_it:
1381                 pass
1382             data = None
1383         else:
1384             data = b''.join(item_it)
1385
1386         yield WalkItem(oid=oid, type=typ,
1387                        chunk_path=chunk_path, path=parent_path,
1388                        mode=mode,
1389                        data=(data if include_data else None))
1390
1391         if typ == b'commit':
1392             commit_items = parse_commit(data)
1393             for pid in commit_items.parents:
1394                 pending.append((pid, parent_path, chunk_path, mode))
1395             pending.append((commit_items.tree, parent_path, chunk_path,
1396                             hashsplit.GIT_MODE_TREE))
1397         elif typ == b'tree':
1398             for mode, name, ent_id in tree_decode(data):
1399                 demangled, bup_type = demangle_name(name, mode)
1400                 if chunk_path:
1401                     sub_path = parent_path
1402                     sub_chunk_path = chunk_path + [name]
1403                 else:
1404                     sub_path = parent_path + [name]
1405                     if bup_type == BUP_CHUNKED:
1406                         sub_chunk_path = [b'']
1407                     else:
1408                         sub_chunk_path = chunk_path
1409                 pending.append((hexlify(ent_id), sub_path, sub_chunk_path,
1410                                 mode))