lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5
   6 from __future__ import absolute_import, print_function
   7 import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
   8 from array import array
   9 from binascii import hexlify, unhexlify
  10 from collections import namedtuple
  11 from itertools import islice
  12 from numbers import Integral
  13
  14 from bup import _helpers, compat, hashsplit, path, midx, bloom, xstat
  15 from bup.compat import (buffer,
  16                         byte_int, bytes_from_byte, bytes_from_uint,
  17                         environ,
  18                         items,
  19                         range,
  20                         reraise)
  21 from bup.io import path_msg
  22 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
  23                          fdatasync,
  24                          hostname, localtime, log,
  25                          merge_dict,
  26                          merge_iter,
  27                          mmap_read, mmap_readwrite,
  28                          parse_num,
  29                          progress, qprogress, stat_if_exists,
  30                          unlink,
  31                          utc_offset_str)
  32 from bup.pwdgrp import username, userfullname
  33
  34
  35 verbose = 0
  36 repodir = None  # The default repository, once initialized
  37
  38 _typemap =  {b'blob': 3, b'tree': 2, b'commit': 1, b'tag': 4}
  39 _typermap = {v: k for k, v in items(_typemap)}
  40
  41
  42 _total_searches = 0
  43 _total_steps = 0
  44
  45
  46 class GitError(Exception):
  47     pass
  48
  49
  50 def _gitenv(repo_dir=None):
  51     if not repo_dir:
  52         repo_dir = repo()
  53     return merge_dict(environ, {b'GIT_DIR': os.path.abspath(repo_dir)})
  54
  55 def _git_wait(cmd, p):
  56     rv = p.wait()
  57     if rv != 0:
  58         raise GitError('%r returned %d' % (cmd, rv))
  59
  60 def git_config_get(option, repo_dir=None):
  61     cmd = (b'git', b'config', b'--get', option)
  62     p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
  63                          env=_gitenv(repo_dir=repo_dir))
  64     r = p.stdout.read()
  65     rc = p.wait()
  66     if rc == 0:
  67         return r
  68     if rc != 1:
  69         raise GitError('%r returned %d' % (cmd, rc))
  70     return None
  71
  72
  73 def parse_tz_offset(s):
  74     """UTC offset in seconds."""
  75     tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
  76     if bytes_from_byte(s[0]) == b'-':
  77         return - tz_off
  78     return tz_off
  79
  80
  81 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
  82 # Make sure that's authoritative.
  83 _start_end_char = br'[^ .,:;<>"\'\0\n]'
  84 _content_char = br'[^\0\n<>]'
  85 _safe_str_rx = br'(?:%s{1,2}|(?:%s%s*%s))' \
  86     % (_start_end_char,
  87        _start_end_char, _content_char, _start_end_char)
  88 _tz_rx = br'[-+]\d\d[0-5]\d'
  89 _parent_rx = br'(?:parent [abcdefABCDEF0123456789]{40}\n)'
  90 # Assumes every following line starting with a space is part of the
  91 # mergetag.  Is there a formal commit blob spec?
  92 _mergetag_rx = br'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)'
  93 _commit_rx = re.compile(br'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
  94 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
  95 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?)
  96
  97 (?P<message>(?:.|\n)*)''' % (_parent_rx,
  98                              _safe_str_rx, _safe_str_rx, _tz_rx,
  99                              _safe_str_rx, _safe_str_rx, _tz_rx,
 100                              _mergetag_rx))
 101 _parent_hash_rx = re.compile(br'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
 102
 103 # Note that the author_sec and committer_sec values are (UTC) epoch
 104 # seconds, and for now the mergetag is not included.
 105 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
 106                                        'author_name', 'author_mail',
 107                                        'author_sec', 'author_offset',
 108                                        'committer_name', 'committer_mail',
 109                                        'committer_sec', 'committer_offset',
 110                                        'message'])
 111
 112 def parse_commit(content):
 113     commit_match = re.match(_commit_rx, content)
 114     if not commit_match:
 115         raise Exception('cannot parse commit %r' % content)
 116     matches = commit_match.groupdict()
 117     return CommitInfo(tree=matches['tree'],
 118                       parents=re.findall(_parent_hash_rx, matches['parents']),
 119                       author_name=matches['author_name'],
 120                       author_mail=matches['author_mail'],
 121                       author_sec=int(matches['asec']),
 122                       author_offset=parse_tz_offset(matches['atz']),
 123                       committer_name=matches['committer_name'],
 124                       committer_mail=matches['committer_mail'],
 125                       committer_sec=int(matches['csec']),
 126                       committer_offset=parse_tz_offset(matches['ctz']),
 127                       message=matches['message'])
 128
 129
 130 def get_cat_data(cat_iterator, expected_type):
 131     _, kind, _ = next(cat_iterator)
 132     if kind != expected_type:
 133         raise Exception('expected %r, saw %r' % (expected_type, kind))
 134     return b''.join(cat_iterator)
 135
 136 def get_commit_items(id, cp):
 137     return parse_commit(get_cat_data(cp.get(id), b'commit'))
 138
 139 def _local_git_date_str(epoch_sec):
 140     return b'%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
 141
 142
 143 def _git_date_str(epoch_sec, tz_offset_sec):
 144     offs =  tz_offset_sec // 60
 145     return b'%d %s%02d%02d' \
 146         % (epoch_sec,
 147            b'+' if offs >= 0 else b'-',
 148            abs(offs) // 60,
 149            abs(offs) % 60)
 150
 151
 152 def repo(sub = b'', repo_dir=None):
 153     """Get the path to the git repository or one of its subdirectories."""
 154     repo_dir = repo_dir or repodir
 155     if not repo_dir:
 156         raise GitError('You should call check_repo_or_die()')
 157
 158     # If there's a .git subdirectory, then the actual repo is in there.
 159     gd = os.path.join(repo_dir, b'.git')
 160     if os.path.exists(gd):
 161         repo_dir = gd
 162
 163     return os.path.join(repo_dir, sub)
 164
 165
 166 _shorten_hash_rx = \
 167     re.compile(br'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)')
 168
 169 def shorten_hash(s):
 170     return _shorten_hash_rx.sub(br'\1\2*\3', s)
 171
 172
 173 def repo_rel(path):
 174     full = os.path.abspath(path)
 175     fullrepo = os.path.abspath(repo(b''))
 176     if not fullrepo.endswith(b'/'):
 177         fullrepo += b'/'
 178     if full.startswith(fullrepo):
 179         path = full[len(fullrepo):]
 180     if path.startswith(b'index-cache/'):
 181         path = path[len(b'index-cache/'):]
 182     return shorten_hash(path)
 183
 184
 185 def all_packdirs():
 186     paths = [repo(b'objects/pack')]
 187     paths += glob.glob(repo(b'index-cache/*/.'))
 188     return paths
 189
 190
 191 def auto_midx(objdir):
 192     args = [path.exe(), b'midx', b'--auto', b'--dir', objdir]
 193     try:
 194         rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
 195     except OSError as e:
 196         # make sure 'args' gets printed to help with debugging
 197         add_error('%r: exception: %s' % (args, e))
 198         raise
 199     if rv:
 200         add_error('%r: returned %d' % (args, rv))
 201
 202     args = [path.exe(), b'bloom', b'--dir', objdir]
 203     try:
 204         rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
 205     except OSError as e:
 206         # make sure 'args' gets printed to help with debugging
 207         add_error('%r: exception: %s' % (args, e))
 208         raise
 209     if rv:
 210         add_error('%r: returned %d' % (args, rv))
 211
 212
 213 def mangle_name(name, mode, gitmode):
 214     """Mangle a file name to present an abstract name for segmented files.
 215     Mangled file names will have the ".bup" extension added to them. If a
 216     file's name already ends with ".bup", a ".bupl" extension is added to
 217     disambiguate normal files from segmented ones.
 218     """
 219     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 220         assert(stat.S_ISDIR(gitmode))
 221         return name + b'.bup'
 222     elif name.endswith(b'.bup') or name[:-1].endswith(b'.bup'):
 223         return name + b'.bupl'
 224     else:
 225         return name
 226
 227
 228 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 229 def demangle_name(name, mode):
 230     """Remove name mangling from a file name, if necessary.
 231
 232     The return value is a tuple (demangled_filename,mode), where mode is one of
 233     the following:
 234
 235     * BUP_NORMAL  : files that should be read as-is from the repository
 236     * BUP_CHUNKED : files that were chunked and need to be reassembled
 237
 238     For more information on the name mangling algorithm, see mangle_name()
 239     """
 240     if name.endswith(b'.bupl'):
 241         return (name[:-5], BUP_NORMAL)
 242     elif name.endswith(b'.bup'):
 243         return (name[:-4], BUP_CHUNKED)
 244     elif name.endswith(b'.bupm'):
 245         return (name[:-5],
 246                 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
 247     else:
 248         return (name, BUP_NORMAL)
 249
 250
 251 def calc_hash(type, content):
 252     """Calculate some content's hash in the Git fashion."""
 253     header = b'%s %d\0' % (type, len(content))
 254     sum = Sha1(header)
 255     sum.update(content)
 256     return sum.digest()
 257
 258
 259 def shalist_item_sort_key(ent):
 260     (mode, name, id) = ent
 261     assert(mode+0 == mode)
 262     if stat.S_ISDIR(mode):
 263         return name + b'/'
 264     else:
 265         return name
 266
 267
 268 def tree_encode(shalist):
 269     """Generate a git tree object from (mode,name,hash) tuples."""
 270     shalist = sorted(shalist, key = shalist_item_sort_key)
 271     l = []
 272     for (mode,name,bin) in shalist:
 273         assert(mode)
 274         assert(mode+0 == mode)
 275         assert(name)
 276         assert(len(bin) == 20)
 277         s = b'%o %s\0%s' % (mode,name,bin)
 278         assert s[0] != b'0'  # 0-padded octal is not acceptable in a git tree
 279         l.append(s)
 280     return b''.join(l)
 281
 282
 283 def tree_decode(buf):
 284     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 285     ofs = 0
 286     while ofs < len(buf):
 287         z = buf.find(b'\0', ofs)
 288         assert(z > ofs)
 289         spl = buf[ofs:z].split(b' ', 1)
 290         assert(len(spl) == 2)
 291         mode,name = spl
 292         sha = buf[z+1:z+1+20]
 293         ofs = z+1+20
 294         yield (int(mode, 8), name, sha)
 295
 296
 297 def _encode_packobj(type, content, compression_level=1):
 298     if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
 299         raise ValueError('invalid compression level %s' % compression_level)
 300     szout = b''
 301     sz = len(content)
 302     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 303     sz >>= 4
 304     while 1:
 305         if sz: szbits |= 0x80
 306         szout += bytes_from_uint(szbits)
 307         if not sz:
 308             break
 309         szbits = sz & 0x7f
 310         sz >>= 7
 311     z = zlib.compressobj(compression_level)
 312     yield szout
 313     yield z.compress(content)
 314     yield z.flush()
 315
 316
 317 def _encode_looseobj(type, content, compression_level=1):
 318     z = zlib.compressobj(compression_level)
 319     yield z.compress(b'%s %d\0' % (type, len(content)))
 320     yield z.compress(content)
 321     yield z.flush()
 322
 323
 324 def _decode_looseobj(buf):
 325     assert(buf);
 326     s = zlib.decompress(buf)
 327     i = s.find(b'\0')
 328     assert(i > 0)
 329     l = s[:i].split(b' ')
 330     type = l[0]
 331     sz = int(l[1])
 332     content = s[i+1:]
 333     assert(type in _typemap)
 334     assert(sz == len(content))
 335     return (type, content)
 336
 337
 338 def _decode_packobj(buf):
 339     assert(buf)
 340     c = byte_int(buf[0])
 341     type = _typermap[(c & 0x70) >> 4]
 342     sz = c & 0x0f
 343     shift = 4
 344     i = 0
 345     while c & 0x80:
 346         i += 1
 347         c = byte_int(buf[i])
 348         sz |= (c & 0x7f) << shift
 349         shift += 7
 350         if not (c & 0x80):
 351             break
 352     return (type, zlib.decompress(buf[i+1:]))
 353
 354
 355 class PackIdx:
 356     def __init__(self):
 357         assert(0)
 358
 359     def find_offset(self, hash):
 360         """Get the offset of an object inside the index file."""
 361         idx = self._idx_from_hash(hash)
 362         if idx != None:
 363             return self._ofs_from_idx(idx)
 364         return None
 365
 366     def exists(self, hash, want_source=False):
 367         """Return nonempty if the object exists in this index."""
 368         if hash and (self._idx_from_hash(hash) != None):
 369             return want_source and os.path.basename(self.name) or True
 370         return None
 371
 372     def _idx_from_hash(self, hash):
 373         global _total_searches, _total_steps
 374         _total_searches += 1
 375         assert(len(hash) == 20)
 376         b1 = byte_int(hash[0])
 377         start = self.fanout[b1-1] # range -1..254
 378         end = self.fanout[b1] # range 0..255
 379         want = hash
 380         _total_steps += 1  # lookup table is a step
 381         while start < end:
 382             _total_steps += 1
 383             mid = start + (end - start) // 2
 384             v = self._idx_to_hash(mid)
 385             if v < want:
 386                 start = mid+1
 387             elif v > want:
 388                 end = mid
 389             else: # got it!
 390                 return mid
 391         return None
 392
 393
 394 class PackIdxV1(PackIdx):
 395     """Object representation of a Git pack index (version 1) file."""
 396     def __init__(self, filename, f):
 397         self.name = filename
 398         self.idxnames = [self.name]
 399         self.map = mmap_read(f)
 400         # Min size for 'L' is 4, which is sufficient for struct's '!I'
 401         self.fanout = array('L', struct.unpack('!256I', self.map))
 402         self.fanout.append(0)  # entry "-1"
 403         self.nsha = self.fanout[255]
 404         self.sha_ofs = 256 * 4
 405         # Avoid slicing shatable for individual hashes (very high overhead)
 406         self.shatable = buffer(self.map, self.sha_ofs, self.nsha * 24)
 407
 408     def __len__(self):
 409         return int(self.nsha)  # int() from long for python 2
 410
 411     def _ofs_from_idx(self, idx):
 412         if idx >= self.nsha or idx < 0:
 413             raise IndexError('invalid pack index index %d' % idx)
 414         ofs = self.sha_ofs + idx * 24
 415         return struct.unpack_from('!I', self.map, offset=ofs)[0]
 416
 417     def _idx_to_hash(self, idx):
 418         if idx >= self.nsha or idx < 0:
 419             raise IndexError('invalid pack index index %d' % idx)
 420         ofs = self.sha_ofs + idx * 24 + 4
 421         return self.map[ofs : ofs + 20]
 422
 423     def __iter__(self):
 424         start = self.sha_ofs + 4
 425         for ofs in range(start, start + 24 * self.nsha, 24):
 426             yield self.map[ofs : ofs + 20]
 427
 428
 429 class PackIdxV2(PackIdx):
 430     """Object representation of a Git pack index (version 2) file."""
 431     def __init__(self, filename, f):
 432         self.name = filename
 433         self.idxnames = [self.name]
 434         self.map = mmap_read(f)
 435         assert self.map[0:8] == b'\377tOc\0\0\0\2'
 436         # Min size for 'L' is 4, which is sufficient for struct's '!I'
 437         self.fanout = array('L', struct.unpack_from('!256I', self.map, offset=8))
 438         self.fanout.append(0)
 439         self.nsha = self.fanout[255]
 440         self.sha_ofs = 8 + 256*4
 441         self.ofstable_ofs = self.sha_ofs + self.nsha * 20 + self.nsha * 4
 442         self.ofs64table_ofs = self.ofstable_ofs + self.nsha * 4
 443         # Avoid slicing this for individual hashes (very high overhead)
 444         self.shatable = buffer(self.map, self.sha_ofs, self.nsha*20)
 445
 446     def __len__(self):
 447         return int(self.nsha)  # int() from long for python 2
 448
 449     def _ofs_from_idx(self, idx):
 450         if idx >= self.nsha or idx < 0:
 451             raise IndexError('invalid pack index index %d' % idx)
 452         ofs_ofs = self.ofstable_ofs + idx * 4
 453         ofs = struct.unpack_from('!I', self.map, offset=ofs_ofs)[0]
 454         if ofs & 0x80000000:
 455             idx64 = ofs & 0x7fffffff
 456             ofs64_ofs = self.ofs64table_ofs + idx64 * 8
 457             ofs = struct.unpack_from('!Q', self.map, offset=ofs64_ofs)[0]
 458         return ofs
 459
 460     def _idx_to_hash(self, idx):
 461         if idx >= self.nsha or idx < 0:
 462             raise IndexError('invalid pack index index %d' % idx)
 463         ofs = self.sha_ofs + idx * 20
 464         return self.map[ofs : ofs + 20]
 465
 466     def __iter__(self):
 467         start = self.sha_ofs
 468         for ofs in range(start, start + 20 * self.nsha, 20):
 469             yield self.map[ofs : ofs + 20]
 470
 471
 472 _mpi_count = 0
 473 class PackIdxList:
 474     def __init__(self, dir, ignore_midx=False):
 475         global _mpi_count
 476         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 477         _mpi_count += 1
 478         self.dir = dir
 479         self.also = set()
 480         self.packs = []
 481         self.do_bloom = False
 482         self.bloom = None
 483         self.ignore_midx = ignore_midx
 484         self.refresh()
 485
 486     def __del__(self):
 487         global _mpi_count
 488         _mpi_count -= 1
 489         assert(_mpi_count == 0)
 490
 491     def __iter__(self):
 492         return iter(idxmerge(self.packs))
 493
 494     def __len__(self):
 495         return sum(len(pack) for pack in self.packs)
 496
 497     def exists(self, hash, want_source=False):
 498         """Return nonempty if the object exists in the index files."""
 499         global _total_searches
 500         _total_searches += 1
 501         if hash in self.also:
 502             return True
 503         if self.do_bloom and self.bloom:
 504             if self.bloom.exists(hash):
 505                 self.do_bloom = False
 506             else:
 507                 _total_searches -= 1  # was counted by bloom
 508                 return None
 509         for i in range(len(self.packs)):
 510             p = self.packs[i]
 511             _total_searches -= 1  # will be incremented by sub-pack
 512             ix = p.exists(hash, want_source=want_source)
 513             if ix:
 514                 # reorder so most recently used packs are searched first
 515                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 516                 return ix
 517         self.do_bloom = True
 518         return None
 519
 520     def refresh(self, skip_midx = False):
 521         """Refresh the index list.
 522         This method verifies if .midx files were superseded (e.g. all of its
 523         contents are in another, bigger .midx file) and removes the superseded
 524         files.
 525
 526         If skip_midx is True, all work on .midx files will be skipped and .midx
 527         files will be removed from the list.
 528
 529         The instance variable 'ignore_midx' can force this function to
 530         always act as if skip_midx was True.
 531         """
 532         if self.bloom is not None:
 533             self.bloom.close()
 534         self.bloom = None # Always reopen the bloom as it may have been relaced
 535         self.do_bloom = False
 536         skip_midx = skip_midx or self.ignore_midx
 537         d = dict((p.name, p) for p in self.packs
 538                  if not skip_midx or not isinstance(p, midx.PackMidx))
 539         if os.path.exists(self.dir):
 540             if not skip_midx:
 541                 midxl = []
 542                 midxes = set(glob.glob(os.path.join(self.dir, b'*.midx')))
 543                 # remove any *.midx files from our list that no longer exist
 544                 for ix in list(d.values()):
 545                     if not isinstance(ix, midx.PackMidx):
 546                         continue
 547                     if ix.name in midxes:
 548                         continue
 549                     # remove the midx
 550                     del d[ix.name]
 551                     ix.close()
 552                     self.packs.remove(ix)
 553                 for ix in self.packs:
 554                     if isinstance(ix, midx.PackMidx):
 555                         for name in ix.idxnames:
 556                             d[os.path.join(self.dir, name)] = ix
 557                 for full in midxes:
 558                     if not d.get(full):
 559                         mx = midx.PackMidx(full)
 560                         (mxd, mxf) = os.path.split(mx.name)
 561                         broken = False
 562                         for n in mx.idxnames:
 563                             if not os.path.exists(os.path.join(mxd, n)):
 564                                 log(('warning: index %s missing\n'
 565                                      '  used by %s\n')
 566                                     % (path_msg(n), path_msg(mxf)))
 567                                 broken = True
 568                         if broken:
 569                             mx.close()
 570                             del mx
 571                             unlink(full)
 572                         else:
 573                             midxl.append(mx)
 574                 midxl.sort(key=lambda ix:
 575                            (-len(ix), -xstat.stat(ix.name).st_mtime))
 576                 for ix in midxl:
 577                     any_needed = False
 578                     for sub in ix.idxnames:
 579                         found = d.get(os.path.join(self.dir, sub))
 580                         if not found or isinstance(found, PackIdx):
 581                             # doesn't exist, or exists but not in a midx
 582                             any_needed = True
 583                             break
 584                     if any_needed:
 585                         d[ix.name] = ix
 586                         for name in ix.idxnames:
 587                             d[os.path.join(self.dir, name)] = ix
 588                     elif not ix.force_keep:
 589                         debug1('midx: removing redundant: %s\n'
 590                                % path_msg(os.path.basename(ix.name)))
 591                         ix.close()
 592                         unlink(ix.name)
 593             for full in glob.glob(os.path.join(self.dir, b'*.idx')):
 594                 if not d.get(full):
 595                     try:
 596                         ix = open_idx(full)
 597                     except GitError as e:
 598                         add_error(e)
 599                         continue
 600                     d[full] = ix
 601             bfull = os.path.join(self.dir, b'bup.bloom')
 602             if self.bloom is None and os.path.exists(bfull):
 603                 self.bloom = bloom.ShaBloom(bfull)
 604             self.packs = list(set(d.values()))
 605             self.packs.sort(reverse=True, key=lambda x: len(x))
 606             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 607                 self.do_bloom = True
 608             else:
 609                 self.bloom = None
 610         debug1('PackIdxList: using %d index%s.\n'
 611             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 612
 613     def add(self, hash):
 614         """Insert an additional object in the list."""
 615         self.also.add(hash)
 616
 617
 618 def open_idx(filename):
 619     if filename.endswith(b'.idx'):
 620         f = open(filename, 'rb')
 621         header = f.read(8)
 622         if header[0:4] == b'\377tOc':
 623             version = struct.unpack('!I', header[4:8])[0]
 624             if version == 2:
 625                 return PackIdxV2(filename, f)
 626             else:
 627                 raise GitError('%s: expected idx file version 2, got %d'
 628                                % (path_msg(filename), version))
 629         elif len(header) == 8 and header[0:4] < b'\377tOc':
 630             return PackIdxV1(filename, f)
 631         else:
 632             raise GitError('%s: unrecognized idx file header'
 633                            % path_msg(filename))
 634     elif filename.endswith(b'.midx'):
 635         return midx.PackMidx(filename)
 636     else:
 637         raise GitError('idx filenames must end with .idx or .midx')
 638
 639
 640 def idxmerge(idxlist, final_progress=True):
 641     """Generate a list of all the objects reachable in a PackIdxList."""
 642     def pfunc(count, total):
 643         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 644                   % (count*100.0/total, count, total))
 645     def pfinal(count, total):
 646         if final_progress:
 647             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 648                      % (100, total, total))
 649     return merge_iter(idxlist, 10024, pfunc, pfinal)
 650
 651
 652 def _make_objcache():
 653     return PackIdxList(repo(b'objects/pack'))
 654
 655 # bup-gc assumes that it can disable all PackWriter activities
 656 # (bloom/midx/cache) via the constructor and close() arguments.
 657
 658 class PackWriter:
 659     """Writes Git objects inside a pack file."""
 660     def __init__(self, objcache_maker=_make_objcache, compression_level=1,
 661                  run_midx=True, on_pack_finish=None,
 662                  max_pack_size=None, max_pack_objects=None, repo_dir=None):
 663         self.repo_dir = repo_dir or repo()
 664         self.file = None
 665         self.parentfd = None
 666         self.count = 0
 667         self.outbytes = 0
 668         self.filename = None
 669         self.idx = None
 670         self.objcache_maker = objcache_maker
 671         self.objcache = None
 672         self.compression_level = compression_level
 673         self.run_midx=run_midx
 674         self.on_pack_finish = on_pack_finish
 675         if not max_pack_size:
 676             max_pack_size = git_config_get(b'pack.packSizeLimit',
 677                                            repo_dir=self.repo_dir)
 678             if max_pack_size is not None:
 679                 max_pack_size = parse_num(max_pack_size)
 680             if not max_pack_size:
 681                 # larger packs slow down pruning
 682                 max_pack_size = 1000 * 1000 * 1000
 683         self.max_pack_size = max_pack_size
 684         # cache memory usage is about 83 bytes per object
 685         self.max_pack_objects = max_pack_objects if max_pack_objects \
 686                                 else max(1, self.max_pack_size // 5000)
 687
 688     def __del__(self):
 689         self.close()
 690
 691     def __enter__(self):
 692         return self
 693
 694     def __exit__(self, type, value, traceback):
 695         self.close()
 696
 697     def _open(self):
 698         if not self.file:
 699             objdir = dir = os.path.join(self.repo_dir, b'objects')
 700             fd, name = tempfile.mkstemp(suffix=b'.pack', dir=objdir)
 701             try:
 702                 self.file = os.fdopen(fd, 'w+b')
 703             except:
 704                 os.close(fd)
 705                 raise
 706             try:
 707                 self.parentfd = os.open(objdir, os.O_RDONLY)
 708             except:
 709                 f = self.file
 710                 self.file = None
 711                 f.close()
 712                 raise
 713             assert name.endswith(b'.pack')
 714             self.filename = name[:-5]
 715             self.file.write(b'PACK\0\0\0\2\0\0\0\0')
 716             self.idx = list(list() for i in range(256))
 717
 718     def _raw_write(self, datalist, sha):
 719         self._open()
 720         f = self.file
 721         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 722         # the file never has a *partial* blob.  So let's make sure it's
 723         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 724         # to our hashsplit algorithm.)  f.write() does its own buffering,
 725         # but that's okay because we'll flush it in _end().
 726         oneblob = b''.join(datalist)
 727         try:
 728             f.write(oneblob)
 729         except IOError as e:
 730             reraise(GitError(e))
 731         nw = len(oneblob)
 732         crc = zlib.crc32(oneblob) & 0xffffffff
 733         self._update_idx(sha, crc, nw)
 734         self.outbytes += nw
 735         self.count += 1
 736         return nw, crc
 737
 738     def _update_idx(self, sha, crc, size):
 739         assert(sha)
 740         if self.idx:
 741             self.idx[byte_int(sha[0])].append((sha, crc,
 742                                                self.file.tell() - size))
 743
 744     def _write(self, sha, type, content):
 745         if verbose:
 746             log('>')
 747         if not sha:
 748             sha = calc_hash(type, content)
 749         size, crc = self._raw_write(_encode_packobj(type, content,
 750                                                     self.compression_level),
 751                                     sha=sha)
 752         if self.outbytes >= self.max_pack_size \
 753            or self.count >= self.max_pack_objects:
 754             self.breakpoint()
 755         return sha
 756
 757     def breakpoint(self):
 758         """Clear byte and object counts and return the last processed id."""
 759         id = self._end(self.run_midx)
 760         self.outbytes = self.count = 0
 761         return id
 762
 763     def _require_objcache(self):
 764         if self.objcache is None and self.objcache_maker:
 765             self.objcache = self.objcache_maker()
 766         if self.objcache is None:
 767             raise GitError(
 768                     "PackWriter not opened or can't check exists w/o objcache")
 769
 770     def exists(self, id, want_source=False):
 771         """Return non-empty if an object is found in the object cache."""
 772         self._require_objcache()
 773         return self.objcache.exists(id, want_source=want_source)
 774
 775     def just_write(self, sha, type, content):
 776         """Write an object to the pack file without checking for duplication."""
 777         self._write(sha, type, content)
 778         # If nothing else, gc doesn't have/want an objcache
 779         if self.objcache is not None:
 780             self.objcache.add(sha)
 781
 782     def maybe_write(self, type, content):
 783         """Write an object to the pack file if not present and return its id."""
 784         sha = calc_hash(type, content)
 785         if not self.exists(sha):
 786             self._require_objcache()
 787             self.just_write(sha, type, content)
 788         return sha
 789
 790     def new_blob(self, blob):
 791         """Create a blob object in the pack with the supplied content."""
 792         return self.maybe_write(b'blob', blob)
 793
 794     def new_tree(self, shalist):
 795         """Create a tree object in the pack."""
 796         content = tree_encode(shalist)
 797         return self.maybe_write(b'tree', content)
 798
 799     def new_commit(self, tree, parent,
 800                    author, adate_sec, adate_tz,
 801                    committer, cdate_sec, cdate_tz,
 802                    msg):
 803         """Create a commit object in the pack.  The date_sec values must be
 804         epoch-seconds, and if a tz is None, the local timezone is assumed."""
 805         if adate_tz:
 806             adate_str = _git_date_str(adate_sec, adate_tz)
 807         else:
 808             adate_str = _local_git_date_str(adate_sec)
 809         if cdate_tz:
 810             cdate_str = _git_date_str(cdate_sec, cdate_tz)
 811         else:
 812             cdate_str = _local_git_date_str(cdate_sec)
 813         l = []
 814         if tree: l.append(b'tree %s' % hexlify(tree))
 815         if parent: l.append(b'parent %s' % hexlify(parent))
 816         if author: l.append(b'author %s %s' % (author, adate_str))
 817         if committer: l.append(b'committer %s %s' % (committer, cdate_str))
 818         l.append(b'')
 819         l.append(msg)
 820         return self.maybe_write(b'commit', b'\n'.join(l))
 821
 822     def abort(self):
 823         """Remove the pack file from disk."""
 824         f = self.file
 825         if f:
 826             pfd = self.parentfd
 827             self.file = None
 828             self.parentfd = None
 829             self.idx = None
 830             try:
 831                 try:
 832                     os.unlink(self.filename + b'.pack')
 833                 finally:
 834                     f.close()
 835             finally:
 836                 if pfd is not None:
 837                     os.close(pfd)
 838
 839     def _end(self, run_midx=True):
 840         f = self.file
 841         if not f: return None
 842         self.file = None
 843         try:
 844             self.objcache = None
 845             idx = self.idx
 846             self.idx = None
 847
 848             # update object count
 849             f.seek(8)
 850             cp = struct.pack('!i', self.count)
 851             assert(len(cp) == 4)
 852             f.write(cp)
 853
 854             # calculate the pack sha1sum
 855             f.seek(0)
 856             sum = Sha1()
 857             for b in chunkyreader(f):
 858                 sum.update(b)
 859             packbin = sum.digest()
 860             f.write(packbin)
 861             fdatasync(f.fileno())
 862         finally:
 863             f.close()
 864
 865         obj_list_sha = self._write_pack_idx_v2(self.filename + b'.idx', idx,
 866                                                packbin)
 867         nameprefix = os.path.join(self.repo_dir,
 868                                   b'objects/pack/pack-' +  obj_list_sha)
 869         if os.path.exists(self.filename + b'.map'):
 870             os.unlink(self.filename + b'.map')
 871         os.rename(self.filename + b'.pack', nameprefix + b'.pack')
 872         os.rename(self.filename + b'.idx', nameprefix + b'.idx')
 873         try:
 874             os.fsync(self.parentfd)
 875         finally:
 876             os.close(self.parentfd)
 877
 878         if run_midx:
 879             auto_midx(os.path.join(self.repo_dir, b'objects/pack'))
 880
 881         if self.on_pack_finish:
 882             self.on_pack_finish(nameprefix)
 883
 884         return nameprefix
 885
 886     def close(self, run_midx=True):
 887         """Close the pack file and move it to its definitive path."""
 888         return self._end(run_midx=run_midx)
 889
 890     def _write_pack_idx_v2(self, filename, idx, packbin):
 891         ofs64_count = 0
 892         for section in idx:
 893             for entry in section:
 894                 if entry[2] >= 2**31:
 895                     ofs64_count += 1
 896
 897         # Length: header + fan-out + shas-and-crcs + overflow-offsets
 898         index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
 899         idx_map = None
 900         idx_f = open(filename, 'w+b')
 901         try:
 902             idx_f.truncate(index_len)
 903             fdatasync(idx_f.fileno())
 904             idx_map = mmap_readwrite(idx_f, close=False)
 905             try:
 906                 count = _helpers.write_idx(filename, idx_map, idx, self.count)
 907                 assert(count == self.count)
 908                 idx_map.flush()
 909             finally:
 910                 idx_map.close()
 911         finally:
 912             idx_f.close()
 913
 914         idx_f = open(filename, 'a+b')
 915         try:
 916             idx_f.write(packbin)
 917             idx_f.seek(0)
 918             idx_sum = Sha1()
 919             b = idx_f.read(8 + 4*256)
 920             idx_sum.update(b)
 921
 922             obj_list_sum = Sha1()
 923             for b in chunkyreader(idx_f, 20*self.count):
 924                 idx_sum.update(b)
 925                 obj_list_sum.update(b)
 926             namebase = hexlify(obj_list_sum.digest())
 927
 928             for b in chunkyreader(idx_f):
 929                 idx_sum.update(b)
 930             idx_f.write(idx_sum.digest())
 931             fdatasync(idx_f.fileno())
 932             return namebase
 933         finally:
 934             idx_f.close()
 935
 936
 937 def list_refs(patterns=None, repo_dir=None,
 938               limit_to_heads=False, limit_to_tags=False):
 939     """Yield (refname, hash) tuples for all repository refs unless
 940     patterns are specified.  In that case, only include tuples for
 941     refs matching those patterns (cf. git-show-ref(1)).  The limits
 942     restrict the result items to refs/heads or refs/tags.  If both
 943     limits are specified, items from both sources will be included.
 944
 945     """
 946     argv = [b'git', b'show-ref']
 947     if limit_to_heads:
 948         argv.append(b'--heads')
 949     if limit_to_tags:
 950         argv.append(b'--tags')
 951     argv.append(b'--')
 952     if patterns:
 953         argv.extend(patterns)
 954     p = subprocess.Popen(argv, env=_gitenv(repo_dir), stdout=subprocess.PIPE)
 955     out = p.stdout.read().strip()
 956     rv = p.wait()  # not fatal
 957     if rv:
 958         assert(not out)
 959     if out:
 960         for d in out.split(b'\n'):
 961             sha, name = d.split(b' ', 1)
 962             yield name, unhexlify(sha)
 963
 964
 965 def read_ref(refname, repo_dir = None):
 966     """Get the commit id of the most recent commit made on a given ref."""
 967     refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
 968     l = tuple(islice(refs, 2))
 969     if l:
 970         assert(len(l) == 1)
 971         return l[0][1]
 972     else:
 973         return None
 974
 975
 976 def rev_list_invocation(ref_or_refs, format=None):
 977     if isinstance(ref_or_refs, bytes):
 978         refs = (ref_or_refs,)
 979     else:
 980         refs = ref_or_refs
 981     argv = [b'git', b'rev-list']
 982
 983     if format:
 984         argv.append(b'--pretty=format:' + format)
 985     for ref in refs:
 986         assert not ref.startswith(b'-')
 987         argv.append(ref)
 988     argv.append(b'--')
 989     return argv
 990
 991
 992 def rev_list(ref_or_refs, parse=None, format=None, repo_dir=None):
 993     """Yield information about commits as per "git rev-list".  If a format
 994     is not provided, yield one hex hash at a time.  If a format is
 995     provided, pass it to rev-list and call parse(git_stdout) for each
 996     commit with the stream positioned just after the rev-list "commit
 997     HASH" header line.  When a format is provided yield (oidx,
 998     parse(git_stdout)) for each commit.
 999
1000     """
1001     assert bool(parse) == bool(format)
1002     p = subprocess.Popen(rev_list_invocation(ref_or_refs,
1003                                              format=format),
1004                          env=_gitenv(repo_dir),
1005                          stdout = subprocess.PIPE)
1006     if not format:
1007         for line in p.stdout:
1008             yield line.strip()
1009     else:
1010         line = p.stdout.readline()
1011         while line:
1012             s = line.strip()
1013             if not s.startswith(b'commit '):
1014                 raise Exception('unexpected line ' + repr(s))
1015             s = s[7:]
1016             assert len(s) == 40
1017             yield s, parse(p.stdout)
1018             line = p.stdout.readline()
1019
1020     rv = p.wait()  # not fatal
1021     if rv:
1022         raise GitError('git rev-list returned error %d' % rv)
1023
1024
1025 def get_commit_dates(refs, repo_dir=None):
1026     """Get the dates for the specified commit refs.  For now, every unique
1027        string in refs must resolve to a different commit or this
1028        function will fail."""
1029     result = []
1030     for ref in refs:
1031         commit = get_commit_items(ref, cp(repo_dir))
1032         result.append(commit.author_sec)
1033     return result
1034
1035
1036 def rev_parse(committish, repo_dir=None):
1037     """Resolve the full hash for 'committish', if it exists.
1038
1039     Should be roughly equivalent to 'git rev-parse'.
1040
1041     Returns the hex value of the hash if it is found, None if 'committish' does
1042     not correspond to anything.
1043     """
1044     head = read_ref(committish, repo_dir=repo_dir)
1045     if head:
1046         debug2("resolved from ref: commit = %s\n" % hexlify(head))
1047         return head
1048
1049     pL = PackIdxList(repo(b'objects/pack', repo_dir=repo_dir))
1050
1051     if len(committish) == 40:
1052         try:
1053             hash = unhexlify(committish)
1054         except TypeError:
1055             return None
1056
1057         if pL.exists(hash):
1058             return hash
1059
1060     return None
1061
1062
1063 def update_ref(refname, newval, oldval, repo_dir=None):
1064     """Update a repository reference."""
1065     if not oldval:
1066         oldval = b''
1067     assert refname.startswith(b'refs/heads/') \
1068         or refname.startswith(b'refs/tags/')
1069     p = subprocess.Popen([b'git', b'update-ref', refname,
1070                           hexlify(newval), hexlify(oldval)],
1071                          env=_gitenv(repo_dir))
1072     _git_wait(b'git update-ref', p)
1073
1074
1075 def delete_ref(refname, oldvalue=None):
1076     """Delete a repository reference (see git update-ref(1))."""
1077     assert refname.startswith(b'refs/')
1078     oldvalue = [] if not oldvalue else [oldvalue]
1079     p = subprocess.Popen([b'git', b'update-ref', b'-d', refname] + oldvalue,
1080                          env=_gitenv())
1081     _git_wait('git update-ref', p)
1082
1083
1084 def guess_repo(path=None):
1085     """Set the path value in the global variable "repodir".
1086     This makes bup look for an existing bup repository, but not fail if a
1087     repository doesn't exist. Usually, if you are interacting with a bup
1088     repository, you would not be calling this function but using
1089     check_repo_or_die().
1090     """
1091     global repodir
1092     if path:
1093         repodir = path
1094     if not repodir:
1095         repodir = environ.get(b'BUP_DIR')
1096         if not repodir:
1097             repodir = os.path.expanduser(b'~/.bup')
1098
1099
1100 def init_repo(path=None):
1101     """Create the Git bare repository for bup in a given path."""
1102     guess_repo(path)
1103     d = repo()  # appends a / to the path
1104     parent = os.path.dirname(os.path.dirname(d))
1105     if parent and not os.path.exists(parent):
1106         raise GitError('parent directory "%s" does not exist\n'
1107                        % path_msg(parent))
1108     if os.path.exists(d) and not os.path.isdir(os.path.join(d, b'.')):
1109         raise GitError('"%s" exists but is not a directory\n' % path_msg(d))
1110     p = subprocess.Popen([b'git', b'--bare', b'init'], stdout=sys.stderr,
1111                          env=_gitenv())
1112     _git_wait('git init', p)
1113     # Force the index version configuration in order to ensure bup works
1114     # regardless of the version of the installed Git binary.
1115     p = subprocess.Popen([b'git', b'config', b'pack.indexVersion', '2'],
1116                          stdout=sys.stderr, env=_gitenv())
1117     _git_wait('git config', p)
1118     # Enable the reflog
1119     p = subprocess.Popen([b'git', b'config', b'core.logAllRefUpdates', b'true'],
1120                          stdout=sys.stderr, env=_gitenv())
1121     _git_wait('git config', p)
1122
1123
1124 def check_repo_or_die(path=None):
1125     """Check to see if a bup repository probably exists, and abort if not."""
1126     guess_repo(path)
1127     top = repo()
1128     pst = stat_if_exists(top + b'/objects/pack')
1129     if pst and stat.S_ISDIR(pst.st_mode):
1130         return
1131     if not pst:
1132         top_st = stat_if_exists(top)
1133         if not top_st:
1134             log('error: repository %r does not exist (see "bup help init")\n'
1135                 % top)
1136             sys.exit(15)
1137     log('error: %s is not a repository\n' % path_msg(top))
1138     sys.exit(14)
1139
1140
1141 _ver = None
1142 def ver():
1143     """Get Git's version and ensure a usable version is installed.
1144
1145     The returned version is formatted as an ordered tuple with each position
1146     representing a digit in the version tag. For example, the following tuple
1147     would represent version 1.6.6.9:
1148
1149         (1, 6, 6, 9)
1150     """
1151     global _ver
1152     if not _ver:
1153         p = subprocess.Popen([b'git', b'--version'], stdout=subprocess.PIPE)
1154         gvs = p.stdout.read()
1155         _git_wait('git --version', p)
1156         m = re.match(br'git version (\S+.\S+)', gvs)
1157         if not m:
1158             raise GitError('git --version weird output: %r' % gvs)
1159         _ver = tuple(int(x) for x in m.group(1).split(b'.'))
1160     needed = (1, 5, 3, 1)
1161     if _ver < needed:
1162         raise GitError('git version %s or higher is required; you have %s'
1163                        % ('.'.join(str(x) for x in needed),
1164                           '.'.join(str(x) for x in _ver)))
1165     return _ver
1166
1167
1168 class _AbortableIter:
1169     def __init__(self, it, onabort = None):
1170         self.it = it
1171         self.onabort = onabort
1172         self.done = None
1173
1174     def __iter__(self):
1175         return self
1176
1177     def __next__(self):
1178         try:
1179             return next(self.it)
1180         except StopIteration as e:
1181             self.done = True
1182             raise
1183         except:
1184             self.abort()
1185             raise
1186
1187     next = __next__
1188
1189     def abort(self):
1190         """Abort iteration and call the abortion callback, if needed."""
1191         if not self.done:
1192             self.done = True
1193             if self.onabort:
1194                 self.onabort()
1195
1196     def __del__(self):
1197         self.abort()
1198
1199
1200 class CatPipe:
1201     """Link to 'git cat-file' that is used to retrieve blob data."""
1202     def __init__(self, repo_dir = None):
1203         self.repo_dir = repo_dir
1204         wanted = (1, 5, 6)
1205         if ver() < wanted:
1206             log('error: git version must be at least 1.5.6\n')
1207             sys.exit(1)
1208         self.p = self.inprogress = None
1209
1210     def _abort(self):
1211         if self.p:
1212             self.p.stdout.close()
1213             self.p.stdin.close()
1214         self.p = None
1215         self.inprogress = None
1216
1217     def restart(self):
1218         self._abort()
1219         self.p = subprocess.Popen([b'git', b'cat-file', b'--batch'],
1220                                   stdin=subprocess.PIPE,
1221                                   stdout=subprocess.PIPE,
1222                                   close_fds = True,
1223                                   bufsize = 4096,
1224                                   env=_gitenv(self.repo_dir))
1225
1226     def get(self, ref):
1227         """Yield (oidx, type, size), followed by the data referred to by ref.
1228         If ref does not exist, only yield (None, None, None).
1229
1230         """
1231         if not self.p or self.p.poll() != None:
1232             self.restart()
1233         assert(self.p)
1234         poll_result = self.p.poll()
1235         assert(poll_result == None)
1236         if self.inprogress:
1237             log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1238         assert(not self.inprogress)
1239         assert ref.find(b'\n') < 0
1240         assert ref.find(b'\r') < 0
1241         assert not ref.startswith(b'-')
1242         self.inprogress = ref
1243         self.p.stdin.write(ref + b'\n')
1244         self.p.stdin.flush()
1245         hdr = self.p.stdout.readline()
1246         if hdr.endswith(b' missing\n'):
1247             self.inprogress = None
1248             yield None, None, None
1249             return
1250         info = hdr.split(b' ')
1251         if len(info) != 3 or len(info[0]) != 40:
1252             raise GitError('expected object (id, type, size), got %r' % info)
1253         oidx, typ, size = info
1254         size = int(size)
1255         it = _AbortableIter(chunkyreader(self.p.stdout, size),
1256                             onabort=self._abort)
1257         try:
1258             yield oidx, typ, size
1259             for blob in it:
1260                 yield blob
1261             readline_result = self.p.stdout.readline()
1262             assert readline_result == b'\n'
1263             self.inprogress = None
1264         except Exception as e:
1265             it.abort()
1266             raise
1267
1268     def _join(self, it):
1269         _, typ, _ = next(it)
1270         if typ == b'blob':
1271             for blob in it:
1272                 yield blob
1273         elif typ == b'tree':
1274             treefile = b''.join(it)
1275             for (mode, name, sha) in tree_decode(treefile):
1276                 for blob in self.join(hexlify(sha)):
1277                     yield blob
1278         elif typ == b'commit':
1279             treeline = b''.join(it).split(b'\n')[0]
1280             assert treeline.startswith(b'tree ')
1281             for blob in self.join(treeline[5:]):
1282                 yield blob
1283         else:
1284             raise GitError('invalid object type %r: expected blob/tree/commit'
1285                            % typ)
1286
1287     def join(self, id):
1288         """Generate a list of the content of all blobs that can be reached
1289         from an object.  The hash given in 'id' must point to a blob, a tree
1290         or a commit. The content of all blobs that can be seen from trees or
1291         commits will be added to the list.
1292         """
1293         for d in self._join(self.get(id)):
1294             yield d
1295
1296
1297 _cp = {}
1298
1299 def cp(repo_dir=None):
1300     """Create a CatPipe object or reuse the already existing one."""
1301     global _cp, repodir
1302     if not repo_dir:
1303         repo_dir = repodir or repo()
1304     repo_dir = os.path.abspath(repo_dir)
1305     cp = _cp.get(repo_dir)
1306     if not cp:
1307         cp = CatPipe(repo_dir)
1308         _cp[repo_dir] = cp
1309     return cp
1310
1311
1312 def tags(repo_dir = None):
1313     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1314     tags = {}
1315     for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1316         assert n.startswith(b'refs/tags/')
1317         name = n[10:]
1318         if not c in tags:
1319             tags[c] = []
1320         tags[c].append(name)  # more than one tag can point at 'c'
1321     return tags
1322
1323
1324 class MissingObject(KeyError):
1325     def __init__(self, oid):
1326         self.oid = oid
1327         KeyError.__init__(self, 'object %r is missing' % oid.encode('hex'))
1328
1329
1330 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1331                                    'path', 'chunk_path', 'data'])
1332 # The path is the mangled path, and if an item represents a fragment
1333 # of a chunked file, the chunk_path will be the chunked subtree path
1334 # for the chunk, i.e. ['', '2d3115e', ...].  The top-level path for a
1335 # chunked file will have a chunk_path of [''].  So some chunk subtree
1336 # of the file '/foo/bar/baz' might look like this:
1337 #
1338 #   item.path = ['foo', 'bar', 'baz.bup']
1339 #   item.chunk_path = ['', '2d3115e', '016b097']
1340 #   item.type = 'tree'
1341 #   ...
1342
1343
1344 def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1345     """Yield everything reachable from oidx via get_ref (which must behave
1346     like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1347     returns true.  Throw MissingObject if a hash encountered is
1348     missing from the repository, and don't read or return blob content
1349     in the data field unless include_data is set.
1350
1351     """
1352     # Maintain the pending stack on the heap to avoid stack overflow
1353     pending = [(oidx, [], [], None)]
1354     while len(pending):
1355         oidx, parent_path, chunk_path, mode = pending.pop()
1356         oid = unhexlify(oidx)
1357         if stop_at and stop_at(oidx):
1358             continue
1359
1360         if (not include_data) and mode and stat.S_ISREG(mode):
1361             # If the object is a "regular file", then it's a leaf in
1362             # the graph, so we can skip reading the data if the caller
1363             # hasn't requested it.
1364             yield WalkItem(oid=oid, type=b'blob',
1365                            chunk_path=chunk_path, path=parent_path,
1366                            mode=mode,
1367                            data=None)
1368             continue
1369
1370         item_it = get_ref(oidx)
1371         get_oidx, typ, _ = next(item_it)
1372         if not get_oidx:
1373             raise MissingObject(unhexlify(oidx))
1374         if typ not in (b'blob', b'commit', b'tree'):
1375             raise Exception('unexpected repository object type %r' % typ)
1376
1377         # FIXME: set the mode based on the type when the mode is None
1378         if typ == b'blob' and not include_data:
1379             # Dump data until we can ask cat_pipe not to fetch it
1380             for ignored in item_it:
1381                 pass
1382             data = None
1383         else:
1384             data = b''.join(item_it)
1385
1386         yield WalkItem(oid=oid, type=typ,
1387                        chunk_path=chunk_path, path=parent_path,
1388                        mode=mode,
1389                        data=(data if include_data else None))
1390
1391         if typ == b'commit':
1392             commit_items = parse_commit(data)
1393             for pid in commit_items.parents:
1394                 pending.append((pid, parent_path, chunk_path, mode))
1395             pending.append((commit_items.tree, parent_path, chunk_path,
1396                             hashsplit.GIT_MODE_TREE))
1397         elif typ == b'tree':
1398             for mode, name, ent_id in tree_decode(data):
1399                 demangled, bup_type = demangle_name(name, mode)
1400                 if chunk_path:
1401                     sub_path = parent_path
1402                     sub_chunk_path = chunk_path + [name]
1403                 else:
1404                     sub_path = parent_path + [name]
1405                     if bup_type == BUP_CHUNKED:
1406                         sub_chunk_path = [b'']
1407                     else:
1408                         sub_chunk_path = chunk_path
1409                 pending.append((hexlify(ent_id), sub_path, sub_chunk_path,
1410                                 mode))