lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5
   6 from __future__ import absolute_import, print_function
   7 import os, sys, zlib, subprocess, struct, stat, re, tempfile, glob
   8 from array import array
   9 from binascii import hexlify, unhexlify
  10 from collections import namedtuple
  11 from itertools import islice
  12
  13 from bup import _helpers, hashsplit, path, midx, bloom, xstat
  14 from bup.compat import (buffer,
  15                         byte_int, bytes_from_byte, bytes_from_uint,
  16                         environ,
  17                         items,
  18                         range,
  19                         reraise)
  20 from bup.io import path_msg
  21 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
  22                          exo,
  23                          fdatasync,
  24                          log,
  25                          merge_dict,
  26                          merge_iter,
  27                          mmap_read, mmap_readwrite,
  28                          progress, qprogress, stat_if_exists,
  29                          unlink,
  30                          utc_offset_str)
  31
  32
  33 verbose = 0
  34 repodir = None  # The default repository, once initialized
  35
  36 _typemap =  {b'blob': 3, b'tree': 2, b'commit': 1, b'tag': 4}
  37 _typermap = {v: k for k, v in items(_typemap)}
  38
  39
  40 _total_searches = 0
  41 _total_steps = 0
  42
  43
  44 class GitError(Exception):
  45     pass
  46
  47
  48 def _gitenv(repo_dir=None):
  49     if not repo_dir:
  50         repo_dir = repo()
  51     return merge_dict(environ, {b'GIT_DIR': os.path.abspath(repo_dir)})
  52
  53 def _git_wait(cmd, p):
  54     rv = p.wait()
  55     if rv != 0:
  56         raise GitError('%r returned %d' % (cmd, rv))
  57
  58 def _git_exo(cmd, **kwargs):
  59     kwargs['check'] = False
  60     result = exo(cmd, **kwargs)
  61     _, _, proc = result
  62     if proc.returncode != 0:
  63         raise GitError('%r returned %d' % (cmd, proc.returncode))
  64     return result
  65
  66 def git_config_get(option, repo_dir=None, opttype=None, cfg_file=None):
  67     assert not (repo_dir and cfg_file), "repo_dir and cfg_file cannot both be used"
  68     cmd = [b'git', b'config', b'--null']
  69     if cfg_file:
  70         cmd.extend([b'--file', cfg_file])
  71     if opttype == 'int':
  72         cmd.extend([b'--int'])
  73     elif opttype == 'bool':
  74         cmd.extend([b'--bool'])
  75     else:
  76         assert opttype is None
  77     cmd.extend([b'--get', option])
  78     env=None
  79     if repo_dir:
  80         env = _gitenv(repo_dir=repo_dir)
  81     p = subprocess.Popen(cmd, stdout=subprocess.PIPE, env=env,
  82                          close_fds=True)
  83     # with --null, git writes out a trailing \0 after the value
  84     r = p.stdout.read()[:-1]
  85     rc = p.wait()
  86     if rc == 0:
  87         if opttype == 'int':
  88             return int(r)
  89         elif opttype == 'bool':
  90             # git converts to 'true' or 'false'
  91             return r == b'true'
  92         return r
  93     if rc != 1:
  94         raise GitError('%r returned %d' % (cmd, rc))
  95     return None
  96
  97
  98 def parse_tz_offset(s):
  99     """UTC offset in seconds."""
 100     tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
 101     if bytes_from_byte(s[0]) == b'-':
 102         return - tz_off
 103     return tz_off
 104
 105
 106 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
 107 # Make sure that's authoritative.
 108 _start_end_char = br'[^ .,:;<>"\'\0\n]'
 109 _content_char = br'[^\0\n<>]'
 110 _safe_str_rx = br'(?:%s{1,2}|(?:%s%s*%s))' \
 111     % (_start_end_char,
 112        _start_end_char, _content_char, _start_end_char)
 113 _tz_rx = br'[-+]\d\d[0-5]\d'
 114 _parent_rx = br'(?:parent [abcdefABCDEF0123456789]{40}\n)'
 115 # Assumes every following line starting with a space is part of the
 116 # mergetag.  Is there a formal commit blob spec?
 117 _mergetag_rx = br'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)'
 118 _commit_rx = re.compile(br'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
 119 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
 120 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?)
 121
 122 (?P<message>(?:.|\n)*)''' % (_parent_rx,
 123                              _safe_str_rx, _safe_str_rx, _tz_rx,
 124                              _safe_str_rx, _safe_str_rx, _tz_rx,
 125                              _mergetag_rx))
 126 _parent_hash_rx = re.compile(br'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
 127
 128 # Note that the author_sec and committer_sec values are (UTC) epoch
 129 # seconds, and for now the mergetag is not included.
 130 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
 131                                        'author_name', 'author_mail',
 132                                        'author_sec', 'author_offset',
 133                                        'committer_name', 'committer_mail',
 134                                        'committer_sec', 'committer_offset',
 135                                        'message'])
 136
 137 def parse_commit(content):
 138     commit_match = re.match(_commit_rx, content)
 139     if not commit_match:
 140         raise Exception('cannot parse commit %r' % content)
 141     matches = commit_match.groupdict()
 142     return CommitInfo(tree=matches['tree'],
 143                       parents=re.findall(_parent_hash_rx, matches['parents']),
 144                       author_name=matches['author_name'],
 145                       author_mail=matches['author_mail'],
 146                       author_sec=int(matches['asec']),
 147                       author_offset=parse_tz_offset(matches['atz']),
 148                       committer_name=matches['committer_name'],
 149                       committer_mail=matches['committer_mail'],
 150                       committer_sec=int(matches['csec']),
 151                       committer_offset=parse_tz_offset(matches['ctz']),
 152                       message=matches['message'])
 153
 154
 155 def get_cat_data(cat_iterator, expected_type):
 156     _, kind, _ = next(cat_iterator)
 157     if kind != expected_type:
 158         raise Exception('expected %r, saw %r' % (expected_type, kind))
 159     return b''.join(cat_iterator)
 160
 161 def get_commit_items(id, cp):
 162     return parse_commit(get_cat_data(cp.get(id), b'commit'))
 163
 164 def _local_git_date_str(epoch_sec):
 165     return b'%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
 166
 167
 168 def _git_date_str(epoch_sec, tz_offset_sec):
 169     offs =  tz_offset_sec // 60
 170     return b'%d %s%02d%02d' \
 171         % (epoch_sec,
 172            b'+' if offs >= 0 else b'-',
 173            abs(offs) // 60,
 174            abs(offs) % 60)
 175
 176
 177 def repo(sub = b'', repo_dir=None):
 178     """Get the path to the git repository or one of its subdirectories."""
 179     repo_dir = repo_dir or repodir
 180     if not repo_dir:
 181         raise GitError('You should call check_repo_or_die()')
 182
 183     # If there's a .git subdirectory, then the actual repo is in there.
 184     gd = os.path.join(repo_dir, b'.git')
 185     if os.path.exists(gd):
 186         repo_dir = gd
 187
 188     return os.path.join(repo_dir, sub)
 189
 190
 191 _shorten_hash_rx = \
 192     re.compile(br'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)')
 193
 194 def shorten_hash(s):
 195     return _shorten_hash_rx.sub(br'\1\2*\3', s)
 196
 197
 198 def repo_rel(path):
 199     full = os.path.abspath(path)
 200     fullrepo = os.path.abspath(repo(b''))
 201     if not fullrepo.endswith(b'/'):
 202         fullrepo += b'/'
 203     if full.startswith(fullrepo):
 204         path = full[len(fullrepo):]
 205     if path.startswith(b'index-cache/'):
 206         path = path[len(b'index-cache/'):]
 207     return shorten_hash(path)
 208
 209
 210 def auto_midx(objdir):
 211     args = [path.exe(), b'midx', b'--auto', b'--dir', objdir]
 212     try:
 213         rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
 214     except OSError as e:
 215         # make sure 'args' gets printed to help with debugging
 216         add_error('%r: exception: %s' % (args, e))
 217         raise
 218     if rv:
 219         add_error('%r: returned %d' % (args, rv))
 220
 221     args = [path.exe(), b'bloom', b'--dir', objdir]
 222     try:
 223         rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
 224     except OSError as e:
 225         # make sure 'args' gets printed to help with debugging
 226         add_error('%r: exception: %s' % (args, e))
 227         raise
 228     if rv:
 229         add_error('%r: returned %d' % (args, rv))
 230
 231
 232 def mangle_name(name, mode, gitmode):
 233     """Mangle a file name to present an abstract name for segmented files.
 234     Mangled file names will have the ".bup" extension added to them. If a
 235     file's name already ends with ".bup", a ".bupl" extension is added to
 236     disambiguate normal files from segmented ones.
 237     """
 238     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 239         assert(stat.S_ISDIR(gitmode))
 240         return name + b'.bup'
 241     elif name.endswith(b'.bup') or name[:-1].endswith(b'.bup'):
 242         return name + b'.bupl'
 243     else:
 244         return name
 245
 246
 247 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 248 def demangle_name(name, mode):
 249     """Remove name mangling from a file name, if necessary.
 250
 251     The return value is a tuple (demangled_filename,mode), where mode is one of
 252     the following:
 253
 254     * BUP_NORMAL  : files that should be read as-is from the repository
 255     * BUP_CHUNKED : files that were chunked and need to be reassembled
 256
 257     For more information on the name mangling algorithm, see mangle_name()
 258     """
 259     if name.endswith(b'.bupl'):
 260         return (name[:-5], BUP_NORMAL)
 261     elif name.endswith(b'.bup'):
 262         return (name[:-4], BUP_CHUNKED)
 263     elif name.endswith(b'.bupm'):
 264         return (name[:-5],
 265                 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
 266     return (name, BUP_NORMAL)
 267
 268
 269 def calc_hash(type, content):
 270     """Calculate some content's hash in the Git fashion."""
 271     header = b'%s %d\0' % (type, len(content))
 272     sum = Sha1(header)
 273     sum.update(content)
 274     return sum.digest()
 275
 276
 277 def shalist_item_sort_key(ent):
 278     (mode, name, id) = ent
 279     assert(mode+0 == mode)
 280     if stat.S_ISDIR(mode):
 281         return name + b'/'
 282     else:
 283         return name
 284
 285
 286 def tree_encode(shalist):
 287     """Generate a git tree object from (mode,name,hash) tuples."""
 288     shalist = sorted(shalist, key = shalist_item_sort_key)
 289     l = []
 290     for (mode,name,bin) in shalist:
 291         assert(mode)
 292         assert(mode+0 == mode)
 293         assert(name)
 294         assert(len(bin) == 20)
 295         s = b'%o %s\0%s' % (mode,name,bin)
 296         assert s[0] != b'0'  # 0-padded octal is not acceptable in a git tree
 297         l.append(s)
 298     return b''.join(l)
 299
 300
 301 def tree_decode(buf):
 302     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 303     ofs = 0
 304     while ofs < len(buf):
 305         z = buf.find(b'\0', ofs)
 306         assert(z > ofs)
 307         spl = buf[ofs:z].split(b' ', 1)
 308         assert(len(spl) == 2)
 309         mode,name = spl
 310         sha = buf[z+1:z+1+20]
 311         ofs = z+1+20
 312         yield (int(mode, 8), name, sha)
 313
 314
 315 def _encode_packobj(type, content, compression_level=1):
 316     if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
 317         raise ValueError('invalid compression level %s' % compression_level)
 318     szout = b''
 319     sz = len(content)
 320     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 321     sz >>= 4
 322     while 1:
 323         if sz: szbits |= 0x80
 324         szout += bytes_from_uint(szbits)
 325         if not sz:
 326             break
 327         szbits = sz & 0x7f
 328         sz >>= 7
 329     z = zlib.compressobj(compression_level)
 330     yield szout
 331     yield z.compress(content)
 332     yield z.flush()
 333
 334
 335 def _decode_packobj(buf):
 336     assert(buf)
 337     c = byte_int(buf[0])
 338     type = _typermap[(c & 0x70) >> 4]
 339     sz = c & 0x0f
 340     shift = 4
 341     i = 0
 342     while c & 0x80:
 343         i += 1
 344         c = byte_int(buf[i])
 345         sz |= (c & 0x7f) << shift
 346         shift += 7
 347         if not (c & 0x80):
 348             break
 349     return (type, zlib.decompress(buf[i+1:]))
 350
 351
 352 class PackIdx:
 353     def __init__(self):
 354         assert(0)
 355
 356     def find_offset(self, hash):
 357         """Get the offset of an object inside the index file."""
 358         idx = self._idx_from_hash(hash)
 359         if idx != None:
 360             return self._ofs_from_idx(idx)
 361         return None
 362
 363     def exists(self, hash, want_source=False):
 364         """Return nonempty if the object exists in this index."""
 365         if hash and (self._idx_from_hash(hash) != None):
 366             return want_source and os.path.basename(self.name) or True
 367         return None
 368
 369     def _idx_from_hash(self, hash):
 370         global _total_searches, _total_steps
 371         _total_searches += 1
 372         assert(len(hash) == 20)
 373         b1 = byte_int(hash[0])
 374         start = self.fanout[b1-1] # range -1..254
 375         end = self.fanout[b1] # range 0..255
 376         want = hash
 377         _total_steps += 1  # lookup table is a step
 378         while start < end:
 379             _total_steps += 1
 380             mid = start + (end - start) // 2
 381             v = self._idx_to_hash(mid)
 382             if v < want:
 383                 start = mid+1
 384             elif v > want:
 385                 end = mid
 386             else: # got it!
 387                 return mid
 388         return None
 389
 390
 391 class PackIdxV1(PackIdx):
 392     """Object representation of a Git pack index (version 1) file."""
 393     def __init__(self, filename, f):
 394         self.name = filename
 395         self.idxnames = [self.name]
 396         self.map = mmap_read(f)
 397         # Min size for 'L' is 4, which is sufficient for struct's '!I'
 398         self.fanout = array('L', struct.unpack('!256I', self.map))
 399         self.fanout.append(0)  # entry "-1"
 400         self.nsha = self.fanout[255]
 401         self.sha_ofs = 256 * 4
 402         # Avoid slicing shatable for individual hashes (very high overhead)
 403         self.shatable = buffer(self.map, self.sha_ofs, self.nsha * 24)
 404
 405     def __enter__(self):
 406         return self
 407
 408     def __exit__(self, type, value, traceback):
 409         self.close()
 410
 411     def __len__(self):
 412         return int(self.nsha)  # int() from long for python 2
 413
 414     def _ofs_from_idx(self, idx):
 415         if idx >= self.nsha or idx < 0:
 416             raise IndexError('invalid pack index index %d' % idx)
 417         ofs = self.sha_ofs + idx * 24
 418         return struct.unpack_from('!I', self.map, offset=ofs)[0]
 419
 420     def _idx_to_hash(self, idx):
 421         if idx >= self.nsha or idx < 0:
 422             raise IndexError('invalid pack index index %d' % idx)
 423         ofs = self.sha_ofs + idx * 24 + 4
 424         return self.map[ofs : ofs + 20]
 425
 426     def __iter__(self):
 427         start = self.sha_ofs + 4
 428         for ofs in range(start, start + 24 * self.nsha, 24):
 429             yield self.map[ofs : ofs + 20]
 430
 431     def close(self):
 432         if self.map is not None:
 433             self.shatable = None
 434             self.map.close()
 435             self.map = None
 436
 437
 438 class PackIdxV2(PackIdx):
 439     """Object representation of a Git pack index (version 2) file."""
 440     def __init__(self, filename, f):
 441         self.name = filename
 442         self.idxnames = [self.name]
 443         self.map = mmap_read(f)
 444         assert self.map[0:8] == b'\377tOc\0\0\0\2'
 445         # Min size for 'L' is 4, which is sufficient for struct's '!I'
 446         self.fanout = array('L', struct.unpack_from('!256I', self.map, offset=8))
 447         self.fanout.append(0)
 448         self.nsha = self.fanout[255]
 449         self.sha_ofs = 8 + 256*4
 450         self.ofstable_ofs = self.sha_ofs + self.nsha * 20 + self.nsha * 4
 451         self.ofs64table_ofs = self.ofstable_ofs + self.nsha * 4
 452         # Avoid slicing this for individual hashes (very high overhead)
 453         self.shatable = buffer(self.map, self.sha_ofs, self.nsha*20)
 454
 455     def __enter__(self):
 456         return self
 457
 458     def __exit__(self, type, value, traceback):
 459         self.close()
 460
 461     def __len__(self):
 462         return int(self.nsha)  # int() from long for python 2
 463
 464     def _ofs_from_idx(self, idx):
 465         if idx >= self.nsha or idx < 0:
 466             raise IndexError('invalid pack index index %d' % idx)
 467         ofs_ofs = self.ofstable_ofs + idx * 4
 468         ofs = struct.unpack_from('!I', self.map, offset=ofs_ofs)[0]
 469         if ofs & 0x80000000:
 470             idx64 = ofs & 0x7fffffff
 471             ofs64_ofs = self.ofs64table_ofs + idx64 * 8
 472             ofs = struct.unpack_from('!Q', self.map, offset=ofs64_ofs)[0]
 473         return ofs
 474
 475     def _idx_to_hash(self, idx):
 476         if idx >= self.nsha or idx < 0:
 477             raise IndexError('invalid pack index index %d' % idx)
 478         ofs = self.sha_ofs + idx * 20
 479         return self.map[ofs : ofs + 20]
 480
 481     def __iter__(self):
 482         start = self.sha_ofs
 483         for ofs in range(start, start + 20 * self.nsha, 20):
 484             yield self.map[ofs : ofs + 20]
 485
 486     def close(self):
 487         if self.map is not None:
 488             self.shatable = None
 489             self.map.close()
 490             self.map = None
 491
 492
 493 _mpi_count = 0
 494 class PackIdxList:
 495     def __init__(self, dir, ignore_midx=False):
 496         global _mpi_count
 497         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 498         _mpi_count += 1
 499         self.dir = dir
 500         self.also = set()
 501         self.packs = []
 502         self.do_bloom = False
 503         self.bloom = None
 504         self.ignore_midx = ignore_midx
 505         self.refresh()
 506
 507     def __del__(self):
 508         global _mpi_count
 509         _mpi_count -= 1
 510         assert(_mpi_count == 0)
 511
 512     def __iter__(self):
 513         return iter(idxmerge(self.packs))
 514
 515     def __len__(self):
 516         return sum(len(pack) for pack in self.packs)
 517
 518     def exists(self, hash, want_source=False):
 519         """Return nonempty if the object exists in the index files."""
 520         global _total_searches
 521         _total_searches += 1
 522         if hash in self.also:
 523             return True
 524         if self.do_bloom and self.bloom:
 525             if self.bloom.exists(hash):
 526                 self.do_bloom = False
 527             else:
 528                 _total_searches -= 1  # was counted by bloom
 529                 return None
 530         for i in range(len(self.packs)):
 531             p = self.packs[i]
 532             _total_searches -= 1  # will be incremented by sub-pack
 533             ix = p.exists(hash, want_source=want_source)
 534             if ix:
 535                 # reorder so most recently used packs are searched first
 536                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 537                 return ix
 538         self.do_bloom = True
 539         return None
 540
 541     def refresh(self, skip_midx = False):
 542         """Refresh the index list.
 543         This method verifies if .midx files were superseded (e.g. all of its
 544         contents are in another, bigger .midx file) and removes the superseded
 545         files.
 546
 547         If skip_midx is True, all work on .midx files will be skipped and .midx
 548         files will be removed from the list.
 549
 550         The instance variable 'ignore_midx' can force this function to
 551         always act as if skip_midx was True.
 552         """
 553         if self.bloom is not None:
 554             self.bloom.close()
 555         self.bloom = None # Always reopen the bloom as it may have been relaced
 556         self.do_bloom = False
 557         skip_midx = skip_midx or self.ignore_midx
 558         d = dict((p.name, p) for p in self.packs
 559                  if not skip_midx or not isinstance(p, midx.PackMidx))
 560         if os.path.exists(self.dir):
 561             if not skip_midx:
 562                 midxl = []
 563                 midxes = set(glob.glob(os.path.join(self.dir, b'*.midx')))
 564                 # remove any *.midx files from our list that no longer exist
 565                 for ix in list(d.values()):
 566                     if not isinstance(ix, midx.PackMidx):
 567                         continue
 568                     if ix.name in midxes:
 569                         continue
 570                     # remove the midx
 571                     del d[ix.name]
 572                     ix.close()
 573                     self.packs.remove(ix)
 574                 for ix in self.packs:
 575                     if isinstance(ix, midx.PackMidx):
 576                         for name in ix.idxnames:
 577                             d[os.path.join(self.dir, name)] = ix
 578                 for full in midxes:
 579                     if not d.get(full):
 580                         mx = midx.PackMidx(full)
 581                         (mxd, mxf) = os.path.split(mx.name)
 582                         broken = False
 583                         for n in mx.idxnames:
 584                             if not os.path.exists(os.path.join(mxd, n)):
 585                                 log(('warning: index %s missing\n'
 586                                      '  used by %s\n')
 587                                     % (path_msg(n), path_msg(mxf)))
 588                                 broken = True
 589                         if broken:
 590                             mx.close()
 591                             del mx
 592                             unlink(full)
 593                         else:
 594                             midxl.append(mx)
 595                 midxl.sort(key=lambda ix:
 596                            (-len(ix), -xstat.stat(ix.name).st_mtime))
 597                 for ix in midxl:
 598                     any_needed = False
 599                     for sub in ix.idxnames:
 600                         found = d.get(os.path.join(self.dir, sub))
 601                         if not found or isinstance(found, PackIdx):
 602                             # doesn't exist, or exists but not in a midx
 603                             any_needed = True
 604                             break
 605                     if any_needed:
 606                         d[ix.name] = ix
 607                         for name in ix.idxnames:
 608                             d[os.path.join(self.dir, name)] = ix
 609                     elif not ix.force_keep:
 610                         debug1('midx: removing redundant: %s\n'
 611                                % path_msg(os.path.basename(ix.name)))
 612                         ix.close()
 613                         unlink(ix.name)
 614             for full in glob.glob(os.path.join(self.dir, b'*.idx')):
 615                 if not d.get(full):
 616                     try:
 617                         ix = open_idx(full)
 618                     except GitError as e:
 619                         add_error(e)
 620                         continue
 621                     d[full] = ix
 622             bfull = os.path.join(self.dir, b'bup.bloom')
 623             if self.bloom is None and os.path.exists(bfull):
 624                 self.bloom = bloom.ShaBloom(bfull)
 625             self.packs = list(set(d.values()))
 626             self.packs.sort(reverse=True, key=lambda x: len(x))
 627             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 628                 self.do_bloom = True
 629             else:
 630                 self.bloom = None
 631         debug1('PackIdxList: using %d index%s.\n'
 632             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 633
 634     def add(self, hash):
 635         """Insert an additional object in the list."""
 636         self.also.add(hash)
 637
 638
 639 def open_idx(filename):
 640     if filename.endswith(b'.idx'):
 641         f = open(filename, 'rb')
 642         header = f.read(8)
 643         if header[0:4] == b'\377tOc':
 644             version = struct.unpack('!I', header[4:8])[0]
 645             if version == 2:
 646                 return PackIdxV2(filename, f)
 647             else:
 648                 raise GitError('%s: expected idx file version 2, got %d'
 649                                % (path_msg(filename), version))
 650         elif len(header) == 8 and header[0:4] < b'\377tOc':
 651             return PackIdxV1(filename, f)
 652         else:
 653             raise GitError('%s: unrecognized idx file header'
 654                            % path_msg(filename))
 655     elif filename.endswith(b'.midx'):
 656         return midx.PackMidx(filename)
 657     else:
 658         raise GitError('idx filenames must end with .idx or .midx')
 659
 660
 661 def idxmerge(idxlist, final_progress=True):
 662     """Generate a list of all the objects reachable in a PackIdxList."""
 663     def pfunc(count, total):
 664         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 665                   % (count*100.0/total, count, total))
 666     def pfinal(count, total):
 667         if final_progress:
 668             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 669                      % (100, total, total))
 670     return merge_iter(idxlist, 10024, pfunc, pfinal)
 671
 672
 673 def create_commit_blob(tree, parent,
 674                        author, adate_sec, adate_tz,
 675                        committer, cdate_sec, cdate_tz,
 676                        msg):
 677     if adate_tz is not None:
 678         adate_str = _git_date_str(adate_sec, adate_tz)
 679     else:
 680         adate_str = _local_git_date_str(adate_sec)
 681     if cdate_tz is not None:
 682         cdate_str = _git_date_str(cdate_sec, cdate_tz)
 683     else:
 684         cdate_str = _local_git_date_str(cdate_sec)
 685     l = []
 686     if tree: l.append(b'tree %s' % hexlify(tree))
 687     if parent: l.append(b'parent %s' % hexlify(parent))
 688     if author: l.append(b'author %s %s' % (author, adate_str))
 689     if committer: l.append(b'committer %s %s' % (committer, cdate_str))
 690     l.append(b'')
 691     l.append(msg)
 692     return b'\n'.join(l)
 693
 694
 695 def _make_objcache():
 696     return PackIdxList(repo(b'objects/pack'))
 697
 698 # bup-gc assumes that it can disable all PackWriter activities
 699 # (bloom/midx/cache) via the constructor and close() arguments.
 700
 701 class PackWriter:
 702     """Writes Git objects inside a pack file."""
 703     def __init__(self, objcache_maker=_make_objcache, compression_level=1,
 704                  run_midx=True, on_pack_finish=None,
 705                  max_pack_size=None, max_pack_objects=None, repo_dir=None):
 706         self.repo_dir = repo_dir or repo()
 707         self.file = None
 708         self.parentfd = None
 709         self.count = 0
 710         self.outbytes = 0
 711         self.filename = None
 712         self.idx = None
 713         self.objcache_maker = objcache_maker
 714         self.objcache = None
 715         self.compression_level = compression_level
 716         self.run_midx=run_midx
 717         self.on_pack_finish = on_pack_finish
 718         if not max_pack_size:
 719             max_pack_size = git_config_get(b'pack.packSizeLimit',
 720                                            repo_dir=self.repo_dir,
 721                                            opttype='int')
 722             if not max_pack_size:
 723                 # larger packs slow down pruning
 724                 max_pack_size = 1000 * 1000 * 1000
 725         self.max_pack_size = max_pack_size
 726         # cache memory usage is about 83 bytes per object
 727         self.max_pack_objects = max_pack_objects if max_pack_objects \
 728                                 else max(1, self.max_pack_size // 5000)
 729
 730     def __del__(self):
 731         self.close()
 732
 733     def __enter__(self):
 734         return self
 735
 736     def __exit__(self, type, value, traceback):
 737         self.close()
 738
 739     def _open(self):
 740         if not self.file:
 741             objdir = dir = os.path.join(self.repo_dir, b'objects')
 742             fd, name = tempfile.mkstemp(suffix=b'.pack', dir=objdir)
 743             try:
 744                 self.file = os.fdopen(fd, 'w+b')
 745             except:
 746                 os.close(fd)
 747                 raise
 748             try:
 749                 self.parentfd = os.open(objdir, os.O_RDONLY)
 750             except:
 751                 f = self.file
 752                 self.file = None
 753                 f.close()
 754                 raise
 755             assert name.endswith(b'.pack')
 756             self.filename = name[:-5]
 757             self.file.write(b'PACK\0\0\0\2\0\0\0\0')
 758             self.idx = PackIdxV2Writer()
 759
 760     def _raw_write(self, datalist, sha):
 761         self._open()
 762         f = self.file
 763         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 764         # the file never has a *partial* blob.  So let's make sure it's
 765         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 766         # to our hashsplit algorithm.)  f.write() does its own buffering,
 767         # but that's okay because we'll flush it in _end().
 768         oneblob = b''.join(datalist)
 769         try:
 770             f.write(oneblob)
 771         except IOError as e:
 772             reraise(GitError(e))
 773         nw = len(oneblob)
 774         crc = zlib.crc32(oneblob) & 0xffffffff
 775         self._update_idx(sha, crc, nw)
 776         self.outbytes += nw
 777         self.count += 1
 778         return nw, crc
 779
 780     def _update_idx(self, sha, crc, size):
 781         assert(sha)
 782         if self.idx:
 783             self.idx.add(sha, crc, self.file.tell() - size)
 784
 785     def _write(self, sha, type, content):
 786         if verbose:
 787             log('>')
 788         if not sha:
 789             sha = calc_hash(type, content)
 790         size, crc = self._raw_write(_encode_packobj(type, content,
 791                                                     self.compression_level),
 792                                     sha=sha)
 793         if self.outbytes >= self.max_pack_size \
 794            or self.count >= self.max_pack_objects:
 795             self.breakpoint()
 796         return sha
 797
 798     def breakpoint(self):
 799         """Clear byte and object counts and return the last processed id."""
 800         id = self._end(self.run_midx)
 801         self.outbytes = self.count = 0
 802         return id
 803
 804     def _require_objcache(self):
 805         if self.objcache is None and self.objcache_maker:
 806             self.objcache = self.objcache_maker()
 807         if self.objcache is None:
 808             raise GitError(
 809                     "PackWriter not opened or can't check exists w/o objcache")
 810
 811     def exists(self, id, want_source=False):
 812         """Return non-empty if an object is found in the object cache."""
 813         self._require_objcache()
 814         return self.objcache.exists(id, want_source=want_source)
 815
 816     def just_write(self, sha, type, content):
 817         """Write an object to the pack file without checking for duplication."""
 818         self._write(sha, type, content)
 819         # If nothing else, gc doesn't have/want an objcache
 820         if self.objcache is not None:
 821             self.objcache.add(sha)
 822
 823     def maybe_write(self, type, content):
 824         """Write an object to the pack file if not present and return its id."""
 825         sha = calc_hash(type, content)
 826         if not self.exists(sha):
 827             self._require_objcache()
 828             self.just_write(sha, type, content)
 829         return sha
 830
 831     def new_blob(self, blob):
 832         """Create a blob object in the pack with the supplied content."""
 833         return self.maybe_write(b'blob', blob)
 834
 835     def new_tree(self, shalist):
 836         """Create a tree object in the pack."""
 837         content = tree_encode(shalist)
 838         return self.maybe_write(b'tree', content)
 839
 840     def new_commit(self, tree, parent,
 841                    author, adate_sec, adate_tz,
 842                    committer, cdate_sec, cdate_tz,
 843                    msg):
 844         """Create a commit object in the pack.  The date_sec values must be
 845         epoch-seconds, and if a tz is None, the local timezone is assumed."""
 846         content = create_commit_blob(tree, parent,
 847                                      author, adate_sec, adate_tz,
 848                                      committer, cdate_sec, cdate_tz,
 849                                      msg)
 850         return self.maybe_write(b'commit', content)
 851
 852     def abort(self):
 853         """Remove the pack file from disk."""
 854         f = self.file
 855         if f:
 856             pfd = self.parentfd
 857             self.file = None
 858             self.parentfd = None
 859             self.idx = None
 860             try:
 861                 try:
 862                     os.unlink(self.filename + b'.pack')
 863                 finally:
 864                     f.close()
 865             finally:
 866                 if pfd is not None:
 867                     os.close(pfd)
 868
 869     def _end(self, run_midx=True):
 870         f = self.file
 871         if not f: return None
 872         self.file = None
 873         try:
 874             self.objcache = None
 875             idx = self.idx
 876             self.idx = None
 877
 878             # update object count
 879             f.seek(8)
 880             cp = struct.pack('!i', self.count)
 881             assert(len(cp) == 4)
 882             f.write(cp)
 883
 884             # calculate the pack sha1sum
 885             f.seek(0)
 886             sum = Sha1()
 887             for b in chunkyreader(f):
 888                 sum.update(b)
 889             packbin = sum.digest()
 890             f.write(packbin)
 891             fdatasync(f.fileno())
 892         finally:
 893             f.close()
 894
 895         idx.write(self.filename + b'.idx', packbin)
 896         nameprefix = os.path.join(self.repo_dir,
 897                                   b'objects/pack/pack-' +  hexlify(packbin))
 898         if os.path.exists(self.filename + b'.map'):
 899             os.unlink(self.filename + b'.map')
 900         os.rename(self.filename + b'.pack', nameprefix + b'.pack')
 901         os.rename(self.filename + b'.idx', nameprefix + b'.idx')
 902         try:
 903             os.fsync(self.parentfd)
 904         finally:
 905             os.close(self.parentfd)
 906
 907         if run_midx:
 908             auto_midx(os.path.join(self.repo_dir, b'objects/pack'))
 909
 910         if self.on_pack_finish:
 911             self.on_pack_finish(nameprefix)
 912
 913         return nameprefix
 914
 915     def close(self, run_midx=True):
 916         """Close the pack file and move it to its definitive path."""
 917         return self._end(run_midx=run_midx)
 918
 919
 920 class PackIdxV2Writer:
 921     def __init__(self):
 922         self.idx = list(list() for i in range(256))
 923         self.count = 0
 924
 925     def add(self, sha, crc, offs):
 926         assert(sha)
 927         self.count += 1
 928         self.idx[byte_int(sha[0])].append((sha, crc, offs))
 929
 930     def write(self, filename, packbin):
 931         ofs64_count = 0
 932         for section in self.idx:
 933             for entry in section:
 934                 if entry[2] >= 2**31:
 935                     ofs64_count += 1
 936
 937         # Length: header + fan-out + shas-and-crcs + overflow-offsets
 938         index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
 939         idx_map = None
 940         idx_f = open(filename, 'w+b')
 941         try:
 942             idx_f.truncate(index_len)
 943             fdatasync(idx_f.fileno())
 944             idx_map = mmap_readwrite(idx_f, close=False)
 945             try:
 946                 count = _helpers.write_idx(filename, idx_map, self.idx,
 947                                            self.count)
 948                 assert(count == self.count)
 949                 idx_map.flush()
 950             finally:
 951                 idx_map.close()
 952         finally:
 953             idx_f.close()
 954
 955         idx_f = open(filename, 'a+b')
 956         try:
 957             idx_f.write(packbin)
 958             idx_f.seek(0)
 959             idx_sum = Sha1()
 960             b = idx_f.read(8 + 4*256)
 961             idx_sum.update(b)
 962
 963             for b in chunkyreader(idx_f, 20 * self.count):
 964                 idx_sum.update(b)
 965
 966             for b in chunkyreader(idx_f):
 967                 idx_sum.update(b)
 968             idx_f.write(idx_sum.digest())
 969             fdatasync(idx_f.fileno())
 970         finally:
 971             idx_f.close()
 972
 973
 974 def list_refs(patterns=None, repo_dir=None,
 975               limit_to_heads=False, limit_to_tags=False):
 976     """Yield (refname, hash) tuples for all repository refs unless
 977     patterns are specified.  In that case, only include tuples for
 978     refs matching those patterns (cf. git-show-ref(1)).  The limits
 979     restrict the result items to refs/heads or refs/tags.  If both
 980     limits are specified, items from both sources will be included.
 981
 982     """
 983     argv = [b'git', b'show-ref']
 984     if limit_to_heads:
 985         argv.append(b'--heads')
 986     if limit_to_tags:
 987         argv.append(b'--tags')
 988     argv.append(b'--')
 989     if patterns:
 990         argv.extend(patterns)
 991     p = subprocess.Popen(argv, env=_gitenv(repo_dir), stdout=subprocess.PIPE,
 992                          close_fds=True)
 993     out = p.stdout.read().strip()
 994     rv = p.wait()  # not fatal
 995     if rv:
 996         assert(not out)
 997     if out:
 998         for d in out.split(b'\n'):
 999             sha, name = d.split(b' ', 1)
1000             yield name, unhexlify(sha)
1001
1002
1003 def read_ref(refname, repo_dir = None):
1004     """Get the commit id of the most recent commit made on a given ref."""
1005     refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
1006     l = tuple(islice(refs, 2))
1007     if l:
1008         assert(len(l) == 1)
1009         return l[0][1]
1010     else:
1011         return None
1012
1013
1014 def rev_list_invocation(ref_or_refs, format=None):
1015     if isinstance(ref_or_refs, bytes):
1016         refs = (ref_or_refs,)
1017     else:
1018         refs = ref_or_refs
1019     argv = [b'git', b'rev-list']
1020
1021     if format:
1022         argv.append(b'--pretty=format:' + format)
1023     for ref in refs:
1024         assert not ref.startswith(b'-')
1025         argv.append(ref)
1026     argv.append(b'--')
1027     return argv
1028
1029
1030 def rev_list(ref_or_refs, parse=None, format=None, repo_dir=None):
1031     """Yield information about commits as per "git rev-list".  If a format
1032     is not provided, yield one hex hash at a time.  If a format is
1033     provided, pass it to rev-list and call parse(git_stdout) for each
1034     commit with the stream positioned just after the rev-list "commit
1035     HASH" header line.  When a format is provided yield (oidx,
1036     parse(git_stdout)) for each commit.
1037
1038     """
1039     assert bool(parse) == bool(format)
1040     p = subprocess.Popen(rev_list_invocation(ref_or_refs,
1041                                              format=format),
1042                          env=_gitenv(repo_dir),
1043                          stdout = subprocess.PIPE,
1044                          close_fds=True)
1045     if not format:
1046         for line in p.stdout:
1047             yield line.strip()
1048     else:
1049         line = p.stdout.readline()
1050         while line:
1051             s = line.strip()
1052             if not s.startswith(b'commit '):
1053                 raise Exception('unexpected line ' + repr(s))
1054             s = s[7:]
1055             assert len(s) == 40
1056             yield s, parse(p.stdout)
1057             line = p.stdout.readline()
1058
1059     rv = p.wait()  # not fatal
1060     if rv:
1061         raise GitError('git rev-list returned error %d' % rv)
1062
1063
1064 def rev_parse(committish, repo_dir=None):
1065     """Resolve the full hash for 'committish', if it exists.
1066
1067     Should be roughly equivalent to 'git rev-parse'.
1068
1069     Returns the hex value of the hash if it is found, None if 'committish' does
1070     not correspond to anything.
1071     """
1072     head = read_ref(committish, repo_dir=repo_dir)
1073     if head:
1074         debug2("resolved from ref: commit = %s\n" % hexlify(head))
1075         return head
1076
1077     pL = PackIdxList(repo(b'objects/pack', repo_dir=repo_dir))
1078
1079     if len(committish) == 40:
1080         try:
1081             hash = unhexlify(committish)
1082         except TypeError:
1083             return None
1084
1085         if pL.exists(hash):
1086             return hash
1087
1088     return None
1089
1090
1091 def update_ref(refname, newval, oldval, repo_dir=None):
1092     """Update a repository reference."""
1093     if not oldval:
1094         oldval = b''
1095     assert refname.startswith(b'refs/heads/') \
1096         or refname.startswith(b'refs/tags/')
1097     p = subprocess.Popen([b'git', b'update-ref', refname,
1098                           hexlify(newval), hexlify(oldval)],
1099                          env=_gitenv(repo_dir),
1100                          close_fds=True)
1101     _git_wait(b'git update-ref', p)
1102
1103
1104 def delete_ref(refname, oldvalue=None):
1105     """Delete a repository reference (see git update-ref(1))."""
1106     assert refname.startswith(b'refs/')
1107     oldvalue = [] if not oldvalue else [oldvalue]
1108     p = subprocess.Popen([b'git', b'update-ref', b'-d', refname] + oldvalue,
1109                          env=_gitenv(),
1110                          close_fds=True)
1111     _git_wait('git update-ref', p)
1112
1113
1114 def guess_repo(path=None):
1115     """Set the path value in the global variable "repodir".
1116     This makes bup look for an existing bup repository, but not fail if a
1117     repository doesn't exist. Usually, if you are interacting with a bup
1118     repository, you would not be calling this function but using
1119     check_repo_or_die().
1120     """
1121     global repodir
1122     if path:
1123         repodir = path
1124     if not repodir:
1125         repodir = environ.get(b'BUP_DIR')
1126         if not repodir:
1127             repodir = os.path.expanduser(b'~/.bup')
1128
1129
1130 def init_repo(path=None):
1131     """Create the Git bare repository for bup in a given path."""
1132     guess_repo(path)
1133     d = repo()  # appends a / to the path
1134     parent = os.path.dirname(os.path.dirname(d))
1135     if parent and not os.path.exists(parent):
1136         raise GitError('parent directory "%s" does not exist\n'
1137                        % path_msg(parent))
1138     if os.path.exists(d) and not os.path.isdir(os.path.join(d, b'.')):
1139         raise GitError('"%s" exists but is not a directory\n' % path_msg(d))
1140     p = subprocess.Popen([b'git', b'--bare', b'init'], stdout=sys.stderr,
1141                          env=_gitenv(),
1142                          close_fds=True)
1143     _git_wait('git init', p)
1144     # Force the index version configuration in order to ensure bup works
1145     # regardless of the version of the installed Git binary.
1146     p = subprocess.Popen([b'git', b'config', b'pack.indexVersion', '2'],
1147                          stdout=sys.stderr, env=_gitenv(), close_fds=True)
1148     _git_wait('git config', p)
1149     # Enable the reflog
1150     p = subprocess.Popen([b'git', b'config', b'core.logAllRefUpdates', b'true'],
1151                          stdout=sys.stderr, env=_gitenv(), close_fds=True)
1152     _git_wait('git config', p)
1153
1154
1155 def check_repo_or_die(path=None):
1156     """Check to see if a bup repository probably exists, and abort if not."""
1157     guess_repo(path)
1158     top = repo()
1159     pst = stat_if_exists(top + b'/objects/pack')
1160     if pst and stat.S_ISDIR(pst.st_mode):
1161         return
1162     if not pst:
1163         top_st = stat_if_exists(top)
1164         if not top_st:
1165             log('error: repository %r does not exist (see "bup help init")\n'
1166                 % top)
1167             sys.exit(15)
1168     log('error: %s is not a repository\n' % path_msg(top))
1169     sys.exit(14)
1170
1171
1172 def is_suitable_git(ver_str):
1173     if not ver_str.startswith(b'git version '):
1174         return 'unrecognized'
1175     ver_str = ver_str[len(b'git version '):]
1176     if ver_str.startswith(b'0.'):
1177         return 'insufficient'
1178     if ver_str.startswith(b'1.'):
1179         if re.match(br'1\.[012345]rc', ver_str):
1180             return 'insufficient'
1181         if re.match(br'1\.[01234]\.', ver_str):
1182             return 'insufficient'
1183         if re.match(br'1\.5\.[012345]($|\.)', ver_str):
1184             return 'insufficient'
1185         if re.match(br'1\.5\.6-rc', ver_str):
1186             return 'insufficient'
1187         return 'suitable'
1188     if re.match(br'[0-9]+(\.|$)?', ver_str):
1189         return 'suitable'
1190     sys.exit(13)
1191
1192 _git_great = None
1193
1194 def require_suitable_git(ver_str=None):
1195     """Raise GitError if the version of git isn't suitable.
1196
1197     Rely on ver_str when provided, rather than invoking the git in the
1198     path.
1199
1200     """
1201     global _git_great
1202     if _git_great is not None:
1203         return
1204     if environ.get(b'BUP_GIT_VERSION_IS_FINE', b'').lower() \
1205        in (b'yes', b'true', b'1'):
1206         _git_great = True
1207         return
1208     if not ver_str:
1209         ver_str, _, _ = _git_exo([b'git', b'--version'])
1210     status = is_suitable_git(ver_str)
1211     if status == 'unrecognized':
1212         raise GitError('Unexpected git --version output: %r' % ver_str)
1213     if status == 'insufficient':
1214         log('error: git version must be at least 1.5.6\n')
1215         sys.exit(1)
1216     if status == 'suitable':
1217         _git_great = True
1218         return
1219     assert False
1220
1221
1222 class _AbortableIter:
1223     def __init__(self, it, onabort = None):
1224         self.it = it
1225         self.onabort = onabort
1226         self.done = None
1227
1228     def __iter__(self):
1229         return self
1230
1231     def __next__(self):
1232         try:
1233             return next(self.it)
1234         except StopIteration as e:
1235             self.done = True
1236             raise
1237         except:
1238             self.abort()
1239             raise
1240
1241     next = __next__
1242
1243     def abort(self):
1244         """Abort iteration and call the abortion callback, if needed."""
1245         if not self.done:
1246             self.done = True
1247             if self.onabort:
1248                 self.onabort()
1249
1250     def __del__(self):
1251         self.abort()
1252
1253
1254 class CatPipe:
1255     """Link to 'git cat-file' that is used to retrieve blob data."""
1256     def __init__(self, repo_dir = None):
1257         require_suitable_git()
1258         self.repo_dir = repo_dir
1259         self.p = self.inprogress = None
1260
1261     def close(self, wait=False):
1262         p = self.p
1263         if p:
1264             p.stdout.close()
1265             p.stdin.close()
1266         self.p = None
1267         self.inprogress = None
1268         if wait:
1269             p.wait()
1270             return p.returncode
1271         return None
1272
1273     def restart(self):
1274         self.close()
1275         self.p = subprocess.Popen([b'git', b'cat-file', b'--batch'],
1276                                   stdin=subprocess.PIPE,
1277                                   stdout=subprocess.PIPE,
1278                                   close_fds = True,
1279                                   bufsize = 4096,
1280                                   env=_gitenv(self.repo_dir))
1281
1282     def get(self, ref):
1283         """Yield (oidx, type, size), followed by the data referred to by ref.
1284         If ref does not exist, only yield (None, None, None).
1285
1286         """
1287         if not self.p or self.p.poll() != None:
1288             self.restart()
1289         assert(self.p)
1290         poll_result = self.p.poll()
1291         assert(poll_result == None)
1292         if self.inprogress:
1293             log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1294         assert(not self.inprogress)
1295         assert ref.find(b'\n') < 0
1296         assert ref.find(b'\r') < 0
1297         assert not ref.startswith(b'-')
1298         self.inprogress = ref
1299         self.p.stdin.write(ref + b'\n')
1300         self.p.stdin.flush()
1301         hdr = self.p.stdout.readline()
1302         if not hdr:
1303             raise GitError('unexpected cat-file EOF (last request: %r, exit: %s)'
1304                            % (ref, self.p.poll() or 'none'))
1305         if hdr.endswith(b' missing\n'):
1306             self.inprogress = None
1307             yield None, None, None
1308             return
1309         info = hdr.split(b' ')
1310         if len(info) != 3 or len(info[0]) != 40:
1311             raise GitError('expected object (id, type, size), got %r' % info)
1312         oidx, typ, size = info
1313         size = int(size)
1314         it = _AbortableIter(chunkyreader(self.p.stdout, size),
1315                             onabort=self.close)
1316         try:
1317             yield oidx, typ, size
1318             for blob in it:
1319                 yield blob
1320             readline_result = self.p.stdout.readline()
1321             assert readline_result == b'\n'
1322             self.inprogress = None
1323         except Exception as e:
1324             it.abort()
1325             raise
1326
1327     def _join(self, it):
1328         _, typ, _ = next(it)
1329         if typ == b'blob':
1330             for blob in it:
1331                 yield blob
1332         elif typ == b'tree':
1333             treefile = b''.join(it)
1334             for (mode, name, sha) in tree_decode(treefile):
1335                 for blob in self.join(hexlify(sha)):
1336                     yield blob
1337         elif typ == b'commit':
1338             treeline = b''.join(it).split(b'\n')[0]
1339             assert treeline.startswith(b'tree ')
1340             for blob in self.join(treeline[5:]):
1341                 yield blob
1342         else:
1343             raise GitError('invalid object type %r: expected blob/tree/commit'
1344                            % typ)
1345
1346     def join(self, id):
1347         """Generate a list of the content of all blobs that can be reached
1348         from an object.  The hash given in 'id' must point to a blob, a tree
1349         or a commit. The content of all blobs that can be seen from trees or
1350         commits will be added to the list.
1351         """
1352         for d in self._join(self.get(id)):
1353             yield d
1354
1355
1356 _cp = {}
1357
1358 def cp(repo_dir=None):
1359     """Create a CatPipe object or reuse the already existing one."""
1360     global _cp, repodir
1361     if not repo_dir:
1362         repo_dir = repodir or repo()
1363     repo_dir = os.path.abspath(repo_dir)
1364     cp = _cp.get(repo_dir)
1365     if not cp:
1366         cp = CatPipe(repo_dir)
1367         _cp[repo_dir] = cp
1368     return cp
1369
1370
1371 def close_catpipes():
1372     # FIXME: chain exceptions
1373     while _cp:
1374         _, cp = _cp.popitem()
1375         cp.close(wait=True)
1376
1377
1378 def tags(repo_dir = None):
1379     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1380     tags = {}
1381     for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1382         assert n.startswith(b'refs/tags/')
1383         name = n[10:]
1384         if not c in tags:
1385             tags[c] = []
1386         tags[c].append(name)  # more than one tag can point at 'c'
1387     return tags
1388
1389
1390 class MissingObject(KeyError):
1391     def __init__(self, oid):
1392         self.oid = oid
1393         KeyError.__init__(self, 'object %r is missing' % hexlify(oid))
1394
1395
1396 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1397                                    'path', 'chunk_path', 'data'])
1398 # The path is the mangled path, and if an item represents a fragment
1399 # of a chunked file, the chunk_path will be the chunked subtree path
1400 # for the chunk, i.e. ['', '2d3115e', ...].  The top-level path for a
1401 # chunked file will have a chunk_path of [''].  So some chunk subtree
1402 # of the file '/foo/bar/baz' might look like this:
1403 #
1404 #   item.path = ['foo', 'bar', 'baz.bup']
1405 #   item.chunk_path = ['', '2d3115e', '016b097']
1406 #   item.type = 'tree'
1407 #   ...
1408
1409
1410 def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1411     """Yield everything reachable from oidx via get_ref (which must behave
1412     like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1413     returns true.  Throw MissingObject if a hash encountered is
1414     missing from the repository, and don't read or return blob content
1415     in the data field unless include_data is set.
1416
1417     """
1418     # Maintain the pending stack on the heap to avoid stack overflow
1419     pending = [(oidx, [], [], None)]
1420     while len(pending):
1421         oidx, parent_path, chunk_path, mode = pending.pop()
1422         oid = unhexlify(oidx)
1423         if stop_at and stop_at(oidx):
1424             continue
1425
1426         if (not include_data) and mode and stat.S_ISREG(mode):
1427             # If the object is a "regular file", then it's a leaf in
1428             # the graph, so we can skip reading the data if the caller
1429             # hasn't requested it.
1430             yield WalkItem(oid=oid, type=b'blob',
1431                            chunk_path=chunk_path, path=parent_path,
1432                            mode=mode,
1433                            data=None)
1434             continue
1435
1436         item_it = get_ref(oidx)
1437         get_oidx, typ, _ = next(item_it)
1438         if not get_oidx:
1439             raise MissingObject(unhexlify(oidx))
1440         if typ not in (b'blob', b'commit', b'tree'):
1441             raise Exception('unexpected repository object type %r' % typ)
1442
1443         # FIXME: set the mode based on the type when the mode is None
1444         if typ == b'blob' and not include_data:
1445             # Dump data until we can ask cat_pipe not to fetch it
1446             for ignored in item_it:
1447                 pass
1448             data = None
1449         else:
1450             data = b''.join(item_it)
1451
1452         yield WalkItem(oid=oid, type=typ,
1453                        chunk_path=chunk_path, path=parent_path,
1454                        mode=mode,
1455                        data=(data if include_data else None))
1456
1457         if typ == b'commit':
1458             commit_items = parse_commit(data)
1459             for pid in commit_items.parents:
1460                 pending.append((pid, parent_path, chunk_path, mode))
1461             pending.append((commit_items.tree, parent_path, chunk_path,
1462                             hashsplit.GIT_MODE_TREE))
1463         elif typ == b'tree':
1464             for mode, name, ent_id in tree_decode(data):
1465                 demangled, bup_type = demangle_name(name, mode)
1466                 if chunk_path:
1467                     sub_path = parent_path
1468                     sub_chunk_path = chunk_path + [name]
1469                 else:
1470                     sub_path = parent_path + [name]
1471                     if bup_type == BUP_CHUNKED:
1472                         sub_chunk_path = [b'']
1473                     else:
1474                         sub_chunk_path = chunk_path
1475                 pending.append((hexlify(ent_id), sub_path, sub_chunk_path,
1476                                 mode))