lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5
   6 from __future__ import absolute_import, print_function
   7 import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
   8 from array import array
   9 from binascii import hexlify, unhexlify
  10 from collections import namedtuple
  11 from itertools import islice
  12 from numbers import Integral
  13
  14 from bup import _helpers, compat, hashsplit, path, midx, bloom, xstat
  15 from bup.compat import (buffer,
  16                         byte_int, bytes_from_byte, bytes_from_uint,
  17                         environ,
  18                         items,
  19                         range,
  20                         reraise)
  21 from bup.io import path_msg
  22 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
  23                          exo,
  24                          fdatasync,
  25                          hostname, localtime, log,
  26                          merge_dict,
  27                          merge_iter,
  28                          mmap_read, mmap_readwrite,
  29                          parse_num,
  30                          progress, qprogress, stat_if_exists,
  31                          unlink,
  32                          utc_offset_str)
  33 from bup.pwdgrp import username, userfullname
  34
  35
  36 verbose = 0
  37 repodir = None  # The default repository, once initialized
  38
  39 _typemap =  {b'blob': 3, b'tree': 2, b'commit': 1, b'tag': 4}
  40 _typermap = {v: k for k, v in items(_typemap)}
  41
  42
  43 _total_searches = 0
  44 _total_steps = 0
  45
  46
  47 class GitError(Exception):
  48     pass
  49
  50
  51 def _gitenv(repo_dir=None):
  52     if not repo_dir:
  53         repo_dir = repo()
  54     return merge_dict(environ, {b'GIT_DIR': os.path.abspath(repo_dir)})
  55
  56 def _git_wait(cmd, p):
  57     rv = p.wait()
  58     if rv != 0:
  59         raise GitError('%r returned %d' % (cmd, rv))
  60
  61 def _git_exo(cmd, **kwargs):
  62     kwargs['check'] = False
  63     result = exo(cmd, **kwargs)
  64     _, _, proc = result
  65     if proc.returncode != 0:
  66         raise GitError('%r returned %d' % (cmd, proc.returncode))
  67     return result
  68
  69 def git_config_get(option, repo_dir=None):
  70     cmd = (b'git', b'config', b'--get', option)
  71     p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
  72                          env=_gitenv(repo_dir=repo_dir),
  73                          close_fds=True)
  74     r = p.stdout.read()
  75     rc = p.wait()
  76     if rc == 0:
  77         return r
  78     if rc != 1:
  79         raise GitError('%r returned %d' % (cmd, rc))
  80     return None
  81
  82
  83 def parse_tz_offset(s):
  84     """UTC offset in seconds."""
  85     tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
  86     if bytes_from_byte(s[0]) == b'-':
  87         return - tz_off
  88     return tz_off
  89
  90
  91 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
  92 # Make sure that's authoritative.
  93 _start_end_char = br'[^ .,:;<>"\'\0\n]'
  94 _content_char = br'[^\0\n<>]'
  95 _safe_str_rx = br'(?:%s{1,2}|(?:%s%s*%s))' \
  96     % (_start_end_char,
  97        _start_end_char, _content_char, _start_end_char)
  98 _tz_rx = br'[-+]\d\d[0-5]\d'
  99 _parent_rx = br'(?:parent [abcdefABCDEF0123456789]{40}\n)'
 100 # Assumes every following line starting with a space is part of the
 101 # mergetag.  Is there a formal commit blob spec?
 102 _mergetag_rx = br'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)'
 103 _commit_rx = re.compile(br'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
 104 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
 105 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?)
 106
 107 (?P<message>(?:.|\n)*)''' % (_parent_rx,
 108                              _safe_str_rx, _safe_str_rx, _tz_rx,
 109                              _safe_str_rx, _safe_str_rx, _tz_rx,
 110                              _mergetag_rx))
 111 _parent_hash_rx = re.compile(br'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
 112
 113 # Note that the author_sec and committer_sec values are (UTC) epoch
 114 # seconds, and for now the mergetag is not included.
 115 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
 116                                        'author_name', 'author_mail',
 117                                        'author_sec', 'author_offset',
 118                                        'committer_name', 'committer_mail',
 119                                        'committer_sec', 'committer_offset',
 120                                        'message'])
 121
 122 def parse_commit(content):
 123     commit_match = re.match(_commit_rx, content)
 124     if not commit_match:
 125         raise Exception('cannot parse commit %r' % content)
 126     matches = commit_match.groupdict()
 127     return CommitInfo(tree=matches['tree'],
 128                       parents=re.findall(_parent_hash_rx, matches['parents']),
 129                       author_name=matches['author_name'],
 130                       author_mail=matches['author_mail'],
 131                       author_sec=int(matches['asec']),
 132                       author_offset=parse_tz_offset(matches['atz']),
 133                       committer_name=matches['committer_name'],
 134                       committer_mail=matches['committer_mail'],
 135                       committer_sec=int(matches['csec']),
 136                       committer_offset=parse_tz_offset(matches['ctz']),
 137                       message=matches['message'])
 138
 139
 140 def get_cat_data(cat_iterator, expected_type):
 141     _, kind, _ = next(cat_iterator)
 142     if kind != expected_type:
 143         raise Exception('expected %r, saw %r' % (expected_type, kind))
 144     return b''.join(cat_iterator)
 145
 146 def get_commit_items(id, cp):
 147     return parse_commit(get_cat_data(cp.get(id), b'commit'))
 148
 149 def _local_git_date_str(epoch_sec):
 150     return b'%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
 151
 152
 153 def _git_date_str(epoch_sec, tz_offset_sec):
 154     offs =  tz_offset_sec // 60
 155     return b'%d %s%02d%02d' \
 156         % (epoch_sec,
 157            b'+' if offs >= 0 else b'-',
 158            abs(offs) // 60,
 159            abs(offs) % 60)
 160
 161
 162 def repo(sub = b'', repo_dir=None):
 163     """Get the path to the git repository or one of its subdirectories."""
 164     repo_dir = repo_dir or repodir
 165     if not repo_dir:
 166         raise GitError('You should call check_repo_or_die()')
 167
 168     # If there's a .git subdirectory, then the actual repo is in there.
 169     gd = os.path.join(repo_dir, b'.git')
 170     if os.path.exists(gd):
 171         repo_dir = gd
 172
 173     return os.path.join(repo_dir, sub)
 174
 175
 176 _shorten_hash_rx = \
 177     re.compile(br'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)')
 178
 179 def shorten_hash(s):
 180     return _shorten_hash_rx.sub(br'\1\2*\3', s)
 181
 182
 183 def repo_rel(path):
 184     full = os.path.abspath(path)
 185     fullrepo = os.path.abspath(repo(b''))
 186     if not fullrepo.endswith(b'/'):
 187         fullrepo += b'/'
 188     if full.startswith(fullrepo):
 189         path = full[len(fullrepo):]
 190     if path.startswith(b'index-cache/'):
 191         path = path[len(b'index-cache/'):]
 192     return shorten_hash(path)
 193
 194
 195 def auto_midx(objdir):
 196     args = [path.exe(), b'midx', b'--auto', b'--dir', objdir]
 197     try:
 198         rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
 199     except OSError as e:
 200         # make sure 'args' gets printed to help with debugging
 201         add_error('%r: exception: %s' % (args, e))
 202         raise
 203     if rv:
 204         add_error('%r: returned %d' % (args, rv))
 205
 206     args = [path.exe(), b'bloom', b'--dir', objdir]
 207     try:
 208         rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
 209     except OSError as e:
 210         # make sure 'args' gets printed to help with debugging
 211         add_error('%r: exception: %s' % (args, e))
 212         raise
 213     if rv:
 214         add_error('%r: returned %d' % (args, rv))
 215
 216
 217 def mangle_name(name, mode, gitmode):
 218     """Mangle a file name to present an abstract name for segmented files.
 219     Mangled file names will have the ".bup" extension added to them. If a
 220     file's name already ends with ".bup", a ".bupl" extension is added to
 221     disambiguate normal files from segmented ones.
 222     """
 223     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 224         assert(stat.S_ISDIR(gitmode))
 225         return name + b'.bup'
 226     elif name.endswith(b'.bup') or name[:-1].endswith(b'.bup'):
 227         return name + b'.bupl'
 228     else:
 229         return name
 230
 231
 232 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 233 def demangle_name(name, mode):
 234     """Remove name mangling from a file name, if necessary.
 235
 236     The return value is a tuple (demangled_filename,mode), where mode is one of
 237     the following:
 238
 239     * BUP_NORMAL  : files that should be read as-is from the repository
 240     * BUP_CHUNKED : files that were chunked and need to be reassembled
 241
 242     For more information on the name mangling algorithm, see mangle_name()
 243     """
 244     if name.endswith(b'.bupl'):
 245         return (name[:-5], BUP_NORMAL)
 246     elif name.endswith(b'.bup'):
 247         return (name[:-4], BUP_CHUNKED)
 248     elif name.endswith(b'.bupm'):
 249         return (name[:-5],
 250                 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
 251     else:
 252         return (name, BUP_NORMAL)
 253
 254
 255 def calc_hash(type, content):
 256     """Calculate some content's hash in the Git fashion."""
 257     header = b'%s %d\0' % (type, len(content))
 258     sum = Sha1(header)
 259     sum.update(content)
 260     return sum.digest()
 261
 262
 263 def shalist_item_sort_key(ent):
 264     (mode, name, id) = ent
 265     assert(mode+0 == mode)
 266     if stat.S_ISDIR(mode):
 267         return name + b'/'
 268     else:
 269         return name
 270
 271
 272 def tree_encode(shalist):
 273     """Generate a git tree object from (mode,name,hash) tuples."""
 274     shalist = sorted(shalist, key = shalist_item_sort_key)
 275     l = []
 276     for (mode,name,bin) in shalist:
 277         assert(mode)
 278         assert(mode+0 == mode)
 279         assert(name)
 280         assert(len(bin) == 20)
 281         s = b'%o %s\0%s' % (mode,name,bin)
 282         assert s[0] != b'0'  # 0-padded octal is not acceptable in a git tree
 283         l.append(s)
 284     return b''.join(l)
 285
 286
 287 def tree_decode(buf):
 288     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 289     ofs = 0
 290     while ofs < len(buf):
 291         z = buf.find(b'\0', ofs)
 292         assert(z > ofs)
 293         spl = buf[ofs:z].split(b' ', 1)
 294         assert(len(spl) == 2)
 295         mode,name = spl
 296         sha = buf[z+1:z+1+20]
 297         ofs = z+1+20
 298         yield (int(mode, 8), name, sha)
 299
 300
 301 def _encode_packobj(type, content, compression_level=1):
 302     if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
 303         raise ValueError('invalid compression level %s' % compression_level)
 304     szout = b''
 305     sz = len(content)
 306     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 307     sz >>= 4
 308     while 1:
 309         if sz: szbits |= 0x80
 310         szout += bytes_from_uint(szbits)
 311         if not sz:
 312             break
 313         szbits = sz & 0x7f
 314         sz >>= 7
 315     z = zlib.compressobj(compression_level)
 316     yield szout
 317     yield z.compress(content)
 318     yield z.flush()
 319
 320
 321 def _decode_packobj(buf):
 322     assert(buf)
 323     c = byte_int(buf[0])
 324     type = _typermap[(c & 0x70) >> 4]
 325     sz = c & 0x0f
 326     shift = 4
 327     i = 0
 328     while c & 0x80:
 329         i += 1
 330         c = byte_int(buf[i])
 331         sz |= (c & 0x7f) << shift
 332         shift += 7
 333         if not (c & 0x80):
 334             break
 335     return (type, zlib.decompress(buf[i+1:]))
 336
 337
 338 class PackIdx:
 339     def __init__(self):
 340         assert(0)
 341
 342     def find_offset(self, hash):
 343         """Get the offset of an object inside the index file."""
 344         idx = self._idx_from_hash(hash)
 345         if idx != None:
 346             return self._ofs_from_idx(idx)
 347         return None
 348
 349     def exists(self, hash, want_source=False):
 350         """Return nonempty if the object exists in this index."""
 351         if hash and (self._idx_from_hash(hash) != None):
 352             return want_source and os.path.basename(self.name) or True
 353         return None
 354
 355     def _idx_from_hash(self, hash):
 356         global _total_searches, _total_steps
 357         _total_searches += 1
 358         assert(len(hash) == 20)
 359         b1 = byte_int(hash[0])
 360         start = self.fanout[b1-1] # range -1..254
 361         end = self.fanout[b1] # range 0..255
 362         want = hash
 363         _total_steps += 1  # lookup table is a step
 364         while start < end:
 365             _total_steps += 1
 366             mid = start + (end - start) // 2
 367             v = self._idx_to_hash(mid)
 368             if v < want:
 369                 start = mid+1
 370             elif v > want:
 371                 end = mid
 372             else: # got it!
 373                 return mid
 374         return None
 375
 376
 377 class PackIdxV1(PackIdx):
 378     """Object representation of a Git pack index (version 1) file."""
 379     def __init__(self, filename, f):
 380         self.name = filename
 381         self.idxnames = [self.name]
 382         self.map = mmap_read(f)
 383         # Min size for 'L' is 4, which is sufficient for struct's '!I'
 384         self.fanout = array('L', struct.unpack('!256I', self.map))
 385         self.fanout.append(0)  # entry "-1"
 386         self.nsha = self.fanout[255]
 387         self.sha_ofs = 256 * 4
 388         # Avoid slicing shatable for individual hashes (very high overhead)
 389         self.shatable = buffer(self.map, self.sha_ofs, self.nsha * 24)
 390
 391     def __enter__(self):
 392         return self
 393
 394     def __exit__(self, type, value, traceback):
 395         self.close()
 396
 397     def __len__(self):
 398         return int(self.nsha)  # int() from long for python 2
 399
 400     def _ofs_from_idx(self, idx):
 401         if idx >= self.nsha or idx < 0:
 402             raise IndexError('invalid pack index index %d' % idx)
 403         ofs = self.sha_ofs + idx * 24
 404         return struct.unpack_from('!I', self.map, offset=ofs)[0]
 405
 406     def _idx_to_hash(self, idx):
 407         if idx >= self.nsha or idx < 0:
 408             raise IndexError('invalid pack index index %d' % idx)
 409         ofs = self.sha_ofs + idx * 24 + 4
 410         return self.map[ofs : ofs + 20]
 411
 412     def __iter__(self):
 413         start = self.sha_ofs + 4
 414         for ofs in range(start, start + 24 * self.nsha, 24):
 415             yield self.map[ofs : ofs + 20]
 416
 417     def close(self):
 418         if self.map is not None:
 419             self.shatable = None
 420             self.map.close()
 421             self.map = None
 422
 423
 424 class PackIdxV2(PackIdx):
 425     """Object representation of a Git pack index (version 2) file."""
 426     def __init__(self, filename, f):
 427         self.name = filename
 428         self.idxnames = [self.name]
 429         self.map = mmap_read(f)
 430         assert self.map[0:8] == b'\377tOc\0\0\0\2'
 431         # Min size for 'L' is 4, which is sufficient for struct's '!I'
 432         self.fanout = array('L', struct.unpack_from('!256I', self.map, offset=8))
 433         self.fanout.append(0)
 434         self.nsha = self.fanout[255]
 435         self.sha_ofs = 8 + 256*4
 436         self.ofstable_ofs = self.sha_ofs + self.nsha * 20 + self.nsha * 4
 437         self.ofs64table_ofs = self.ofstable_ofs + self.nsha * 4
 438         # Avoid slicing this for individual hashes (very high overhead)
 439         self.shatable = buffer(self.map, self.sha_ofs, self.nsha*20)
 440
 441     def __enter__(self):
 442         return self
 443
 444     def __exit__(self, type, value, traceback):
 445         self.close()
 446
 447     def __len__(self):
 448         return int(self.nsha)  # int() from long for python 2
 449
 450     def _ofs_from_idx(self, idx):
 451         if idx >= self.nsha or idx < 0:
 452             raise IndexError('invalid pack index index %d' % idx)
 453         ofs_ofs = self.ofstable_ofs + idx * 4
 454         ofs = struct.unpack_from('!I', self.map, offset=ofs_ofs)[0]
 455         if ofs & 0x80000000:
 456             idx64 = ofs & 0x7fffffff
 457             ofs64_ofs = self.ofs64table_ofs + idx64 * 8
 458             ofs = struct.unpack_from('!Q', self.map, offset=ofs64_ofs)[0]
 459         return ofs
 460
 461     def _idx_to_hash(self, idx):
 462         if idx >= self.nsha or idx < 0:
 463             raise IndexError('invalid pack index index %d' % idx)
 464         ofs = self.sha_ofs + idx * 20
 465         return self.map[ofs : ofs + 20]
 466
 467     def __iter__(self):
 468         start = self.sha_ofs
 469         for ofs in range(start, start + 20 * self.nsha, 20):
 470             yield self.map[ofs : ofs + 20]
 471
 472     def close(self):
 473         if self.map is not None:
 474             self.shatable = None
 475             self.map.close()
 476             self.map = None
 477
 478
 479 _mpi_count = 0
 480 class PackIdxList:
 481     def __init__(self, dir, ignore_midx=False):
 482         global _mpi_count
 483         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 484         _mpi_count += 1
 485         self.dir = dir
 486         self.also = set()
 487         self.packs = []
 488         self.do_bloom = False
 489         self.bloom = None
 490         self.ignore_midx = ignore_midx
 491         self.refresh()
 492
 493     def __del__(self):
 494         global _mpi_count
 495         _mpi_count -= 1
 496         assert(_mpi_count == 0)
 497
 498     def __iter__(self):
 499         return iter(idxmerge(self.packs))
 500
 501     def __len__(self):
 502         return sum(len(pack) for pack in self.packs)
 503
 504     def exists(self, hash, want_source=False):
 505         """Return nonempty if the object exists in the index files."""
 506         global _total_searches
 507         _total_searches += 1
 508         if hash in self.also:
 509             return True
 510         if self.do_bloom and self.bloom:
 511             if self.bloom.exists(hash):
 512                 self.do_bloom = False
 513             else:
 514                 _total_searches -= 1  # was counted by bloom
 515                 return None
 516         for i in range(len(self.packs)):
 517             p = self.packs[i]
 518             _total_searches -= 1  # will be incremented by sub-pack
 519             ix = p.exists(hash, want_source=want_source)
 520             if ix:
 521                 # reorder so most recently used packs are searched first
 522                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 523                 return ix
 524         self.do_bloom = True
 525         return None
 526
 527     def refresh(self, skip_midx = False):
 528         """Refresh the index list.
 529         This method verifies if .midx files were superseded (e.g. all of its
 530         contents are in another, bigger .midx file) and removes the superseded
 531         files.
 532
 533         If skip_midx is True, all work on .midx files will be skipped and .midx
 534         files will be removed from the list.
 535
 536         The instance variable 'ignore_midx' can force this function to
 537         always act as if skip_midx was True.
 538         """
 539         if self.bloom is not None:
 540             self.bloom.close()
 541         self.bloom = None # Always reopen the bloom as it may have been relaced
 542         self.do_bloom = False
 543         skip_midx = skip_midx or self.ignore_midx
 544         d = dict((p.name, p) for p in self.packs
 545                  if not skip_midx or not isinstance(p, midx.PackMidx))
 546         if os.path.exists(self.dir):
 547             if not skip_midx:
 548                 midxl = []
 549                 midxes = set(glob.glob(os.path.join(self.dir, b'*.midx')))
 550                 # remove any *.midx files from our list that no longer exist
 551                 for ix in list(d.values()):
 552                     if not isinstance(ix, midx.PackMidx):
 553                         continue
 554                     if ix.name in midxes:
 555                         continue
 556                     # remove the midx
 557                     del d[ix.name]
 558                     ix.close()
 559                     self.packs.remove(ix)
 560                 for ix in self.packs:
 561                     if isinstance(ix, midx.PackMidx):
 562                         for name in ix.idxnames:
 563                             d[os.path.join(self.dir, name)] = ix
 564                 for full in midxes:
 565                     if not d.get(full):
 566                         mx = midx.PackMidx(full)
 567                         (mxd, mxf) = os.path.split(mx.name)
 568                         broken = False
 569                         for n in mx.idxnames:
 570                             if not os.path.exists(os.path.join(mxd, n)):
 571                                 log(('warning: index %s missing\n'
 572                                      '  used by %s\n')
 573                                     % (path_msg(n), path_msg(mxf)))
 574                                 broken = True
 575                         if broken:
 576                             mx.close()
 577                             del mx
 578                             unlink(full)
 579                         else:
 580                             midxl.append(mx)
 581                 midxl.sort(key=lambda ix:
 582                            (-len(ix), -xstat.stat(ix.name).st_mtime))
 583                 for ix in midxl:
 584                     any_needed = False
 585                     for sub in ix.idxnames:
 586                         found = d.get(os.path.join(self.dir, sub))
 587                         if not found or isinstance(found, PackIdx):
 588                             # doesn't exist, or exists but not in a midx
 589                             any_needed = True
 590                             break
 591                     if any_needed:
 592                         d[ix.name] = ix
 593                         for name in ix.idxnames:
 594                             d[os.path.join(self.dir, name)] = ix
 595                     elif not ix.force_keep:
 596                         debug1('midx: removing redundant: %s\n'
 597                                % path_msg(os.path.basename(ix.name)))
 598                         ix.close()
 599                         unlink(ix.name)
 600             for full in glob.glob(os.path.join(self.dir, b'*.idx')):
 601                 if not d.get(full):
 602                     try:
 603                         ix = open_idx(full)
 604                     except GitError as e:
 605                         add_error(e)
 606                         continue
 607                     d[full] = ix
 608             bfull = os.path.join(self.dir, b'bup.bloom')
 609             if self.bloom is None and os.path.exists(bfull):
 610                 self.bloom = bloom.ShaBloom(bfull)
 611             self.packs = list(set(d.values()))
 612             self.packs.sort(reverse=True, key=lambda x: len(x))
 613             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 614                 self.do_bloom = True
 615             else:
 616                 self.bloom = None
 617         debug1('PackIdxList: using %d index%s.\n'
 618             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 619
 620     def add(self, hash):
 621         """Insert an additional object in the list."""
 622         self.also.add(hash)
 623
 624
 625 def open_idx(filename):
 626     if filename.endswith(b'.idx'):
 627         f = open(filename, 'rb')
 628         header = f.read(8)
 629         if header[0:4] == b'\377tOc':
 630             version = struct.unpack('!I', header[4:8])[0]
 631             if version == 2:
 632                 return PackIdxV2(filename, f)
 633             else:
 634                 raise GitError('%s: expected idx file version 2, got %d'
 635                                % (path_msg(filename), version))
 636         elif len(header) == 8 and header[0:4] < b'\377tOc':
 637             return PackIdxV1(filename, f)
 638         else:
 639             raise GitError('%s: unrecognized idx file header'
 640                            % path_msg(filename))
 641     elif filename.endswith(b'.midx'):
 642         return midx.PackMidx(filename)
 643     else:
 644         raise GitError('idx filenames must end with .idx or .midx')
 645
 646
 647 def idxmerge(idxlist, final_progress=True):
 648     """Generate a list of all the objects reachable in a PackIdxList."""
 649     def pfunc(count, total):
 650         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 651                   % (count*100.0/total, count, total))
 652     def pfinal(count, total):
 653         if final_progress:
 654             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 655                      % (100, total, total))
 656     return merge_iter(idxlist, 10024, pfunc, pfinal)
 657
 658
 659 def create_commit_blob(tree, parent,
 660                        author, adate_sec, adate_tz,
 661                        committer, cdate_sec, cdate_tz,
 662                        msg):
 663     if adate_tz is not None:
 664         adate_str = _git_date_str(adate_sec, adate_tz)
 665     else:
 666         adate_str = _local_git_date_str(adate_sec)
 667     if cdate_tz is not None:
 668         cdate_str = _git_date_str(cdate_sec, cdate_tz)
 669     else:
 670         cdate_str = _local_git_date_str(cdate_sec)
 671     l = []
 672     if tree: l.append(b'tree %s' % hexlify(tree))
 673     if parent: l.append(b'parent %s' % hexlify(parent))
 674     if author: l.append(b'author %s %s' % (author, adate_str))
 675     if committer: l.append(b'committer %s %s' % (committer, cdate_str))
 676     l.append(b'')
 677     l.append(msg)
 678     return b'\n'.join(l)
 679
 680
 681 def _make_objcache():
 682     return PackIdxList(repo(b'objects/pack'))
 683
 684 # bup-gc assumes that it can disable all PackWriter activities
 685 # (bloom/midx/cache) via the constructor and close() arguments.
 686
 687 class PackWriter:
 688     """Writes Git objects inside a pack file."""
 689     def __init__(self, objcache_maker=_make_objcache, compression_level=1,
 690                  run_midx=True, on_pack_finish=None,
 691                  max_pack_size=None, max_pack_objects=None, repo_dir=None):
 692         self.repo_dir = repo_dir or repo()
 693         self.file = None
 694         self.parentfd = None
 695         self.count = 0
 696         self.outbytes = 0
 697         self.filename = None
 698         self.idx = None
 699         self.objcache_maker = objcache_maker
 700         self.objcache = None
 701         self.compression_level = compression_level
 702         self.run_midx=run_midx
 703         self.on_pack_finish = on_pack_finish
 704         if not max_pack_size:
 705             max_pack_size = git_config_get(b'pack.packSizeLimit',
 706                                            repo_dir=self.repo_dir)
 707             if max_pack_size is not None:
 708                 max_pack_size = parse_num(max_pack_size)
 709             if not max_pack_size:
 710                 # larger packs slow down pruning
 711                 max_pack_size = 1000 * 1000 * 1000
 712         self.max_pack_size = max_pack_size
 713         # cache memory usage is about 83 bytes per object
 714         self.max_pack_objects = max_pack_objects if max_pack_objects \
 715                                 else max(1, self.max_pack_size // 5000)
 716
 717     def __del__(self):
 718         self.close()
 719
 720     def __enter__(self):
 721         return self
 722
 723     def __exit__(self, type, value, traceback):
 724         self.close()
 725
 726     def _open(self):
 727         if not self.file:
 728             objdir = dir = os.path.join(self.repo_dir, b'objects')
 729             fd, name = tempfile.mkstemp(suffix=b'.pack', dir=objdir)
 730             try:
 731                 self.file = os.fdopen(fd, 'w+b')
 732             except:
 733                 os.close(fd)
 734                 raise
 735             try:
 736                 self.parentfd = os.open(objdir, os.O_RDONLY)
 737             except:
 738                 f = self.file
 739                 self.file = None
 740                 f.close()
 741                 raise
 742             assert name.endswith(b'.pack')
 743             self.filename = name[:-5]
 744             self.file.write(b'PACK\0\0\0\2\0\0\0\0')
 745             self.idx = PackIdxV2Writer()
 746
 747     def _raw_write(self, datalist, sha):
 748         self._open()
 749         f = self.file
 750         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 751         # the file never has a *partial* blob.  So let's make sure it's
 752         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 753         # to our hashsplit algorithm.)  f.write() does its own buffering,
 754         # but that's okay because we'll flush it in _end().
 755         oneblob = b''.join(datalist)
 756         try:
 757             f.write(oneblob)
 758         except IOError as e:
 759             reraise(GitError(e))
 760         nw = len(oneblob)
 761         crc = zlib.crc32(oneblob) & 0xffffffff
 762         self._update_idx(sha, crc, nw)
 763         self.outbytes += nw
 764         self.count += 1
 765         return nw, crc
 766
 767     def _update_idx(self, sha, crc, size):
 768         assert(sha)
 769         if self.idx:
 770             self.idx.add(sha, crc, self.file.tell() - size)
 771
 772     def _write(self, sha, type, content):
 773         if verbose:
 774             log('>')
 775         if not sha:
 776             sha = calc_hash(type, content)
 777         size, crc = self._raw_write(_encode_packobj(type, content,
 778                                                     self.compression_level),
 779                                     sha=sha)
 780         if self.outbytes >= self.max_pack_size \
 781            or self.count >= self.max_pack_objects:
 782             self.breakpoint()
 783         return sha
 784
 785     def breakpoint(self):
 786         """Clear byte and object counts and return the last processed id."""
 787         id = self._end(self.run_midx)
 788         self.outbytes = self.count = 0
 789         return id
 790
 791     def _require_objcache(self):
 792         if self.objcache is None and self.objcache_maker:
 793             self.objcache = self.objcache_maker()
 794         if self.objcache is None:
 795             raise GitError(
 796                     "PackWriter not opened or can't check exists w/o objcache")
 797
 798     def exists(self, id, want_source=False):
 799         """Return non-empty if an object is found in the object cache."""
 800         self._require_objcache()
 801         return self.objcache.exists(id, want_source=want_source)
 802
 803     def just_write(self, sha, type, content):
 804         """Write an object to the pack file without checking for duplication."""
 805         self._write(sha, type, content)
 806         # If nothing else, gc doesn't have/want an objcache
 807         if self.objcache is not None:
 808             self.objcache.add(sha)
 809
 810     def maybe_write(self, type, content):
 811         """Write an object to the pack file if not present and return its id."""
 812         sha = calc_hash(type, content)
 813         if not self.exists(sha):
 814             self._require_objcache()
 815             self.just_write(sha, type, content)
 816         return sha
 817
 818     def new_blob(self, blob):
 819         """Create a blob object in the pack with the supplied content."""
 820         return self.maybe_write(b'blob', blob)
 821
 822     def new_tree(self, shalist):
 823         """Create a tree object in the pack."""
 824         content = tree_encode(shalist)
 825         return self.maybe_write(b'tree', content)
 826
 827     def new_commit(self, tree, parent,
 828                    author, adate_sec, adate_tz,
 829                    committer, cdate_sec, cdate_tz,
 830                    msg):
 831         """Create a commit object in the pack.  The date_sec values must be
 832         epoch-seconds, and if a tz is None, the local timezone is assumed."""
 833         content = create_commit_blob(tree, parent,
 834                                      author, adate_sec, adate_tz,
 835                                      committer, cdate_sec, cdate_tz,
 836                                      msg)
 837         return self.maybe_write(b'commit', content)
 838
 839     def abort(self):
 840         """Remove the pack file from disk."""
 841         f = self.file
 842         if f:
 843             pfd = self.parentfd
 844             self.file = None
 845             self.parentfd = None
 846             self.idx = None
 847             try:
 848                 try:
 849                     os.unlink(self.filename + b'.pack')
 850                 finally:
 851                     f.close()
 852             finally:
 853                 if pfd is not None:
 854                     os.close(pfd)
 855
 856     def _end(self, run_midx=True):
 857         f = self.file
 858         if not f: return None
 859         self.file = None
 860         try:
 861             self.objcache = None
 862             idx = self.idx
 863             self.idx = None
 864
 865             # update object count
 866             f.seek(8)
 867             cp = struct.pack('!i', self.count)
 868             assert(len(cp) == 4)
 869             f.write(cp)
 870
 871             # calculate the pack sha1sum
 872             f.seek(0)
 873             sum = Sha1()
 874             for b in chunkyreader(f):
 875                 sum.update(b)
 876             packbin = sum.digest()
 877             f.write(packbin)
 878             fdatasync(f.fileno())
 879         finally:
 880             f.close()
 881
 882         idx.write(self.filename + b'.idx', packbin)
 883         nameprefix = os.path.join(self.repo_dir,
 884                                   b'objects/pack/pack-' +  hexlify(packbin))
 885         if os.path.exists(self.filename + b'.map'):
 886             os.unlink(self.filename + b'.map')
 887         os.rename(self.filename + b'.pack', nameprefix + b'.pack')
 888         os.rename(self.filename + b'.idx', nameprefix + b'.idx')
 889         try:
 890             os.fsync(self.parentfd)
 891         finally:
 892             os.close(self.parentfd)
 893
 894         if run_midx:
 895             auto_midx(os.path.join(self.repo_dir, b'objects/pack'))
 896
 897         if self.on_pack_finish:
 898             self.on_pack_finish(nameprefix)
 899
 900         return nameprefix
 901
 902     def close(self, run_midx=True):
 903         """Close the pack file and move it to its definitive path."""
 904         return self._end(run_midx=run_midx)
 905
 906
 907 class PackIdxV2Writer:
 908     def __init__(self):
 909         self.idx = list(list() for i in range(256))
 910         self.count = 0
 911
 912     def add(self, sha, crc, offs):
 913         assert(sha)
 914         self.count += 1
 915         self.idx[byte_int(sha[0])].append((sha, crc, offs))
 916
 917     def write(self, filename, packbin):
 918         ofs64_count = 0
 919         for section in self.idx:
 920             for entry in section:
 921                 if entry[2] >= 2**31:
 922                     ofs64_count += 1
 923
 924         # Length: header + fan-out + shas-and-crcs + overflow-offsets
 925         index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
 926         idx_map = None
 927         idx_f = open(filename, 'w+b')
 928         try:
 929             idx_f.truncate(index_len)
 930             fdatasync(idx_f.fileno())
 931             idx_map = mmap_readwrite(idx_f, close=False)
 932             try:
 933                 count = _helpers.write_idx(filename, idx_map, self.idx,
 934                                            self.count)
 935                 assert(count == self.count)
 936                 idx_map.flush()
 937             finally:
 938                 idx_map.close()
 939         finally:
 940             idx_f.close()
 941
 942         idx_f = open(filename, 'a+b')
 943         try:
 944             idx_f.write(packbin)
 945             idx_f.seek(0)
 946             idx_sum = Sha1()
 947             b = idx_f.read(8 + 4*256)
 948             idx_sum.update(b)
 949
 950             for b in chunkyreader(idx_f, 20 * self.count):
 951                 idx_sum.update(b)
 952
 953             for b in chunkyreader(idx_f):
 954                 idx_sum.update(b)
 955             idx_f.write(idx_sum.digest())
 956             fdatasync(idx_f.fileno())
 957         finally:
 958             idx_f.close()
 959
 960
 961 def list_refs(patterns=None, repo_dir=None,
 962               limit_to_heads=False, limit_to_tags=False):
 963     """Yield (refname, hash) tuples for all repository refs unless
 964     patterns are specified.  In that case, only include tuples for
 965     refs matching those patterns (cf. git-show-ref(1)).  The limits
 966     restrict the result items to refs/heads or refs/tags.  If both
 967     limits are specified, items from both sources will be included.
 968
 969     """
 970     argv = [b'git', b'show-ref']
 971     if limit_to_heads:
 972         argv.append(b'--heads')
 973     if limit_to_tags:
 974         argv.append(b'--tags')
 975     argv.append(b'--')
 976     if patterns:
 977         argv.extend(patterns)
 978     p = subprocess.Popen(argv, env=_gitenv(repo_dir), stdout=subprocess.PIPE,
 979                          close_fds=True)
 980     out = p.stdout.read().strip()
 981     rv = p.wait()  # not fatal
 982     if rv:
 983         assert(not out)
 984     if out:
 985         for d in out.split(b'\n'):
 986             sha, name = d.split(b' ', 1)
 987             yield name, unhexlify(sha)
 988
 989
 990 def read_ref(refname, repo_dir = None):
 991     """Get the commit id of the most recent commit made on a given ref."""
 992     refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
 993     l = tuple(islice(refs, 2))
 994     if l:
 995         assert(len(l) == 1)
 996         return l[0][1]
 997     else:
 998         return None
 999
1000
1001 def rev_list_invocation(ref_or_refs, format=None):
1002     if isinstance(ref_or_refs, bytes):
1003         refs = (ref_or_refs,)
1004     else:
1005         refs = ref_or_refs
1006     argv = [b'git', b'rev-list']
1007
1008     if format:
1009         argv.append(b'--pretty=format:' + format)
1010     for ref in refs:
1011         assert not ref.startswith(b'-')
1012         argv.append(ref)
1013     argv.append(b'--')
1014     return argv
1015
1016
1017 def rev_list(ref_or_refs, parse=None, format=None, repo_dir=None):
1018     """Yield information about commits as per "git rev-list".  If a format
1019     is not provided, yield one hex hash at a time.  If a format is
1020     provided, pass it to rev-list and call parse(git_stdout) for each
1021     commit with the stream positioned just after the rev-list "commit
1022     HASH" header line.  When a format is provided yield (oidx,
1023     parse(git_stdout)) for each commit.
1024
1025     """
1026     assert bool(parse) == bool(format)
1027     p = subprocess.Popen(rev_list_invocation(ref_or_refs,
1028                                              format=format),
1029                          env=_gitenv(repo_dir),
1030                          stdout = subprocess.PIPE,
1031                          close_fds=True)
1032     if not format:
1033         for line in p.stdout:
1034             yield line.strip()
1035     else:
1036         line = p.stdout.readline()
1037         while line:
1038             s = line.strip()
1039             if not s.startswith(b'commit '):
1040                 raise Exception('unexpected line ' + repr(s))
1041             s = s[7:]
1042             assert len(s) == 40
1043             yield s, parse(p.stdout)
1044             line = p.stdout.readline()
1045
1046     rv = p.wait()  # not fatal
1047     if rv:
1048         raise GitError('git rev-list returned error %d' % rv)
1049
1050
1051 def rev_parse(committish, repo_dir=None):
1052     """Resolve the full hash for 'committish', if it exists.
1053
1054     Should be roughly equivalent to 'git rev-parse'.
1055
1056     Returns the hex value of the hash if it is found, None if 'committish' does
1057     not correspond to anything.
1058     """
1059     head = read_ref(committish, repo_dir=repo_dir)
1060     if head:
1061         debug2("resolved from ref: commit = %s\n" % hexlify(head))
1062         return head
1063
1064     pL = PackIdxList(repo(b'objects/pack', repo_dir=repo_dir))
1065
1066     if len(committish) == 40:
1067         try:
1068             hash = unhexlify(committish)
1069         except TypeError:
1070             return None
1071
1072         if pL.exists(hash):
1073             return hash
1074
1075     return None
1076
1077
1078 def update_ref(refname, newval, oldval, repo_dir=None):
1079     """Update a repository reference."""
1080     if not oldval:
1081         oldval = b''
1082     assert refname.startswith(b'refs/heads/') \
1083         or refname.startswith(b'refs/tags/')
1084     p = subprocess.Popen([b'git', b'update-ref', refname,
1085                           hexlify(newval), hexlify(oldval)],
1086                          env=_gitenv(repo_dir),
1087                          close_fds=True)
1088     _git_wait(b'git update-ref', p)
1089
1090
1091 def delete_ref(refname, oldvalue=None):
1092     """Delete a repository reference (see git update-ref(1))."""
1093     assert refname.startswith(b'refs/')
1094     oldvalue = [] if not oldvalue else [oldvalue]
1095     p = subprocess.Popen([b'git', b'update-ref', b'-d', refname] + oldvalue,
1096                          env=_gitenv(),
1097                          close_fds=True)
1098     _git_wait('git update-ref', p)
1099
1100
1101 def guess_repo(path=None):
1102     """Set the path value in the global variable "repodir".
1103     This makes bup look for an existing bup repository, but not fail if a
1104     repository doesn't exist. Usually, if you are interacting with a bup
1105     repository, you would not be calling this function but using
1106     check_repo_or_die().
1107     """
1108     global repodir
1109     if path:
1110         repodir = path
1111     if not repodir:
1112         repodir = environ.get(b'BUP_DIR')
1113         if not repodir:
1114             repodir = os.path.expanduser(b'~/.bup')
1115
1116
1117 def init_repo(path=None):
1118     """Create the Git bare repository for bup in a given path."""
1119     guess_repo(path)
1120     d = repo()  # appends a / to the path
1121     parent = os.path.dirname(os.path.dirname(d))
1122     if parent and not os.path.exists(parent):
1123         raise GitError('parent directory "%s" does not exist\n'
1124                        % path_msg(parent))
1125     if os.path.exists(d) and not os.path.isdir(os.path.join(d, b'.')):
1126         raise GitError('"%s" exists but is not a directory\n' % path_msg(d))
1127     p = subprocess.Popen([b'git', b'--bare', b'init'], stdout=sys.stderr,
1128                          env=_gitenv(),
1129                          close_fds=True)
1130     _git_wait('git init', p)
1131     # Force the index version configuration in order to ensure bup works
1132     # regardless of the version of the installed Git binary.
1133     p = subprocess.Popen([b'git', b'config', b'pack.indexVersion', '2'],
1134                          stdout=sys.stderr, env=_gitenv(), close_fds=True)
1135     _git_wait('git config', p)
1136     # Enable the reflog
1137     p = subprocess.Popen([b'git', b'config', b'core.logAllRefUpdates', b'true'],
1138                          stdout=sys.stderr, env=_gitenv(), close_fds=True)
1139     _git_wait('git config', p)
1140
1141
1142 def check_repo_or_die(path=None):
1143     """Check to see if a bup repository probably exists, and abort if not."""
1144     guess_repo(path)
1145     top = repo()
1146     pst = stat_if_exists(top + b'/objects/pack')
1147     if pst and stat.S_ISDIR(pst.st_mode):
1148         return
1149     if not pst:
1150         top_st = stat_if_exists(top)
1151         if not top_st:
1152             log('error: repository %r does not exist (see "bup help init")\n'
1153                 % top)
1154             sys.exit(15)
1155     log('error: %s is not a repository\n' % path_msg(top))
1156     sys.exit(14)
1157
1158
1159 def is_suitable_git(ver_str):
1160     if not ver_str.startswith(b'git version '):
1161         return 'unrecognized'
1162     ver_str = ver_str[len(b'git version '):]
1163     if ver_str.startswith(b'0.'):
1164         return 'insufficient'
1165     if ver_str.startswith(b'1.'):
1166         if re.match(br'1\.[012345]rc', ver_str):
1167             return 'insufficient'
1168         if re.match(br'1\.[01234]\.', ver_str):
1169             return 'insufficient'
1170         if re.match(br'1\.5\.[012345]($|\.)', ver_str):
1171             return 'insufficient'
1172         if re.match(br'1\.5\.6-rc', ver_str):
1173             return 'insufficient'
1174         return 'suitable'
1175     if re.match(br'[0-9]+(\.|$)?', ver_str):
1176         return 'suitable'
1177     sys.exit(13)
1178
1179 _git_great = None
1180
1181 def require_suitable_git(ver_str=None):
1182     """Raise GitError if the version of git isn't suitable.
1183
1184     Rely on ver_str when provided, rather than invoking the git in the
1185     path.
1186
1187     """
1188     global _git_great
1189     if _git_great is not None:
1190         return
1191     if environ.get(b'BUP_GIT_VERSION_IS_FINE', b'').lower() \
1192        in (b'yes', b'true', b'1'):
1193         _git_great = True
1194         return
1195     if not ver_str:
1196         ver_str, _, _ = _git_exo([b'git', b'--version'])
1197     status = is_suitable_git(ver_str)
1198     if status == 'unrecognized':
1199         raise GitError('Unexpected git --version output: %r' % ver_str)
1200     if status == 'insufficient':
1201         log('error: git version must be at least 1.5.6\n')
1202         sys.exit(1)
1203     if status == 'suitable':
1204         _git_great = True
1205         return
1206     assert False
1207
1208
1209 class _AbortableIter:
1210     def __init__(self, it, onabort = None):
1211         self.it = it
1212         self.onabort = onabort
1213         self.done = None
1214
1215     def __iter__(self):
1216         return self
1217
1218     def __next__(self):
1219         try:
1220             return next(self.it)
1221         except StopIteration as e:
1222             self.done = True
1223             raise
1224         except:
1225             self.abort()
1226             raise
1227
1228     next = __next__
1229
1230     def abort(self):
1231         """Abort iteration and call the abortion callback, if needed."""
1232         if not self.done:
1233             self.done = True
1234             if self.onabort:
1235                 self.onabort()
1236
1237     def __del__(self):
1238         self.abort()
1239
1240
1241 class CatPipe:
1242     """Link to 'git cat-file' that is used to retrieve blob data."""
1243     def __init__(self, repo_dir = None):
1244         require_suitable_git()
1245         self.repo_dir = repo_dir
1246         self.p = self.inprogress = None
1247
1248     def close(self, wait=False):
1249         p = self.p
1250         if p:
1251             p.stdout.close()
1252             p.stdin.close()
1253         self.p = None
1254         self.inprogress = None
1255         if wait:
1256             p.wait()
1257             return p.returncode
1258
1259     def restart(self):
1260         self.close()
1261         self.p = subprocess.Popen([b'git', b'cat-file', b'--batch'],
1262                                   stdin=subprocess.PIPE,
1263                                   stdout=subprocess.PIPE,
1264                                   close_fds = True,
1265                                   bufsize = 4096,
1266                                   env=_gitenv(self.repo_dir))
1267
1268     def get(self, ref):
1269         """Yield (oidx, type, size), followed by the data referred to by ref.
1270         If ref does not exist, only yield (None, None, None).
1271
1272         """
1273         if not self.p or self.p.poll() != None:
1274             self.restart()
1275         assert(self.p)
1276         poll_result = self.p.poll()
1277         assert(poll_result == None)
1278         if self.inprogress:
1279             log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1280         assert(not self.inprogress)
1281         assert ref.find(b'\n') < 0
1282         assert ref.find(b'\r') < 0
1283         assert not ref.startswith(b'-')
1284         self.inprogress = ref
1285         self.p.stdin.write(ref + b'\n')
1286         self.p.stdin.flush()
1287         hdr = self.p.stdout.readline()
1288         if hdr.endswith(b' missing\n'):
1289             self.inprogress = None
1290             yield None, None, None
1291             return
1292         info = hdr.split(b' ')
1293         if len(info) != 3 or len(info[0]) != 40:
1294             raise GitError('expected object (id, type, size), got %r' % info)
1295         oidx, typ, size = info
1296         size = int(size)
1297         it = _AbortableIter(chunkyreader(self.p.stdout, size),
1298                             onabort=self.close)
1299         try:
1300             yield oidx, typ, size
1301             for blob in it:
1302                 yield blob
1303             readline_result = self.p.stdout.readline()
1304             assert readline_result == b'\n'
1305             self.inprogress = None
1306         except Exception as e:
1307             it.abort()
1308             raise
1309
1310     def _join(self, it):
1311         _, typ, _ = next(it)
1312         if typ == b'blob':
1313             for blob in it:
1314                 yield blob
1315         elif typ == b'tree':
1316             treefile = b''.join(it)
1317             for (mode, name, sha) in tree_decode(treefile):
1318                 for blob in self.join(hexlify(sha)):
1319                     yield blob
1320         elif typ == b'commit':
1321             treeline = b''.join(it).split(b'\n')[0]
1322             assert treeline.startswith(b'tree ')
1323             for blob in self.join(treeline[5:]):
1324                 yield blob
1325         else:
1326             raise GitError('invalid object type %r: expected blob/tree/commit'
1327                            % typ)
1328
1329     def join(self, id):
1330         """Generate a list of the content of all blobs that can be reached
1331         from an object.  The hash given in 'id' must point to a blob, a tree
1332         or a commit. The content of all blobs that can be seen from trees or
1333         commits will be added to the list.
1334         """
1335         for d in self._join(self.get(id)):
1336             yield d
1337
1338
1339 _cp = {}
1340
1341 def cp(repo_dir=None):
1342     """Create a CatPipe object or reuse the already existing one."""
1343     global _cp, repodir
1344     if not repo_dir:
1345         repo_dir = repodir or repo()
1346     repo_dir = os.path.abspath(repo_dir)
1347     cp = _cp.get(repo_dir)
1348     if not cp:
1349         cp = CatPipe(repo_dir)
1350         _cp[repo_dir] = cp
1351     return cp
1352
1353
1354 def close_catpipes():
1355     # FIXME: chain exceptions
1356     while _cp:
1357         _, cp = _cp.popitem()
1358         cp.close(wait=True)
1359
1360
1361 def tags(repo_dir = None):
1362     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1363     tags = {}
1364     for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1365         assert n.startswith(b'refs/tags/')
1366         name = n[10:]
1367         if not c in tags:
1368             tags[c] = []
1369         tags[c].append(name)  # more than one tag can point at 'c'
1370     return tags
1371
1372
1373 class MissingObject(KeyError):
1374     def __init__(self, oid):
1375         self.oid = oid
1376         KeyError.__init__(self, 'object %r is missing' % hexlify(oid))
1377
1378
1379 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1380                                    'path', 'chunk_path', 'data'])
1381 # The path is the mangled path, and if an item represents a fragment
1382 # of a chunked file, the chunk_path will be the chunked subtree path
1383 # for the chunk, i.e. ['', '2d3115e', ...].  The top-level path for a
1384 # chunked file will have a chunk_path of [''].  So some chunk subtree
1385 # of the file '/foo/bar/baz' might look like this:
1386 #
1387 #   item.path = ['foo', 'bar', 'baz.bup']
1388 #   item.chunk_path = ['', '2d3115e', '016b097']
1389 #   item.type = 'tree'
1390 #   ...
1391
1392
1393 def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1394     """Yield everything reachable from oidx via get_ref (which must behave
1395     like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1396     returns true.  Throw MissingObject if a hash encountered is
1397     missing from the repository, and don't read or return blob content
1398     in the data field unless include_data is set.
1399
1400     """
1401     # Maintain the pending stack on the heap to avoid stack overflow
1402     pending = [(oidx, [], [], None)]
1403     while len(pending):
1404         oidx, parent_path, chunk_path, mode = pending.pop()
1405         oid = unhexlify(oidx)
1406         if stop_at and stop_at(oidx):
1407             continue
1408
1409         if (not include_data) and mode and stat.S_ISREG(mode):
1410             # If the object is a "regular file", then it's a leaf in
1411             # the graph, so we can skip reading the data if the caller
1412             # hasn't requested it.
1413             yield WalkItem(oid=oid, type=b'blob',
1414                            chunk_path=chunk_path, path=parent_path,
1415                            mode=mode,
1416                            data=None)
1417             continue
1418
1419         item_it = get_ref(oidx)
1420         get_oidx, typ, _ = next(item_it)
1421         if not get_oidx:
1422             raise MissingObject(unhexlify(oidx))
1423         if typ not in (b'blob', b'commit', b'tree'):
1424             raise Exception('unexpected repository object type %r' % typ)
1425
1426         # FIXME: set the mode based on the type when the mode is None
1427         if typ == b'blob' and not include_data:
1428             # Dump data until we can ask cat_pipe not to fetch it
1429             for ignored in item_it:
1430                 pass
1431             data = None
1432         else:
1433             data = b''.join(item_it)
1434
1435         yield WalkItem(oid=oid, type=typ,
1436                        chunk_path=chunk_path, path=parent_path,
1437                        mode=mode,
1438                        data=(data if include_data else None))
1439
1440         if typ == b'commit':
1441             commit_items = parse_commit(data)
1442             for pid in commit_items.parents:
1443                 pending.append((pid, parent_path, chunk_path, mode))
1444             pending.append((commit_items.tree, parent_path, chunk_path,
1445                             hashsplit.GIT_MODE_TREE))
1446         elif typ == b'tree':
1447             for mode, name, ent_id in tree_decode(data):
1448                 demangled, bup_type = demangle_name(name, mode)
1449                 if chunk_path:
1450                     sub_path = parent_path
1451                     sub_chunk_path = chunk_path + [name]
1452                 else:
1453                     sub_path = parent_path + [name]
1454                     if bup_type == BUP_CHUNKED:
1455                         sub_chunk_path = [b'']
1456                     else:
1457                         sub_chunk_path = chunk_path
1458                 pending.append((hexlify(ent_id), sub_path, sub_chunk_path,
1459                                 mode))