lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5
   6 from __future__ import absolute_import, print_function
   7 import os, sys, zlib, subprocess, struct, stat, re, tempfile, glob
   8 from array import array
   9 from binascii import hexlify, unhexlify
  10 from collections import namedtuple
  11 from itertools import islice
  12
  13 from bup import _helpers, hashsplit, path, midx, bloom, xstat
  14 from bup.compat import (buffer,
  15                         byte_int, bytes_from_byte, bytes_from_uint,
  16                         environ,
  17                         ExitStack,
  18                         items,
  19                         pending_raise,
  20                         range,
  21                         reraise)
  22 from bup.io import path_msg
  23 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
  24                          exo,
  25                          fdatasync,
  26                          finalized,
  27                          log,
  28                          merge_dict,
  29                          merge_iter,
  30                          mmap_read, mmap_readwrite,
  31                          nullcontext_if_not,
  32                          progress, qprogress, stat_if_exists,
  33                          unlink,
  34                          utc_offset_str)
  35
  36
  37 verbose = 0
  38 repodir = None  # The default repository, once initialized
  39
  40 _typemap =  {b'blob': 3, b'tree': 2, b'commit': 1, b'tag': 4}
  41 _typermap = {v: k for k, v in items(_typemap)}
  42
  43
  44 _total_searches = 0
  45 _total_steps = 0
  46
  47
  48 class GitError(Exception):
  49     pass
  50
  51
  52 def _gitenv(repo_dir=None):
  53     if not repo_dir:
  54         repo_dir = repo()
  55     return merge_dict(environ, {b'GIT_DIR': os.path.abspath(repo_dir)})
  56
  57 def _git_wait(cmd, p):
  58     rv = p.wait()
  59     if rv != 0:
  60         raise GitError('%r returned %d' % (cmd, rv))
  61
  62 def _git_exo(cmd, **kwargs):
  63     kwargs['check'] = False
  64     result = exo(cmd, **kwargs)
  65     _, _, proc = result
  66     if proc.returncode != 0:
  67         raise GitError('%r returned %d' % (cmd, proc.returncode))
  68     return result
  69
  70 def git_config_get(option, repo_dir=None, opttype=None, cfg_file=None):
  71     assert not (repo_dir and cfg_file), "repo_dir and cfg_file cannot both be used"
  72     cmd = [b'git', b'config', b'--null']
  73     if cfg_file:
  74         cmd.extend([b'--file', cfg_file])
  75     if opttype == 'int':
  76         cmd.extend([b'--int'])
  77     elif opttype == 'bool':
  78         cmd.extend([b'--bool'])
  79     else:
  80         assert opttype is None
  81     cmd.extend([b'--get', option])
  82     env=None
  83     if repo_dir:
  84         env = _gitenv(repo_dir=repo_dir)
  85     p = subprocess.Popen(cmd, stdout=subprocess.PIPE, env=env,
  86                          close_fds=True)
  87     # with --null, git writes out a trailing \0 after the value
  88     r = p.stdout.read()[:-1]
  89     rc = p.wait()
  90     if rc == 0:
  91         if opttype == 'int':
  92             return int(r)
  93         elif opttype == 'bool':
  94             # git converts to 'true' or 'false'
  95             return r == b'true'
  96         return r
  97     if rc != 1:
  98         raise GitError('%r returned %d' % (cmd, rc))
  99     return None
 100
 101
 102 def parse_tz_offset(s):
 103     """UTC offset in seconds."""
 104     tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
 105     if bytes_from_byte(s[0]) == b'-':
 106         return - tz_off
 107     return tz_off
 108
 109 def parse_commit_gpgsig(sig):
 110     """Return the original signature bytes.
 111
 112     i.e. with the "gpgsig " header and the leading space character on
 113     each continuation line removed.
 114
 115     """
 116     if not sig:
 117         return None
 118     assert sig.startswith(b'gpgsig ')
 119     sig = sig[7:]
 120     return sig.replace(b'\n ', b'\n')
 121
 122 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
 123 # Make sure that's authoritative.
 124
 125 # See also
 126 # https://github.com/git/git/blob/master/Documentation/technical/signature-format.txt
 127 # The continuation lines have only one leading space.
 128
 129 _start_end_char = br'[^ .,:;<>"\'\0\n]'
 130 _content_char = br'[^\0\n<>]'
 131 _safe_str_rx = br'(?:%s{1,2}|(?:%s%s*%s))' \
 132     % (_start_end_char,
 133        _start_end_char, _content_char, _start_end_char)
 134 _tz_rx = br'[-+]\d\d[0-5]\d'
 135 _parent_rx = br'(?:parent [abcdefABCDEF0123456789]{40}\n)'
 136 # Assumes every following line starting with a space is part of the
 137 # mergetag.  Is there a formal commit blob spec?
 138 _mergetag_rx = br'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)'
 139 _commit_rx = re.compile(br'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
 140 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
 141 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?)
 142 (?P<gpgsig>gpgsig .*\n(?: .*\n)*)?
 143 (?P<message>(?:.|\n)*)''' % (_parent_rx,
 144                              _safe_str_rx, _safe_str_rx, _tz_rx,
 145                              _safe_str_rx, _safe_str_rx, _tz_rx,
 146                              _mergetag_rx))
 147 _parent_hash_rx = re.compile(br'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
 148
 149 # Note that the author_sec and committer_sec values are (UTC) epoch
 150 # seconds, and for now the mergetag is not included.
 151 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
 152                                        'author_name', 'author_mail',
 153                                        'author_sec', 'author_offset',
 154                                        'committer_name', 'committer_mail',
 155                                        'committer_sec', 'committer_offset',
 156                                        'gpgsig',
 157                                        'message'])
 158
 159 def parse_commit(content):
 160     commit_match = re.match(_commit_rx, content)
 161     if not commit_match:
 162         raise Exception('cannot parse commit %r' % content)
 163     matches = commit_match.groupdict()
 164     return CommitInfo(tree=matches['tree'],
 165                       parents=re.findall(_parent_hash_rx, matches['parents']),
 166                       author_name=matches['author_name'],
 167                       author_mail=matches['author_mail'],
 168                       author_sec=int(matches['asec']),
 169                       author_offset=parse_tz_offset(matches['atz']),
 170                       committer_name=matches['committer_name'],
 171                       committer_mail=matches['committer_mail'],
 172                       committer_sec=int(matches['csec']),
 173                       committer_offset=parse_tz_offset(matches['ctz']),
 174                       gpgsig=parse_commit_gpgsig(matches['gpgsig']),
 175                       message=matches['message'])
 176
 177
 178 def get_cat_data(cat_iterator, expected_type):
 179     _, kind, _ = next(cat_iterator)
 180     if kind != expected_type:
 181         raise Exception('expected %r, saw %r' % (expected_type, kind))
 182     return b''.join(cat_iterator)
 183
 184 def get_commit_items(id, cp):
 185     return parse_commit(get_cat_data(cp.get(id), b'commit'))
 186
 187 def _local_git_date_str(epoch_sec):
 188     return b'%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
 189
 190
 191 def _git_date_str(epoch_sec, tz_offset_sec):
 192     offs =  tz_offset_sec // 60
 193     return b'%d %s%02d%02d' \
 194         % (epoch_sec,
 195            b'+' if offs >= 0 else b'-',
 196            abs(offs) // 60,
 197            abs(offs) % 60)
 198
 199
 200 def repo(sub = b'', repo_dir=None):
 201     """Get the path to the git repository or one of its subdirectories."""
 202     repo_dir = repo_dir or repodir
 203     if not repo_dir:
 204         raise GitError('You should call check_repo_or_die()')
 205
 206     # If there's a .git subdirectory, then the actual repo is in there.
 207     gd = os.path.join(repo_dir, b'.git')
 208     if os.path.exists(gd):
 209         repo_dir = gd
 210
 211     return os.path.join(repo_dir, sub)
 212
 213
 214 _shorten_hash_rx = \
 215     re.compile(br'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)')
 216
 217 def shorten_hash(s):
 218     return _shorten_hash_rx.sub(br'\1\2*\3', s)
 219
 220
 221 def repo_rel(path):
 222     full = os.path.abspath(path)
 223     fullrepo = os.path.abspath(repo(b''))
 224     if not fullrepo.endswith(b'/'):
 225         fullrepo += b'/'
 226     if full.startswith(fullrepo):
 227         path = full[len(fullrepo):]
 228     if path.startswith(b'index-cache/'):
 229         path = path[len(b'index-cache/'):]
 230     return shorten_hash(path)
 231
 232
 233 def auto_midx(objdir):
 234     args = [path.exe(), b'midx', b'--auto', b'--dir', objdir]
 235     try:
 236         rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
 237     except OSError as e:
 238         # make sure 'args' gets printed to help with debugging
 239         add_error('%r: exception: %s' % (args, e))
 240         raise
 241     if rv:
 242         add_error('%r: returned %d' % (args, rv))
 243
 244     args = [path.exe(), b'bloom', b'--dir', objdir]
 245     try:
 246         rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
 247     except OSError as e:
 248         # make sure 'args' gets printed to help with debugging
 249         add_error('%r: exception: %s' % (args, e))
 250         raise
 251     if rv:
 252         add_error('%r: returned %d' % (args, rv))
 253
 254
 255 def mangle_name(name, mode, gitmode):
 256     """Mangle a file name to present an abstract name for segmented files.
 257     Mangled file names will have the ".bup" extension added to them. If a
 258     file's name already ends with ".bup", a ".bupl" extension is added to
 259     disambiguate normal files from segmented ones.
 260     """
 261     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 262         assert(stat.S_ISDIR(gitmode))
 263         return name + b'.bup'
 264     elif name.endswith(b'.bup') or name[:-1].endswith(b'.bup'):
 265         return name + b'.bupl'
 266     else:
 267         return name
 268
 269
 270 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 271 def demangle_name(name, mode):
 272     """Remove name mangling from a file name, if necessary.
 273
 274     The return value is a tuple (demangled_filename,mode), where mode is one of
 275     the following:
 276
 277     * BUP_NORMAL  : files that should be read as-is from the repository
 278     * BUP_CHUNKED : files that were chunked and need to be reassembled
 279
 280     For more information on the name mangling algorithm, see mangle_name()
 281     """
 282     if name.endswith(b'.bupl'):
 283         return (name[:-5], BUP_NORMAL)
 284     elif name.endswith(b'.bup'):
 285         return (name[:-4], BUP_CHUNKED)
 286     elif name.endswith(b'.bupm'):
 287         return (name[:-5],
 288                 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
 289     return (name, BUP_NORMAL)
 290
 291
 292 def calc_hash(type, content):
 293     """Calculate some content's hash in the Git fashion."""
 294     header = b'%s %d\0' % (type, len(content))
 295     sum = Sha1(header)
 296     sum.update(content)
 297     return sum.digest()
 298
 299
 300 def shalist_item_sort_key(ent):
 301     (mode, name, id) = ent
 302     assert(mode+0 == mode)
 303     if stat.S_ISDIR(mode):
 304         return name + b'/'
 305     else:
 306         return name
 307
 308
 309 def tree_encode(shalist):
 310     """Generate a git tree object from (mode,name,hash) tuples."""
 311     shalist = sorted(shalist, key = shalist_item_sort_key)
 312     l = []
 313     for (mode,name,bin) in shalist:
 314         assert(mode)
 315         assert(mode+0 == mode)
 316         assert(name)
 317         assert(len(bin) == 20)
 318         s = b'%o %s\0%s' % (mode,name,bin)
 319         assert s[0] != b'0'  # 0-padded octal is not acceptable in a git tree
 320         l.append(s)
 321     return b''.join(l)
 322
 323
 324 def tree_decode(buf):
 325     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 326     ofs = 0
 327     while ofs < len(buf):
 328         z = buf.find(b'\0', ofs)
 329         assert(z > ofs)
 330         spl = buf[ofs:z].split(b' ', 1)
 331         assert(len(spl) == 2)
 332         mode,name = spl
 333         sha = buf[z+1:z+1+20]
 334         ofs = z+1+20
 335         yield (int(mode, 8), name, sha)
 336
 337
 338 def _encode_packobj(type, content, compression_level=1):
 339     if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
 340         raise ValueError('invalid compression level %s' % compression_level)
 341     szout = b''
 342     sz = len(content)
 343     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 344     sz >>= 4
 345     while 1:
 346         if sz: szbits |= 0x80
 347         szout += bytes_from_uint(szbits)
 348         if not sz:
 349             break
 350         szbits = sz & 0x7f
 351         sz >>= 7
 352     z = zlib.compressobj(compression_level)
 353     yield szout
 354     yield z.compress(content)
 355     yield z.flush()
 356
 357
 358 def _decode_packobj(buf):
 359     assert(buf)
 360     c = byte_int(buf[0])
 361     type = _typermap[(c & 0x70) >> 4]
 362     sz = c & 0x0f
 363     shift = 4
 364     i = 0
 365     while c & 0x80:
 366         i += 1
 367         c = byte_int(buf[i])
 368         sz |= (c & 0x7f) << shift
 369         shift += 7
 370         if not (c & 0x80):
 371             break
 372     return (type, zlib.decompress(buf[i+1:]))
 373
 374
 375 class PackIdx(object):
 376     def find_offset(self, hash):
 377         """Get the offset of an object inside the index file."""
 378         idx = self._idx_from_hash(hash)
 379         if idx != None:
 380             return self._ofs_from_idx(idx)
 381         return None
 382
 383     def exists(self, hash, want_source=False):
 384         """Return nonempty if the object exists in this index."""
 385         if hash and (self._idx_from_hash(hash) != None):
 386             return want_source and os.path.basename(self.name) or True
 387         return None
 388
 389     def _idx_from_hash(self, hash):
 390         global _total_searches, _total_steps
 391         _total_searches += 1
 392         assert(len(hash) == 20)
 393         b1 = byte_int(hash[0])
 394         start = self.fanout[b1-1] # range -1..254
 395         end = self.fanout[b1] # range 0..255
 396         want = hash
 397         _total_steps += 1  # lookup table is a step
 398         while start < end:
 399             _total_steps += 1
 400             mid = start + (end - start) // 2
 401             v = self._idx_to_hash(mid)
 402             if v < want:
 403                 start = mid+1
 404             elif v > want:
 405                 end = mid
 406             else: # got it!
 407                 return mid
 408         return None
 409
 410
 411 class PackIdxV1(PackIdx):
 412     """Object representation of a Git pack index (version 1) file."""
 413     def __init__(self, filename, f):
 414         super(PackIdxV1, self).__init__()
 415         self.closed = False
 416         self.name = filename
 417         self.idxnames = [self.name]
 418         self.map = mmap_read(f)
 419         # Min size for 'L' is 4, which is sufficient for struct's '!I'
 420         self.fanout = array('L', struct.unpack('!256I', self.map))
 421         self.fanout.append(0)  # entry "-1"
 422         self.nsha = self.fanout[255]
 423         self.sha_ofs = 256 * 4
 424         # Avoid slicing shatable for individual hashes (very high overhead)
 425         self.shatable = buffer(self.map, self.sha_ofs, self.nsha * 24)
 426
 427     def __enter__(self):
 428         return self
 429
 430     def __exit__(self, type, value, traceback):
 431         with pending_raise(value, rethrow=False):
 432             self.close()
 433
 434     def __len__(self):
 435         return int(self.nsha)  # int() from long for python 2
 436
 437     def _ofs_from_idx(self, idx):
 438         if idx >= self.nsha or idx < 0:
 439             raise IndexError('invalid pack index index %d' % idx)
 440         ofs = self.sha_ofs + idx * 24
 441         return struct.unpack_from('!I', self.map, offset=ofs)[0]
 442
 443     def _idx_to_hash(self, idx):
 444         if idx >= self.nsha or idx < 0:
 445             raise IndexError('invalid pack index index %d' % idx)
 446         ofs = self.sha_ofs + idx * 24 + 4
 447         return self.map[ofs : ofs + 20]
 448
 449     def __iter__(self):
 450         start = self.sha_ofs + 4
 451         for ofs in range(start, start + 24 * self.nsha, 24):
 452             yield self.map[ofs : ofs + 20]
 453
 454     def close(self):
 455         self.closed = True
 456         if self.map is not None:
 457             self.shatable = None
 458             self.map.close()
 459             self.map = None
 460
 461     def __del__(self):
 462         assert self.closed
 463
 464
 465 class PackIdxV2(PackIdx):
 466     """Object representation of a Git pack index (version 2) file."""
 467     def __init__(self, filename, f):
 468         super(PackIdxV2, self).__init__()
 469         self.closed = False
 470         self.name = filename
 471         self.idxnames = [self.name]
 472         self.map = mmap_read(f)
 473         assert self.map[0:8] == b'\377tOc\0\0\0\2'
 474         # Min size for 'L' is 4, which is sufficient for struct's '!I'
 475         self.fanout = array('L', struct.unpack_from('!256I', self.map, offset=8))
 476         self.fanout.append(0)
 477         self.nsha = self.fanout[255]
 478         self.sha_ofs = 8 + 256*4
 479         self.ofstable_ofs = self.sha_ofs + self.nsha * 20 + self.nsha * 4
 480         self.ofs64table_ofs = self.ofstable_ofs + self.nsha * 4
 481         # Avoid slicing this for individual hashes (very high overhead)
 482         self.shatable = buffer(self.map, self.sha_ofs, self.nsha*20)
 483
 484     def __enter__(self):
 485         return self
 486
 487     def __exit__(self, type, value, traceback):
 488         with pending_raise(value, rethrow=False):
 489             self.close()
 490
 491     def __len__(self):
 492         return int(self.nsha)  # int() from long for python 2
 493
 494     def _ofs_from_idx(self, idx):
 495         if idx >= self.nsha or idx < 0:
 496             raise IndexError('invalid pack index index %d' % idx)
 497         ofs_ofs = self.ofstable_ofs + idx * 4
 498         ofs = struct.unpack_from('!I', self.map, offset=ofs_ofs)[0]
 499         if ofs & 0x80000000:
 500             idx64 = ofs & 0x7fffffff
 501             ofs64_ofs = self.ofs64table_ofs + idx64 * 8
 502             ofs = struct.unpack_from('!Q', self.map, offset=ofs64_ofs)[0]
 503         return ofs
 504
 505     def _idx_to_hash(self, idx):
 506         if idx >= self.nsha or idx < 0:
 507             raise IndexError('invalid pack index index %d' % idx)
 508         ofs = self.sha_ofs + idx * 20
 509         return self.map[ofs : ofs + 20]
 510
 511     def __iter__(self):
 512         start = self.sha_ofs
 513         for ofs in range(start, start + 20 * self.nsha, 20):
 514             yield self.map[ofs : ofs + 20]
 515
 516     def close(self):
 517         self.closed = True
 518         if self.map is not None:
 519             self.shatable = None
 520             self.map.close()
 521             self.map = None
 522
 523     def __del__(self):
 524         assert self.closed
 525
 526
 527 _mpi_count = 0
 528 class PackIdxList:
 529     def __init__(self, dir, ignore_midx=False):
 530         global _mpi_count
 531         # Q: was this also intended to prevent opening multiple repos?
 532         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 533         _mpi_count += 1
 534         self.open = True
 535         self.dir = dir
 536         self.also = set()
 537         self.packs = []
 538         self.do_bloom = False
 539         self.bloom = None
 540         self.ignore_midx = ignore_midx
 541         try:
 542             self.refresh()
 543         except BaseException as ex:
 544             with pending_raise(ex):
 545                 self.close()
 546
 547     def close(self):
 548         global _mpi_count
 549         if not self.open:
 550             assert _mpi_count == 0
 551             return
 552         _mpi_count -= 1
 553         assert _mpi_count == 0
 554         self.also = None
 555         self.bloom, bloom = None, self.bloom
 556         self.packs, packs = None, self.packs
 557         self.open = False
 558         with ExitStack() as stack:
 559             for pack in packs:
 560                 stack.enter_context(pack)
 561             if bloom:
 562                 bloom.close()
 563
 564     def __enter__(self):
 565         return self
 566
 567     def __exit__(self, type, value, traceback):
 568         with pending_raise(value, rethrow=False):
 569             self.close()
 570
 571     def __del__(self):
 572         assert not self.open
 573
 574     def __iter__(self):
 575         return iter(idxmerge(self.packs))
 576
 577     def __len__(self):
 578         return sum(len(pack) for pack in self.packs)
 579
 580     def exists(self, hash, want_source=False):
 581         """Return nonempty if the object exists in the index files."""
 582         global _total_searches
 583         _total_searches += 1
 584         if hash in self.also:
 585             return True
 586         if self.do_bloom and self.bloom:
 587             if self.bloom.exists(hash):
 588                 self.do_bloom = False
 589             else:
 590                 _total_searches -= 1  # was counted by bloom
 591                 return None
 592         for i in range(len(self.packs)):
 593             p = self.packs[i]
 594             _total_searches -= 1  # will be incremented by sub-pack
 595             ix = p.exists(hash, want_source=want_source)
 596             if ix:
 597                 # reorder so most recently used packs are searched first
 598                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 599                 return ix
 600         self.do_bloom = True
 601         return None
 602
 603     def refresh(self, skip_midx = False):
 604         """Refresh the index list.
 605         This method verifies if .midx files were superseded (e.g. all of its
 606         contents are in another, bigger .midx file) and removes the superseded
 607         files.
 608
 609         If skip_midx is True, all work on .midx files will be skipped and .midx
 610         files will be removed from the list.
 611
 612         The instance variable 'ignore_midx' can force this function to
 613         always act as if skip_midx was True.
 614         """
 615         if self.bloom is not None:
 616             self.bloom.close()
 617         self.bloom = None # Always reopen the bloom as it may have been relaced
 618         self.do_bloom = False
 619         skip_midx = skip_midx or self.ignore_midx
 620         d = dict((p.name, p) for p in self.packs
 621                  if not skip_midx or not isinstance(p, midx.PackMidx))
 622         if os.path.exists(self.dir):
 623             if not skip_midx:
 624                 midxl = []
 625                 midxes = set(glob.glob(os.path.join(self.dir, b'*.midx')))
 626                 # remove any *.midx files from our list that no longer exist
 627                 for ix in list(d.values()):
 628                     if not isinstance(ix, midx.PackMidx):
 629                         continue
 630                     if ix.name in midxes:
 631                         continue
 632                     # remove the midx
 633                     del d[ix.name]
 634                     ix.close()
 635                     self.packs.remove(ix)
 636                 for ix in self.packs:
 637                     if isinstance(ix, midx.PackMidx):
 638                         for name in ix.idxnames:
 639                             d[os.path.join(self.dir, name)] = ix
 640                 for full in midxes:
 641                     if not d.get(full):
 642                         mx = midx.PackMidx(full)
 643                         (mxd, mxf) = os.path.split(mx.name)
 644                         broken = False
 645                         for n in mx.idxnames:
 646                             if not os.path.exists(os.path.join(mxd, n)):
 647                                 log(('warning: index %s missing\n'
 648                                      '  used by %s\n')
 649                                     % (path_msg(n), path_msg(mxf)))
 650                                 broken = True
 651                         if broken:
 652                             mx.close()
 653                             unlink(full)
 654                         else:
 655                             midxl.append(mx)
 656                 midxl.sort(key=lambda ix:
 657                            (-len(ix), -xstat.stat(ix.name).st_mtime))
 658                 for ix in midxl:
 659                     any_needed = False
 660                     for sub in ix.idxnames:
 661                         found = d.get(os.path.join(self.dir, sub))
 662                         if not found or isinstance(found, PackIdx):
 663                             # doesn't exist, or exists but not in a midx
 664                             any_needed = True
 665                             break
 666                     if any_needed:
 667                         d[ix.name] = ix
 668                         for name in ix.idxnames:
 669                             d[os.path.join(self.dir, name)] = ix
 670                     elif not ix.force_keep:
 671                         debug1('midx: removing redundant: %s\n'
 672                                % path_msg(os.path.basename(ix.name)))
 673                         ix.close()
 674                         unlink(ix.name)
 675             for full in glob.glob(os.path.join(self.dir, b'*.idx')):
 676                 if not d.get(full):
 677                     try:
 678                         ix = open_idx(full)
 679                     except GitError as e:
 680                         add_error(e)
 681                         continue
 682                     d[full] = ix
 683             bfull = os.path.join(self.dir, b'bup.bloom')
 684             new_packs = set(d.values())
 685             for p in self.packs:
 686                 if not p in new_packs:
 687                     p.close()
 688             new_packs = list(new_packs)
 689             new_packs.sort(reverse=True, key=lambda x: len(x))
 690             self.packs = new_packs
 691             if self.bloom is None and os.path.exists(bfull):
 692                 self.bloom = bloom.ShaBloom(bfull)
 693             try:
 694                 if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 695                     self.do_bloom = True
 696                 else:
 697                     if self.bloom:
 698                         self.bloom, bloom_tmp = None, self.bloom
 699                         bloom_tmp.close()
 700             except BaseException as ex:
 701                 with pending_raise(ex):
 702                     if self.bloom:
 703                         self.bloom.close()
 704
 705         debug1('PackIdxList: using %d index%s.\n'
 706             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 707
 708     def add(self, hash):
 709         """Insert an additional object in the list."""
 710         self.also.add(hash)
 711
 712
 713 def open_idx(filename):
 714     if filename.endswith(b'.idx'):
 715         f = open(filename, 'rb')
 716         header = f.read(8)
 717         if header[0:4] == b'\377tOc':
 718             version = struct.unpack('!I', header[4:8])[0]
 719             if version == 2:
 720                 return PackIdxV2(filename, f)
 721             else:
 722                 raise GitError('%s: expected idx file version 2, got %d'
 723                                % (path_msg(filename), version))
 724         elif len(header) == 8 and header[0:4] < b'\377tOc':
 725             return PackIdxV1(filename, f)
 726         else:
 727             raise GitError('%s: unrecognized idx file header'
 728                            % path_msg(filename))
 729     elif filename.endswith(b'.midx'):
 730         return midx.PackMidx(filename)
 731     else:
 732         raise GitError('idx filenames must end with .idx or .midx')
 733
 734
 735 def idxmerge(idxlist, final_progress=True):
 736     """Generate a list of all the objects reachable in a PackIdxList."""
 737     def pfunc(count, total):
 738         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 739                   % (count*100.0/total, count, total))
 740     def pfinal(count, total):
 741         if final_progress:
 742             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 743                      % (100, total, total))
 744     return merge_iter(idxlist, 10024, pfunc, pfinal)
 745
 746
 747 def create_commit_blob(tree, parent,
 748                        author, adate_sec, adate_tz,
 749                        committer, cdate_sec, cdate_tz,
 750                        msg):
 751     if adate_tz is not None:
 752         adate_str = _git_date_str(adate_sec, adate_tz)
 753     else:
 754         adate_str = _local_git_date_str(adate_sec)
 755     if cdate_tz is not None:
 756         cdate_str = _git_date_str(cdate_sec, cdate_tz)
 757     else:
 758         cdate_str = _local_git_date_str(cdate_sec)
 759     l = []
 760     if tree: l.append(b'tree %s' % hexlify(tree))
 761     if parent: l.append(b'parent %s' % hexlify(parent))
 762     if author: l.append(b'author %s %s' % (author, adate_str))
 763     if committer: l.append(b'committer %s %s' % (committer, cdate_str))
 764     l.append(b'')
 765     l.append(msg)
 766     return b'\n'.join(l)
 767
 768 def _make_objcache():
 769     return PackIdxList(repo(b'objects/pack'))
 770
 771 # bup-gc assumes that it can disable all PackWriter activities
 772 # (bloom/midx/cache) via the constructor and close() arguments.
 773
 774 class PackWriter(object):
 775     """Writes Git objects inside a pack file."""
 776     def __init__(self, objcache_maker=_make_objcache, compression_level=1,
 777                  run_midx=True, on_pack_finish=None,
 778                  max_pack_size=None, max_pack_objects=None, repo_dir=None):
 779         self.closed = False
 780         self.repo_dir = repo_dir or repo()
 781         self.file = None
 782         self.parentfd = None
 783         self.count = 0
 784         self.outbytes = 0
 785         self.filename = None
 786         self.idx = None
 787         self.objcache_maker = objcache_maker
 788         self.objcache = None
 789         self.compression_level = compression_level
 790         self.run_midx=run_midx
 791         self.on_pack_finish = on_pack_finish
 792         if not max_pack_size:
 793             max_pack_size = git_config_get(b'pack.packSizeLimit',
 794                                            repo_dir=self.repo_dir,
 795                                            opttype='int')
 796             if not max_pack_size:
 797                 # larger packs slow down pruning
 798                 max_pack_size = 1000 * 1000 * 1000
 799         self.max_pack_size = max_pack_size
 800         # cache memory usage is about 83 bytes per object
 801         self.max_pack_objects = max_pack_objects if max_pack_objects \
 802                                 else max(1, self.max_pack_size // 5000)
 803
 804     def __enter__(self):
 805         return self
 806
 807     def __exit__(self, type, value, traceback):
 808         with pending_raise(value, rethrow=False):
 809             self.close()
 810
 811     def _open(self):
 812         if not self.file:
 813             objdir = dir = os.path.join(self.repo_dir, b'objects')
 814             fd, name = tempfile.mkstemp(suffix=b'.pack', dir=objdir)
 815             try:
 816                 self.file = os.fdopen(fd, 'w+b')
 817             except:
 818                 os.close(fd)
 819                 raise
 820             try:
 821                 self.parentfd = os.open(objdir, os.O_RDONLY)
 822             except:
 823                 f = self.file
 824                 self.file = None
 825                 f.close()
 826                 raise
 827             assert name.endswith(b'.pack')
 828             self.filename = name[:-5]
 829             self.file.write(b'PACK\0\0\0\2\0\0\0\0')
 830             self.idx = PackIdxV2Writer()
 831
 832     def _raw_write(self, datalist, sha):
 833         self._open()
 834         f = self.file
 835         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 836         # the file never has a *partial* blob.  So let's make sure it's
 837         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 838         # to our hashsplit algorithm.)  f.write() does its own buffering,
 839         # but that's okay because we'll flush it in _end().
 840         oneblob = b''.join(datalist)
 841         try:
 842             f.write(oneblob)
 843         except IOError as e:
 844             reraise(GitError(e))
 845         nw = len(oneblob)
 846         crc = zlib.crc32(oneblob) & 0xffffffff
 847         self._update_idx(sha, crc, nw)
 848         self.outbytes += nw
 849         self.count += 1
 850         return nw, crc
 851
 852     def _update_idx(self, sha, crc, size):
 853         assert(sha)
 854         if self.idx:
 855             self.idx.add(sha, crc, self.file.tell() - size)
 856
 857     def _write(self, sha, type, content):
 858         if verbose:
 859             log('>')
 860         if not sha:
 861             sha = calc_hash(type, content)
 862         size, crc = self._raw_write(_encode_packobj(type, content,
 863                                                     self.compression_level),
 864                                     sha=sha)
 865         if self.outbytes >= self.max_pack_size \
 866            or self.count >= self.max_pack_objects:
 867             self.breakpoint()
 868         return sha
 869
 870     def _require_objcache(self):
 871         if self.objcache is None and self.objcache_maker:
 872             self.objcache = self.objcache_maker()
 873         if self.objcache is None:
 874             raise GitError(
 875                     "PackWriter not opened or can't check exists w/o objcache")
 876
 877     def exists(self, id, want_source=False):
 878         """Return non-empty if an object is found in the object cache."""
 879         self._require_objcache()
 880         return self.objcache.exists(id, want_source=want_source)
 881
 882     def just_write(self, sha, type, content):
 883         """Write an object to the pack file without checking for duplication."""
 884         self._write(sha, type, content)
 885         # If nothing else, gc doesn't have/want an objcache
 886         if self.objcache is not None:
 887             self.objcache.add(sha)
 888
 889     def maybe_write(self, type, content):
 890         """Write an object to the pack file if not present and return its id."""
 891         sha = calc_hash(type, content)
 892         if not self.exists(sha):
 893             self._require_objcache()
 894             self.just_write(sha, type, content)
 895         return sha
 896
 897     def new_blob(self, blob):
 898         """Create a blob object in the pack with the supplied content."""
 899         return self.maybe_write(b'blob', blob)
 900
 901     def new_tree(self, shalist):
 902         """Create a tree object in the pack."""
 903         content = tree_encode(shalist)
 904         return self.maybe_write(b'tree', content)
 905
 906     def new_commit(self, tree, parent,
 907                    author, adate_sec, adate_tz,
 908                    committer, cdate_sec, cdate_tz,
 909                    msg):
 910         """Create a commit object in the pack.  The date_sec values must be
 911         epoch-seconds, and if a tz is None, the local timezone is assumed."""
 912         content = create_commit_blob(tree, parent,
 913                                      author, adate_sec, adate_tz,
 914                                      committer, cdate_sec, cdate_tz,
 915                                      msg)
 916         return self.maybe_write(b'commit', content)
 917
 918     def _end(self, run_midx=True, abort=False):
 919         # Ignores run_midx during abort
 920         self.parentfd, pfd, = None, self.parentfd
 921         self.file, f = None, self.file
 922         self.idx, idx = None, self.idx
 923         try:
 924             with nullcontext_if_not(self.objcache), \
 925                  finalized(pfd, lambda x: x is not None and os.close(x)), \
 926                  nullcontext_if_not(f):
 927                 if not f:
 928                     return None
 929
 930                 if abort:
 931                     os.unlink(self.filename + b'.pack')
 932                     return None
 933
 934                 # update object count
 935                 f.seek(8)
 936                 cp = struct.pack('!i', self.count)
 937                 assert len(cp) == 4
 938                 f.write(cp)
 939
 940                 # calculate the pack sha1sum
 941                 f.seek(0)
 942                 sum = Sha1()
 943                 for b in chunkyreader(f):
 944                     sum.update(b)
 945                 packbin = sum.digest()
 946                 f.write(packbin)
 947                 f.flush()
 948                 fdatasync(f.fileno())
 949                 f.close()
 950
 951                 idx.write(self.filename + b'.idx', packbin)
 952                 nameprefix = os.path.join(self.repo_dir,
 953                                           b'objects/pack/pack-' +  hexlify(packbin))
 954                 if os.path.exists(self.filename + b'.map'):
 955                     os.unlink(self.filename + b'.map')
 956                 os.rename(self.filename + b'.pack', nameprefix + b'.pack')
 957                 os.rename(self.filename + b'.idx', nameprefix + b'.idx')
 958                 os.fsync(pfd)
 959                 if run_midx:
 960                     auto_midx(os.path.join(self.repo_dir, b'objects/pack'))
 961                 if self.on_pack_finish:
 962                     self.on_pack_finish(nameprefix)
 963                 return nameprefix
 964         finally:
 965             # Must be last -- some of the code above depends on it
 966             self.objcache = None
 967
 968     def abort(self):
 969         """Remove the pack file from disk."""
 970         self.closed = True
 971         self._end(abort=True)
 972
 973     def breakpoint(self):
 974         """Clear byte and object counts and return the last processed id."""
 975         id = self._end(self.run_midx)
 976         self.outbytes = self.count = 0
 977         return id
 978
 979     def close(self, run_midx=True):
 980         """Close the pack file and move it to its definitive path."""
 981         self.closed = True
 982         return self._end(run_midx=run_midx)
 983
 984     def __del__(self):
 985         assert self.closed
 986
 987
 988 class PackIdxV2Writer:
 989     def __init__(self):
 990         self.idx = list(list() for i in range(256))
 991         self.count = 0
 992
 993     def add(self, sha, crc, offs):
 994         assert(sha)
 995         self.count += 1
 996         self.idx[byte_int(sha[0])].append((sha, crc, offs))
 997
 998     def write(self, filename, packbin):
 999         ofs64_count = 0
1000         for section in self.idx:
1001             for entry in section:
1002                 if entry[2] >= 2**31:
1003                     ofs64_count += 1
1004
1005         # Length: header + fan-out + shas-and-crcs + overflow-offsets
1006         index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
1007         idx_map = None
1008         idx_f = open(filename, 'w+b')
1009         try:
1010             idx_f.truncate(index_len)
1011             fdatasync(idx_f.fileno())
1012             idx_map = mmap_readwrite(idx_f, close=False)
1013             try:
1014                 count = _helpers.write_idx(filename, idx_map, self.idx,
1015                                            self.count)
1016                 assert(count == self.count)
1017                 idx_map.flush()
1018             finally:
1019                 idx_map.close()
1020         finally:
1021             idx_f.close()
1022
1023         idx_f = open(filename, 'a+b')
1024         try:
1025             idx_f.write(packbin)
1026             idx_f.seek(0)
1027             idx_sum = Sha1()
1028             b = idx_f.read(8 + 4*256)
1029             idx_sum.update(b)
1030
1031             for b in chunkyreader(idx_f, 20 * self.count):
1032                 idx_sum.update(b)
1033
1034             for b in chunkyreader(idx_f):
1035                 idx_sum.update(b)
1036             idx_f.write(idx_sum.digest())
1037             fdatasync(idx_f.fileno())
1038         finally:
1039             idx_f.close()
1040
1041
1042 def list_refs(patterns=None, repo_dir=None,
1043               limit_to_heads=False, limit_to_tags=False):
1044     """Yield (refname, hash) tuples for all repository refs unless
1045     patterns are specified.  In that case, only include tuples for
1046     refs matching those patterns (cf. git-show-ref(1)).  The limits
1047     restrict the result items to refs/heads or refs/tags.  If both
1048     limits are specified, items from both sources will be included.
1049
1050     """
1051     argv = [b'git', b'show-ref']
1052     if limit_to_heads:
1053         argv.append(b'--heads')
1054     if limit_to_tags:
1055         argv.append(b'--tags')
1056     argv.append(b'--')
1057     if patterns:
1058         argv.extend(patterns)
1059     p = subprocess.Popen(argv, env=_gitenv(repo_dir), stdout=subprocess.PIPE,
1060                          close_fds=True)
1061     out = p.stdout.read().strip()
1062     rv = p.wait()  # not fatal
1063     if rv:
1064         assert(not out)
1065     if out:
1066         for d in out.split(b'\n'):
1067             sha, name = d.split(b' ', 1)
1068             yield name, unhexlify(sha)
1069
1070
1071 def read_ref(refname, repo_dir = None):
1072     """Get the commit id of the most recent commit made on a given ref."""
1073     refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
1074     l = tuple(islice(refs, 2))
1075     if l:
1076         assert(len(l) == 1)
1077         return l[0][1]
1078     else:
1079         return None
1080
1081
1082 def rev_list_invocation(ref_or_refs, format=None):
1083     if isinstance(ref_or_refs, bytes):
1084         refs = (ref_or_refs,)
1085     else:
1086         refs = ref_or_refs
1087     argv = [b'git', b'rev-list']
1088
1089     if format:
1090         argv.append(b'--pretty=format:' + format)
1091     for ref in refs:
1092         assert not ref.startswith(b'-')
1093         argv.append(ref)
1094     argv.append(b'--')
1095     return argv
1096
1097
1098 def rev_list(ref_or_refs, parse=None, format=None, repo_dir=None):
1099     """Yield information about commits as per "git rev-list".  If a format
1100     is not provided, yield one hex hash at a time.  If a format is
1101     provided, pass it to rev-list and call parse(git_stdout) for each
1102     commit with the stream positioned just after the rev-list "commit
1103     HASH" header line.  When a format is provided yield (oidx,
1104     parse(git_stdout)) for each commit.
1105
1106     """
1107     assert bool(parse) == bool(format)
1108     p = subprocess.Popen(rev_list_invocation(ref_or_refs,
1109                                              format=format),
1110                          env=_gitenv(repo_dir),
1111                          stdout = subprocess.PIPE,
1112                          close_fds=True)
1113     if not format:
1114         for line in p.stdout:
1115             yield line.strip()
1116     else:
1117         line = p.stdout.readline()
1118         while line:
1119             s = line.strip()
1120             if not s.startswith(b'commit '):
1121                 raise Exception('unexpected line ' + repr(s))
1122             s = s[7:]
1123             assert len(s) == 40
1124             yield s, parse(p.stdout)
1125             line = p.stdout.readline()
1126
1127     rv = p.wait()  # not fatal
1128     if rv:
1129         raise GitError('git rev-list returned error %d' % rv)
1130
1131
1132 def rev_parse(committish, repo_dir=None):
1133     """Resolve the full hash for 'committish', if it exists.
1134
1135     Should be roughly equivalent to 'git rev-parse'.
1136
1137     Returns the hex value of the hash if it is found, None if 'committish' does
1138     not correspond to anything.
1139     """
1140     head = read_ref(committish, repo_dir=repo_dir)
1141     if head:
1142         debug2("resolved from ref: commit = %s\n" % hexlify(head))
1143         return head
1144
1145     if len(committish) == 40:
1146         try:
1147             hash = unhexlify(committish)
1148         except TypeError:
1149             return None
1150
1151         with PackIdxList(repo(b'objects/pack', repo_dir=repo_dir)) as pL:
1152             if pL.exists(hash):
1153                 return hash
1154
1155     return None
1156
1157
1158 def update_ref(refname, newval, oldval, repo_dir=None, force=False):
1159     """Update a repository reference.
1160
1161     With force=True, don't care about the previous ref (oldval);
1162     with force=False oldval must be either a sha1 or None (for an
1163     entirely new branch)
1164     """
1165     if force:
1166         assert oldval is None
1167         oldarg = []
1168     elif not oldval:
1169         oldarg = [b'']
1170     else:
1171         oldarg = [hexlify(oldval)]
1172     assert refname.startswith(b'refs/heads/') \
1173         or refname.startswith(b'refs/tags/')
1174     p = subprocess.Popen([b'git', b'update-ref', refname,
1175                           hexlify(newval)] + oldarg,
1176                          env=_gitenv(repo_dir),
1177                          close_fds=True)
1178     _git_wait(b'git update-ref', p)
1179
1180
1181 def delete_ref(refname, oldvalue=None):
1182     """Delete a repository reference (see git update-ref(1))."""
1183     assert refname.startswith(b'refs/')
1184     oldvalue = [] if not oldvalue else [oldvalue]
1185     p = subprocess.Popen([b'git', b'update-ref', b'-d', refname] + oldvalue,
1186                          env=_gitenv(),
1187                          close_fds=True)
1188     _git_wait('git update-ref', p)
1189
1190
1191 def guess_repo(path=None):
1192     """Set the path value in the global variable "repodir".
1193     This makes bup look for an existing bup repository, but not fail if a
1194     repository doesn't exist. Usually, if you are interacting with a bup
1195     repository, you would not be calling this function but using
1196     check_repo_or_die().
1197     """
1198     global repodir
1199     if path:
1200         repodir = path
1201     if not repodir:
1202         repodir = environ.get(b'BUP_DIR')
1203         if not repodir:
1204             repodir = os.path.expanduser(b'~/.bup')
1205
1206
1207 def init_repo(path=None):
1208     """Create the Git bare repository for bup in a given path."""
1209     guess_repo(path)
1210     d = repo()  # appends a / to the path
1211     parent = os.path.dirname(os.path.dirname(d))
1212     if parent and not os.path.exists(parent):
1213         raise GitError('parent directory "%s" does not exist\n'
1214                        % path_msg(parent))
1215     if os.path.exists(d) and not os.path.isdir(os.path.join(d, b'.')):
1216         raise GitError('"%s" exists but is not a directory\n' % path_msg(d))
1217     p = subprocess.Popen([b'git', b'--bare', b'init'], stdout=sys.stderr,
1218                          env=_gitenv(),
1219                          close_fds=True)
1220     _git_wait('git init', p)
1221     # Force the index version configuration in order to ensure bup works
1222     # regardless of the version of the installed Git binary.
1223     p = subprocess.Popen([b'git', b'config', b'pack.indexVersion', '2'],
1224                          stdout=sys.stderr, env=_gitenv(), close_fds=True)
1225     _git_wait('git config', p)
1226     # Enable the reflog
1227     p = subprocess.Popen([b'git', b'config', b'core.logAllRefUpdates', b'true'],
1228                          stdout=sys.stderr, env=_gitenv(), close_fds=True)
1229     _git_wait('git config', p)
1230
1231
1232 def check_repo_or_die(path=None):
1233     """Check to see if a bup repository probably exists, and abort if not."""
1234     guess_repo(path)
1235     top = repo()
1236     pst = stat_if_exists(top + b'/objects/pack')
1237     if pst and stat.S_ISDIR(pst.st_mode):
1238         return
1239     if not pst:
1240         top_st = stat_if_exists(top)
1241         if not top_st:
1242             log('error: repository %r does not exist (see "bup help init")\n'
1243                 % top)
1244             sys.exit(15)
1245     log('error: %s is not a repository\n' % path_msg(top))
1246     sys.exit(14)
1247
1248
1249 def is_suitable_git(ver_str):
1250     if not ver_str.startswith(b'git version '):
1251         return 'unrecognized'
1252     ver_str = ver_str[len(b'git version '):]
1253     if ver_str.startswith(b'0.'):
1254         return 'insufficient'
1255     if ver_str.startswith(b'1.'):
1256         if re.match(br'1\.[012345]rc', ver_str):
1257             return 'insufficient'
1258         if re.match(br'1\.[01234]\.', ver_str):
1259             return 'insufficient'
1260         if re.match(br'1\.5\.[012345]($|\.)', ver_str):
1261             return 'insufficient'
1262         if re.match(br'1\.5\.6-rc', ver_str):
1263             return 'insufficient'
1264         return 'suitable'
1265     if re.match(br'[0-9]+(\.|$)?', ver_str):
1266         return 'suitable'
1267     sys.exit(13)
1268
1269 _git_great = None
1270
1271 def require_suitable_git(ver_str=None):
1272     """Raise GitError if the version of git isn't suitable.
1273
1274     Rely on ver_str when provided, rather than invoking the git in the
1275     path.
1276
1277     """
1278     global _git_great
1279     if _git_great is not None:
1280         return
1281     if environ.get(b'BUP_GIT_VERSION_IS_FINE', b'').lower() \
1282        in (b'yes', b'true', b'1'):
1283         _git_great = True
1284         return
1285     if not ver_str:
1286         ver_str, _, _ = _git_exo([b'git', b'--version'])
1287     status = is_suitable_git(ver_str)
1288     if status == 'unrecognized':
1289         raise GitError('Unexpected git --version output: %r' % ver_str)
1290     if status == 'insufficient':
1291         log('error: git version must be at least 1.5.6\n')
1292         sys.exit(1)
1293     if status == 'suitable':
1294         _git_great = True
1295         return
1296     assert False
1297
1298
1299 class CatPipe:
1300     """Link to 'git cat-file' that is used to retrieve blob data."""
1301     def __init__(self, repo_dir = None):
1302         require_suitable_git()
1303         self.repo_dir = repo_dir
1304         self.p = self.inprogress = None
1305
1306     def close(self, wait=False):
1307         self.p, p = None, self.p
1308         self.inprogress = None
1309         if p:
1310             try:
1311                 p.stdout.close()
1312             finally:
1313                 # This will handle pending exceptions correctly once
1314                 # we drop py2
1315                 p.stdin.close()
1316         if wait:
1317             p.wait()
1318             return p.returncode
1319         return None
1320
1321     def restart(self):
1322         self.close()
1323         self.p = subprocess.Popen([b'git', b'cat-file', b'--batch'],
1324                                   stdin=subprocess.PIPE,
1325                                   stdout=subprocess.PIPE,
1326                                   close_fds = True,
1327                                   bufsize = 4096,
1328                                   env=_gitenv(self.repo_dir))
1329
1330     def get(self, ref):
1331         """Yield (oidx, type, size), followed by the data referred to by ref.
1332         If ref does not exist, only yield (None, None, None).
1333
1334         """
1335         if not self.p or self.p.poll() != None:
1336             self.restart()
1337         assert(self.p)
1338         poll_result = self.p.poll()
1339         assert(poll_result == None)
1340         if self.inprogress:
1341             log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1342         assert(not self.inprogress)
1343         assert ref.find(b'\n') < 0
1344         assert ref.find(b'\r') < 0
1345         assert not ref.startswith(b'-')
1346         self.inprogress = ref
1347         self.p.stdin.write(ref + b'\n')
1348         self.p.stdin.flush()
1349         hdr = self.p.stdout.readline()
1350         if not hdr:
1351             raise GitError('unexpected cat-file EOF (last request: %r, exit: %s)'
1352                            % (ref, self.p.poll() or 'none'))
1353         if hdr.endswith(b' missing\n'):
1354             self.inprogress = None
1355             yield None, None, None
1356             return
1357         info = hdr.split(b' ')
1358         if len(info) != 3 or len(info[0]) != 40:
1359             raise GitError('expected object (id, type, size), got %r' % info)
1360         oidx, typ, size = info
1361         size = int(size)
1362         try:
1363             it = chunkyreader(self.p.stdout, size)
1364             yield oidx, typ, size
1365             for blob in chunkyreader(self.p.stdout, size):
1366                 yield blob
1367             readline_result = self.p.stdout.readline()
1368             assert readline_result == b'\n'
1369             self.inprogress = None
1370         except Exception as ex:
1371             with pending_raise(ex):
1372                 self.close()
1373
1374     def _join(self, it):
1375         _, typ, _ = next(it)
1376         if typ == b'blob':
1377             for blob in it:
1378                 yield blob
1379         elif typ == b'tree':
1380             treefile = b''.join(it)
1381             for (mode, name, sha) in tree_decode(treefile):
1382                 for blob in self.join(hexlify(sha)):
1383                     yield blob
1384         elif typ == b'commit':
1385             treeline = b''.join(it).split(b'\n')[0]
1386             assert treeline.startswith(b'tree ')
1387             for blob in self.join(treeline[5:]):
1388                 yield blob
1389         else:
1390             raise GitError('invalid object type %r: expected blob/tree/commit'
1391                            % typ)
1392
1393     def join(self, id):
1394         """Generate a list of the content of all blobs that can be reached
1395         from an object.  The hash given in 'id' must point to a blob, a tree
1396         or a commit. The content of all blobs that can be seen from trees or
1397         commits will be added to the list.
1398         """
1399         for d in self._join(self.get(id)):
1400             yield d
1401
1402
1403 _cp = {}
1404
1405 def cp(repo_dir=None):
1406     """Create a CatPipe object or reuse the already existing one."""
1407     global _cp, repodir
1408     if not repo_dir:
1409         repo_dir = repodir or repo()
1410     repo_dir = os.path.abspath(repo_dir)
1411     cp = _cp.get(repo_dir)
1412     if not cp:
1413         cp = CatPipe(repo_dir)
1414         _cp[repo_dir] = cp
1415     return cp
1416
1417
1418 def close_catpipes():
1419     # FIXME: chain exceptions
1420     while _cp:
1421         _, cp = _cp.popitem()
1422         cp.close(wait=True)
1423
1424
1425 def tags(repo_dir = None):
1426     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1427     tags = {}
1428     for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1429         assert n.startswith(b'refs/tags/')
1430         name = n[10:]
1431         if not c in tags:
1432             tags[c] = []
1433         tags[c].append(name)  # more than one tag can point at 'c'
1434     return tags
1435
1436
1437 class MissingObject(KeyError):
1438     def __init__(self, oid):
1439         self.oid = oid
1440         KeyError.__init__(self, 'object %r is missing' % hexlify(oid))
1441
1442
1443 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1444                                    'path', 'chunk_path', 'data'])
1445 # The path is the mangled path, and if an item represents a fragment
1446 # of a chunked file, the chunk_path will be the chunked subtree path
1447 # for the chunk, i.e. ['', '2d3115e', ...].  The top-level path for a
1448 # chunked file will have a chunk_path of [''].  So some chunk subtree
1449 # of the file '/foo/bar/baz' might look like this:
1450 #
1451 #   item.path = ['foo', 'bar', 'baz.bup']
1452 #   item.chunk_path = ['', '2d3115e', '016b097']
1453 #   item.type = 'tree'
1454 #   ...
1455
1456
1457 def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1458     """Yield everything reachable from oidx via get_ref (which must behave
1459     like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1460     returns true.  Throw MissingObject if a hash encountered is
1461     missing from the repository, and don't read or return blob content
1462     in the data field unless include_data is set.
1463
1464     """
1465     # Maintain the pending stack on the heap to avoid stack overflow
1466     pending = [(oidx, [], [], None)]
1467     while len(pending):
1468         oidx, parent_path, chunk_path, mode = pending.pop()
1469         oid = unhexlify(oidx)
1470         if stop_at and stop_at(oidx):
1471             continue
1472
1473         if (not include_data) and mode and stat.S_ISREG(mode):
1474             # If the object is a "regular file", then it's a leaf in
1475             # the graph, so we can skip reading the data if the caller
1476             # hasn't requested it.
1477             yield WalkItem(oid=oid, type=b'blob',
1478                            chunk_path=chunk_path, path=parent_path,
1479                            mode=mode,
1480                            data=None)
1481             continue
1482
1483         item_it = get_ref(oidx)
1484         get_oidx, typ, _ = next(item_it)
1485         if not get_oidx:
1486             raise MissingObject(unhexlify(oidx))
1487         if typ not in (b'blob', b'commit', b'tree'):
1488             raise Exception('unexpected repository object type %r' % typ)
1489
1490         # FIXME: set the mode based on the type when the mode is None
1491         if typ == b'blob' and not include_data:
1492             # Dump data until we can ask cat_pipe not to fetch it
1493             for ignored in item_it:
1494                 pass
1495             data = None
1496         else:
1497             data = b''.join(item_it)
1498
1499         yield WalkItem(oid=oid, type=typ,
1500                        chunk_path=chunk_path, path=parent_path,
1501                        mode=mode,
1502                        data=(data if include_data else None))
1503
1504         if typ == b'commit':
1505             commit_items = parse_commit(data)
1506             for pid in commit_items.parents:
1507                 pending.append((pid, parent_path, chunk_path, mode))
1508             pending.append((commit_items.tree, parent_path, chunk_path,
1509                             hashsplit.GIT_MODE_TREE))
1510         elif typ == b'tree':
1511             for mode, name, ent_id in tree_decode(data):
1512                 demangled, bup_type = demangle_name(name, mode)
1513                 if chunk_path:
1514                     sub_path = parent_path
1515                     sub_chunk_path = chunk_path + [name]
1516                 else:
1517                     sub_path = parent_path + [name]
1518                     if bup_type == BUP_CHUNKED:
1519                         sub_chunk_path = [b'']
1520                     else:
1521                         sub_chunk_path = chunk_path
1522                 pending.append((hexlify(ent_id), sub_path, sub_chunk_path,
1523                                 mode))