lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5
   6 from __future__ import absolute_import, print_function
   7 import os, sys, zlib, subprocess, struct, stat, re, tempfile, glob
   8 from array import array
   9 from binascii import hexlify, unhexlify
  10 from collections import namedtuple
  11 from itertools import islice
  12
  13 from bup import _helpers, hashsplit, path, midx, bloom, xstat
  14 from bup.compat import (buffer,
  15                         byte_int, bytes_from_byte, bytes_from_uint,
  16                         environ,
  17                         items,
  18                         range,
  19                         reraise)
  20 from bup.io import path_msg
  21 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
  22                          exo,
  23                          fdatasync,
  24                          log,
  25                          merge_dict,
  26                          merge_iter,
  27                          mmap_read, mmap_readwrite,
  28                          progress, qprogress, stat_if_exists,
  29                          unlink,
  30                          utc_offset_str)
  31
  32
  33 verbose = 0
  34 repodir = None  # The default repository, once initialized
  35
  36 _typemap =  {b'blob': 3, b'tree': 2, b'commit': 1, b'tag': 4}
  37 _typermap = {v: k for k, v in items(_typemap)}
  38
  39
  40 _total_searches = 0
  41 _total_steps = 0
  42
  43
  44 class GitError(Exception):
  45     pass
  46
  47
  48 def _gitenv(repo_dir=None):
  49     if not repo_dir:
  50         repo_dir = repo()
  51     return merge_dict(environ, {b'GIT_DIR': os.path.abspath(repo_dir)})
  52
  53 def _git_wait(cmd, p):
  54     rv = p.wait()
  55     if rv != 0:
  56         raise GitError('%r returned %d' % (cmd, rv))
  57
  58 def _git_exo(cmd, **kwargs):
  59     kwargs['check'] = False
  60     result = exo(cmd, **kwargs)
  61     _, _, proc = result
  62     if proc.returncode != 0:
  63         raise GitError('%r returned %d' % (cmd, proc.returncode))
  64     return result
  65
  66 def git_config_get(option, repo_dir=None, opttype=None, cfg_file=None):
  67     assert not (repo_dir and cfg_file), "repo_dir and cfg_file cannot both be used"
  68     cmd = [b'git', b'config', b'--null']
  69     if cfg_file:
  70         cmd.extend([b'--file', cfg_file])
  71     if opttype == 'int':
  72         cmd.extend([b'--int'])
  73     elif opttype == 'bool':
  74         cmd.extend([b'--bool'])
  75     else:
  76         assert opttype is None
  77     cmd.extend([b'--get', option])
  78     env=None
  79     if repo_dir:
  80         env = _gitenv(repo_dir=repo_dir)
  81     p = subprocess.Popen(cmd, stdout=subprocess.PIPE, env=env,
  82                          close_fds=True)
  83     # with --null, git writes out a trailing \0 after the value
  84     r = p.stdout.read()[:-1]
  85     rc = p.wait()
  86     if rc == 0:
  87         if opttype == 'int':
  88             return int(r)
  89         elif opttype == 'bool':
  90             # git converts to 'true' or 'false'
  91             return r == b'true'
  92         return r
  93     if rc != 1:
  94         raise GitError('%r returned %d' % (cmd, rc))
  95     return None
  96
  97
  98 def parse_tz_offset(s):
  99     """UTC offset in seconds."""
 100     tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
 101     if bytes_from_byte(s[0]) == b'-':
 102         return - tz_off
 103     return tz_off
 104
 105 def parse_commit_gpgsig(sig):
 106     """Return the original signature bytes.
 107
 108     i.e. with the "gpgsig " header and the leading space character on
 109     each continuation line removed.
 110
 111     """
 112     if not sig:
 113         return None
 114     assert sig.startswith(b'gpgsig ')
 115     sig = sig[7:]
 116     return sig.replace(b'\n ', b'\n')
 117
 118 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
 119 # Make sure that's authoritative.
 120
 121 # See also
 122 # https://github.com/git/git/blob/master/Documentation/technical/signature-format.txt
 123 # The continuation lines have only one leading space.
 124
 125 _start_end_char = br'[^ .,:;<>"\'\0\n]'
 126 _content_char = br'[^\0\n<>]'
 127 _safe_str_rx = br'(?:%s{1,2}|(?:%s%s*%s))' \
 128     % (_start_end_char,
 129        _start_end_char, _content_char, _start_end_char)
 130 _tz_rx = br'[-+]\d\d[0-5]\d'
 131 _parent_rx = br'(?:parent [abcdefABCDEF0123456789]{40}\n)'
 132 # Assumes every following line starting with a space is part of the
 133 # mergetag.  Is there a formal commit blob spec?
 134 _mergetag_rx = br'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)'
 135 _commit_rx = re.compile(br'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
 136 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
 137 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?)
 138 (?P<gpgsig>gpgsig .*\n(?: .*\n)*)?
 139 (?P<message>(?:.|\n)*)''' % (_parent_rx,
 140                              _safe_str_rx, _safe_str_rx, _tz_rx,
 141                              _safe_str_rx, _safe_str_rx, _tz_rx,
 142                              _mergetag_rx))
 143 _parent_hash_rx = re.compile(br'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
 144
 145 # Note that the author_sec and committer_sec values are (UTC) epoch
 146 # seconds, and for now the mergetag is not included.
 147 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
 148                                        'author_name', 'author_mail',
 149                                        'author_sec', 'author_offset',
 150                                        'committer_name', 'committer_mail',
 151                                        'committer_sec', 'committer_offset',
 152                                        'gpgsig',
 153                                        'message'])
 154
 155 def parse_commit(content):
 156     commit_match = re.match(_commit_rx, content)
 157     if not commit_match:
 158         raise Exception('cannot parse commit %r' % content)
 159     matches = commit_match.groupdict()
 160     return CommitInfo(tree=matches['tree'],
 161                       parents=re.findall(_parent_hash_rx, matches['parents']),
 162                       author_name=matches['author_name'],
 163                       author_mail=matches['author_mail'],
 164                       author_sec=int(matches['asec']),
 165                       author_offset=parse_tz_offset(matches['atz']),
 166                       committer_name=matches['committer_name'],
 167                       committer_mail=matches['committer_mail'],
 168                       committer_sec=int(matches['csec']),
 169                       committer_offset=parse_tz_offset(matches['ctz']),
 170                       gpgsig=parse_commit_gpgsig(matches['gpgsig']),
 171                       message=matches['message'])
 172
 173
 174 def get_cat_data(cat_iterator, expected_type):
 175     _, kind, _ = next(cat_iterator)
 176     if kind != expected_type:
 177         raise Exception('expected %r, saw %r' % (expected_type, kind))
 178     return b''.join(cat_iterator)
 179
 180 def get_commit_items(id, cp):
 181     return parse_commit(get_cat_data(cp.get(id), b'commit'))
 182
 183 def _local_git_date_str(epoch_sec):
 184     return b'%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
 185
 186
 187 def _git_date_str(epoch_sec, tz_offset_sec):
 188     offs =  tz_offset_sec // 60
 189     return b'%d %s%02d%02d' \
 190         % (epoch_sec,
 191            b'+' if offs >= 0 else b'-',
 192            abs(offs) // 60,
 193            abs(offs) % 60)
 194
 195
 196 def repo(sub = b'', repo_dir=None):
 197     """Get the path to the git repository or one of its subdirectories."""
 198     repo_dir = repo_dir or repodir
 199     if not repo_dir:
 200         raise GitError('You should call check_repo_or_die()')
 201
 202     # If there's a .git subdirectory, then the actual repo is in there.
 203     gd = os.path.join(repo_dir, b'.git')
 204     if os.path.exists(gd):
 205         repo_dir = gd
 206
 207     return os.path.join(repo_dir, sub)
 208
 209
 210 _shorten_hash_rx = \
 211     re.compile(br'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)')
 212
 213 def shorten_hash(s):
 214     return _shorten_hash_rx.sub(br'\1\2*\3', s)
 215
 216
 217 def repo_rel(path):
 218     full = os.path.abspath(path)
 219     fullrepo = os.path.abspath(repo(b''))
 220     if not fullrepo.endswith(b'/'):
 221         fullrepo += b'/'
 222     if full.startswith(fullrepo):
 223         path = full[len(fullrepo):]
 224     if path.startswith(b'index-cache/'):
 225         path = path[len(b'index-cache/'):]
 226     return shorten_hash(path)
 227
 228
 229 def auto_midx(objdir):
 230     args = [path.exe(), b'midx', b'--auto', b'--dir', objdir]
 231     try:
 232         rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
 233     except OSError as e:
 234         # make sure 'args' gets printed to help with debugging
 235         add_error('%r: exception: %s' % (args, e))
 236         raise
 237     if rv:
 238         add_error('%r: returned %d' % (args, rv))
 239
 240     args = [path.exe(), b'bloom', b'--dir', objdir]
 241     try:
 242         rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
 243     except OSError as e:
 244         # make sure 'args' gets printed to help with debugging
 245         add_error('%r: exception: %s' % (args, e))
 246         raise
 247     if rv:
 248         add_error('%r: returned %d' % (args, rv))
 249
 250
 251 def mangle_name(name, mode, gitmode):
 252     """Mangle a file name to present an abstract name for segmented files.
 253     Mangled file names will have the ".bup" extension added to them. If a
 254     file's name already ends with ".bup", a ".bupl" extension is added to
 255     disambiguate normal files from segmented ones.
 256     """
 257     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 258         assert(stat.S_ISDIR(gitmode))
 259         return name + b'.bup'
 260     elif name.endswith(b'.bup') or name[:-1].endswith(b'.bup'):
 261         return name + b'.bupl'
 262     else:
 263         return name
 264
 265
 266 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 267 def demangle_name(name, mode):
 268     """Remove name mangling from a file name, if necessary.
 269
 270     The return value is a tuple (demangled_filename,mode), where mode is one of
 271     the following:
 272
 273     * BUP_NORMAL  : files that should be read as-is from the repository
 274     * BUP_CHUNKED : files that were chunked and need to be reassembled
 275
 276     For more information on the name mangling algorithm, see mangle_name()
 277     """
 278     if name.endswith(b'.bupl'):
 279         return (name[:-5], BUP_NORMAL)
 280     elif name.endswith(b'.bup'):
 281         return (name[:-4], BUP_CHUNKED)
 282     elif name.endswith(b'.bupm'):
 283         return (name[:-5],
 284                 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
 285     return (name, BUP_NORMAL)
 286
 287
 288 def calc_hash(type, content):
 289     """Calculate some content's hash in the Git fashion."""
 290     header = b'%s %d\0' % (type, len(content))
 291     sum = Sha1(header)
 292     sum.update(content)
 293     return sum.digest()
 294
 295
 296 def shalist_item_sort_key(ent):
 297     (mode, name, id) = ent
 298     assert(mode+0 == mode)
 299     if stat.S_ISDIR(mode):
 300         return name + b'/'
 301     else:
 302         return name
 303
 304
 305 def tree_encode(shalist):
 306     """Generate a git tree object from (mode,name,hash) tuples."""
 307     shalist = sorted(shalist, key = shalist_item_sort_key)
 308     l = []
 309     for (mode,name,bin) in shalist:
 310         assert(mode)
 311         assert(mode+0 == mode)
 312         assert(name)
 313         assert(len(bin) == 20)
 314         s = b'%o %s\0%s' % (mode,name,bin)
 315         assert s[0] != b'0'  # 0-padded octal is not acceptable in a git tree
 316         l.append(s)
 317     return b''.join(l)
 318
 319
 320 def tree_decode(buf):
 321     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 322     ofs = 0
 323     while ofs < len(buf):
 324         z = buf.find(b'\0', ofs)
 325         assert(z > ofs)
 326         spl = buf[ofs:z].split(b' ', 1)
 327         assert(len(spl) == 2)
 328         mode,name = spl
 329         sha = buf[z+1:z+1+20]
 330         ofs = z+1+20
 331         yield (int(mode, 8), name, sha)
 332
 333
 334 def _encode_packobj(type, content, compression_level=1):
 335     if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
 336         raise ValueError('invalid compression level %s' % compression_level)
 337     szout = b''
 338     sz = len(content)
 339     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 340     sz >>= 4
 341     while 1:
 342         if sz: szbits |= 0x80
 343         szout += bytes_from_uint(szbits)
 344         if not sz:
 345             break
 346         szbits = sz & 0x7f
 347         sz >>= 7
 348     z = zlib.compressobj(compression_level)
 349     yield szout
 350     yield z.compress(content)
 351     yield z.flush()
 352
 353
 354 def _decode_packobj(buf):
 355     assert(buf)
 356     c = byte_int(buf[0])
 357     type = _typermap[(c & 0x70) >> 4]
 358     sz = c & 0x0f
 359     shift = 4
 360     i = 0
 361     while c & 0x80:
 362         i += 1
 363         c = byte_int(buf[i])
 364         sz |= (c & 0x7f) << shift
 365         shift += 7
 366         if not (c & 0x80):
 367             break
 368     return (type, zlib.decompress(buf[i+1:]))
 369
 370
 371 class PackIdx:
 372     def __init__(self):
 373         assert(0)
 374
 375     def find_offset(self, hash):
 376         """Get the offset of an object inside the index file."""
 377         idx = self._idx_from_hash(hash)
 378         if idx != None:
 379             return self._ofs_from_idx(idx)
 380         return None
 381
 382     def exists(self, hash, want_source=False):
 383         """Return nonempty if the object exists in this index."""
 384         if hash and (self._idx_from_hash(hash) != None):
 385             return want_source and os.path.basename(self.name) or True
 386         return None
 387
 388     def _idx_from_hash(self, hash):
 389         global _total_searches, _total_steps
 390         _total_searches += 1
 391         assert(len(hash) == 20)
 392         b1 = byte_int(hash[0])
 393         start = self.fanout[b1-1] # range -1..254
 394         end = self.fanout[b1] # range 0..255
 395         want = hash
 396         _total_steps += 1  # lookup table is a step
 397         while start < end:
 398             _total_steps += 1
 399             mid = start + (end - start) // 2
 400             v = self._idx_to_hash(mid)
 401             if v < want:
 402                 start = mid+1
 403             elif v > want:
 404                 end = mid
 405             else: # got it!
 406                 return mid
 407         return None
 408
 409
 410 class PackIdxV1(PackIdx):
 411     """Object representation of a Git pack index (version 1) file."""
 412     def __init__(self, filename, f):
 413         self.name = filename
 414         self.idxnames = [self.name]
 415         self.map = mmap_read(f)
 416         # Min size for 'L' is 4, which is sufficient for struct's '!I'
 417         self.fanout = array('L', struct.unpack('!256I', self.map))
 418         self.fanout.append(0)  # entry "-1"
 419         self.nsha = self.fanout[255]
 420         self.sha_ofs = 256 * 4
 421         # Avoid slicing shatable for individual hashes (very high overhead)
 422         self.shatable = buffer(self.map, self.sha_ofs, self.nsha * 24)
 423
 424     def __enter__(self):
 425         return self
 426
 427     def __exit__(self, type, value, traceback):
 428         self.close()
 429
 430     def __len__(self):
 431         return int(self.nsha)  # int() from long for python 2
 432
 433     def _ofs_from_idx(self, idx):
 434         if idx >= self.nsha or idx < 0:
 435             raise IndexError('invalid pack index index %d' % idx)
 436         ofs = self.sha_ofs + idx * 24
 437         return struct.unpack_from('!I', self.map, offset=ofs)[0]
 438
 439     def _idx_to_hash(self, idx):
 440         if idx >= self.nsha or idx < 0:
 441             raise IndexError('invalid pack index index %d' % idx)
 442         ofs = self.sha_ofs + idx * 24 + 4
 443         return self.map[ofs : ofs + 20]
 444
 445     def __iter__(self):
 446         start = self.sha_ofs + 4
 447         for ofs in range(start, start + 24 * self.nsha, 24):
 448             yield self.map[ofs : ofs + 20]
 449
 450     def close(self):
 451         if self.map is not None:
 452             self.shatable = None
 453             self.map.close()
 454             self.map = None
 455
 456
 457 class PackIdxV2(PackIdx):
 458     """Object representation of a Git pack index (version 2) file."""
 459     def __init__(self, filename, f):
 460         self.name = filename
 461         self.idxnames = [self.name]
 462         self.map = mmap_read(f)
 463         assert self.map[0:8] == b'\377tOc\0\0\0\2'
 464         # Min size for 'L' is 4, which is sufficient for struct's '!I'
 465         self.fanout = array('L', struct.unpack_from('!256I', self.map, offset=8))
 466         self.fanout.append(0)
 467         self.nsha = self.fanout[255]
 468         self.sha_ofs = 8 + 256*4
 469         self.ofstable_ofs = self.sha_ofs + self.nsha * 20 + self.nsha * 4
 470         self.ofs64table_ofs = self.ofstable_ofs + self.nsha * 4
 471         # Avoid slicing this for individual hashes (very high overhead)
 472         self.shatable = buffer(self.map, self.sha_ofs, self.nsha*20)
 473
 474     def __enter__(self):
 475         return self
 476
 477     def __exit__(self, type, value, traceback):
 478         self.close()
 479
 480     def __len__(self):
 481         return int(self.nsha)  # int() from long for python 2
 482
 483     def _ofs_from_idx(self, idx):
 484         if idx >= self.nsha or idx < 0:
 485             raise IndexError('invalid pack index index %d' % idx)
 486         ofs_ofs = self.ofstable_ofs + idx * 4
 487         ofs = struct.unpack_from('!I', self.map, offset=ofs_ofs)[0]
 488         if ofs & 0x80000000:
 489             idx64 = ofs & 0x7fffffff
 490             ofs64_ofs = self.ofs64table_ofs + idx64 * 8
 491             ofs = struct.unpack_from('!Q', self.map, offset=ofs64_ofs)[0]
 492         return ofs
 493
 494     def _idx_to_hash(self, idx):
 495         if idx >= self.nsha or idx < 0:
 496             raise IndexError('invalid pack index index %d' % idx)
 497         ofs = self.sha_ofs + idx * 20
 498         return self.map[ofs : ofs + 20]
 499
 500     def __iter__(self):
 501         start = self.sha_ofs
 502         for ofs in range(start, start + 20 * self.nsha, 20):
 503             yield self.map[ofs : ofs + 20]
 504
 505     def close(self):
 506         if self.map is not None:
 507             self.shatable = None
 508             self.map.close()
 509             self.map = None
 510
 511
 512 _mpi_count = 0
 513 class PackIdxList:
 514     def __init__(self, dir, ignore_midx=False):
 515         global _mpi_count
 516         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 517         _mpi_count += 1
 518         self.dir = dir
 519         self.also = set()
 520         self.packs = []
 521         self.do_bloom = False
 522         self.bloom = None
 523         self.ignore_midx = ignore_midx
 524         self.refresh()
 525
 526     def __del__(self):
 527         global _mpi_count
 528         _mpi_count -= 1
 529         assert(_mpi_count == 0)
 530
 531     def __iter__(self):
 532         return iter(idxmerge(self.packs))
 533
 534     def __len__(self):
 535         return sum(len(pack) for pack in self.packs)
 536
 537     def exists(self, hash, want_source=False):
 538         """Return nonempty if the object exists in the index files."""
 539         global _total_searches
 540         _total_searches += 1
 541         if hash in self.also:
 542             return True
 543         if self.do_bloom and self.bloom:
 544             if self.bloom.exists(hash):
 545                 self.do_bloom = False
 546             else:
 547                 _total_searches -= 1  # was counted by bloom
 548                 return None
 549         for i in range(len(self.packs)):
 550             p = self.packs[i]
 551             _total_searches -= 1  # will be incremented by sub-pack
 552             ix = p.exists(hash, want_source=want_source)
 553             if ix:
 554                 # reorder so most recently used packs are searched first
 555                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 556                 return ix
 557         self.do_bloom = True
 558         return None
 559
 560     def refresh(self, skip_midx = False):
 561         """Refresh the index list.
 562         This method verifies if .midx files were superseded (e.g. all of its
 563         contents are in another, bigger .midx file) and removes the superseded
 564         files.
 565
 566         If skip_midx is True, all work on .midx files will be skipped and .midx
 567         files will be removed from the list.
 568
 569         The instance variable 'ignore_midx' can force this function to
 570         always act as if skip_midx was True.
 571         """
 572         if self.bloom is not None:
 573             self.bloom.close()
 574         self.bloom = None # Always reopen the bloom as it may have been relaced
 575         self.do_bloom = False
 576         skip_midx = skip_midx or self.ignore_midx
 577         d = dict((p.name, p) for p in self.packs
 578                  if not skip_midx or not isinstance(p, midx.PackMidx))
 579         if os.path.exists(self.dir):
 580             if not skip_midx:
 581                 midxl = []
 582                 midxes = set(glob.glob(os.path.join(self.dir, b'*.midx')))
 583                 # remove any *.midx files from our list that no longer exist
 584                 for ix in list(d.values()):
 585                     if not isinstance(ix, midx.PackMidx):
 586                         continue
 587                     if ix.name in midxes:
 588                         continue
 589                     # remove the midx
 590                     del d[ix.name]
 591                     ix.close()
 592                     self.packs.remove(ix)
 593                 for ix in self.packs:
 594                     if isinstance(ix, midx.PackMidx):
 595                         for name in ix.idxnames:
 596                             d[os.path.join(self.dir, name)] = ix
 597                 for full in midxes:
 598                     if not d.get(full):
 599                         mx = midx.PackMidx(full)
 600                         (mxd, mxf) = os.path.split(mx.name)
 601                         broken = False
 602                         for n in mx.idxnames:
 603                             if not os.path.exists(os.path.join(mxd, n)):
 604                                 log(('warning: index %s missing\n'
 605                                      '  used by %s\n')
 606                                     % (path_msg(n), path_msg(mxf)))
 607                                 broken = True
 608                         if broken:
 609                             mx.close()
 610                             del mx
 611                             unlink(full)
 612                         else:
 613                             midxl.append(mx)
 614                 midxl.sort(key=lambda ix:
 615                            (-len(ix), -xstat.stat(ix.name).st_mtime))
 616                 for ix in midxl:
 617                     any_needed = False
 618                     for sub in ix.idxnames:
 619                         found = d.get(os.path.join(self.dir, sub))
 620                         if not found or isinstance(found, PackIdx):
 621                             # doesn't exist, or exists but not in a midx
 622                             any_needed = True
 623                             break
 624                     if any_needed:
 625                         d[ix.name] = ix
 626                         for name in ix.idxnames:
 627                             d[os.path.join(self.dir, name)] = ix
 628                     elif not ix.force_keep:
 629                         debug1('midx: removing redundant: %s\n'
 630                                % path_msg(os.path.basename(ix.name)))
 631                         ix.close()
 632                         unlink(ix.name)
 633             for full in glob.glob(os.path.join(self.dir, b'*.idx')):
 634                 if not d.get(full):
 635                     try:
 636                         ix = open_idx(full)
 637                     except GitError as e:
 638                         add_error(e)
 639                         continue
 640                     d[full] = ix
 641             bfull = os.path.join(self.dir, b'bup.bloom')
 642             if self.bloom is None and os.path.exists(bfull):
 643                 self.bloom = bloom.ShaBloom(bfull)
 644             self.packs = list(set(d.values()))
 645             self.packs.sort(reverse=True, key=lambda x: len(x))
 646             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 647                 self.do_bloom = True
 648             else:
 649                 self.bloom = None
 650         debug1('PackIdxList: using %d index%s.\n'
 651             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 652
 653     def add(self, hash):
 654         """Insert an additional object in the list."""
 655         self.also.add(hash)
 656
 657
 658 def open_idx(filename):
 659     if filename.endswith(b'.idx'):
 660         f = open(filename, 'rb')
 661         header = f.read(8)
 662         if header[0:4] == b'\377tOc':
 663             version = struct.unpack('!I', header[4:8])[0]
 664             if version == 2:
 665                 return PackIdxV2(filename, f)
 666             else:
 667                 raise GitError('%s: expected idx file version 2, got %d'
 668                                % (path_msg(filename), version))
 669         elif len(header) == 8 and header[0:4] < b'\377tOc':
 670             return PackIdxV1(filename, f)
 671         else:
 672             raise GitError('%s: unrecognized idx file header'
 673                            % path_msg(filename))
 674     elif filename.endswith(b'.midx'):
 675         return midx.PackMidx(filename)
 676     else:
 677         raise GitError('idx filenames must end with .idx or .midx')
 678
 679
 680 def idxmerge(idxlist, final_progress=True):
 681     """Generate a list of all the objects reachable in a PackIdxList."""
 682     def pfunc(count, total):
 683         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 684                   % (count*100.0/total, count, total))
 685     def pfinal(count, total):
 686         if final_progress:
 687             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 688                      % (100, total, total))
 689     return merge_iter(idxlist, 10024, pfunc, pfinal)
 690
 691
 692 def create_commit_blob(tree, parent,
 693                        author, adate_sec, adate_tz,
 694                        committer, cdate_sec, cdate_tz,
 695                        msg):
 696     if adate_tz is not None:
 697         adate_str = _git_date_str(adate_sec, adate_tz)
 698     else:
 699         adate_str = _local_git_date_str(adate_sec)
 700     if cdate_tz is not None:
 701         cdate_str = _git_date_str(cdate_sec, cdate_tz)
 702     else:
 703         cdate_str = _local_git_date_str(cdate_sec)
 704     l = []
 705     if tree: l.append(b'tree %s' % hexlify(tree))
 706     if parent: l.append(b'parent %s' % hexlify(parent))
 707     if author: l.append(b'author %s %s' % (author, adate_str))
 708     if committer: l.append(b'committer %s %s' % (committer, cdate_str))
 709     l.append(b'')
 710     l.append(msg)
 711     return b'\n'.join(l)
 712
 713
 714 def _make_objcache():
 715     return PackIdxList(repo(b'objects/pack'))
 716
 717 # bup-gc assumes that it can disable all PackWriter activities
 718 # (bloom/midx/cache) via the constructor and close() arguments.
 719
 720 class PackWriter:
 721     """Writes Git objects inside a pack file."""
 722     def __init__(self, objcache_maker=_make_objcache, compression_level=1,
 723                  run_midx=True, on_pack_finish=None,
 724                  max_pack_size=None, max_pack_objects=None, repo_dir=None):
 725         self.repo_dir = repo_dir or repo()
 726         self.file = None
 727         self.parentfd = None
 728         self.count = 0
 729         self.outbytes = 0
 730         self.filename = None
 731         self.idx = None
 732         self.objcache_maker = objcache_maker
 733         self.objcache = None
 734         self.compression_level = compression_level
 735         self.run_midx=run_midx
 736         self.on_pack_finish = on_pack_finish
 737         if not max_pack_size:
 738             max_pack_size = git_config_get(b'pack.packSizeLimit',
 739                                            repo_dir=self.repo_dir,
 740                                            opttype='int')
 741             if not max_pack_size:
 742                 # larger packs slow down pruning
 743                 max_pack_size = 1000 * 1000 * 1000
 744         self.max_pack_size = max_pack_size
 745         # cache memory usage is about 83 bytes per object
 746         self.max_pack_objects = max_pack_objects if max_pack_objects \
 747                                 else max(1, self.max_pack_size // 5000)
 748
 749     def __del__(self):
 750         self.close()
 751
 752     def __enter__(self):
 753         return self
 754
 755     def __exit__(self, type, value, traceback):
 756         self.close()
 757
 758     def _open(self):
 759         if not self.file:
 760             objdir = dir = os.path.join(self.repo_dir, b'objects')
 761             fd, name = tempfile.mkstemp(suffix=b'.pack', dir=objdir)
 762             try:
 763                 self.file = os.fdopen(fd, 'w+b')
 764             except:
 765                 os.close(fd)
 766                 raise
 767             try:
 768                 self.parentfd = os.open(objdir, os.O_RDONLY)
 769             except:
 770                 f = self.file
 771                 self.file = None
 772                 f.close()
 773                 raise
 774             assert name.endswith(b'.pack')
 775             self.filename = name[:-5]
 776             self.file.write(b'PACK\0\0\0\2\0\0\0\0')
 777             self.idx = PackIdxV2Writer()
 778
 779     def _raw_write(self, datalist, sha):
 780         self._open()
 781         f = self.file
 782         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 783         # the file never has a *partial* blob.  So let's make sure it's
 784         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 785         # to our hashsplit algorithm.)  f.write() does its own buffering,
 786         # but that's okay because we'll flush it in _end().
 787         oneblob = b''.join(datalist)
 788         try:
 789             f.write(oneblob)
 790         except IOError as e:
 791             reraise(GitError(e))
 792         nw = len(oneblob)
 793         crc = zlib.crc32(oneblob) & 0xffffffff
 794         self._update_idx(sha, crc, nw)
 795         self.outbytes += nw
 796         self.count += 1
 797         return nw, crc
 798
 799     def _update_idx(self, sha, crc, size):
 800         assert(sha)
 801         if self.idx:
 802             self.idx.add(sha, crc, self.file.tell() - size)
 803
 804     def _write(self, sha, type, content):
 805         if verbose:
 806             log('>')
 807         if not sha:
 808             sha = calc_hash(type, content)
 809         size, crc = self._raw_write(_encode_packobj(type, content,
 810                                                     self.compression_level),
 811                                     sha=sha)
 812         if self.outbytes >= self.max_pack_size \
 813            or self.count >= self.max_pack_objects:
 814             self.breakpoint()
 815         return sha
 816
 817     def breakpoint(self):
 818         """Clear byte and object counts and return the last processed id."""
 819         id = self._end(self.run_midx)
 820         self.outbytes = self.count = 0
 821         return id
 822
 823     def _require_objcache(self):
 824         if self.objcache is None and self.objcache_maker:
 825             self.objcache = self.objcache_maker()
 826         if self.objcache is None:
 827             raise GitError(
 828                     "PackWriter not opened or can't check exists w/o objcache")
 829
 830     def exists(self, id, want_source=False):
 831         """Return non-empty if an object is found in the object cache."""
 832         self._require_objcache()
 833         return self.objcache.exists(id, want_source=want_source)
 834
 835     def just_write(self, sha, type, content):
 836         """Write an object to the pack file without checking for duplication."""
 837         self._write(sha, type, content)
 838         # If nothing else, gc doesn't have/want an objcache
 839         if self.objcache is not None:
 840             self.objcache.add(sha)
 841
 842     def maybe_write(self, type, content):
 843         """Write an object to the pack file if not present and return its id."""
 844         sha = calc_hash(type, content)
 845         if not self.exists(sha):
 846             self._require_objcache()
 847             self.just_write(sha, type, content)
 848         return sha
 849
 850     def new_blob(self, blob):
 851         """Create a blob object in the pack with the supplied content."""
 852         return self.maybe_write(b'blob', blob)
 853
 854     def new_tree(self, shalist):
 855         """Create a tree object in the pack."""
 856         content = tree_encode(shalist)
 857         return self.maybe_write(b'tree', content)
 858
 859     def new_commit(self, tree, parent,
 860                    author, adate_sec, adate_tz,
 861                    committer, cdate_sec, cdate_tz,
 862                    msg):
 863         """Create a commit object in the pack.  The date_sec values must be
 864         epoch-seconds, and if a tz is None, the local timezone is assumed."""
 865         content = create_commit_blob(tree, parent,
 866                                      author, adate_sec, adate_tz,
 867                                      committer, cdate_sec, cdate_tz,
 868                                      msg)
 869         return self.maybe_write(b'commit', content)
 870
 871     def abort(self):
 872         """Remove the pack file from disk."""
 873         f = self.file
 874         if f:
 875             pfd = self.parentfd
 876             self.file = None
 877             self.parentfd = None
 878             self.idx = None
 879             try:
 880                 try:
 881                     os.unlink(self.filename + b'.pack')
 882                 finally:
 883                     f.close()
 884             finally:
 885                 if pfd is not None:
 886                     os.close(pfd)
 887
 888     def _end(self, run_midx=True):
 889         f = self.file
 890         if not f: return None
 891         self.file = None
 892         try:
 893             self.objcache = None
 894             idx = self.idx
 895             self.idx = None
 896
 897             # update object count
 898             f.seek(8)
 899             cp = struct.pack('!i', self.count)
 900             assert(len(cp) == 4)
 901             f.write(cp)
 902
 903             # calculate the pack sha1sum
 904             f.seek(0)
 905             sum = Sha1()
 906             for b in chunkyreader(f):
 907                 sum.update(b)
 908             packbin = sum.digest()
 909             f.write(packbin)
 910             fdatasync(f.fileno())
 911         finally:
 912             f.close()
 913
 914         idx.write(self.filename + b'.idx', packbin)
 915         nameprefix = os.path.join(self.repo_dir,
 916                                   b'objects/pack/pack-' +  hexlify(packbin))
 917         if os.path.exists(self.filename + b'.map'):
 918             os.unlink(self.filename + b'.map')
 919         os.rename(self.filename + b'.pack', nameprefix + b'.pack')
 920         os.rename(self.filename + b'.idx', nameprefix + b'.idx')
 921         try:
 922             os.fsync(self.parentfd)
 923         finally:
 924             os.close(self.parentfd)
 925
 926         if run_midx:
 927             auto_midx(os.path.join(self.repo_dir, b'objects/pack'))
 928
 929         if self.on_pack_finish:
 930             self.on_pack_finish(nameprefix)
 931
 932         return nameprefix
 933
 934     def close(self, run_midx=True):
 935         """Close the pack file and move it to its definitive path."""
 936         return self._end(run_midx=run_midx)
 937
 938
 939 class PackIdxV2Writer:
 940     def __init__(self):
 941         self.idx = list(list() for i in range(256))
 942         self.count = 0
 943
 944     def add(self, sha, crc, offs):
 945         assert(sha)
 946         self.count += 1
 947         self.idx[byte_int(sha[0])].append((sha, crc, offs))
 948
 949     def write(self, filename, packbin):
 950         ofs64_count = 0
 951         for section in self.idx:
 952             for entry in section:
 953                 if entry[2] >= 2**31:
 954                     ofs64_count += 1
 955
 956         # Length: header + fan-out + shas-and-crcs + overflow-offsets
 957         index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
 958         idx_map = None
 959         idx_f = open(filename, 'w+b')
 960         try:
 961             idx_f.truncate(index_len)
 962             fdatasync(idx_f.fileno())
 963             idx_map = mmap_readwrite(idx_f, close=False)
 964             try:
 965                 count = _helpers.write_idx(filename, idx_map, self.idx,
 966                                            self.count)
 967                 assert(count == self.count)
 968                 idx_map.flush()
 969             finally:
 970                 idx_map.close()
 971         finally:
 972             idx_f.close()
 973
 974         idx_f = open(filename, 'a+b')
 975         try:
 976             idx_f.write(packbin)
 977             idx_f.seek(0)
 978             idx_sum = Sha1()
 979             b = idx_f.read(8 + 4*256)
 980             idx_sum.update(b)
 981
 982             for b in chunkyreader(idx_f, 20 * self.count):
 983                 idx_sum.update(b)
 984
 985             for b in chunkyreader(idx_f):
 986                 idx_sum.update(b)
 987             idx_f.write(idx_sum.digest())
 988             fdatasync(idx_f.fileno())
 989         finally:
 990             idx_f.close()
 991
 992
 993 def list_refs(patterns=None, repo_dir=None,
 994               limit_to_heads=False, limit_to_tags=False):
 995     """Yield (refname, hash) tuples for all repository refs unless
 996     patterns are specified.  In that case, only include tuples for
 997     refs matching those patterns (cf. git-show-ref(1)).  The limits
 998     restrict the result items to refs/heads or refs/tags.  If both
 999     limits are specified, items from both sources will be included.
1000
1001     """
1002     argv = [b'git', b'show-ref']
1003     if limit_to_heads:
1004         argv.append(b'--heads')
1005     if limit_to_tags:
1006         argv.append(b'--tags')
1007     argv.append(b'--')
1008     if patterns:
1009         argv.extend(patterns)
1010     p = subprocess.Popen(argv, env=_gitenv(repo_dir), stdout=subprocess.PIPE,
1011                          close_fds=True)
1012     out = p.stdout.read().strip()
1013     rv = p.wait()  # not fatal
1014     if rv:
1015         assert(not out)
1016     if out:
1017         for d in out.split(b'\n'):
1018             sha, name = d.split(b' ', 1)
1019             yield name, unhexlify(sha)
1020
1021
1022 def read_ref(refname, repo_dir = None):
1023     """Get the commit id of the most recent commit made on a given ref."""
1024     refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
1025     l = tuple(islice(refs, 2))
1026     if l:
1027         assert(len(l) == 1)
1028         return l[0][1]
1029     else:
1030         return None
1031
1032
1033 def rev_list_invocation(ref_or_refs, format=None):
1034     if isinstance(ref_or_refs, bytes):
1035         refs = (ref_or_refs,)
1036     else:
1037         refs = ref_or_refs
1038     argv = [b'git', b'rev-list']
1039
1040     if format:
1041         argv.append(b'--pretty=format:' + format)
1042     for ref in refs:
1043         assert not ref.startswith(b'-')
1044         argv.append(ref)
1045     argv.append(b'--')
1046     return argv
1047
1048
1049 def rev_list(ref_or_refs, parse=None, format=None, repo_dir=None):
1050     """Yield information about commits as per "git rev-list".  If a format
1051     is not provided, yield one hex hash at a time.  If a format is
1052     provided, pass it to rev-list and call parse(git_stdout) for each
1053     commit with the stream positioned just after the rev-list "commit
1054     HASH" header line.  When a format is provided yield (oidx,
1055     parse(git_stdout)) for each commit.
1056
1057     """
1058     assert bool(parse) == bool(format)
1059     p = subprocess.Popen(rev_list_invocation(ref_or_refs,
1060                                              format=format),
1061                          env=_gitenv(repo_dir),
1062                          stdout = subprocess.PIPE,
1063                          close_fds=True)
1064     if not format:
1065         for line in p.stdout:
1066             yield line.strip()
1067     else:
1068         line = p.stdout.readline()
1069         while line:
1070             s = line.strip()
1071             if not s.startswith(b'commit '):
1072                 raise Exception('unexpected line ' + repr(s))
1073             s = s[7:]
1074             assert len(s) == 40
1075             yield s, parse(p.stdout)
1076             line = p.stdout.readline()
1077
1078     rv = p.wait()  # not fatal
1079     if rv:
1080         raise GitError('git rev-list returned error %d' % rv)
1081
1082
1083 def rev_parse(committish, repo_dir=None):
1084     """Resolve the full hash for 'committish', if it exists.
1085
1086     Should be roughly equivalent to 'git rev-parse'.
1087
1088     Returns the hex value of the hash if it is found, None if 'committish' does
1089     not correspond to anything.
1090     """
1091     head = read_ref(committish, repo_dir=repo_dir)
1092     if head:
1093         debug2("resolved from ref: commit = %s\n" % hexlify(head))
1094         return head
1095
1096     pL = PackIdxList(repo(b'objects/pack', repo_dir=repo_dir))
1097
1098     if len(committish) == 40:
1099         try:
1100             hash = unhexlify(committish)
1101         except TypeError:
1102             return None
1103
1104         if pL.exists(hash):
1105             return hash
1106
1107     return None
1108
1109
1110 def update_ref(refname, newval, oldval, repo_dir=None):
1111     """Update a repository reference."""
1112     if not oldval:
1113         oldval = b''
1114     assert refname.startswith(b'refs/heads/') \
1115         or refname.startswith(b'refs/tags/')
1116     p = subprocess.Popen([b'git', b'update-ref', refname,
1117                           hexlify(newval), hexlify(oldval)],
1118                          env=_gitenv(repo_dir),
1119                          close_fds=True)
1120     _git_wait(b'git update-ref', p)
1121
1122
1123 def delete_ref(refname, oldvalue=None):
1124     """Delete a repository reference (see git update-ref(1))."""
1125     assert refname.startswith(b'refs/')
1126     oldvalue = [] if not oldvalue else [oldvalue]
1127     p = subprocess.Popen([b'git', b'update-ref', b'-d', refname] + oldvalue,
1128                          env=_gitenv(),
1129                          close_fds=True)
1130     _git_wait('git update-ref', p)
1131
1132
1133 def guess_repo(path=None):
1134     """Set the path value in the global variable "repodir".
1135     This makes bup look for an existing bup repository, but not fail if a
1136     repository doesn't exist. Usually, if you are interacting with a bup
1137     repository, you would not be calling this function but using
1138     check_repo_or_die().
1139     """
1140     global repodir
1141     if path:
1142         repodir = path
1143     if not repodir:
1144         repodir = environ.get(b'BUP_DIR')
1145         if not repodir:
1146             repodir = os.path.expanduser(b'~/.bup')
1147
1148
1149 def init_repo(path=None):
1150     """Create the Git bare repository for bup in a given path."""
1151     guess_repo(path)
1152     d = repo()  # appends a / to the path
1153     parent = os.path.dirname(os.path.dirname(d))
1154     if parent and not os.path.exists(parent):
1155         raise GitError('parent directory "%s" does not exist\n'
1156                        % path_msg(parent))
1157     if os.path.exists(d) and not os.path.isdir(os.path.join(d, b'.')):
1158         raise GitError('"%s" exists but is not a directory\n' % path_msg(d))
1159     p = subprocess.Popen([b'git', b'--bare', b'init'], stdout=sys.stderr,
1160                          env=_gitenv(),
1161                          close_fds=True)
1162     _git_wait('git init', p)
1163     # Force the index version configuration in order to ensure bup works
1164     # regardless of the version of the installed Git binary.
1165     p = subprocess.Popen([b'git', b'config', b'pack.indexVersion', '2'],
1166                          stdout=sys.stderr, env=_gitenv(), close_fds=True)
1167     _git_wait('git config', p)
1168     # Enable the reflog
1169     p = subprocess.Popen([b'git', b'config', b'core.logAllRefUpdates', b'true'],
1170                          stdout=sys.stderr, env=_gitenv(), close_fds=True)
1171     _git_wait('git config', p)
1172
1173
1174 def check_repo_or_die(path=None):
1175     """Check to see if a bup repository probably exists, and abort if not."""
1176     guess_repo(path)
1177     top = repo()
1178     pst = stat_if_exists(top + b'/objects/pack')
1179     if pst and stat.S_ISDIR(pst.st_mode):
1180         return
1181     if not pst:
1182         top_st = stat_if_exists(top)
1183         if not top_st:
1184             log('error: repository %r does not exist (see "bup help init")\n'
1185                 % top)
1186             sys.exit(15)
1187     log('error: %s is not a repository\n' % path_msg(top))
1188     sys.exit(14)
1189
1190
1191 def is_suitable_git(ver_str):
1192     if not ver_str.startswith(b'git version '):
1193         return 'unrecognized'
1194     ver_str = ver_str[len(b'git version '):]
1195     if ver_str.startswith(b'0.'):
1196         return 'insufficient'
1197     if ver_str.startswith(b'1.'):
1198         if re.match(br'1\.[012345]rc', ver_str):
1199             return 'insufficient'
1200         if re.match(br'1\.[01234]\.', ver_str):
1201             return 'insufficient'
1202         if re.match(br'1\.5\.[012345]($|\.)', ver_str):
1203             return 'insufficient'
1204         if re.match(br'1\.5\.6-rc', ver_str):
1205             return 'insufficient'
1206         return 'suitable'
1207     if re.match(br'[0-9]+(\.|$)?', ver_str):
1208         return 'suitable'
1209     sys.exit(13)
1210
1211 _git_great = None
1212
1213 def require_suitable_git(ver_str=None):
1214     """Raise GitError if the version of git isn't suitable.
1215
1216     Rely on ver_str when provided, rather than invoking the git in the
1217     path.
1218
1219     """
1220     global _git_great
1221     if _git_great is not None:
1222         return
1223     if environ.get(b'BUP_GIT_VERSION_IS_FINE', b'').lower() \
1224        in (b'yes', b'true', b'1'):
1225         _git_great = True
1226         return
1227     if not ver_str:
1228         ver_str, _, _ = _git_exo([b'git', b'--version'])
1229     status = is_suitable_git(ver_str)
1230     if status == 'unrecognized':
1231         raise GitError('Unexpected git --version output: %r' % ver_str)
1232     if status == 'insufficient':
1233         log('error: git version must be at least 1.5.6\n')
1234         sys.exit(1)
1235     if status == 'suitable':
1236         _git_great = True
1237         return
1238     assert False
1239
1240
1241 class _AbortableIter:
1242     def __init__(self, it, onabort = None):
1243         self.it = it
1244         self.onabort = onabort
1245         self.done = None
1246
1247     def __iter__(self):
1248         return self
1249
1250     def __next__(self):
1251         try:
1252             return next(self.it)
1253         except StopIteration as e:
1254             self.done = True
1255             raise
1256         except:
1257             self.abort()
1258             raise
1259
1260     next = __next__
1261
1262     def abort(self):
1263         """Abort iteration and call the abortion callback, if needed."""
1264         if not self.done:
1265             self.done = True
1266             if self.onabort:
1267                 self.onabort()
1268
1269     def __del__(self):
1270         self.abort()
1271
1272
1273 class CatPipe:
1274     """Link to 'git cat-file' that is used to retrieve blob data."""
1275     def __init__(self, repo_dir = None):
1276         require_suitable_git()
1277         self.repo_dir = repo_dir
1278         self.p = self.inprogress = None
1279
1280     def close(self, wait=False):
1281         p = self.p
1282         if p:
1283             p.stdout.close()
1284             p.stdin.close()
1285         self.p = None
1286         self.inprogress = None
1287         if wait:
1288             p.wait()
1289             return p.returncode
1290         return None
1291
1292     def restart(self):
1293         self.close()
1294         self.p = subprocess.Popen([b'git', b'cat-file', b'--batch'],
1295                                   stdin=subprocess.PIPE,
1296                                   stdout=subprocess.PIPE,
1297                                   close_fds = True,
1298                                   bufsize = 4096,
1299                                   env=_gitenv(self.repo_dir))
1300
1301     def get(self, ref):
1302         """Yield (oidx, type, size), followed by the data referred to by ref.
1303         If ref does not exist, only yield (None, None, None).
1304
1305         """
1306         if not self.p or self.p.poll() != None:
1307             self.restart()
1308         assert(self.p)
1309         poll_result = self.p.poll()
1310         assert(poll_result == None)
1311         if self.inprogress:
1312             log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1313         assert(not self.inprogress)
1314         assert ref.find(b'\n') < 0
1315         assert ref.find(b'\r') < 0
1316         assert not ref.startswith(b'-')
1317         self.inprogress = ref
1318         self.p.stdin.write(ref + b'\n')
1319         self.p.stdin.flush()
1320         hdr = self.p.stdout.readline()
1321         if not hdr:
1322             raise GitError('unexpected cat-file EOF (last request: %r, exit: %s)'
1323                            % (ref, self.p.poll() or 'none'))
1324         if hdr.endswith(b' missing\n'):
1325             self.inprogress = None
1326             yield None, None, None
1327             return
1328         info = hdr.split(b' ')
1329         if len(info) != 3 or len(info[0]) != 40:
1330             raise GitError('expected object (id, type, size), got %r' % info)
1331         oidx, typ, size = info
1332         size = int(size)
1333         it = _AbortableIter(chunkyreader(self.p.stdout, size),
1334                             onabort=self.close)
1335         try:
1336             yield oidx, typ, size
1337             for blob in it:
1338                 yield blob
1339             readline_result = self.p.stdout.readline()
1340             assert readline_result == b'\n'
1341             self.inprogress = None
1342         except Exception as e:
1343             it.abort()
1344             raise
1345
1346     def _join(self, it):
1347         _, typ, _ = next(it)
1348         if typ == b'blob':
1349             for blob in it:
1350                 yield blob
1351         elif typ == b'tree':
1352             treefile = b''.join(it)
1353             for (mode, name, sha) in tree_decode(treefile):
1354                 for blob in self.join(hexlify(sha)):
1355                     yield blob
1356         elif typ == b'commit':
1357             treeline = b''.join(it).split(b'\n')[0]
1358             assert treeline.startswith(b'tree ')
1359             for blob in self.join(treeline[5:]):
1360                 yield blob
1361         else:
1362             raise GitError('invalid object type %r: expected blob/tree/commit'
1363                            % typ)
1364
1365     def join(self, id):
1366         """Generate a list of the content of all blobs that can be reached
1367         from an object.  The hash given in 'id' must point to a blob, a tree
1368         or a commit. The content of all blobs that can be seen from trees or
1369         commits will be added to the list.
1370         """
1371         for d in self._join(self.get(id)):
1372             yield d
1373
1374
1375 _cp = {}
1376
1377 def cp(repo_dir=None):
1378     """Create a CatPipe object or reuse the already existing one."""
1379     global _cp, repodir
1380     if not repo_dir:
1381         repo_dir = repodir or repo()
1382     repo_dir = os.path.abspath(repo_dir)
1383     cp = _cp.get(repo_dir)
1384     if not cp:
1385         cp = CatPipe(repo_dir)
1386         _cp[repo_dir] = cp
1387     return cp
1388
1389
1390 def close_catpipes():
1391     # FIXME: chain exceptions
1392     while _cp:
1393         _, cp = _cp.popitem()
1394         cp.close(wait=True)
1395
1396
1397 def tags(repo_dir = None):
1398     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1399     tags = {}
1400     for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1401         assert n.startswith(b'refs/tags/')
1402         name = n[10:]
1403         if not c in tags:
1404             tags[c] = []
1405         tags[c].append(name)  # more than one tag can point at 'c'
1406     return tags
1407
1408
1409 class MissingObject(KeyError):
1410     def __init__(self, oid):
1411         self.oid = oid
1412         KeyError.__init__(self, 'object %r is missing' % hexlify(oid))
1413
1414
1415 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1416                                    'path', 'chunk_path', 'data'])
1417 # The path is the mangled path, and if an item represents a fragment
1418 # of a chunked file, the chunk_path will be the chunked subtree path
1419 # for the chunk, i.e. ['', '2d3115e', ...].  The top-level path for a
1420 # chunked file will have a chunk_path of [''].  So some chunk subtree
1421 # of the file '/foo/bar/baz' might look like this:
1422 #
1423 #   item.path = ['foo', 'bar', 'baz.bup']
1424 #   item.chunk_path = ['', '2d3115e', '016b097']
1425 #   item.type = 'tree'
1426 #   ...
1427
1428
1429 def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1430     """Yield everything reachable from oidx via get_ref (which must behave
1431     like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1432     returns true.  Throw MissingObject if a hash encountered is
1433     missing from the repository, and don't read or return blob content
1434     in the data field unless include_data is set.
1435
1436     """
1437     # Maintain the pending stack on the heap to avoid stack overflow
1438     pending = [(oidx, [], [], None)]
1439     while len(pending):
1440         oidx, parent_path, chunk_path, mode = pending.pop()
1441         oid = unhexlify(oidx)
1442         if stop_at and stop_at(oidx):
1443             continue
1444
1445         if (not include_data) and mode and stat.S_ISREG(mode):
1446             # If the object is a "regular file", then it's a leaf in
1447             # the graph, so we can skip reading the data if the caller
1448             # hasn't requested it.
1449             yield WalkItem(oid=oid, type=b'blob',
1450                            chunk_path=chunk_path, path=parent_path,
1451                            mode=mode,
1452                            data=None)
1453             continue
1454
1455         item_it = get_ref(oidx)
1456         get_oidx, typ, _ = next(item_it)
1457         if not get_oidx:
1458             raise MissingObject(unhexlify(oidx))
1459         if typ not in (b'blob', b'commit', b'tree'):
1460             raise Exception('unexpected repository object type %r' % typ)
1461
1462         # FIXME: set the mode based on the type when the mode is None
1463         if typ == b'blob' and not include_data:
1464             # Dump data until we can ask cat_pipe not to fetch it
1465             for ignored in item_it:
1466                 pass
1467             data = None
1468         else:
1469             data = b''.join(item_it)
1470
1471         yield WalkItem(oid=oid, type=typ,
1472                        chunk_path=chunk_path, path=parent_path,
1473                        mode=mode,
1474                        data=(data if include_data else None))
1475
1476         if typ == b'commit':
1477             commit_items = parse_commit(data)
1478             for pid in commit_items.parents:
1479                 pending.append((pid, parent_path, chunk_path, mode))
1480             pending.append((commit_items.tree, parent_path, chunk_path,
1481                             hashsplit.GIT_MODE_TREE))
1482         elif typ == b'tree':
1483             for mode, name, ent_id in tree_decode(data):
1484                 demangled, bup_type = demangle_name(name, mode)
1485                 if chunk_path:
1486                     sub_path = parent_path
1487                     sub_chunk_path = chunk_path + [name]
1488                 else:
1489                     sub_path = parent_path + [name]
1490                     if bup_type == BUP_CHUNKED:
1491                         sub_chunk_path = [b'']
1492                     else:
1493                         sub_chunk_path = chunk_path
1494                 pending.append((hexlify(ent_id), sub_path, sub_chunk_path,
1495                                 mode))