lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5
   6 from __future__ import absolute_import, print_function
   7 import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
   8 from array import array
   9 from binascii import hexlify, unhexlify
  10 from collections import namedtuple
  11 from itertools import islice
  12 from numbers import Integral
  13
  14 from bup import _helpers, compat, hashsplit, path, midx, bloom, xstat
  15 from bup.compat import (buffer,
  16                         byte_int, bytes_from_byte, bytes_from_uint,
  17                         environ,
  18                         items,
  19                         range,
  20                         reraise)
  21 from bup.io import path_msg
  22 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
  23                          exo,
  24                          fdatasync,
  25                          hostname, localtime, log,
  26                          merge_dict,
  27                          merge_iter,
  28                          mmap_read, mmap_readwrite,
  29                          parse_num,
  30                          progress, qprogress, stat_if_exists,
  31                          unlink,
  32                          utc_offset_str)
  33 from bup.pwdgrp import username, userfullname
  34
  35
  36 verbose = 0
  37 repodir = None  # The default repository, once initialized
  38
  39 _typemap =  {b'blob': 3, b'tree': 2, b'commit': 1, b'tag': 4}
  40 _typermap = {v: k for k, v in items(_typemap)}
  41
  42
  43 _total_searches = 0
  44 _total_steps = 0
  45
  46
  47 class GitError(Exception):
  48     pass
  49
  50
  51 def _gitenv(repo_dir=None):
  52     if not repo_dir:
  53         repo_dir = repo()
  54     return merge_dict(environ, {b'GIT_DIR': os.path.abspath(repo_dir)})
  55
  56 def _git_wait(cmd, p):
  57     rv = p.wait()
  58     if rv != 0:
  59         raise GitError('%r returned %d' % (cmd, rv))
  60
  61 def _git_exo(cmd, **kwargs):
  62     kwargs['check'] = False
  63     result = exo(cmd, **kwargs)
  64     _, _, proc = result
  65     if proc.returncode != 0:
  66         raise GitError('%r returned %d' % (cmd, proc.returncode))
  67     return result
  68
  69 def git_config_get(option, repo_dir=None):
  70     cmd = (b'git', b'config', b'--get', option)
  71     p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
  72                          env=_gitenv(repo_dir=repo_dir),
  73                          close_fds=True)
  74     r = p.stdout.read()
  75     rc = p.wait()
  76     if rc == 0:
  77         return r
  78     if rc != 1:
  79         raise GitError('%r returned %d' % (cmd, rc))
  80     return None
  81
  82
  83 def parse_tz_offset(s):
  84     """UTC offset in seconds."""
  85     tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
  86     if bytes_from_byte(s[0]) == b'-':
  87         return - tz_off
  88     return tz_off
  89
  90
  91 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
  92 # Make sure that's authoritative.
  93 _start_end_char = br'[^ .,:;<>"\'\0\n]'
  94 _content_char = br'[^\0\n<>]'
  95 _safe_str_rx = br'(?:%s{1,2}|(?:%s%s*%s))' \
  96     % (_start_end_char,
  97        _start_end_char, _content_char, _start_end_char)
  98 _tz_rx = br'[-+]\d\d[0-5]\d'
  99 _parent_rx = br'(?:parent [abcdefABCDEF0123456789]{40}\n)'
 100 # Assumes every following line starting with a space is part of the
 101 # mergetag.  Is there a formal commit blob spec?
 102 _mergetag_rx = br'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)'
 103 _commit_rx = re.compile(br'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
 104 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
 105 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?)
 106
 107 (?P<message>(?:.|\n)*)''' % (_parent_rx,
 108                              _safe_str_rx, _safe_str_rx, _tz_rx,
 109                              _safe_str_rx, _safe_str_rx, _tz_rx,
 110                              _mergetag_rx))
 111 _parent_hash_rx = re.compile(br'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
 112
 113 # Note that the author_sec and committer_sec values are (UTC) epoch
 114 # seconds, and for now the mergetag is not included.
 115 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
 116                                        'author_name', 'author_mail',
 117                                        'author_sec', 'author_offset',
 118                                        'committer_name', 'committer_mail',
 119                                        'committer_sec', 'committer_offset',
 120                                        'message'])
 121
 122 def parse_commit(content):
 123     commit_match = re.match(_commit_rx, content)
 124     if not commit_match:
 125         raise Exception('cannot parse commit %r' % content)
 126     matches = commit_match.groupdict()
 127     return CommitInfo(tree=matches['tree'],
 128                       parents=re.findall(_parent_hash_rx, matches['parents']),
 129                       author_name=matches['author_name'],
 130                       author_mail=matches['author_mail'],
 131                       author_sec=int(matches['asec']),
 132                       author_offset=parse_tz_offset(matches['atz']),
 133                       committer_name=matches['committer_name'],
 134                       committer_mail=matches['committer_mail'],
 135                       committer_sec=int(matches['csec']),
 136                       committer_offset=parse_tz_offset(matches['ctz']),
 137                       message=matches['message'])
 138
 139
 140 def get_cat_data(cat_iterator, expected_type):
 141     _, kind, _ = next(cat_iterator)
 142     if kind != expected_type:
 143         raise Exception('expected %r, saw %r' % (expected_type, kind))
 144     return b''.join(cat_iterator)
 145
 146 def get_commit_items(id, cp):
 147     return parse_commit(get_cat_data(cp.get(id), b'commit'))
 148
 149 def _local_git_date_str(epoch_sec):
 150     return b'%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
 151
 152
 153 def _git_date_str(epoch_sec, tz_offset_sec):
 154     offs =  tz_offset_sec // 60
 155     return b'%d %s%02d%02d' \
 156         % (epoch_sec,
 157            b'+' if offs >= 0 else b'-',
 158            abs(offs) // 60,
 159            abs(offs) % 60)
 160
 161
 162 def repo(sub = b'', repo_dir=None):
 163     """Get the path to the git repository or one of its subdirectories."""
 164     repo_dir = repo_dir or repodir
 165     if not repo_dir:
 166         raise GitError('You should call check_repo_or_die()')
 167
 168     # If there's a .git subdirectory, then the actual repo is in there.
 169     gd = os.path.join(repo_dir, b'.git')
 170     if os.path.exists(gd):
 171         repo_dir = gd
 172
 173     return os.path.join(repo_dir, sub)
 174
 175
 176 _shorten_hash_rx = \
 177     re.compile(br'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)')
 178
 179 def shorten_hash(s):
 180     return _shorten_hash_rx.sub(br'\1\2*\3', s)
 181
 182
 183 def repo_rel(path):
 184     full = os.path.abspath(path)
 185     fullrepo = os.path.abspath(repo(b''))
 186     if not fullrepo.endswith(b'/'):
 187         fullrepo += b'/'
 188     if full.startswith(fullrepo):
 189         path = full[len(fullrepo):]
 190     if path.startswith(b'index-cache/'):
 191         path = path[len(b'index-cache/'):]
 192     return shorten_hash(path)
 193
 194
 195 def all_packdirs():
 196     paths = [repo(b'objects/pack')]
 197     paths += glob.glob(repo(b'index-cache/*/.'))
 198     return paths
 199
 200
 201 def auto_midx(objdir):
 202     args = [path.exe(), b'midx', b'--auto', b'--dir', objdir]
 203     try:
 204         rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
 205     except OSError as e:
 206         # make sure 'args' gets printed to help with debugging
 207         add_error('%r: exception: %s' % (args, e))
 208         raise
 209     if rv:
 210         add_error('%r: returned %d' % (args, rv))
 211
 212     args = [path.exe(), b'bloom', b'--dir', objdir]
 213     try:
 214         rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
 215     except OSError as e:
 216         # make sure 'args' gets printed to help with debugging
 217         add_error('%r: exception: %s' % (args, e))
 218         raise
 219     if rv:
 220         add_error('%r: returned %d' % (args, rv))
 221
 222
 223 def mangle_name(name, mode, gitmode):
 224     """Mangle a file name to present an abstract name for segmented files.
 225     Mangled file names will have the ".bup" extension added to them. If a
 226     file's name already ends with ".bup", a ".bupl" extension is added to
 227     disambiguate normal files from segmented ones.
 228     """
 229     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 230         assert(stat.S_ISDIR(gitmode))
 231         return name + b'.bup'
 232     elif name.endswith(b'.bup') or name[:-1].endswith(b'.bup'):
 233         return name + b'.bupl'
 234     else:
 235         return name
 236
 237
 238 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 239 def demangle_name(name, mode):
 240     """Remove name mangling from a file name, if necessary.
 241
 242     The return value is a tuple (demangled_filename,mode), where mode is one of
 243     the following:
 244
 245     * BUP_NORMAL  : files that should be read as-is from the repository
 246     * BUP_CHUNKED : files that were chunked and need to be reassembled
 247
 248     For more information on the name mangling algorithm, see mangle_name()
 249     """
 250     if name.endswith(b'.bupl'):
 251         return (name[:-5], BUP_NORMAL)
 252     elif name.endswith(b'.bup'):
 253         return (name[:-4], BUP_CHUNKED)
 254     elif name.endswith(b'.bupm'):
 255         return (name[:-5],
 256                 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
 257     else:
 258         return (name, BUP_NORMAL)
 259
 260
 261 def calc_hash(type, content):
 262     """Calculate some content's hash in the Git fashion."""
 263     header = b'%s %d\0' % (type, len(content))
 264     sum = Sha1(header)
 265     sum.update(content)
 266     return sum.digest()
 267
 268
 269 def shalist_item_sort_key(ent):
 270     (mode, name, id) = ent
 271     assert(mode+0 == mode)
 272     if stat.S_ISDIR(mode):
 273         return name + b'/'
 274     else:
 275         return name
 276
 277
 278 def tree_encode(shalist):
 279     """Generate a git tree object from (mode,name,hash) tuples."""
 280     shalist = sorted(shalist, key = shalist_item_sort_key)
 281     l = []
 282     for (mode,name,bin) in shalist:
 283         assert(mode)
 284         assert(mode+0 == mode)
 285         assert(name)
 286         assert(len(bin) == 20)
 287         s = b'%o %s\0%s' % (mode,name,bin)
 288         assert s[0] != b'0'  # 0-padded octal is not acceptable in a git tree
 289         l.append(s)
 290     return b''.join(l)
 291
 292
 293 def tree_decode(buf):
 294     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 295     ofs = 0
 296     while ofs < len(buf):
 297         z = buf.find(b'\0', ofs)
 298         assert(z > ofs)
 299         spl = buf[ofs:z].split(b' ', 1)
 300         assert(len(spl) == 2)
 301         mode,name = spl
 302         sha = buf[z+1:z+1+20]
 303         ofs = z+1+20
 304         yield (int(mode, 8), name, sha)
 305
 306
 307 def _encode_packobj(type, content, compression_level=1):
 308     if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
 309         raise ValueError('invalid compression level %s' % compression_level)
 310     szout = b''
 311     sz = len(content)
 312     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 313     sz >>= 4
 314     while 1:
 315         if sz: szbits |= 0x80
 316         szout += bytes_from_uint(szbits)
 317         if not sz:
 318             break
 319         szbits = sz & 0x7f
 320         sz >>= 7
 321     z = zlib.compressobj(compression_level)
 322     yield szout
 323     yield z.compress(content)
 324     yield z.flush()
 325
 326
 327 def _encode_looseobj(type, content, compression_level=1):
 328     z = zlib.compressobj(compression_level)
 329     yield z.compress(b'%s %d\0' % (type, len(content)))
 330     yield z.compress(content)
 331     yield z.flush()
 332
 333
 334 def _decode_looseobj(buf):
 335     assert(buf);
 336     s = zlib.decompress(buf)
 337     i = s.find(b'\0')
 338     assert(i > 0)
 339     l = s[:i].split(b' ')
 340     type = l[0]
 341     sz = int(l[1])
 342     content = s[i+1:]
 343     assert(type in _typemap)
 344     assert(sz == len(content))
 345     return (type, content)
 346
 347
 348 def _decode_packobj(buf):
 349     assert(buf)
 350     c = byte_int(buf[0])
 351     type = _typermap[(c & 0x70) >> 4]
 352     sz = c & 0x0f
 353     shift = 4
 354     i = 0
 355     while c & 0x80:
 356         i += 1
 357         c = byte_int(buf[i])
 358         sz |= (c & 0x7f) << shift
 359         shift += 7
 360         if not (c & 0x80):
 361             break
 362     return (type, zlib.decompress(buf[i+1:]))
 363
 364
 365 class PackIdx:
 366     def __init__(self):
 367         assert(0)
 368
 369     def find_offset(self, hash):
 370         """Get the offset of an object inside the index file."""
 371         idx = self._idx_from_hash(hash)
 372         if idx != None:
 373             return self._ofs_from_idx(idx)
 374         return None
 375
 376     def exists(self, hash, want_source=False):
 377         """Return nonempty if the object exists in this index."""
 378         if hash and (self._idx_from_hash(hash) != None):
 379             return want_source and os.path.basename(self.name) or True
 380         return None
 381
 382     def _idx_from_hash(self, hash):
 383         global _total_searches, _total_steps
 384         _total_searches += 1
 385         assert(len(hash) == 20)
 386         b1 = byte_int(hash[0])
 387         start = self.fanout[b1-1] # range -1..254
 388         end = self.fanout[b1] # range 0..255
 389         want = hash
 390         _total_steps += 1  # lookup table is a step
 391         while start < end:
 392             _total_steps += 1
 393             mid = start + (end - start) // 2
 394             v = self._idx_to_hash(mid)
 395             if v < want:
 396                 start = mid+1
 397             elif v > want:
 398                 end = mid
 399             else: # got it!
 400                 return mid
 401         return None
 402
 403
 404 class PackIdxV1(PackIdx):
 405     """Object representation of a Git pack index (version 1) file."""
 406     def __init__(self, filename, f):
 407         self.name = filename
 408         self.idxnames = [self.name]
 409         self.map = mmap_read(f)
 410         # Min size for 'L' is 4, which is sufficient for struct's '!I'
 411         self.fanout = array('L', struct.unpack('!256I', self.map))
 412         self.fanout.append(0)  # entry "-1"
 413         self.nsha = self.fanout[255]
 414         self.sha_ofs = 256 * 4
 415         # Avoid slicing shatable for individual hashes (very high overhead)
 416         self.shatable = buffer(self.map, self.sha_ofs, self.nsha * 24)
 417
 418     def __enter__(self):
 419         return self
 420
 421     def __exit__(self, type, value, traceback):
 422         self.close()
 423
 424     def __len__(self):
 425         return int(self.nsha)  # int() from long for python 2
 426
 427     def _ofs_from_idx(self, idx):
 428         if idx >= self.nsha or idx < 0:
 429             raise IndexError('invalid pack index index %d' % idx)
 430         ofs = self.sha_ofs + idx * 24
 431         return struct.unpack_from('!I', self.map, offset=ofs)[0]
 432
 433     def _idx_to_hash(self, idx):
 434         if idx >= self.nsha or idx < 0:
 435             raise IndexError('invalid pack index index %d' % idx)
 436         ofs = self.sha_ofs + idx * 24 + 4
 437         return self.map[ofs : ofs + 20]
 438
 439     def __iter__(self):
 440         start = self.sha_ofs + 4
 441         for ofs in range(start, start + 24 * self.nsha, 24):
 442             yield self.map[ofs : ofs + 20]
 443
 444     def close(self):
 445         if self.map is not None:
 446             self.shatable = None
 447             self.map.close()
 448             self.map = None
 449
 450
 451 class PackIdxV2(PackIdx):
 452     """Object representation of a Git pack index (version 2) file."""
 453     def __init__(self, filename, f):
 454         self.name = filename
 455         self.idxnames = [self.name]
 456         self.map = mmap_read(f)
 457         assert self.map[0:8] == b'\377tOc\0\0\0\2'
 458         # Min size for 'L' is 4, which is sufficient for struct's '!I'
 459         self.fanout = array('L', struct.unpack_from('!256I', self.map, offset=8))
 460         self.fanout.append(0)
 461         self.nsha = self.fanout[255]
 462         self.sha_ofs = 8 + 256*4
 463         self.ofstable_ofs = self.sha_ofs + self.nsha * 20 + self.nsha * 4
 464         self.ofs64table_ofs = self.ofstable_ofs + self.nsha * 4
 465         # Avoid slicing this for individual hashes (very high overhead)
 466         self.shatable = buffer(self.map, self.sha_ofs, self.nsha*20)
 467
 468     def __enter__(self):
 469         return self
 470
 471     def __exit__(self, type, value, traceback):
 472         self.close()
 473
 474     def __len__(self):
 475         return int(self.nsha)  # int() from long for python 2
 476
 477     def _ofs_from_idx(self, idx):
 478         if idx >= self.nsha or idx < 0:
 479             raise IndexError('invalid pack index index %d' % idx)
 480         ofs_ofs = self.ofstable_ofs + idx * 4
 481         ofs = struct.unpack_from('!I', self.map, offset=ofs_ofs)[0]
 482         if ofs & 0x80000000:
 483             idx64 = ofs & 0x7fffffff
 484             ofs64_ofs = self.ofs64table_ofs + idx64 * 8
 485             ofs = struct.unpack_from('!Q', self.map, offset=ofs64_ofs)[0]
 486         return ofs
 487
 488     def _idx_to_hash(self, idx):
 489         if idx >= self.nsha or idx < 0:
 490             raise IndexError('invalid pack index index %d' % idx)
 491         ofs = self.sha_ofs + idx * 20
 492         return self.map[ofs : ofs + 20]
 493
 494     def __iter__(self):
 495         start = self.sha_ofs
 496         for ofs in range(start, start + 20 * self.nsha, 20):
 497             yield self.map[ofs : ofs + 20]
 498
 499     def close(self):
 500         if self.map is not None:
 501             self.shatable = None
 502             self.map.close()
 503             self.map = None
 504
 505
 506 _mpi_count = 0
 507 class PackIdxList:
 508     def __init__(self, dir, ignore_midx=False):
 509         global _mpi_count
 510         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 511         _mpi_count += 1
 512         self.dir = dir
 513         self.also = set()
 514         self.packs = []
 515         self.do_bloom = False
 516         self.bloom = None
 517         self.ignore_midx = ignore_midx
 518         self.refresh()
 519
 520     def __del__(self):
 521         global _mpi_count
 522         _mpi_count -= 1
 523         assert(_mpi_count == 0)
 524
 525     def __iter__(self):
 526         return iter(idxmerge(self.packs))
 527
 528     def __len__(self):
 529         return sum(len(pack) for pack in self.packs)
 530
 531     def exists(self, hash, want_source=False):
 532         """Return nonempty if the object exists in the index files."""
 533         global _total_searches
 534         _total_searches += 1
 535         if hash in self.also:
 536             return True
 537         if self.do_bloom and self.bloom:
 538             if self.bloom.exists(hash):
 539                 self.do_bloom = False
 540             else:
 541                 _total_searches -= 1  # was counted by bloom
 542                 return None
 543         for i in range(len(self.packs)):
 544             p = self.packs[i]
 545             _total_searches -= 1  # will be incremented by sub-pack
 546             ix = p.exists(hash, want_source=want_source)
 547             if ix:
 548                 # reorder so most recently used packs are searched first
 549                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 550                 return ix
 551         self.do_bloom = True
 552         return None
 553
 554     def refresh(self, skip_midx = False):
 555         """Refresh the index list.
 556         This method verifies if .midx files were superseded (e.g. all of its
 557         contents are in another, bigger .midx file) and removes the superseded
 558         files.
 559
 560         If skip_midx is True, all work on .midx files will be skipped and .midx
 561         files will be removed from the list.
 562
 563         The instance variable 'ignore_midx' can force this function to
 564         always act as if skip_midx was True.
 565         """
 566         if self.bloom is not None:
 567             self.bloom.close()
 568         self.bloom = None # Always reopen the bloom as it may have been relaced
 569         self.do_bloom = False
 570         skip_midx = skip_midx or self.ignore_midx
 571         d = dict((p.name, p) for p in self.packs
 572                  if not skip_midx or not isinstance(p, midx.PackMidx))
 573         if os.path.exists(self.dir):
 574             if not skip_midx:
 575                 midxl = []
 576                 midxes = set(glob.glob(os.path.join(self.dir, b'*.midx')))
 577                 # remove any *.midx files from our list that no longer exist
 578                 for ix in list(d.values()):
 579                     if not isinstance(ix, midx.PackMidx):
 580                         continue
 581                     if ix.name in midxes:
 582                         continue
 583                     # remove the midx
 584                     del d[ix.name]
 585                     ix.close()
 586                     self.packs.remove(ix)
 587                 for ix in self.packs:
 588                     if isinstance(ix, midx.PackMidx):
 589                         for name in ix.idxnames:
 590                             d[os.path.join(self.dir, name)] = ix
 591                 for full in midxes:
 592                     if not d.get(full):
 593                         mx = midx.PackMidx(full)
 594                         (mxd, mxf) = os.path.split(mx.name)
 595                         broken = False
 596                         for n in mx.idxnames:
 597                             if not os.path.exists(os.path.join(mxd, n)):
 598                                 log(('warning: index %s missing\n'
 599                                      '  used by %s\n')
 600                                     % (path_msg(n), path_msg(mxf)))
 601                                 broken = True
 602                         if broken:
 603                             mx.close()
 604                             del mx
 605                             unlink(full)
 606                         else:
 607                             midxl.append(mx)
 608                 midxl.sort(key=lambda ix:
 609                            (-len(ix), -xstat.stat(ix.name).st_mtime))
 610                 for ix in midxl:
 611                     any_needed = False
 612                     for sub in ix.idxnames:
 613                         found = d.get(os.path.join(self.dir, sub))
 614                         if not found or isinstance(found, PackIdx):
 615                             # doesn't exist, or exists but not in a midx
 616                             any_needed = True
 617                             break
 618                     if any_needed:
 619                         d[ix.name] = ix
 620                         for name in ix.idxnames:
 621                             d[os.path.join(self.dir, name)] = ix
 622                     elif not ix.force_keep:
 623                         debug1('midx: removing redundant: %s\n'
 624                                % path_msg(os.path.basename(ix.name)))
 625                         ix.close()
 626                         unlink(ix.name)
 627             for full in glob.glob(os.path.join(self.dir, b'*.idx')):
 628                 if not d.get(full):
 629                     try:
 630                         ix = open_idx(full)
 631                     except GitError as e:
 632                         add_error(e)
 633                         continue
 634                     d[full] = ix
 635             bfull = os.path.join(self.dir, b'bup.bloom')
 636             if self.bloom is None and os.path.exists(bfull):
 637                 self.bloom = bloom.ShaBloom(bfull)
 638             self.packs = list(set(d.values()))
 639             self.packs.sort(reverse=True, key=lambda x: len(x))
 640             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 641                 self.do_bloom = True
 642             else:
 643                 self.bloom = None
 644         debug1('PackIdxList: using %d index%s.\n'
 645             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 646
 647     def add(self, hash):
 648         """Insert an additional object in the list."""
 649         self.also.add(hash)
 650
 651
 652 def open_idx(filename):
 653     if filename.endswith(b'.idx'):
 654         f = open(filename, 'rb')
 655         header = f.read(8)
 656         if header[0:4] == b'\377tOc':
 657             version = struct.unpack('!I', header[4:8])[0]
 658             if version == 2:
 659                 return PackIdxV2(filename, f)
 660             else:
 661                 raise GitError('%s: expected idx file version 2, got %d'
 662                                % (path_msg(filename), version))
 663         elif len(header) == 8 and header[0:4] < b'\377tOc':
 664             return PackIdxV1(filename, f)
 665         else:
 666             raise GitError('%s: unrecognized idx file header'
 667                            % path_msg(filename))
 668     elif filename.endswith(b'.midx'):
 669         return midx.PackMidx(filename)
 670     else:
 671         raise GitError('idx filenames must end with .idx or .midx')
 672
 673
 674 def idxmerge(idxlist, final_progress=True):
 675     """Generate a list of all the objects reachable in a PackIdxList."""
 676     def pfunc(count, total):
 677         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 678                   % (count*100.0/total, count, total))
 679     def pfinal(count, total):
 680         if final_progress:
 681             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 682                      % (100, total, total))
 683     return merge_iter(idxlist, 10024, pfunc, pfinal)
 684
 685
 686 def _make_objcache():
 687     return PackIdxList(repo(b'objects/pack'))
 688
 689 # bup-gc assumes that it can disable all PackWriter activities
 690 # (bloom/midx/cache) via the constructor and close() arguments.
 691
 692 class PackWriter:
 693     """Writes Git objects inside a pack file."""
 694     def __init__(self, objcache_maker=_make_objcache, compression_level=1,
 695                  run_midx=True, on_pack_finish=None,
 696                  max_pack_size=None, max_pack_objects=None, repo_dir=None):
 697         self.repo_dir = repo_dir or repo()
 698         self.file = None
 699         self.parentfd = None
 700         self.count = 0
 701         self.outbytes = 0
 702         self.filename = None
 703         self.idx = None
 704         self.objcache_maker = objcache_maker
 705         self.objcache = None
 706         self.compression_level = compression_level
 707         self.run_midx=run_midx
 708         self.on_pack_finish = on_pack_finish
 709         if not max_pack_size:
 710             max_pack_size = git_config_get(b'pack.packSizeLimit',
 711                                            repo_dir=self.repo_dir)
 712             if max_pack_size is not None:
 713                 max_pack_size = parse_num(max_pack_size)
 714             if not max_pack_size:
 715                 # larger packs slow down pruning
 716                 max_pack_size = 1000 * 1000 * 1000
 717         self.max_pack_size = max_pack_size
 718         # cache memory usage is about 83 bytes per object
 719         self.max_pack_objects = max_pack_objects if max_pack_objects \
 720                                 else max(1, self.max_pack_size // 5000)
 721
 722     def __del__(self):
 723         self.close()
 724
 725     def __enter__(self):
 726         return self
 727
 728     def __exit__(self, type, value, traceback):
 729         self.close()
 730
 731     def _open(self):
 732         if not self.file:
 733             objdir = dir = os.path.join(self.repo_dir, b'objects')
 734             fd, name = tempfile.mkstemp(suffix=b'.pack', dir=objdir)
 735             try:
 736                 self.file = os.fdopen(fd, 'w+b')
 737             except:
 738                 os.close(fd)
 739                 raise
 740             try:
 741                 self.parentfd = os.open(objdir, os.O_RDONLY)
 742             except:
 743                 f = self.file
 744                 self.file = None
 745                 f.close()
 746                 raise
 747             assert name.endswith(b'.pack')
 748             self.filename = name[:-5]
 749             self.file.write(b'PACK\0\0\0\2\0\0\0\0')
 750             self.idx = PackIdxV2Writer()
 751
 752     def _raw_write(self, datalist, sha):
 753         self._open()
 754         f = self.file
 755         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 756         # the file never has a *partial* blob.  So let's make sure it's
 757         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 758         # to our hashsplit algorithm.)  f.write() does its own buffering,
 759         # but that's okay because we'll flush it in _end().
 760         oneblob = b''.join(datalist)
 761         try:
 762             f.write(oneblob)
 763         except IOError as e:
 764             reraise(GitError(e))
 765         nw = len(oneblob)
 766         crc = zlib.crc32(oneblob) & 0xffffffff
 767         self._update_idx(sha, crc, nw)
 768         self.outbytes += nw
 769         self.count += 1
 770         return nw, crc
 771
 772     def _update_idx(self, sha, crc, size):
 773         assert(sha)
 774         if self.idx:
 775             self.idx.add(sha, crc, self.file.tell() - size)
 776
 777     def _write(self, sha, type, content):
 778         if verbose:
 779             log('>')
 780         if not sha:
 781             sha = calc_hash(type, content)
 782         size, crc = self._raw_write(_encode_packobj(type, content,
 783                                                     self.compression_level),
 784                                     sha=sha)
 785         if self.outbytes >= self.max_pack_size \
 786            or self.count >= self.max_pack_objects:
 787             self.breakpoint()
 788         return sha
 789
 790     def breakpoint(self):
 791         """Clear byte and object counts and return the last processed id."""
 792         id = self._end(self.run_midx)
 793         self.outbytes = self.count = 0
 794         return id
 795
 796     def _require_objcache(self):
 797         if self.objcache is None and self.objcache_maker:
 798             self.objcache = self.objcache_maker()
 799         if self.objcache is None:
 800             raise GitError(
 801                     "PackWriter not opened or can't check exists w/o objcache")
 802
 803     def exists(self, id, want_source=False):
 804         """Return non-empty if an object is found in the object cache."""
 805         self._require_objcache()
 806         return self.objcache.exists(id, want_source=want_source)
 807
 808     def just_write(self, sha, type, content):
 809         """Write an object to the pack file without checking for duplication."""
 810         self._write(sha, type, content)
 811         # If nothing else, gc doesn't have/want an objcache
 812         if self.objcache is not None:
 813             self.objcache.add(sha)
 814
 815     def maybe_write(self, type, content):
 816         """Write an object to the pack file if not present and return its id."""
 817         sha = calc_hash(type, content)
 818         if not self.exists(sha):
 819             self._require_objcache()
 820             self.just_write(sha, type, content)
 821         return sha
 822
 823     def new_blob(self, blob):
 824         """Create a blob object in the pack with the supplied content."""
 825         return self.maybe_write(b'blob', blob)
 826
 827     def new_tree(self, shalist):
 828         """Create a tree object in the pack."""
 829         content = tree_encode(shalist)
 830         return self.maybe_write(b'tree', content)
 831
 832     def new_commit(self, tree, parent,
 833                    author, adate_sec, adate_tz,
 834                    committer, cdate_sec, cdate_tz,
 835                    msg):
 836         """Create a commit object in the pack.  The date_sec values must be
 837         epoch-seconds, and if a tz is None, the local timezone is assumed."""
 838         if adate_tz is not None:
 839             adate_str = _git_date_str(adate_sec, adate_tz)
 840         else:
 841             adate_str = _local_git_date_str(adate_sec)
 842         if cdate_tz is not None:
 843             cdate_str = _git_date_str(cdate_sec, cdate_tz)
 844         else:
 845             cdate_str = _local_git_date_str(cdate_sec)
 846         l = []
 847         if tree: l.append(b'tree %s' % hexlify(tree))
 848         if parent: l.append(b'parent %s' % hexlify(parent))
 849         if author: l.append(b'author %s %s' % (author, adate_str))
 850         if committer: l.append(b'committer %s %s' % (committer, cdate_str))
 851         l.append(b'')
 852         l.append(msg)
 853         return self.maybe_write(b'commit', b'\n'.join(l))
 854
 855     def abort(self):
 856         """Remove the pack file from disk."""
 857         f = self.file
 858         if f:
 859             pfd = self.parentfd
 860             self.file = None
 861             self.parentfd = None
 862             self.idx = None
 863             try:
 864                 try:
 865                     os.unlink(self.filename + b'.pack')
 866                 finally:
 867                     f.close()
 868             finally:
 869                 if pfd is not None:
 870                     os.close(pfd)
 871
 872     def _end(self, run_midx=True):
 873         f = self.file
 874         if not f: return None
 875         self.file = None
 876         try:
 877             self.objcache = None
 878             idx = self.idx
 879             self.idx = None
 880
 881             # update object count
 882             f.seek(8)
 883             cp = struct.pack('!i', self.count)
 884             assert(len(cp) == 4)
 885             f.write(cp)
 886
 887             # calculate the pack sha1sum
 888             f.seek(0)
 889             sum = Sha1()
 890             for b in chunkyreader(f):
 891                 sum.update(b)
 892             packbin = sum.digest()
 893             f.write(packbin)
 894             fdatasync(f.fileno())
 895         finally:
 896             f.close()
 897
 898         obj_list_sha = idx.write(self.filename + b'.idx', packbin)
 899         nameprefix = os.path.join(self.repo_dir,
 900                                   b'objects/pack/pack-' +  obj_list_sha)
 901         if os.path.exists(self.filename + b'.map'):
 902             os.unlink(self.filename + b'.map')
 903         os.rename(self.filename + b'.pack', nameprefix + b'.pack')
 904         os.rename(self.filename + b'.idx', nameprefix + b'.idx')
 905         try:
 906             os.fsync(self.parentfd)
 907         finally:
 908             os.close(self.parentfd)
 909
 910         if run_midx:
 911             auto_midx(os.path.join(self.repo_dir, b'objects/pack'))
 912
 913         if self.on_pack_finish:
 914             self.on_pack_finish(nameprefix)
 915
 916         return nameprefix
 917
 918     def close(self, run_midx=True):
 919         """Close the pack file and move it to its definitive path."""
 920         return self._end(run_midx=run_midx)
 921
 922
 923 class PackIdxV2Writer:
 924     def __init__(self):
 925         self.idx = list(list() for i in range(256))
 926         self.count = 0
 927
 928     def add(self, sha, crc, offs):
 929         assert(sha)
 930         self.count += 1
 931         self.idx[byte_int(sha[0])].append((sha, crc, offs))
 932
 933     def write(self, filename, packbin):
 934         ofs64_count = 0
 935         for section in self.idx:
 936             for entry in section:
 937                 if entry[2] >= 2**31:
 938                     ofs64_count += 1
 939
 940         # Length: header + fan-out + shas-and-crcs + overflow-offsets
 941         index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
 942         idx_map = None
 943         idx_f = open(filename, 'w+b')
 944         try:
 945             idx_f.truncate(index_len)
 946             fdatasync(idx_f.fileno())
 947             idx_map = mmap_readwrite(idx_f, close=False)
 948             try:
 949                 count = _helpers.write_idx(filename, idx_map, self.idx,
 950                                            self.count)
 951                 assert(count == self.count)
 952                 idx_map.flush()
 953             finally:
 954                 idx_map.close()
 955         finally:
 956             idx_f.close()
 957
 958         idx_f = open(filename, 'a+b')
 959         try:
 960             idx_f.write(packbin)
 961             idx_f.seek(0)
 962             idx_sum = Sha1()
 963             b = idx_f.read(8 + 4*256)
 964             idx_sum.update(b)
 965
 966             obj_list_sum = Sha1()
 967             for b in chunkyreader(idx_f, 20 * self.count):
 968                 idx_sum.update(b)
 969                 obj_list_sum.update(b)
 970             namebase = hexlify(obj_list_sum.digest())
 971
 972             for b in chunkyreader(idx_f):
 973                 idx_sum.update(b)
 974             idx_f.write(idx_sum.digest())
 975             fdatasync(idx_f.fileno())
 976             return namebase
 977         finally:
 978             idx_f.close()
 979
 980
 981 def list_refs(patterns=None, repo_dir=None,
 982               limit_to_heads=False, limit_to_tags=False):
 983     """Yield (refname, hash) tuples for all repository refs unless
 984     patterns are specified.  In that case, only include tuples for
 985     refs matching those patterns (cf. git-show-ref(1)).  The limits
 986     restrict the result items to refs/heads or refs/tags.  If both
 987     limits are specified, items from both sources will be included.
 988
 989     """
 990     argv = [b'git', b'show-ref']
 991     if limit_to_heads:
 992         argv.append(b'--heads')
 993     if limit_to_tags:
 994         argv.append(b'--tags')
 995     argv.append(b'--')
 996     if patterns:
 997         argv.extend(patterns)
 998     p = subprocess.Popen(argv, env=_gitenv(repo_dir), stdout=subprocess.PIPE,
 999                          close_fds=True)
1000     out = p.stdout.read().strip()
1001     rv = p.wait()  # not fatal
1002     if rv:
1003         assert(not out)
1004     if out:
1005         for d in out.split(b'\n'):
1006             sha, name = d.split(b' ', 1)
1007             yield name, unhexlify(sha)
1008
1009
1010 def read_ref(refname, repo_dir = None):
1011     """Get the commit id of the most recent commit made on a given ref."""
1012     refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
1013     l = tuple(islice(refs, 2))
1014     if l:
1015         assert(len(l) == 1)
1016         return l[0][1]
1017     else:
1018         return None
1019
1020
1021 def rev_list_invocation(ref_or_refs, format=None):
1022     if isinstance(ref_or_refs, bytes):
1023         refs = (ref_or_refs,)
1024     else:
1025         refs = ref_or_refs
1026     argv = [b'git', b'rev-list']
1027
1028     if format:
1029         argv.append(b'--pretty=format:' + format)
1030     for ref in refs:
1031         assert not ref.startswith(b'-')
1032         argv.append(ref)
1033     argv.append(b'--')
1034     return argv
1035
1036
1037 def rev_list(ref_or_refs, parse=None, format=None, repo_dir=None):
1038     """Yield information about commits as per "git rev-list".  If a format
1039     is not provided, yield one hex hash at a time.  If a format is
1040     provided, pass it to rev-list and call parse(git_stdout) for each
1041     commit with the stream positioned just after the rev-list "commit
1042     HASH" header line.  When a format is provided yield (oidx,
1043     parse(git_stdout)) for each commit.
1044
1045     """
1046     assert bool(parse) == bool(format)
1047     p = subprocess.Popen(rev_list_invocation(ref_or_refs,
1048                                              format=format),
1049                          env=_gitenv(repo_dir),
1050                          stdout = subprocess.PIPE,
1051                          close_fds=True)
1052     if not format:
1053         for line in p.stdout:
1054             yield line.strip()
1055     else:
1056         line = p.stdout.readline()
1057         while line:
1058             s = line.strip()
1059             if not s.startswith(b'commit '):
1060                 raise Exception('unexpected line ' + repr(s))
1061             s = s[7:]
1062             assert len(s) == 40
1063             yield s, parse(p.stdout)
1064             line = p.stdout.readline()
1065
1066     rv = p.wait()  # not fatal
1067     if rv:
1068         raise GitError('git rev-list returned error %d' % rv)
1069
1070
1071 def get_commit_dates(refs, repo_dir=None):
1072     """Get the dates for the specified commit refs.  For now, every unique
1073        string in refs must resolve to a different commit or this
1074        function will fail."""
1075     result = []
1076     for ref in refs:
1077         commit = get_commit_items(ref, cp(repo_dir))
1078         result.append(commit.author_sec)
1079     return result
1080
1081
1082 def rev_parse(committish, repo_dir=None):
1083     """Resolve the full hash for 'committish', if it exists.
1084
1085     Should be roughly equivalent to 'git rev-parse'.
1086
1087     Returns the hex value of the hash if it is found, None if 'committish' does
1088     not correspond to anything.
1089     """
1090     head = read_ref(committish, repo_dir=repo_dir)
1091     if head:
1092         debug2("resolved from ref: commit = %s\n" % hexlify(head))
1093         return head
1094
1095     pL = PackIdxList(repo(b'objects/pack', repo_dir=repo_dir))
1096
1097     if len(committish) == 40:
1098         try:
1099             hash = unhexlify(committish)
1100         except TypeError:
1101             return None
1102
1103         if pL.exists(hash):
1104             return hash
1105
1106     return None
1107
1108
1109 def update_ref(refname, newval, oldval, repo_dir=None):
1110     """Update a repository reference."""
1111     if not oldval:
1112         oldval = b''
1113     assert refname.startswith(b'refs/heads/') \
1114         or refname.startswith(b'refs/tags/')
1115     p = subprocess.Popen([b'git', b'update-ref', refname,
1116                           hexlify(newval), hexlify(oldval)],
1117                          env=_gitenv(repo_dir),
1118                          close_fds=True)
1119     _git_wait(b'git update-ref', p)
1120
1121
1122 def delete_ref(refname, oldvalue=None):
1123     """Delete a repository reference (see git update-ref(1))."""
1124     assert refname.startswith(b'refs/')
1125     oldvalue = [] if not oldvalue else [oldvalue]
1126     p = subprocess.Popen([b'git', b'update-ref', b'-d', refname] + oldvalue,
1127                          env=_gitenv(),
1128                          close_fds=True)
1129     _git_wait('git update-ref', p)
1130
1131
1132 def guess_repo(path=None):
1133     """Set the path value in the global variable "repodir".
1134     This makes bup look for an existing bup repository, but not fail if a
1135     repository doesn't exist. Usually, if you are interacting with a bup
1136     repository, you would not be calling this function but using
1137     check_repo_or_die().
1138     """
1139     global repodir
1140     if path:
1141         repodir = path
1142     if not repodir:
1143         repodir = environ.get(b'BUP_DIR')
1144         if not repodir:
1145             repodir = os.path.expanduser(b'~/.bup')
1146
1147
1148 def init_repo(path=None):
1149     """Create the Git bare repository for bup in a given path."""
1150     guess_repo(path)
1151     d = repo()  # appends a / to the path
1152     parent = os.path.dirname(os.path.dirname(d))
1153     if parent and not os.path.exists(parent):
1154         raise GitError('parent directory "%s" does not exist\n'
1155                        % path_msg(parent))
1156     if os.path.exists(d) and not os.path.isdir(os.path.join(d, b'.')):
1157         raise GitError('"%s" exists but is not a directory\n' % path_msg(d))
1158     p = subprocess.Popen([b'git', b'--bare', b'init'], stdout=sys.stderr,
1159                          env=_gitenv(),
1160                          close_fds=True)
1161     _git_wait('git init', p)
1162     # Force the index version configuration in order to ensure bup works
1163     # regardless of the version of the installed Git binary.
1164     p = subprocess.Popen([b'git', b'config', b'pack.indexVersion', '2'],
1165                          stdout=sys.stderr, env=_gitenv(), close_fds=True)
1166     _git_wait('git config', p)
1167     # Enable the reflog
1168     p = subprocess.Popen([b'git', b'config', b'core.logAllRefUpdates', b'true'],
1169                          stdout=sys.stderr, env=_gitenv(), close_fds=True)
1170     _git_wait('git config', p)
1171
1172
1173 def check_repo_or_die(path=None):
1174     """Check to see if a bup repository probably exists, and abort if not."""
1175     guess_repo(path)
1176     top = repo()
1177     pst = stat_if_exists(top + b'/objects/pack')
1178     if pst and stat.S_ISDIR(pst.st_mode):
1179         return
1180     if not pst:
1181         top_st = stat_if_exists(top)
1182         if not top_st:
1183             log('error: repository %r does not exist (see "bup help init")\n'
1184                 % top)
1185             sys.exit(15)
1186     log('error: %s is not a repository\n' % path_msg(top))
1187     sys.exit(14)
1188
1189
1190 def is_suitable_git(ver_str):
1191     if not ver_str.startswith(b'git version '):
1192         return 'unrecognized'
1193     ver_str = ver_str[len(b'git version '):]
1194     if ver_str.startswith(b'0.'):
1195         return 'insufficient'
1196     if ver_str.startswith(b'1.'):
1197         if re.match(br'1\.[012345]rc', ver_str):
1198             return 'insufficient'
1199         if re.match(br'1\.[01234]\.', ver_str):
1200             return 'insufficient'
1201         if re.match(br'1\.5\.[012345]($|\.)', ver_str):
1202             return 'insufficient'
1203         if re.match(br'1\.5\.6-rc', ver_str):
1204             return 'insufficient'
1205         return 'suitable'
1206     if re.match(br'[0-9]+(\.|$)?', ver_str):
1207         return 'suitable'
1208     sys.exit(13)
1209
1210 _git_great = None
1211
1212 def require_suitable_git(ver_str=None):
1213     """Raise GitError if the version of git isn't suitable.
1214
1215     Rely on ver_str when provided, rather than invoking the git in the
1216     path.
1217
1218     """
1219     global _git_great
1220     if _git_great is not None:
1221         return
1222     if environ.get(b'BUP_GIT_VERSION_IS_FINE', b'').lower() \
1223        in (b'yes', b'true', b'1'):
1224         _git_great = True
1225         return
1226     if not ver_str:
1227         ver_str, _, _ = _git_exo([b'git', b'--version'])
1228     status = is_suitable_git(ver_str)
1229     if status == 'unrecognized':
1230         raise GitError('Unexpected git --version output: %r' % ver_str)
1231     if status == 'insufficient':
1232         log('error: git version must be at least 1.5.6\n')
1233         sys.exit(1)
1234     if status == 'suitable':
1235         _git_great = True
1236         return
1237     assert False
1238
1239
1240 class _AbortableIter:
1241     def __init__(self, it, onabort = None):
1242         self.it = it
1243         self.onabort = onabort
1244         self.done = None
1245
1246     def __iter__(self):
1247         return self
1248
1249     def __next__(self):
1250         try:
1251             return next(self.it)
1252         except StopIteration as e:
1253             self.done = True
1254             raise
1255         except:
1256             self.abort()
1257             raise
1258
1259     next = __next__
1260
1261     def abort(self):
1262         """Abort iteration and call the abortion callback, if needed."""
1263         if not self.done:
1264             self.done = True
1265             if self.onabort:
1266                 self.onabort()
1267
1268     def __del__(self):
1269         self.abort()
1270
1271
1272 class CatPipe:
1273     """Link to 'git cat-file' that is used to retrieve blob data."""
1274     def __init__(self, repo_dir = None):
1275         require_suitable_git()
1276         self.repo_dir = repo_dir
1277         self.p = self.inprogress = None
1278
1279     def _abort(self):
1280         if self.p:
1281             self.p.stdout.close()
1282             self.p.stdin.close()
1283         self.p = None
1284         self.inprogress = None
1285
1286     def restart(self):
1287         self._abort()
1288         self.p = subprocess.Popen([b'git', b'cat-file', b'--batch'],
1289                                   stdin=subprocess.PIPE,
1290                                   stdout=subprocess.PIPE,
1291                                   close_fds = True,
1292                                   bufsize = 4096,
1293                                   env=_gitenv(self.repo_dir))
1294
1295     def get(self, ref):
1296         """Yield (oidx, type, size), followed by the data referred to by ref.
1297         If ref does not exist, only yield (None, None, None).
1298
1299         """
1300         if not self.p or self.p.poll() != None:
1301             self.restart()
1302         assert(self.p)
1303         poll_result = self.p.poll()
1304         assert(poll_result == None)
1305         if self.inprogress:
1306             log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1307         assert(not self.inprogress)
1308         assert ref.find(b'\n') < 0
1309         assert ref.find(b'\r') < 0
1310         assert not ref.startswith(b'-')
1311         self.inprogress = ref
1312         self.p.stdin.write(ref + b'\n')
1313         self.p.stdin.flush()
1314         hdr = self.p.stdout.readline()
1315         if hdr.endswith(b' missing\n'):
1316             self.inprogress = None
1317             yield None, None, None
1318             return
1319         info = hdr.split(b' ')
1320         if len(info) != 3 or len(info[0]) != 40:
1321             raise GitError('expected object (id, type, size), got %r' % info)
1322         oidx, typ, size = info
1323         size = int(size)
1324         it = _AbortableIter(chunkyreader(self.p.stdout, size),
1325                             onabort=self._abort)
1326         try:
1327             yield oidx, typ, size
1328             for blob in it:
1329                 yield blob
1330             readline_result = self.p.stdout.readline()
1331             assert readline_result == b'\n'
1332             self.inprogress = None
1333         except Exception as e:
1334             it.abort()
1335             raise
1336
1337     def _join(self, it):
1338         _, typ, _ = next(it)
1339         if typ == b'blob':
1340             for blob in it:
1341                 yield blob
1342         elif typ == b'tree':
1343             treefile = b''.join(it)
1344             for (mode, name, sha) in tree_decode(treefile):
1345                 for blob in self.join(hexlify(sha)):
1346                     yield blob
1347         elif typ == b'commit':
1348             treeline = b''.join(it).split(b'\n')[0]
1349             assert treeline.startswith(b'tree ')
1350             for blob in self.join(treeline[5:]):
1351                 yield blob
1352         else:
1353             raise GitError('invalid object type %r: expected blob/tree/commit'
1354                            % typ)
1355
1356     def join(self, id):
1357         """Generate a list of the content of all blobs that can be reached
1358         from an object.  The hash given in 'id' must point to a blob, a tree
1359         or a commit. The content of all blobs that can be seen from trees or
1360         commits will be added to the list.
1361         """
1362         for d in self._join(self.get(id)):
1363             yield d
1364
1365
1366 _cp = {}
1367
1368 def cp(repo_dir=None):
1369     """Create a CatPipe object or reuse the already existing one."""
1370     global _cp, repodir
1371     if not repo_dir:
1372         repo_dir = repodir or repo()
1373     repo_dir = os.path.abspath(repo_dir)
1374     cp = _cp.get(repo_dir)
1375     if not cp:
1376         cp = CatPipe(repo_dir)
1377         _cp[repo_dir] = cp
1378     return cp
1379
1380
1381 def tags(repo_dir = None):
1382     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1383     tags = {}
1384     for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1385         assert n.startswith(b'refs/tags/')
1386         name = n[10:]
1387         if not c in tags:
1388             tags[c] = []
1389         tags[c].append(name)  # more than one tag can point at 'c'
1390     return tags
1391
1392
1393 class MissingObject(KeyError):
1394     def __init__(self, oid):
1395         self.oid = oid
1396         KeyError.__init__(self, 'object %r is missing' % hexlify(oid))
1397
1398
1399 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1400                                    'path', 'chunk_path', 'data'])
1401 # The path is the mangled path, and if an item represents a fragment
1402 # of a chunked file, the chunk_path will be the chunked subtree path
1403 # for the chunk, i.e. ['', '2d3115e', ...].  The top-level path for a
1404 # chunked file will have a chunk_path of [''].  So some chunk subtree
1405 # of the file '/foo/bar/baz' might look like this:
1406 #
1407 #   item.path = ['foo', 'bar', 'baz.bup']
1408 #   item.chunk_path = ['', '2d3115e', '016b097']
1409 #   item.type = 'tree'
1410 #   ...
1411
1412
1413 def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1414     """Yield everything reachable from oidx via get_ref (which must behave
1415     like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1416     returns true.  Throw MissingObject if a hash encountered is
1417     missing from the repository, and don't read or return blob content
1418     in the data field unless include_data is set.
1419
1420     """
1421     # Maintain the pending stack on the heap to avoid stack overflow
1422     pending = [(oidx, [], [], None)]
1423     while len(pending):
1424         oidx, parent_path, chunk_path, mode = pending.pop()
1425         oid = unhexlify(oidx)
1426         if stop_at and stop_at(oidx):
1427             continue
1428
1429         if (not include_data) and mode and stat.S_ISREG(mode):
1430             # If the object is a "regular file", then it's a leaf in
1431             # the graph, so we can skip reading the data if the caller
1432             # hasn't requested it.
1433             yield WalkItem(oid=oid, type=b'blob',
1434                            chunk_path=chunk_path, path=parent_path,
1435                            mode=mode,
1436                            data=None)
1437             continue
1438
1439         item_it = get_ref(oidx)
1440         get_oidx, typ, _ = next(item_it)
1441         if not get_oidx:
1442             raise MissingObject(unhexlify(oidx))
1443         if typ not in (b'blob', b'commit', b'tree'):
1444             raise Exception('unexpected repository object type %r' % typ)
1445
1446         # FIXME: set the mode based on the type when the mode is None
1447         if typ == b'blob' and not include_data:
1448             # Dump data until we can ask cat_pipe not to fetch it
1449             for ignored in item_it:
1450                 pass
1451             data = None
1452         else:
1453             data = b''.join(item_it)
1454
1455         yield WalkItem(oid=oid, type=typ,
1456                        chunk_path=chunk_path, path=parent_path,
1457                        mode=mode,
1458                        data=(data if include_data else None))
1459
1460         if typ == b'commit':
1461             commit_items = parse_commit(data)
1462             for pid in commit_items.parents:
1463                 pending.append((pid, parent_path, chunk_path, mode))
1464             pending.append((commit_items.tree, parent_path, chunk_path,
1465                             hashsplit.GIT_MODE_TREE))
1466         elif typ == b'tree':
1467             for mode, name, ent_id in tree_decode(data):
1468                 demangled, bup_type = demangle_name(name, mode)
1469                 if chunk_path:
1470                     sub_path = parent_path
1471                     sub_chunk_path = chunk_path + [name]
1472                 else:
1473                     sub_path = parent_path + [name]
1474                     if bup_type == BUP_CHUNKED:
1475                         sub_chunk_path = [b'']
1476                     else:
1477                         sub_chunk_path = chunk_path
1478                 pending.append((hexlify(ent_id), sub_path, sub_chunk_path,
1479                                 mode))