lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5
   6 from __future__ import absolute_import, print_function
   7 import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
   8 from array import array
   9 from binascii import hexlify, unhexlify
  10 from collections import namedtuple
  11 from itertools import islice
  12 from numbers import Integral
  13
  14 from bup import _helpers, compat, hashsplit, path, midx, bloom, xstat
  15 from bup.compat import (buffer,
  16                         byte_int, bytes_from_byte, bytes_from_uint,
  17                         environ,
  18                         items,
  19                         range,
  20                         reraise)
  21 from bup.io import path_msg
  22 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
  23                          exo,
  24                          fdatasync,
  25                          hostname, localtime, log,
  26                          merge_dict,
  27                          merge_iter,
  28                          mmap_read, mmap_readwrite,
  29                          parse_num,
  30                          progress, qprogress, stat_if_exists,
  31                          unlink,
  32                          utc_offset_str)
  33 from bup.pwdgrp import username, userfullname
  34
  35
  36 verbose = 0
  37 repodir = None  # The default repository, once initialized
  38
  39 _typemap =  {b'blob': 3, b'tree': 2, b'commit': 1, b'tag': 4}
  40 _typermap = {v: k for k, v in items(_typemap)}
  41
  42
  43 _total_searches = 0
  44 _total_steps = 0
  45
  46
  47 class GitError(Exception):
  48     pass
  49
  50
  51 def _gitenv(repo_dir=None):
  52     if not repo_dir:
  53         repo_dir = repo()
  54     return merge_dict(environ, {b'GIT_DIR': os.path.abspath(repo_dir)})
  55
  56 def _git_wait(cmd, p):
  57     rv = p.wait()
  58     if rv != 0:
  59         raise GitError('%r returned %d' % (cmd, rv))
  60
  61 def _git_exo(cmd, **kwargs):
  62     kwargs['check'] = False
  63     result = exo(cmd, **kwargs)
  64     _, _, proc = result
  65     if proc.returncode != 0:
  66         raise GitError('%r returned %d' % (cmd, proc.returncode))
  67     return result
  68
  69 def git_config_get(option, repo_dir=None):
  70     cmd = (b'git', b'config', b'--get', option)
  71     p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
  72                          env=_gitenv(repo_dir=repo_dir))
  73     r = p.stdout.read()
  74     rc = p.wait()
  75     if rc == 0:
  76         return r
  77     if rc != 1:
  78         raise GitError('%r returned %d' % (cmd, rc))
  79     return None
  80
  81
  82 def parse_tz_offset(s):
  83     """UTC offset in seconds."""
  84     tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
  85     if bytes_from_byte(s[0]) == b'-':
  86         return - tz_off
  87     return tz_off
  88
  89
  90 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
  91 # Make sure that's authoritative.
  92 _start_end_char = br'[^ .,:;<>"\'\0\n]'
  93 _content_char = br'[^\0\n<>]'
  94 _safe_str_rx = br'(?:%s{1,2}|(?:%s%s*%s))' \
  95     % (_start_end_char,
  96        _start_end_char, _content_char, _start_end_char)
  97 _tz_rx = br'[-+]\d\d[0-5]\d'
  98 _parent_rx = br'(?:parent [abcdefABCDEF0123456789]{40}\n)'
  99 # Assumes every following line starting with a space is part of the
 100 # mergetag.  Is there a formal commit blob spec?
 101 _mergetag_rx = br'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)'
 102 _commit_rx = re.compile(br'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
 103 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
 104 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?)
 105
 106 (?P<message>(?:.|\n)*)''' % (_parent_rx,
 107                              _safe_str_rx, _safe_str_rx, _tz_rx,
 108                              _safe_str_rx, _safe_str_rx, _tz_rx,
 109                              _mergetag_rx))
 110 _parent_hash_rx = re.compile(br'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
 111
 112 # Note that the author_sec and committer_sec values are (UTC) epoch
 113 # seconds, and for now the mergetag is not included.
 114 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
 115                                        'author_name', 'author_mail',
 116                                        'author_sec', 'author_offset',
 117                                        'committer_name', 'committer_mail',
 118                                        'committer_sec', 'committer_offset',
 119                                        'message'])
 120
 121 def parse_commit(content):
 122     commit_match = re.match(_commit_rx, content)
 123     if not commit_match:
 124         raise Exception('cannot parse commit %r' % content)
 125     matches = commit_match.groupdict()
 126     return CommitInfo(tree=matches['tree'],
 127                       parents=re.findall(_parent_hash_rx, matches['parents']),
 128                       author_name=matches['author_name'],
 129                       author_mail=matches['author_mail'],
 130                       author_sec=int(matches['asec']),
 131                       author_offset=parse_tz_offset(matches['atz']),
 132                       committer_name=matches['committer_name'],
 133                       committer_mail=matches['committer_mail'],
 134                       committer_sec=int(matches['csec']),
 135                       committer_offset=parse_tz_offset(matches['ctz']),
 136                       message=matches['message'])
 137
 138
 139 def get_cat_data(cat_iterator, expected_type):
 140     _, kind, _ = next(cat_iterator)
 141     if kind != expected_type:
 142         raise Exception('expected %r, saw %r' % (expected_type, kind))
 143     return b''.join(cat_iterator)
 144
 145 def get_commit_items(id, cp):
 146     return parse_commit(get_cat_data(cp.get(id), b'commit'))
 147
 148 def _local_git_date_str(epoch_sec):
 149     return b'%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
 150
 151
 152 def _git_date_str(epoch_sec, tz_offset_sec):
 153     offs =  tz_offset_sec // 60
 154     return b'%d %s%02d%02d' \
 155         % (epoch_sec,
 156            b'+' if offs >= 0 else b'-',
 157            abs(offs) // 60,
 158            abs(offs) % 60)
 159
 160
 161 def repo(sub = b'', repo_dir=None):
 162     """Get the path to the git repository or one of its subdirectories."""
 163     repo_dir = repo_dir or repodir
 164     if not repo_dir:
 165         raise GitError('You should call check_repo_or_die()')
 166
 167     # If there's a .git subdirectory, then the actual repo is in there.
 168     gd = os.path.join(repo_dir, b'.git')
 169     if os.path.exists(gd):
 170         repo_dir = gd
 171
 172     return os.path.join(repo_dir, sub)
 173
 174
 175 _shorten_hash_rx = \
 176     re.compile(br'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)')
 177
 178 def shorten_hash(s):
 179     return _shorten_hash_rx.sub(br'\1\2*\3', s)
 180
 181
 182 def repo_rel(path):
 183     full = os.path.abspath(path)
 184     fullrepo = os.path.abspath(repo(b''))
 185     if not fullrepo.endswith(b'/'):
 186         fullrepo += b'/'
 187     if full.startswith(fullrepo):
 188         path = full[len(fullrepo):]
 189     if path.startswith(b'index-cache/'):
 190         path = path[len(b'index-cache/'):]
 191     return shorten_hash(path)
 192
 193
 194 def all_packdirs():
 195     paths = [repo(b'objects/pack')]
 196     paths += glob.glob(repo(b'index-cache/*/.'))
 197     return paths
 198
 199
 200 def auto_midx(objdir):
 201     args = [path.exe(), b'midx', b'--auto', b'--dir', objdir]
 202     try:
 203         rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
 204     except OSError as e:
 205         # make sure 'args' gets printed to help with debugging
 206         add_error('%r: exception: %s' % (args, e))
 207         raise
 208     if rv:
 209         add_error('%r: returned %d' % (args, rv))
 210
 211     args = [path.exe(), b'bloom', b'--dir', objdir]
 212     try:
 213         rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
 214     except OSError as e:
 215         # make sure 'args' gets printed to help with debugging
 216         add_error('%r: exception: %s' % (args, e))
 217         raise
 218     if rv:
 219         add_error('%r: returned %d' % (args, rv))
 220
 221
 222 def mangle_name(name, mode, gitmode):
 223     """Mangle a file name to present an abstract name for segmented files.
 224     Mangled file names will have the ".bup" extension added to them. If a
 225     file's name already ends with ".bup", a ".bupl" extension is added to
 226     disambiguate normal files from segmented ones.
 227     """
 228     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 229         assert(stat.S_ISDIR(gitmode))
 230         return name + b'.bup'
 231     elif name.endswith(b'.bup') or name[:-1].endswith(b'.bup'):
 232         return name + b'.bupl'
 233     else:
 234         return name
 235
 236
 237 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 238 def demangle_name(name, mode):
 239     """Remove name mangling from a file name, if necessary.
 240
 241     The return value is a tuple (demangled_filename,mode), where mode is one of
 242     the following:
 243
 244     * BUP_NORMAL  : files that should be read as-is from the repository
 245     * BUP_CHUNKED : files that were chunked and need to be reassembled
 246
 247     For more information on the name mangling algorithm, see mangle_name()
 248     """
 249     if name.endswith(b'.bupl'):
 250         return (name[:-5], BUP_NORMAL)
 251     elif name.endswith(b'.bup'):
 252         return (name[:-4], BUP_CHUNKED)
 253     elif name.endswith(b'.bupm'):
 254         return (name[:-5],
 255                 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
 256     else:
 257         return (name, BUP_NORMAL)
 258
 259
 260 def calc_hash(type, content):
 261     """Calculate some content's hash in the Git fashion."""
 262     header = b'%s %d\0' % (type, len(content))
 263     sum = Sha1(header)
 264     sum.update(content)
 265     return sum.digest()
 266
 267
 268 def shalist_item_sort_key(ent):
 269     (mode, name, id) = ent
 270     assert(mode+0 == mode)
 271     if stat.S_ISDIR(mode):
 272         return name + b'/'
 273     else:
 274         return name
 275
 276
 277 def tree_encode(shalist):
 278     """Generate a git tree object from (mode,name,hash) tuples."""
 279     shalist = sorted(shalist, key = shalist_item_sort_key)
 280     l = []
 281     for (mode,name,bin) in shalist:
 282         assert(mode)
 283         assert(mode+0 == mode)
 284         assert(name)
 285         assert(len(bin) == 20)
 286         s = b'%o %s\0%s' % (mode,name,bin)
 287         assert s[0] != b'0'  # 0-padded octal is not acceptable in a git tree
 288         l.append(s)
 289     return b''.join(l)
 290
 291
 292 def tree_decode(buf):
 293     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 294     ofs = 0
 295     while ofs < len(buf):
 296         z = buf.find(b'\0', ofs)
 297         assert(z > ofs)
 298         spl = buf[ofs:z].split(b' ', 1)
 299         assert(len(spl) == 2)
 300         mode,name = spl
 301         sha = buf[z+1:z+1+20]
 302         ofs = z+1+20
 303         yield (int(mode, 8), name, sha)
 304
 305
 306 def _encode_packobj(type, content, compression_level=1):
 307     if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
 308         raise ValueError('invalid compression level %s' % compression_level)
 309     szout = b''
 310     sz = len(content)
 311     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 312     sz >>= 4
 313     while 1:
 314         if sz: szbits |= 0x80
 315         szout += bytes_from_uint(szbits)
 316         if not sz:
 317             break
 318         szbits = sz & 0x7f
 319         sz >>= 7
 320     z = zlib.compressobj(compression_level)
 321     yield szout
 322     yield z.compress(content)
 323     yield z.flush()
 324
 325
 326 def _encode_looseobj(type, content, compression_level=1):
 327     z = zlib.compressobj(compression_level)
 328     yield z.compress(b'%s %d\0' % (type, len(content)))
 329     yield z.compress(content)
 330     yield z.flush()
 331
 332
 333 def _decode_looseobj(buf):
 334     assert(buf);
 335     s = zlib.decompress(buf)
 336     i = s.find(b'\0')
 337     assert(i > 0)
 338     l = s[:i].split(b' ')
 339     type = l[0]
 340     sz = int(l[1])
 341     content = s[i+1:]
 342     assert(type in _typemap)
 343     assert(sz == len(content))
 344     return (type, content)
 345
 346
 347 def _decode_packobj(buf):
 348     assert(buf)
 349     c = byte_int(buf[0])
 350     type = _typermap[(c & 0x70) >> 4]
 351     sz = c & 0x0f
 352     shift = 4
 353     i = 0
 354     while c & 0x80:
 355         i += 1
 356         c = byte_int(buf[i])
 357         sz |= (c & 0x7f) << shift
 358         shift += 7
 359         if not (c & 0x80):
 360             break
 361     return (type, zlib.decompress(buf[i+1:]))
 362
 363
 364 class PackIdx:
 365     def __init__(self):
 366         assert(0)
 367
 368     def find_offset(self, hash):
 369         """Get the offset of an object inside the index file."""
 370         idx = self._idx_from_hash(hash)
 371         if idx != None:
 372             return self._ofs_from_idx(idx)
 373         return None
 374
 375     def exists(self, hash, want_source=False):
 376         """Return nonempty if the object exists in this index."""
 377         if hash and (self._idx_from_hash(hash) != None):
 378             return want_source and os.path.basename(self.name) or True
 379         return None
 380
 381     def _idx_from_hash(self, hash):
 382         global _total_searches, _total_steps
 383         _total_searches += 1
 384         assert(len(hash) == 20)
 385         b1 = byte_int(hash[0])
 386         start = self.fanout[b1-1] # range -1..254
 387         end = self.fanout[b1] # range 0..255
 388         want = hash
 389         _total_steps += 1  # lookup table is a step
 390         while start < end:
 391             _total_steps += 1
 392             mid = start + (end - start) // 2
 393             v = self._idx_to_hash(mid)
 394             if v < want:
 395                 start = mid+1
 396             elif v > want:
 397                 end = mid
 398             else: # got it!
 399                 return mid
 400         return None
 401
 402
 403 class PackIdxV1(PackIdx):
 404     """Object representation of a Git pack index (version 1) file."""
 405     def __init__(self, filename, f):
 406         self.name = filename
 407         self.idxnames = [self.name]
 408         self.map = mmap_read(f)
 409         # Min size for 'L' is 4, which is sufficient for struct's '!I'
 410         self.fanout = array('L', struct.unpack('!256I', self.map))
 411         self.fanout.append(0)  # entry "-1"
 412         self.nsha = self.fanout[255]
 413         self.sha_ofs = 256 * 4
 414         # Avoid slicing shatable for individual hashes (very high overhead)
 415         self.shatable = buffer(self.map, self.sha_ofs, self.nsha * 24)
 416
 417     def __len__(self):
 418         return int(self.nsha)  # int() from long for python 2
 419
 420     def _ofs_from_idx(self, idx):
 421         if idx >= self.nsha or idx < 0:
 422             raise IndexError('invalid pack index index %d' % idx)
 423         ofs = self.sha_ofs + idx * 24
 424         return struct.unpack_from('!I', self.map, offset=ofs)[0]
 425
 426     def _idx_to_hash(self, idx):
 427         if idx >= self.nsha or idx < 0:
 428             raise IndexError('invalid pack index index %d' % idx)
 429         ofs = self.sha_ofs + idx * 24 + 4
 430         return self.map[ofs : ofs + 20]
 431
 432     def __iter__(self):
 433         start = self.sha_ofs + 4
 434         for ofs in range(start, start + 24 * self.nsha, 24):
 435             yield self.map[ofs : ofs + 20]
 436
 437
 438 class PackIdxV2(PackIdx):
 439     """Object representation of a Git pack index (version 2) file."""
 440     def __init__(self, filename, f):
 441         self.name = filename
 442         self.idxnames = [self.name]
 443         self.map = mmap_read(f)
 444         assert self.map[0:8] == b'\377tOc\0\0\0\2'
 445         # Min size for 'L' is 4, which is sufficient for struct's '!I'
 446         self.fanout = array('L', struct.unpack_from('!256I', self.map, offset=8))
 447         self.fanout.append(0)
 448         self.nsha = self.fanout[255]
 449         self.sha_ofs = 8 + 256*4
 450         self.ofstable_ofs = self.sha_ofs + self.nsha * 20 + self.nsha * 4
 451         self.ofs64table_ofs = self.ofstable_ofs + self.nsha * 4
 452         # Avoid slicing this for individual hashes (very high overhead)
 453         self.shatable = buffer(self.map, self.sha_ofs, self.nsha*20)
 454
 455     def __len__(self):
 456         return int(self.nsha)  # int() from long for python 2
 457
 458     def _ofs_from_idx(self, idx):
 459         if idx >= self.nsha or idx < 0:
 460             raise IndexError('invalid pack index index %d' % idx)
 461         ofs_ofs = self.ofstable_ofs + idx * 4
 462         ofs = struct.unpack_from('!I', self.map, offset=ofs_ofs)[0]
 463         if ofs & 0x80000000:
 464             idx64 = ofs & 0x7fffffff
 465             ofs64_ofs = self.ofs64table_ofs + idx64 * 8
 466             ofs = struct.unpack_from('!Q', self.map, offset=ofs64_ofs)[0]
 467         return ofs
 468
 469     def _idx_to_hash(self, idx):
 470         if idx >= self.nsha or idx < 0:
 471             raise IndexError('invalid pack index index %d' % idx)
 472         ofs = self.sha_ofs + idx * 20
 473         return self.map[ofs : ofs + 20]
 474
 475     def __iter__(self):
 476         start = self.sha_ofs
 477         for ofs in range(start, start + 20 * self.nsha, 20):
 478             yield self.map[ofs : ofs + 20]
 479
 480
 481 _mpi_count = 0
 482 class PackIdxList:
 483     def __init__(self, dir, ignore_midx=False):
 484         global _mpi_count
 485         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 486         _mpi_count += 1
 487         self.dir = dir
 488         self.also = set()
 489         self.packs = []
 490         self.do_bloom = False
 491         self.bloom = None
 492         self.ignore_midx = ignore_midx
 493         self.refresh()
 494
 495     def __del__(self):
 496         global _mpi_count
 497         _mpi_count -= 1
 498         assert(_mpi_count == 0)
 499
 500     def __iter__(self):
 501         return iter(idxmerge(self.packs))
 502
 503     def __len__(self):
 504         return sum(len(pack) for pack in self.packs)
 505
 506     def exists(self, hash, want_source=False):
 507         """Return nonempty if the object exists in the index files."""
 508         global _total_searches
 509         _total_searches += 1
 510         if hash in self.also:
 511             return True
 512         if self.do_bloom and self.bloom:
 513             if self.bloom.exists(hash):
 514                 self.do_bloom = False
 515             else:
 516                 _total_searches -= 1  # was counted by bloom
 517                 return None
 518         for i in range(len(self.packs)):
 519             p = self.packs[i]
 520             _total_searches -= 1  # will be incremented by sub-pack
 521             ix = p.exists(hash, want_source=want_source)
 522             if ix:
 523                 # reorder so most recently used packs are searched first
 524                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 525                 return ix
 526         self.do_bloom = True
 527         return None
 528
 529     def refresh(self, skip_midx = False):
 530         """Refresh the index list.
 531         This method verifies if .midx files were superseded (e.g. all of its
 532         contents are in another, bigger .midx file) and removes the superseded
 533         files.
 534
 535         If skip_midx is True, all work on .midx files will be skipped and .midx
 536         files will be removed from the list.
 537
 538         The instance variable 'ignore_midx' can force this function to
 539         always act as if skip_midx was True.
 540         """
 541         if self.bloom is not None:
 542             self.bloom.close()
 543         self.bloom = None # Always reopen the bloom as it may have been relaced
 544         self.do_bloom = False
 545         skip_midx = skip_midx or self.ignore_midx
 546         d = dict((p.name, p) for p in self.packs
 547                  if not skip_midx or not isinstance(p, midx.PackMidx))
 548         if os.path.exists(self.dir):
 549             if not skip_midx:
 550                 midxl = []
 551                 midxes = set(glob.glob(os.path.join(self.dir, b'*.midx')))
 552                 # remove any *.midx files from our list that no longer exist
 553                 for ix in list(d.values()):
 554                     if not isinstance(ix, midx.PackMidx):
 555                         continue
 556                     if ix.name in midxes:
 557                         continue
 558                     # remove the midx
 559                     del d[ix.name]
 560                     ix.close()
 561                     self.packs.remove(ix)
 562                 for ix in self.packs:
 563                     if isinstance(ix, midx.PackMidx):
 564                         for name in ix.idxnames:
 565                             d[os.path.join(self.dir, name)] = ix
 566                 for full in midxes:
 567                     if not d.get(full):
 568                         mx = midx.PackMidx(full)
 569                         (mxd, mxf) = os.path.split(mx.name)
 570                         broken = False
 571                         for n in mx.idxnames:
 572                             if not os.path.exists(os.path.join(mxd, n)):
 573                                 log(('warning: index %s missing\n'
 574                                      '  used by %s\n')
 575                                     % (path_msg(n), path_msg(mxf)))
 576                                 broken = True
 577                         if broken:
 578                             mx.close()
 579                             del mx
 580                             unlink(full)
 581                         else:
 582                             midxl.append(mx)
 583                 midxl.sort(key=lambda ix:
 584                            (-len(ix), -xstat.stat(ix.name).st_mtime))
 585                 for ix in midxl:
 586                     any_needed = False
 587                     for sub in ix.idxnames:
 588                         found = d.get(os.path.join(self.dir, sub))
 589                         if not found or isinstance(found, PackIdx):
 590                             # doesn't exist, or exists but not in a midx
 591                             any_needed = True
 592                             break
 593                     if any_needed:
 594                         d[ix.name] = ix
 595                         for name in ix.idxnames:
 596                             d[os.path.join(self.dir, name)] = ix
 597                     elif not ix.force_keep:
 598                         debug1('midx: removing redundant: %s\n'
 599                                % path_msg(os.path.basename(ix.name)))
 600                         ix.close()
 601                         unlink(ix.name)
 602             for full in glob.glob(os.path.join(self.dir, b'*.idx')):
 603                 if not d.get(full):
 604                     try:
 605                         ix = open_idx(full)
 606                     except GitError as e:
 607                         add_error(e)
 608                         continue
 609                     d[full] = ix
 610             bfull = os.path.join(self.dir, b'bup.bloom')
 611             if self.bloom is None and os.path.exists(bfull):
 612                 self.bloom = bloom.ShaBloom(bfull)
 613             self.packs = list(set(d.values()))
 614             self.packs.sort(reverse=True, key=lambda x: len(x))
 615             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 616                 self.do_bloom = True
 617             else:
 618                 self.bloom = None
 619         debug1('PackIdxList: using %d index%s.\n'
 620             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 621
 622     def add(self, hash):
 623         """Insert an additional object in the list."""
 624         self.also.add(hash)
 625
 626
 627 def open_idx(filename):
 628     if filename.endswith(b'.idx'):
 629         f = open(filename, 'rb')
 630         header = f.read(8)
 631         if header[0:4] == b'\377tOc':
 632             version = struct.unpack('!I', header[4:8])[0]
 633             if version == 2:
 634                 return PackIdxV2(filename, f)
 635             else:
 636                 raise GitError('%s: expected idx file version 2, got %d'
 637                                % (path_msg(filename), version))
 638         elif len(header) == 8 and header[0:4] < b'\377tOc':
 639             return PackIdxV1(filename, f)
 640         else:
 641             raise GitError('%s: unrecognized idx file header'
 642                            % path_msg(filename))
 643     elif filename.endswith(b'.midx'):
 644         return midx.PackMidx(filename)
 645     else:
 646         raise GitError('idx filenames must end with .idx or .midx')
 647
 648
 649 def idxmerge(idxlist, final_progress=True):
 650     """Generate a list of all the objects reachable in a PackIdxList."""
 651     def pfunc(count, total):
 652         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 653                   % (count*100.0/total, count, total))
 654     def pfinal(count, total):
 655         if final_progress:
 656             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 657                      % (100, total, total))
 658     return merge_iter(idxlist, 10024, pfunc, pfinal)
 659
 660
 661 def _make_objcache():
 662     return PackIdxList(repo(b'objects/pack'))
 663
 664 # bup-gc assumes that it can disable all PackWriter activities
 665 # (bloom/midx/cache) via the constructor and close() arguments.
 666
 667 class PackWriter:
 668     """Writes Git objects inside a pack file."""
 669     def __init__(self, objcache_maker=_make_objcache, compression_level=1,
 670                  run_midx=True, on_pack_finish=None,
 671                  max_pack_size=None, max_pack_objects=None, repo_dir=None):
 672         self.repo_dir = repo_dir or repo()
 673         self.file = None
 674         self.parentfd = None
 675         self.count = 0
 676         self.outbytes = 0
 677         self.filename = None
 678         self.idx = None
 679         self.objcache_maker = objcache_maker
 680         self.objcache = None
 681         self.compression_level = compression_level
 682         self.run_midx=run_midx
 683         self.on_pack_finish = on_pack_finish
 684         if not max_pack_size:
 685             max_pack_size = git_config_get(b'pack.packSizeLimit',
 686                                            repo_dir=self.repo_dir)
 687             if max_pack_size is not None:
 688                 max_pack_size = parse_num(max_pack_size)
 689             if not max_pack_size:
 690                 # larger packs slow down pruning
 691                 max_pack_size = 1000 * 1000 * 1000
 692         self.max_pack_size = max_pack_size
 693         # cache memory usage is about 83 bytes per object
 694         self.max_pack_objects = max_pack_objects if max_pack_objects \
 695                                 else max(1, self.max_pack_size // 5000)
 696
 697     def __del__(self):
 698         self.close()
 699
 700     def __enter__(self):
 701         return self
 702
 703     def __exit__(self, type, value, traceback):
 704         self.close()
 705
 706     def _open(self):
 707         if not self.file:
 708             objdir = dir = os.path.join(self.repo_dir, b'objects')
 709             fd, name = tempfile.mkstemp(suffix=b'.pack', dir=objdir)
 710             try:
 711                 self.file = os.fdopen(fd, 'w+b')
 712             except:
 713                 os.close(fd)
 714                 raise
 715             try:
 716                 self.parentfd = os.open(objdir, os.O_RDONLY)
 717             except:
 718                 f = self.file
 719                 self.file = None
 720                 f.close()
 721                 raise
 722             assert name.endswith(b'.pack')
 723             self.filename = name[:-5]
 724             self.file.write(b'PACK\0\0\0\2\0\0\0\0')
 725             self.idx = list(list() for i in range(256))
 726
 727     def _raw_write(self, datalist, sha):
 728         self._open()
 729         f = self.file
 730         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 731         # the file never has a *partial* blob.  So let's make sure it's
 732         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 733         # to our hashsplit algorithm.)  f.write() does its own buffering,
 734         # but that's okay because we'll flush it in _end().
 735         oneblob = b''.join(datalist)
 736         try:
 737             f.write(oneblob)
 738         except IOError as e:
 739             reraise(GitError(e))
 740         nw = len(oneblob)
 741         crc = zlib.crc32(oneblob) & 0xffffffff
 742         self._update_idx(sha, crc, nw)
 743         self.outbytes += nw
 744         self.count += 1
 745         return nw, crc
 746
 747     def _update_idx(self, sha, crc, size):
 748         assert(sha)
 749         if self.idx:
 750             self.idx[byte_int(sha[0])].append((sha, crc,
 751                                                self.file.tell() - size))
 752
 753     def _write(self, sha, type, content):
 754         if verbose:
 755             log('>')
 756         if not sha:
 757             sha = calc_hash(type, content)
 758         size, crc = self._raw_write(_encode_packobj(type, content,
 759                                                     self.compression_level),
 760                                     sha=sha)
 761         if self.outbytes >= self.max_pack_size \
 762            or self.count >= self.max_pack_objects:
 763             self.breakpoint()
 764         return sha
 765
 766     def breakpoint(self):
 767         """Clear byte and object counts and return the last processed id."""
 768         id = self._end(self.run_midx)
 769         self.outbytes = self.count = 0
 770         return id
 771
 772     def _require_objcache(self):
 773         if self.objcache is None and self.objcache_maker:
 774             self.objcache = self.objcache_maker()
 775         if self.objcache is None:
 776             raise GitError(
 777                     "PackWriter not opened or can't check exists w/o objcache")
 778
 779     def exists(self, id, want_source=False):
 780         """Return non-empty if an object is found in the object cache."""
 781         self._require_objcache()
 782         return self.objcache.exists(id, want_source=want_source)
 783
 784     def just_write(self, sha, type, content):
 785         """Write an object to the pack file without checking for duplication."""
 786         self._write(sha, type, content)
 787         # If nothing else, gc doesn't have/want an objcache
 788         if self.objcache is not None:
 789             self.objcache.add(sha)
 790
 791     def maybe_write(self, type, content):
 792         """Write an object to the pack file if not present and return its id."""
 793         sha = calc_hash(type, content)
 794         if not self.exists(sha):
 795             self._require_objcache()
 796             self.just_write(sha, type, content)
 797         return sha
 798
 799     def new_blob(self, blob):
 800         """Create a blob object in the pack with the supplied content."""
 801         return self.maybe_write(b'blob', blob)
 802
 803     def new_tree(self, shalist):
 804         """Create a tree object in the pack."""
 805         content = tree_encode(shalist)
 806         return self.maybe_write(b'tree', content)
 807
 808     def new_commit(self, tree, parent,
 809                    author, adate_sec, adate_tz,
 810                    committer, cdate_sec, cdate_tz,
 811                    msg):
 812         """Create a commit object in the pack.  The date_sec values must be
 813         epoch-seconds, and if a tz is None, the local timezone is assumed."""
 814         if adate_tz:
 815             adate_str = _git_date_str(adate_sec, adate_tz)
 816         else:
 817             adate_str = _local_git_date_str(adate_sec)
 818         if cdate_tz:
 819             cdate_str = _git_date_str(cdate_sec, cdate_tz)
 820         else:
 821             cdate_str = _local_git_date_str(cdate_sec)
 822         l = []
 823         if tree: l.append(b'tree %s' % hexlify(tree))
 824         if parent: l.append(b'parent %s' % hexlify(parent))
 825         if author: l.append(b'author %s %s' % (author, adate_str))
 826         if committer: l.append(b'committer %s %s' % (committer, cdate_str))
 827         l.append(b'')
 828         l.append(msg)
 829         return self.maybe_write(b'commit', b'\n'.join(l))
 830
 831     def abort(self):
 832         """Remove the pack file from disk."""
 833         f = self.file
 834         if f:
 835             pfd = self.parentfd
 836             self.file = None
 837             self.parentfd = None
 838             self.idx = None
 839             try:
 840                 try:
 841                     os.unlink(self.filename + b'.pack')
 842                 finally:
 843                     f.close()
 844             finally:
 845                 if pfd is not None:
 846                     os.close(pfd)
 847
 848     def _end(self, run_midx=True):
 849         f = self.file
 850         if not f: return None
 851         self.file = None
 852         try:
 853             self.objcache = None
 854             idx = self.idx
 855             self.idx = None
 856
 857             # update object count
 858             f.seek(8)
 859             cp = struct.pack('!i', self.count)
 860             assert(len(cp) == 4)
 861             f.write(cp)
 862
 863             # calculate the pack sha1sum
 864             f.seek(0)
 865             sum = Sha1()
 866             for b in chunkyreader(f):
 867                 sum.update(b)
 868             packbin = sum.digest()
 869             f.write(packbin)
 870             fdatasync(f.fileno())
 871         finally:
 872             f.close()
 873
 874         obj_list_sha = self._write_pack_idx_v2(self.filename + b'.idx', idx,
 875                                                packbin)
 876         nameprefix = os.path.join(self.repo_dir,
 877                                   b'objects/pack/pack-' +  obj_list_sha)
 878         if os.path.exists(self.filename + b'.map'):
 879             os.unlink(self.filename + b'.map')
 880         os.rename(self.filename + b'.pack', nameprefix + b'.pack')
 881         os.rename(self.filename + b'.idx', nameprefix + b'.idx')
 882         try:
 883             os.fsync(self.parentfd)
 884         finally:
 885             os.close(self.parentfd)
 886
 887         if run_midx:
 888             auto_midx(os.path.join(self.repo_dir, b'objects/pack'))
 889
 890         if self.on_pack_finish:
 891             self.on_pack_finish(nameprefix)
 892
 893         return nameprefix
 894
 895     def close(self, run_midx=True):
 896         """Close the pack file and move it to its definitive path."""
 897         return self._end(run_midx=run_midx)
 898
 899     def _write_pack_idx_v2(self, filename, idx, packbin):
 900         ofs64_count = 0
 901         for section in idx:
 902             for entry in section:
 903                 if entry[2] >= 2**31:
 904                     ofs64_count += 1
 905
 906         # Length: header + fan-out + shas-and-crcs + overflow-offsets
 907         index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
 908         idx_map = None
 909         idx_f = open(filename, 'w+b')
 910         try:
 911             idx_f.truncate(index_len)
 912             fdatasync(idx_f.fileno())
 913             idx_map = mmap_readwrite(idx_f, close=False)
 914             try:
 915                 count = _helpers.write_idx(filename, idx_map, idx, self.count)
 916                 assert(count == self.count)
 917                 idx_map.flush()
 918             finally:
 919                 idx_map.close()
 920         finally:
 921             idx_f.close()
 922
 923         idx_f = open(filename, 'a+b')
 924         try:
 925             idx_f.write(packbin)
 926             idx_f.seek(0)
 927             idx_sum = Sha1()
 928             b = idx_f.read(8 + 4*256)
 929             idx_sum.update(b)
 930
 931             obj_list_sum = Sha1()
 932             for b in chunkyreader(idx_f, 20*self.count):
 933                 idx_sum.update(b)
 934                 obj_list_sum.update(b)
 935             namebase = hexlify(obj_list_sum.digest())
 936
 937             for b in chunkyreader(idx_f):
 938                 idx_sum.update(b)
 939             idx_f.write(idx_sum.digest())
 940             fdatasync(idx_f.fileno())
 941             return namebase
 942         finally:
 943             idx_f.close()
 944
 945
 946 def list_refs(patterns=None, repo_dir=None,
 947               limit_to_heads=False, limit_to_tags=False):
 948     """Yield (refname, hash) tuples for all repository refs unless
 949     patterns are specified.  In that case, only include tuples for
 950     refs matching those patterns (cf. git-show-ref(1)).  The limits
 951     restrict the result items to refs/heads or refs/tags.  If both
 952     limits are specified, items from both sources will be included.
 953
 954     """
 955     argv = [b'git', b'show-ref']
 956     if limit_to_heads:
 957         argv.append(b'--heads')
 958     if limit_to_tags:
 959         argv.append(b'--tags')
 960     argv.append(b'--')
 961     if patterns:
 962         argv.extend(patterns)
 963     p = subprocess.Popen(argv, env=_gitenv(repo_dir), stdout=subprocess.PIPE)
 964     out = p.stdout.read().strip()
 965     rv = p.wait()  # not fatal
 966     if rv:
 967         assert(not out)
 968     if out:
 969         for d in out.split(b'\n'):
 970             sha, name = d.split(b' ', 1)
 971             yield name, unhexlify(sha)
 972
 973
 974 def read_ref(refname, repo_dir = None):
 975     """Get the commit id of the most recent commit made on a given ref."""
 976     refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
 977     l = tuple(islice(refs, 2))
 978     if l:
 979         assert(len(l) == 1)
 980         return l[0][1]
 981     else:
 982         return None
 983
 984
 985 def rev_list_invocation(ref_or_refs, format=None):
 986     if isinstance(ref_or_refs, bytes):
 987         refs = (ref_or_refs,)
 988     else:
 989         refs = ref_or_refs
 990     argv = [b'git', b'rev-list']
 991
 992     if format:
 993         argv.append(b'--pretty=format:' + format)
 994     for ref in refs:
 995         assert not ref.startswith(b'-')
 996         argv.append(ref)
 997     argv.append(b'--')
 998     return argv
 999
1000
1001 def rev_list(ref_or_refs, parse=None, format=None, repo_dir=None):
1002     """Yield information about commits as per "git rev-list".  If a format
1003     is not provided, yield one hex hash at a time.  If a format is
1004     provided, pass it to rev-list and call parse(git_stdout) for each
1005     commit with the stream positioned just after the rev-list "commit
1006     HASH" header line.  When a format is provided yield (oidx,
1007     parse(git_stdout)) for each commit.
1008
1009     """
1010     assert bool(parse) == bool(format)
1011     p = subprocess.Popen(rev_list_invocation(ref_or_refs,
1012                                              format=format),
1013                          env=_gitenv(repo_dir),
1014                          stdout = subprocess.PIPE)
1015     if not format:
1016         for line in p.stdout:
1017             yield line.strip()
1018     else:
1019         line = p.stdout.readline()
1020         while line:
1021             s = line.strip()
1022             if not s.startswith(b'commit '):
1023                 raise Exception('unexpected line ' + repr(s))
1024             s = s[7:]
1025             assert len(s) == 40
1026             yield s, parse(p.stdout)
1027             line = p.stdout.readline()
1028
1029     rv = p.wait()  # not fatal
1030     if rv:
1031         raise GitError('git rev-list returned error %d' % rv)
1032
1033
1034 def get_commit_dates(refs, repo_dir=None):
1035     """Get the dates for the specified commit refs.  For now, every unique
1036        string in refs must resolve to a different commit or this
1037        function will fail."""
1038     result = []
1039     for ref in refs:
1040         commit = get_commit_items(ref, cp(repo_dir))
1041         result.append(commit.author_sec)
1042     return result
1043
1044
1045 def rev_parse(committish, repo_dir=None):
1046     """Resolve the full hash for 'committish', if it exists.
1047
1048     Should be roughly equivalent to 'git rev-parse'.
1049
1050     Returns the hex value of the hash if it is found, None if 'committish' does
1051     not correspond to anything.
1052     """
1053     head = read_ref(committish, repo_dir=repo_dir)
1054     if head:
1055         debug2("resolved from ref: commit = %s\n" % hexlify(head))
1056         return head
1057
1058     pL = PackIdxList(repo(b'objects/pack', repo_dir=repo_dir))
1059
1060     if len(committish) == 40:
1061         try:
1062             hash = unhexlify(committish)
1063         except TypeError:
1064             return None
1065
1066         if pL.exists(hash):
1067             return hash
1068
1069     return None
1070
1071
1072 def update_ref(refname, newval, oldval, repo_dir=None):
1073     """Update a repository reference."""
1074     if not oldval:
1075         oldval = b''
1076     assert refname.startswith(b'refs/heads/') \
1077         or refname.startswith(b'refs/tags/')
1078     p = subprocess.Popen([b'git', b'update-ref', refname,
1079                           hexlify(newval), hexlify(oldval)],
1080                          env=_gitenv(repo_dir))
1081     _git_wait(b'git update-ref', p)
1082
1083
1084 def delete_ref(refname, oldvalue=None):
1085     """Delete a repository reference (see git update-ref(1))."""
1086     assert refname.startswith(b'refs/')
1087     oldvalue = [] if not oldvalue else [oldvalue]
1088     p = subprocess.Popen([b'git', b'update-ref', b'-d', refname] + oldvalue,
1089                          env=_gitenv())
1090     _git_wait('git update-ref', p)
1091
1092
1093 def guess_repo(path=None):
1094     """Set the path value in the global variable "repodir".
1095     This makes bup look for an existing bup repository, but not fail if a
1096     repository doesn't exist. Usually, if you are interacting with a bup
1097     repository, you would not be calling this function but using
1098     check_repo_or_die().
1099     """
1100     global repodir
1101     if path:
1102         repodir = path
1103     if not repodir:
1104         repodir = environ.get(b'BUP_DIR')
1105         if not repodir:
1106             repodir = os.path.expanduser(b'~/.bup')
1107
1108
1109 def init_repo(path=None):
1110     """Create the Git bare repository for bup in a given path."""
1111     guess_repo(path)
1112     d = repo()  # appends a / to the path
1113     parent = os.path.dirname(os.path.dirname(d))
1114     if parent and not os.path.exists(parent):
1115         raise GitError('parent directory "%s" does not exist\n'
1116                        % path_msg(parent))
1117     if os.path.exists(d) and not os.path.isdir(os.path.join(d, b'.')):
1118         raise GitError('"%s" exists but is not a directory\n' % path_msg(d))
1119     p = subprocess.Popen([b'git', b'--bare', b'init'], stdout=sys.stderr,
1120                          env=_gitenv())
1121     _git_wait('git init', p)
1122     # Force the index version configuration in order to ensure bup works
1123     # regardless of the version of the installed Git binary.
1124     p = subprocess.Popen([b'git', b'config', b'pack.indexVersion', '2'],
1125                          stdout=sys.stderr, env=_gitenv())
1126     _git_wait('git config', p)
1127     # Enable the reflog
1128     p = subprocess.Popen([b'git', b'config', b'core.logAllRefUpdates', b'true'],
1129                          stdout=sys.stderr, env=_gitenv())
1130     _git_wait('git config', p)
1131
1132
1133 def check_repo_or_die(path=None):
1134     """Check to see if a bup repository probably exists, and abort if not."""
1135     guess_repo(path)
1136     top = repo()
1137     pst = stat_if_exists(top + b'/objects/pack')
1138     if pst and stat.S_ISDIR(pst.st_mode):
1139         return
1140     if not pst:
1141         top_st = stat_if_exists(top)
1142         if not top_st:
1143             log('error: repository %r does not exist (see "bup help init")\n'
1144                 % top)
1145             sys.exit(15)
1146     log('error: %s is not a repository\n' % path_msg(top))
1147     sys.exit(14)
1148
1149
1150 def is_suitable_git(ver_str):
1151     if not ver_str.startswith(b'git version '):
1152         return 'unrecognized'
1153     ver_str = ver_str[len(b'git version '):]
1154     if ver_str.startswith(b'0.'):
1155         return 'insufficient'
1156     if ver_str.startswith(b'1.'):
1157         if re.match(br'1\.[012345]rc', ver_str):
1158             return 'insufficient'
1159         if re.match(br'1\.[01234]\.', ver_str):
1160             return 'insufficient'
1161         if re.match(br'1\.5\.[012345]($|\.)', ver_str):
1162             return 'insufficient'
1163         if re.match(br'1\.5\.6-rc', ver_str):
1164             return 'insufficient'
1165         return 'suitable'
1166     if re.match(br'[0-9]+(\.|$)?', ver_str):
1167         return 'suitable'
1168     sys.exit(13)
1169
1170 _git_great = None
1171
1172 def require_suitable_git(ver_str=None):
1173     """Raise GitError if the version of git isn't suitable.
1174
1175     Rely on ver_str when provided, rather than invoking the git in the
1176     path.
1177
1178     """
1179     global _git_great
1180     if _git_great is not None:
1181         return
1182     if environ.get(b'BUP_GIT_VERSION_IS_FINE', b'').lower() \
1183        in (b'yes', b'true', b'1'):
1184         _git_great = True
1185         return
1186     if not ver_str:
1187         ver_str, _, _ = _git_exo([b'git', b'--version'])
1188     status = is_suitable_git(ver_str)
1189     if status == 'unrecognized':
1190         raise GitError('Unexpected git --version output: %r' % ver_str)
1191     if status == 'insufficient':
1192         log('error: git version must be at least 1.5.6\n')
1193         sys.exit(1)
1194     if status == 'suitable':
1195         _git_great = True
1196         return
1197     assert False
1198
1199
1200 class _AbortableIter:
1201     def __init__(self, it, onabort = None):
1202         self.it = it
1203         self.onabort = onabort
1204         self.done = None
1205
1206     def __iter__(self):
1207         return self
1208
1209     def __next__(self):
1210         try:
1211             return next(self.it)
1212         except StopIteration as e:
1213             self.done = True
1214             raise
1215         except:
1216             self.abort()
1217             raise
1218
1219     next = __next__
1220
1221     def abort(self):
1222         """Abort iteration and call the abortion callback, if needed."""
1223         if not self.done:
1224             self.done = True
1225             if self.onabort:
1226                 self.onabort()
1227
1228     def __del__(self):
1229         self.abort()
1230
1231
1232 class CatPipe:
1233     """Link to 'git cat-file' that is used to retrieve blob data."""
1234     def __init__(self, repo_dir = None):
1235         require_suitable_git()
1236         self.repo_dir = repo_dir
1237         self.p = self.inprogress = None
1238
1239     def _abort(self):
1240         if self.p:
1241             self.p.stdout.close()
1242             self.p.stdin.close()
1243         self.p = None
1244         self.inprogress = None
1245
1246     def restart(self):
1247         self._abort()
1248         self.p = subprocess.Popen([b'git', b'cat-file', b'--batch'],
1249                                   stdin=subprocess.PIPE,
1250                                   stdout=subprocess.PIPE,
1251                                   close_fds = True,
1252                                   bufsize = 4096,
1253                                   env=_gitenv(self.repo_dir))
1254
1255     def get(self, ref):
1256         """Yield (oidx, type, size), followed by the data referred to by ref.
1257         If ref does not exist, only yield (None, None, None).
1258
1259         """
1260         if not self.p or self.p.poll() != None:
1261             self.restart()
1262         assert(self.p)
1263         poll_result = self.p.poll()
1264         assert(poll_result == None)
1265         if self.inprogress:
1266             log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1267         assert(not self.inprogress)
1268         assert ref.find(b'\n') < 0
1269         assert ref.find(b'\r') < 0
1270         assert not ref.startswith(b'-')
1271         self.inprogress = ref
1272         self.p.stdin.write(ref + b'\n')
1273         self.p.stdin.flush()
1274         hdr = self.p.stdout.readline()
1275         if hdr.endswith(b' missing\n'):
1276             self.inprogress = None
1277             yield None, None, None
1278             return
1279         info = hdr.split(b' ')
1280         if len(info) != 3 or len(info[0]) != 40:
1281             raise GitError('expected object (id, type, size), got %r' % info)
1282         oidx, typ, size = info
1283         size = int(size)
1284         it = _AbortableIter(chunkyreader(self.p.stdout, size),
1285                             onabort=self._abort)
1286         try:
1287             yield oidx, typ, size
1288             for blob in it:
1289                 yield blob
1290             readline_result = self.p.stdout.readline()
1291             assert readline_result == b'\n'
1292             self.inprogress = None
1293         except Exception as e:
1294             it.abort()
1295             raise
1296
1297     def _join(self, it):
1298         _, typ, _ = next(it)
1299         if typ == b'blob':
1300             for blob in it:
1301                 yield blob
1302         elif typ == b'tree':
1303             treefile = b''.join(it)
1304             for (mode, name, sha) in tree_decode(treefile):
1305                 for blob in self.join(hexlify(sha)):
1306                     yield blob
1307         elif typ == b'commit':
1308             treeline = b''.join(it).split(b'\n')[0]
1309             assert treeline.startswith(b'tree ')
1310             for blob in self.join(treeline[5:]):
1311                 yield blob
1312         else:
1313             raise GitError('invalid object type %r: expected blob/tree/commit'
1314                            % typ)
1315
1316     def join(self, id):
1317         """Generate a list of the content of all blobs that can be reached
1318         from an object.  The hash given in 'id' must point to a blob, a tree
1319         or a commit. The content of all blobs that can be seen from trees or
1320         commits will be added to the list.
1321         """
1322         for d in self._join(self.get(id)):
1323             yield d
1324
1325
1326 _cp = {}
1327
1328 def cp(repo_dir=None):
1329     """Create a CatPipe object or reuse the already existing one."""
1330     global _cp, repodir
1331     if not repo_dir:
1332         repo_dir = repodir or repo()
1333     repo_dir = os.path.abspath(repo_dir)
1334     cp = _cp.get(repo_dir)
1335     if not cp:
1336         cp = CatPipe(repo_dir)
1337         _cp[repo_dir] = cp
1338     return cp
1339
1340
1341 def tags(repo_dir = None):
1342     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1343     tags = {}
1344     for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1345         assert n.startswith(b'refs/tags/')
1346         name = n[10:]
1347         if not c in tags:
1348             tags[c] = []
1349         tags[c].append(name)  # more than one tag can point at 'c'
1350     return tags
1351
1352
1353 class MissingObject(KeyError):
1354     def __init__(self, oid):
1355         self.oid = oid
1356         KeyError.__init__(self, 'object %r is missing' % hexlify(oid))
1357
1358
1359 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1360                                    'path', 'chunk_path', 'data'])
1361 # The path is the mangled path, and if an item represents a fragment
1362 # of a chunked file, the chunk_path will be the chunked subtree path
1363 # for the chunk, i.e. ['', '2d3115e', ...].  The top-level path for a
1364 # chunked file will have a chunk_path of [''].  So some chunk subtree
1365 # of the file '/foo/bar/baz' might look like this:
1366 #
1367 #   item.path = ['foo', 'bar', 'baz.bup']
1368 #   item.chunk_path = ['', '2d3115e', '016b097']
1369 #   item.type = 'tree'
1370 #   ...
1371
1372
1373 def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1374     """Yield everything reachable from oidx via get_ref (which must behave
1375     like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1376     returns true.  Throw MissingObject if a hash encountered is
1377     missing from the repository, and don't read or return blob content
1378     in the data field unless include_data is set.
1379
1380     """
1381     # Maintain the pending stack on the heap to avoid stack overflow
1382     pending = [(oidx, [], [], None)]
1383     while len(pending):
1384         oidx, parent_path, chunk_path, mode = pending.pop()
1385         oid = unhexlify(oidx)
1386         if stop_at and stop_at(oidx):
1387             continue
1388
1389         if (not include_data) and mode and stat.S_ISREG(mode):
1390             # If the object is a "regular file", then it's a leaf in
1391             # the graph, so we can skip reading the data if the caller
1392             # hasn't requested it.
1393             yield WalkItem(oid=oid, type=b'blob',
1394                            chunk_path=chunk_path, path=parent_path,
1395                            mode=mode,
1396                            data=None)
1397             continue
1398
1399         item_it = get_ref(oidx)
1400         get_oidx, typ, _ = next(item_it)
1401         if not get_oidx:
1402             raise MissingObject(unhexlify(oidx))
1403         if typ not in (b'blob', b'commit', b'tree'):
1404             raise Exception('unexpected repository object type %r' % typ)
1405
1406         # FIXME: set the mode based on the type when the mode is None
1407         if typ == b'blob' and not include_data:
1408             # Dump data until we can ask cat_pipe not to fetch it
1409             for ignored in item_it:
1410                 pass
1411             data = None
1412         else:
1413             data = b''.join(item_it)
1414
1415         yield WalkItem(oid=oid, type=typ,
1416                        chunk_path=chunk_path, path=parent_path,
1417                        mode=mode,
1418                        data=(data if include_data else None))
1419
1420         if typ == b'commit':
1421             commit_items = parse_commit(data)
1422             for pid in commit_items.parents:
1423                 pending.append((pid, parent_path, chunk_path, mode))
1424             pending.append((commit_items.tree, parent_path, chunk_path,
1425                             hashsplit.GIT_MODE_TREE))
1426         elif typ == b'tree':
1427             for mode, name, ent_id in tree_decode(data):
1428                 demangled, bup_type = demangle_name(name, mode)
1429                 if chunk_path:
1430                     sub_path = parent_path
1431                     sub_chunk_path = chunk_path + [name]
1432                 else:
1433                     sub_path = parent_path + [name]
1434                     if bup_type == BUP_CHUNKED:
1435                         sub_chunk_path = [b'']
1436                     else:
1437                         sub_chunk_path = chunk_path
1438                 pending.append((hexlify(ent_id), sub_path, sub_chunk_path,
1439                                 mode))