lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5
   6 from __future__ import absolute_import, print_function
   7 import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
   8 from array import array
   9 from binascii import hexlify, unhexlify
  10 from collections import namedtuple
  11 from itertools import islice
  12 from numbers import Integral
  13
  14 from bup import _helpers, compat, hashsplit, path, midx, bloom, xstat
  15 from bup.compat import (buffer,
  16                         byte_int, bytes_from_byte, bytes_from_uint,
  17                         environ,
  18                         items,
  19                         range,
  20                         reraise)
  21 from bup.io import path_msg
  22 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
  23                          exo,
  24                          fdatasync,
  25                          hostname, localtime, log,
  26                          merge_dict,
  27                          merge_iter,
  28                          mmap_read, mmap_readwrite,
  29                          parse_num,
  30                          progress, qprogress, stat_if_exists,
  31                          unlink,
  32                          utc_offset_str)
  33 from bup.pwdgrp import username, userfullname
  34
  35
  36 verbose = 0
  37 repodir = None  # The default repository, once initialized
  38
  39 _typemap =  {b'blob': 3, b'tree': 2, b'commit': 1, b'tag': 4}
  40 _typermap = {v: k for k, v in items(_typemap)}
  41
  42
  43 _total_searches = 0
  44 _total_steps = 0
  45
  46
  47 class GitError(Exception):
  48     pass
  49
  50
  51 def _gitenv(repo_dir=None):
  52     if not repo_dir:
  53         repo_dir = repo()
  54     return merge_dict(environ, {b'GIT_DIR': os.path.abspath(repo_dir)})
  55
  56 def _git_wait(cmd, p):
  57     rv = p.wait()
  58     if rv != 0:
  59         raise GitError('%r returned %d' % (cmd, rv))
  60
  61 def _git_exo(cmd, **kwargs):
  62     kwargs['check'] = False
  63     result = exo(cmd, **kwargs)
  64     _, _, proc = result
  65     if proc.returncode != 0:
  66         raise GitError('%r returned %d' % (cmd, proc.returncode))
  67     return result
  68
  69 def git_config_get(option, repo_dir=None):
  70     cmd = (b'git', b'config', b'--get', option)
  71     p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
  72                          env=_gitenv(repo_dir=repo_dir))
  73     r = p.stdout.read()
  74     rc = p.wait()
  75     if rc == 0:
  76         return r
  77     if rc != 1:
  78         raise GitError('%r returned %d' % (cmd, rc))
  79     return None
  80
  81
  82 def parse_tz_offset(s):
  83     """UTC offset in seconds."""
  84     tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
  85     if bytes_from_byte(s[0]) == b'-':
  86         return - tz_off
  87     return tz_off
  88
  89
  90 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
  91 # Make sure that's authoritative.
  92 _start_end_char = br'[^ .,:;<>"\'\0\n]'
  93 _content_char = br'[^\0\n<>]'
  94 _safe_str_rx = br'(?:%s{1,2}|(?:%s%s*%s))' \
  95     % (_start_end_char,
  96        _start_end_char, _content_char, _start_end_char)
  97 _tz_rx = br'[-+]\d\d[0-5]\d'
  98 _parent_rx = br'(?:parent [abcdefABCDEF0123456789]{40}\n)'
  99 # Assumes every following line starting with a space is part of the
 100 # mergetag.  Is there a formal commit blob spec?
 101 _mergetag_rx = br'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)'
 102 _commit_rx = re.compile(br'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
 103 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
 104 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?)
 105
 106 (?P<message>(?:.|\n)*)''' % (_parent_rx,
 107                              _safe_str_rx, _safe_str_rx, _tz_rx,
 108                              _safe_str_rx, _safe_str_rx, _tz_rx,
 109                              _mergetag_rx))
 110 _parent_hash_rx = re.compile(br'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
 111
 112 # Note that the author_sec and committer_sec values are (UTC) epoch
 113 # seconds, and for now the mergetag is not included.
 114 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
 115                                        'author_name', 'author_mail',
 116                                        'author_sec', 'author_offset',
 117                                        'committer_name', 'committer_mail',
 118                                        'committer_sec', 'committer_offset',
 119                                        'message'])
 120
 121 def parse_commit(content):
 122     commit_match = re.match(_commit_rx, content)
 123     if not commit_match:
 124         raise Exception('cannot parse commit %r' % content)
 125     matches = commit_match.groupdict()
 126     return CommitInfo(tree=matches['tree'],
 127                       parents=re.findall(_parent_hash_rx, matches['parents']),
 128                       author_name=matches['author_name'],
 129                       author_mail=matches['author_mail'],
 130                       author_sec=int(matches['asec']),
 131                       author_offset=parse_tz_offset(matches['atz']),
 132                       committer_name=matches['committer_name'],
 133                       committer_mail=matches['committer_mail'],
 134                       committer_sec=int(matches['csec']),
 135                       committer_offset=parse_tz_offset(matches['ctz']),
 136                       message=matches['message'])
 137
 138
 139 def get_cat_data(cat_iterator, expected_type):
 140     _, kind, _ = next(cat_iterator)
 141     if kind != expected_type:
 142         raise Exception('expected %r, saw %r' % (expected_type, kind))
 143     return b''.join(cat_iterator)
 144
 145 def get_commit_items(id, cp):
 146     return parse_commit(get_cat_data(cp.get(id), b'commit'))
 147
 148 def _local_git_date_str(epoch_sec):
 149     return b'%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
 150
 151
 152 def _git_date_str(epoch_sec, tz_offset_sec):
 153     offs =  tz_offset_sec // 60
 154     return b'%d %s%02d%02d' \
 155         % (epoch_sec,
 156            b'+' if offs >= 0 else b'-',
 157            abs(offs) // 60,
 158            abs(offs) % 60)
 159
 160
 161 def repo(sub = b'', repo_dir=None):
 162     """Get the path to the git repository or one of its subdirectories."""
 163     repo_dir = repo_dir or repodir
 164     if not repo_dir:
 165         raise GitError('You should call check_repo_or_die()')
 166
 167     # If there's a .git subdirectory, then the actual repo is in there.
 168     gd = os.path.join(repo_dir, b'.git')
 169     if os.path.exists(gd):
 170         repo_dir = gd
 171
 172     return os.path.join(repo_dir, sub)
 173
 174
 175 _shorten_hash_rx = \
 176     re.compile(br'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)')
 177
 178 def shorten_hash(s):
 179     return _shorten_hash_rx.sub(br'\1\2*\3', s)
 180
 181
 182 def repo_rel(path):
 183     full = os.path.abspath(path)
 184     fullrepo = os.path.abspath(repo(b''))
 185     if not fullrepo.endswith(b'/'):
 186         fullrepo += b'/'
 187     if full.startswith(fullrepo):
 188         path = full[len(fullrepo):]
 189     if path.startswith(b'index-cache/'):
 190         path = path[len(b'index-cache/'):]
 191     return shorten_hash(path)
 192
 193
 194 def all_packdirs():
 195     paths = [repo(b'objects/pack')]
 196     paths += glob.glob(repo(b'index-cache/*/.'))
 197     return paths
 198
 199
 200 def auto_midx(objdir):
 201     args = [path.exe(), b'midx', b'--auto', b'--dir', objdir]
 202     try:
 203         rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
 204     except OSError as e:
 205         # make sure 'args' gets printed to help with debugging
 206         add_error('%r: exception: %s' % (args, e))
 207         raise
 208     if rv:
 209         add_error('%r: returned %d' % (args, rv))
 210
 211     args = [path.exe(), b'bloom', b'--dir', objdir]
 212     try:
 213         rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
 214     except OSError as e:
 215         # make sure 'args' gets printed to help with debugging
 216         add_error('%r: exception: %s' % (args, e))
 217         raise
 218     if rv:
 219         add_error('%r: returned %d' % (args, rv))
 220
 221
 222 def mangle_name(name, mode, gitmode):
 223     """Mangle a file name to present an abstract name for segmented files.
 224     Mangled file names will have the ".bup" extension added to them. If a
 225     file's name already ends with ".bup", a ".bupl" extension is added to
 226     disambiguate normal files from segmented ones.
 227     """
 228     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 229         assert(stat.S_ISDIR(gitmode))
 230         return name + b'.bup'
 231     elif name.endswith(b'.bup') or name[:-1].endswith(b'.bup'):
 232         return name + b'.bupl'
 233     else:
 234         return name
 235
 236
 237 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 238 def demangle_name(name, mode):
 239     """Remove name mangling from a file name, if necessary.
 240
 241     The return value is a tuple (demangled_filename,mode), where mode is one of
 242     the following:
 243
 244     * BUP_NORMAL  : files that should be read as-is from the repository
 245     * BUP_CHUNKED : files that were chunked and need to be reassembled
 246
 247     For more information on the name mangling algorithm, see mangle_name()
 248     """
 249     if name.endswith(b'.bupl'):
 250         return (name[:-5], BUP_NORMAL)
 251     elif name.endswith(b'.bup'):
 252         return (name[:-4], BUP_CHUNKED)
 253     elif name.endswith(b'.bupm'):
 254         return (name[:-5],
 255                 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
 256     else:
 257         return (name, BUP_NORMAL)
 258
 259
 260 def calc_hash(type, content):
 261     """Calculate some content's hash in the Git fashion."""
 262     header = b'%s %d\0' % (type, len(content))
 263     sum = Sha1(header)
 264     sum.update(content)
 265     return sum.digest()
 266
 267
 268 def shalist_item_sort_key(ent):
 269     (mode, name, id) = ent
 270     assert(mode+0 == mode)
 271     if stat.S_ISDIR(mode):
 272         return name + b'/'
 273     else:
 274         return name
 275
 276
 277 def tree_encode(shalist):
 278     """Generate a git tree object from (mode,name,hash) tuples."""
 279     shalist = sorted(shalist, key = shalist_item_sort_key)
 280     l = []
 281     for (mode,name,bin) in shalist:
 282         assert(mode)
 283         assert(mode+0 == mode)
 284         assert(name)
 285         assert(len(bin) == 20)
 286         s = b'%o %s\0%s' % (mode,name,bin)
 287         assert s[0] != b'0'  # 0-padded octal is not acceptable in a git tree
 288         l.append(s)
 289     return b''.join(l)
 290
 291
 292 def tree_decode(buf):
 293     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 294     ofs = 0
 295     while ofs < len(buf):
 296         z = buf.find(b'\0', ofs)
 297         assert(z > ofs)
 298         spl = buf[ofs:z].split(b' ', 1)
 299         assert(len(spl) == 2)
 300         mode,name = spl
 301         sha = buf[z+1:z+1+20]
 302         ofs = z+1+20
 303         yield (int(mode, 8), name, sha)
 304
 305
 306 def _encode_packobj(type, content, compression_level=1):
 307     if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
 308         raise ValueError('invalid compression level %s' % compression_level)
 309     szout = b''
 310     sz = len(content)
 311     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 312     sz >>= 4
 313     while 1:
 314         if sz: szbits |= 0x80
 315         szout += bytes_from_uint(szbits)
 316         if not sz:
 317             break
 318         szbits = sz & 0x7f
 319         sz >>= 7
 320     z = zlib.compressobj(compression_level)
 321     yield szout
 322     yield z.compress(content)
 323     yield z.flush()
 324
 325
 326 def _encode_looseobj(type, content, compression_level=1):
 327     z = zlib.compressobj(compression_level)
 328     yield z.compress(b'%s %d\0' % (type, len(content)))
 329     yield z.compress(content)
 330     yield z.flush()
 331
 332
 333 def _decode_looseobj(buf):
 334     assert(buf);
 335     s = zlib.decompress(buf)
 336     i = s.find(b'\0')
 337     assert(i > 0)
 338     l = s[:i].split(b' ')
 339     type = l[0]
 340     sz = int(l[1])
 341     content = s[i+1:]
 342     assert(type in _typemap)
 343     assert(sz == len(content))
 344     return (type, content)
 345
 346
 347 def _decode_packobj(buf):
 348     assert(buf)
 349     c = byte_int(buf[0])
 350     type = _typermap[(c & 0x70) >> 4]
 351     sz = c & 0x0f
 352     shift = 4
 353     i = 0
 354     while c & 0x80:
 355         i += 1
 356         c = byte_int(buf[i])
 357         sz |= (c & 0x7f) << shift
 358         shift += 7
 359         if not (c & 0x80):
 360             break
 361     return (type, zlib.decompress(buf[i+1:]))
 362
 363
 364 class PackIdx:
 365     def __init__(self):
 366         assert(0)
 367
 368     def find_offset(self, hash):
 369         """Get the offset of an object inside the index file."""
 370         idx = self._idx_from_hash(hash)
 371         if idx != None:
 372             return self._ofs_from_idx(idx)
 373         return None
 374
 375     def exists(self, hash, want_source=False):
 376         """Return nonempty if the object exists in this index."""
 377         if hash and (self._idx_from_hash(hash) != None):
 378             return want_source and os.path.basename(self.name) or True
 379         return None
 380
 381     def _idx_from_hash(self, hash):
 382         global _total_searches, _total_steps
 383         _total_searches += 1
 384         assert(len(hash) == 20)
 385         b1 = byte_int(hash[0])
 386         start = self.fanout[b1-1] # range -1..254
 387         end = self.fanout[b1] # range 0..255
 388         want = hash
 389         _total_steps += 1  # lookup table is a step
 390         while start < end:
 391             _total_steps += 1
 392             mid = start + (end - start) // 2
 393             v = self._idx_to_hash(mid)
 394             if v < want:
 395                 start = mid+1
 396             elif v > want:
 397                 end = mid
 398             else: # got it!
 399                 return mid
 400         return None
 401
 402
 403 class PackIdxV1(PackIdx):
 404     """Object representation of a Git pack index (version 1) file."""
 405     def __init__(self, filename, f):
 406         self.name = filename
 407         self.idxnames = [self.name]
 408         self.map = mmap_read(f)
 409         # Min size for 'L' is 4, which is sufficient for struct's '!I'
 410         self.fanout = array('L', struct.unpack('!256I', self.map))
 411         self.fanout.append(0)  # entry "-1"
 412         self.nsha = self.fanout[255]
 413         self.sha_ofs = 256 * 4
 414         # Avoid slicing shatable for individual hashes (very high overhead)
 415         self.shatable = buffer(self.map, self.sha_ofs, self.nsha * 24)
 416
 417     def __enter__(self):
 418         return self
 419
 420     def __exit__(self, type, value, traceback):
 421         self.close()
 422
 423     def __len__(self):
 424         return int(self.nsha)  # int() from long for python 2
 425
 426     def _ofs_from_idx(self, idx):
 427         if idx >= self.nsha or idx < 0:
 428             raise IndexError('invalid pack index index %d' % idx)
 429         ofs = self.sha_ofs + idx * 24
 430         return struct.unpack_from('!I', self.map, offset=ofs)[0]
 431
 432     def _idx_to_hash(self, idx):
 433         if idx >= self.nsha or idx < 0:
 434             raise IndexError('invalid pack index index %d' % idx)
 435         ofs = self.sha_ofs + idx * 24 + 4
 436         return self.map[ofs : ofs + 20]
 437
 438     def __iter__(self):
 439         start = self.sha_ofs + 4
 440         for ofs in range(start, start + 24 * self.nsha, 24):
 441             yield self.map[ofs : ofs + 20]
 442
 443     def close(self):
 444         if self.map is not None:
 445             self.shatable = None
 446             self.map.close()
 447             self.map = None
 448
 449
 450 class PackIdxV2(PackIdx):
 451     """Object representation of a Git pack index (version 2) file."""
 452     def __init__(self, filename, f):
 453         self.name = filename
 454         self.idxnames = [self.name]
 455         self.map = mmap_read(f)
 456         assert self.map[0:8] == b'\377tOc\0\0\0\2'
 457         # Min size for 'L' is 4, which is sufficient for struct's '!I'
 458         self.fanout = array('L', struct.unpack_from('!256I', self.map, offset=8))
 459         self.fanout.append(0)
 460         self.nsha = self.fanout[255]
 461         self.sha_ofs = 8 + 256*4
 462         self.ofstable_ofs = self.sha_ofs + self.nsha * 20 + self.nsha * 4
 463         self.ofs64table_ofs = self.ofstable_ofs + self.nsha * 4
 464         # Avoid slicing this for individual hashes (very high overhead)
 465         self.shatable = buffer(self.map, self.sha_ofs, self.nsha*20)
 466
 467     def __enter__(self):
 468         return self
 469
 470     def __exit__(self, type, value, traceback):
 471         self.close()
 472
 473     def __len__(self):
 474         return int(self.nsha)  # int() from long for python 2
 475
 476     def _ofs_from_idx(self, idx):
 477         if idx >= self.nsha or idx < 0:
 478             raise IndexError('invalid pack index index %d' % idx)
 479         ofs_ofs = self.ofstable_ofs + idx * 4
 480         ofs = struct.unpack_from('!I', self.map, offset=ofs_ofs)[0]
 481         if ofs & 0x80000000:
 482             idx64 = ofs & 0x7fffffff
 483             ofs64_ofs = self.ofs64table_ofs + idx64 * 8
 484             ofs = struct.unpack_from('!Q', self.map, offset=ofs64_ofs)[0]
 485         return ofs
 486
 487     def _idx_to_hash(self, idx):
 488         if idx >= self.nsha or idx < 0:
 489             raise IndexError('invalid pack index index %d' % idx)
 490         ofs = self.sha_ofs + idx * 20
 491         return self.map[ofs : ofs + 20]
 492
 493     def __iter__(self):
 494         start = self.sha_ofs
 495         for ofs in range(start, start + 20 * self.nsha, 20):
 496             yield self.map[ofs : ofs + 20]
 497
 498     def close(self):
 499         if self.map is not None:
 500             self.shatable = None
 501             self.map.close()
 502             self.map = None
 503
 504
 505 _mpi_count = 0
 506 class PackIdxList:
 507     def __init__(self, dir, ignore_midx=False):
 508         global _mpi_count
 509         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 510         _mpi_count += 1
 511         self.dir = dir
 512         self.also = set()
 513         self.packs = []
 514         self.do_bloom = False
 515         self.bloom = None
 516         self.ignore_midx = ignore_midx
 517         self.refresh()
 518
 519     def __del__(self):
 520         global _mpi_count
 521         _mpi_count -= 1
 522         assert(_mpi_count == 0)
 523
 524     def __iter__(self):
 525         return iter(idxmerge(self.packs))
 526
 527     def __len__(self):
 528         return sum(len(pack) for pack in self.packs)
 529
 530     def exists(self, hash, want_source=False):
 531         """Return nonempty if the object exists in the index files."""
 532         global _total_searches
 533         _total_searches += 1
 534         if hash in self.also:
 535             return True
 536         if self.do_bloom and self.bloom:
 537             if self.bloom.exists(hash):
 538                 self.do_bloom = False
 539             else:
 540                 _total_searches -= 1  # was counted by bloom
 541                 return None
 542         for i in range(len(self.packs)):
 543             p = self.packs[i]
 544             _total_searches -= 1  # will be incremented by sub-pack
 545             ix = p.exists(hash, want_source=want_source)
 546             if ix:
 547                 # reorder so most recently used packs are searched first
 548                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 549                 return ix
 550         self.do_bloom = True
 551         return None
 552
 553     def refresh(self, skip_midx = False):
 554         """Refresh the index list.
 555         This method verifies if .midx files were superseded (e.g. all of its
 556         contents are in another, bigger .midx file) and removes the superseded
 557         files.
 558
 559         If skip_midx is True, all work on .midx files will be skipped and .midx
 560         files will be removed from the list.
 561
 562         The instance variable 'ignore_midx' can force this function to
 563         always act as if skip_midx was True.
 564         """
 565         if self.bloom is not None:
 566             self.bloom.close()
 567         self.bloom = None # Always reopen the bloom as it may have been relaced
 568         self.do_bloom = False
 569         skip_midx = skip_midx or self.ignore_midx
 570         d = dict((p.name, p) for p in self.packs
 571                  if not skip_midx or not isinstance(p, midx.PackMidx))
 572         if os.path.exists(self.dir):
 573             if not skip_midx:
 574                 midxl = []
 575                 midxes = set(glob.glob(os.path.join(self.dir, b'*.midx')))
 576                 # remove any *.midx files from our list that no longer exist
 577                 for ix in list(d.values()):
 578                     if not isinstance(ix, midx.PackMidx):
 579                         continue
 580                     if ix.name in midxes:
 581                         continue
 582                     # remove the midx
 583                     del d[ix.name]
 584                     ix.close()
 585                     self.packs.remove(ix)
 586                 for ix in self.packs:
 587                     if isinstance(ix, midx.PackMidx):
 588                         for name in ix.idxnames:
 589                             d[os.path.join(self.dir, name)] = ix
 590                 for full in midxes:
 591                     if not d.get(full):
 592                         mx = midx.PackMidx(full)
 593                         (mxd, mxf) = os.path.split(mx.name)
 594                         broken = False
 595                         for n in mx.idxnames:
 596                             if not os.path.exists(os.path.join(mxd, n)):
 597                                 log(('warning: index %s missing\n'
 598                                      '  used by %s\n')
 599                                     % (path_msg(n), path_msg(mxf)))
 600                                 broken = True
 601                         if broken:
 602                             mx.close()
 603                             del mx
 604                             unlink(full)
 605                         else:
 606                             midxl.append(mx)
 607                 midxl.sort(key=lambda ix:
 608                            (-len(ix), -xstat.stat(ix.name).st_mtime))
 609                 for ix in midxl:
 610                     any_needed = False
 611                     for sub in ix.idxnames:
 612                         found = d.get(os.path.join(self.dir, sub))
 613                         if not found or isinstance(found, PackIdx):
 614                             # doesn't exist, or exists but not in a midx
 615                             any_needed = True
 616                             break
 617                     if any_needed:
 618                         d[ix.name] = ix
 619                         for name in ix.idxnames:
 620                             d[os.path.join(self.dir, name)] = ix
 621                     elif not ix.force_keep:
 622                         debug1('midx: removing redundant: %s\n'
 623                                % path_msg(os.path.basename(ix.name)))
 624                         ix.close()
 625                         unlink(ix.name)
 626             for full in glob.glob(os.path.join(self.dir, b'*.idx')):
 627                 if not d.get(full):
 628                     try:
 629                         ix = open_idx(full)
 630                     except GitError as e:
 631                         add_error(e)
 632                         continue
 633                     d[full] = ix
 634             bfull = os.path.join(self.dir, b'bup.bloom')
 635             if self.bloom is None and os.path.exists(bfull):
 636                 self.bloom = bloom.ShaBloom(bfull)
 637             self.packs = list(set(d.values()))
 638             self.packs.sort(reverse=True, key=lambda x: len(x))
 639             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 640                 self.do_bloom = True
 641             else:
 642                 self.bloom = None
 643         debug1('PackIdxList: using %d index%s.\n'
 644             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 645
 646     def add(self, hash):
 647         """Insert an additional object in the list."""
 648         self.also.add(hash)
 649
 650
 651 def open_idx(filename):
 652     if filename.endswith(b'.idx'):
 653         f = open(filename, 'rb')
 654         header = f.read(8)
 655         if header[0:4] == b'\377tOc':
 656             version = struct.unpack('!I', header[4:8])[0]
 657             if version == 2:
 658                 return PackIdxV2(filename, f)
 659             else:
 660                 raise GitError('%s: expected idx file version 2, got %d'
 661                                % (path_msg(filename), version))
 662         elif len(header) == 8 and header[0:4] < b'\377tOc':
 663             return PackIdxV1(filename, f)
 664         else:
 665             raise GitError('%s: unrecognized idx file header'
 666                            % path_msg(filename))
 667     elif filename.endswith(b'.midx'):
 668         return midx.PackMidx(filename)
 669     else:
 670         raise GitError('idx filenames must end with .idx or .midx')
 671
 672
 673 def idxmerge(idxlist, final_progress=True):
 674     """Generate a list of all the objects reachable in a PackIdxList."""
 675     def pfunc(count, total):
 676         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 677                   % (count*100.0/total, count, total))
 678     def pfinal(count, total):
 679         if final_progress:
 680             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 681                      % (100, total, total))
 682     return merge_iter(idxlist, 10024, pfunc, pfinal)
 683
 684
 685 def _make_objcache():
 686     return PackIdxList(repo(b'objects/pack'))
 687
 688 # bup-gc assumes that it can disable all PackWriter activities
 689 # (bloom/midx/cache) via the constructor and close() arguments.
 690
 691 class PackWriter:
 692     """Writes Git objects inside a pack file."""
 693     def __init__(self, objcache_maker=_make_objcache, compression_level=1,
 694                  run_midx=True, on_pack_finish=None,
 695                  max_pack_size=None, max_pack_objects=None, repo_dir=None):
 696         self.repo_dir = repo_dir or repo()
 697         self.file = None
 698         self.parentfd = None
 699         self.count = 0
 700         self.outbytes = 0
 701         self.filename = None
 702         self.idx = None
 703         self.objcache_maker = objcache_maker
 704         self.objcache = None
 705         self.compression_level = compression_level
 706         self.run_midx=run_midx
 707         self.on_pack_finish = on_pack_finish
 708         if not max_pack_size:
 709             max_pack_size = git_config_get(b'pack.packSizeLimit',
 710                                            repo_dir=self.repo_dir)
 711             if max_pack_size is not None:
 712                 max_pack_size = parse_num(max_pack_size)
 713             if not max_pack_size:
 714                 # larger packs slow down pruning
 715                 max_pack_size = 1000 * 1000 * 1000
 716         self.max_pack_size = max_pack_size
 717         # cache memory usage is about 83 bytes per object
 718         self.max_pack_objects = max_pack_objects if max_pack_objects \
 719                                 else max(1, self.max_pack_size // 5000)
 720
 721     def __del__(self):
 722         self.close()
 723
 724     def __enter__(self):
 725         return self
 726
 727     def __exit__(self, type, value, traceback):
 728         self.close()
 729
 730     def _open(self):
 731         if not self.file:
 732             objdir = dir = os.path.join(self.repo_dir, b'objects')
 733             fd, name = tempfile.mkstemp(suffix=b'.pack', dir=objdir)
 734             try:
 735                 self.file = os.fdopen(fd, 'w+b')
 736             except:
 737                 os.close(fd)
 738                 raise
 739             try:
 740                 self.parentfd = os.open(objdir, os.O_RDONLY)
 741             except:
 742                 f = self.file
 743                 self.file = None
 744                 f.close()
 745                 raise
 746             assert name.endswith(b'.pack')
 747             self.filename = name[:-5]
 748             self.file.write(b'PACK\0\0\0\2\0\0\0\0')
 749             self.idx = PackIdxV2Writer()
 750
 751     def _raw_write(self, datalist, sha):
 752         self._open()
 753         f = self.file
 754         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 755         # the file never has a *partial* blob.  So let's make sure it's
 756         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 757         # to our hashsplit algorithm.)  f.write() does its own buffering,
 758         # but that's okay because we'll flush it in _end().
 759         oneblob = b''.join(datalist)
 760         try:
 761             f.write(oneblob)
 762         except IOError as e:
 763             reraise(GitError(e))
 764         nw = len(oneblob)
 765         crc = zlib.crc32(oneblob) & 0xffffffff
 766         self._update_idx(sha, crc, nw)
 767         self.outbytes += nw
 768         self.count += 1
 769         return nw, crc
 770
 771     def _update_idx(self, sha, crc, size):
 772         assert(sha)
 773         if self.idx:
 774             self.idx.add(sha, crc, self.file.tell() - size)
 775
 776     def _write(self, sha, type, content):
 777         if verbose:
 778             log('>')
 779         if not sha:
 780             sha = calc_hash(type, content)
 781         size, crc = self._raw_write(_encode_packobj(type, content,
 782                                                     self.compression_level),
 783                                     sha=sha)
 784         if self.outbytes >= self.max_pack_size \
 785            or self.count >= self.max_pack_objects:
 786             self.breakpoint()
 787         return sha
 788
 789     def breakpoint(self):
 790         """Clear byte and object counts and return the last processed id."""
 791         id = self._end(self.run_midx)
 792         self.outbytes = self.count = 0
 793         return id
 794
 795     def _require_objcache(self):
 796         if self.objcache is None and self.objcache_maker:
 797             self.objcache = self.objcache_maker()
 798         if self.objcache is None:
 799             raise GitError(
 800                     "PackWriter not opened or can't check exists w/o objcache")
 801
 802     def exists(self, id, want_source=False):
 803         """Return non-empty if an object is found in the object cache."""
 804         self._require_objcache()
 805         return self.objcache.exists(id, want_source=want_source)
 806
 807     def just_write(self, sha, type, content):
 808         """Write an object to the pack file without checking for duplication."""
 809         self._write(sha, type, content)
 810         # If nothing else, gc doesn't have/want an objcache
 811         if self.objcache is not None:
 812             self.objcache.add(sha)
 813
 814     def maybe_write(self, type, content):
 815         """Write an object to the pack file if not present and return its id."""
 816         sha = calc_hash(type, content)
 817         if not self.exists(sha):
 818             self._require_objcache()
 819             self.just_write(sha, type, content)
 820         return sha
 821
 822     def new_blob(self, blob):
 823         """Create a blob object in the pack with the supplied content."""
 824         return self.maybe_write(b'blob', blob)
 825
 826     def new_tree(self, shalist):
 827         """Create a tree object in the pack."""
 828         content = tree_encode(shalist)
 829         return self.maybe_write(b'tree', content)
 830
 831     def new_commit(self, tree, parent,
 832                    author, adate_sec, adate_tz,
 833                    committer, cdate_sec, cdate_tz,
 834                    msg):
 835         """Create a commit object in the pack.  The date_sec values must be
 836         epoch-seconds, and if a tz is None, the local timezone is assumed."""
 837         if adate_tz is not None:
 838             adate_str = _git_date_str(adate_sec, adate_tz)
 839         else:
 840             adate_str = _local_git_date_str(adate_sec)
 841         if cdate_tz is not None:
 842             cdate_str = _git_date_str(cdate_sec, cdate_tz)
 843         else:
 844             cdate_str = _local_git_date_str(cdate_sec)
 845         l = []
 846         if tree: l.append(b'tree %s' % hexlify(tree))
 847         if parent: l.append(b'parent %s' % hexlify(parent))
 848         if author: l.append(b'author %s %s' % (author, adate_str))
 849         if committer: l.append(b'committer %s %s' % (committer, cdate_str))
 850         l.append(b'')
 851         l.append(msg)
 852         return self.maybe_write(b'commit', b'\n'.join(l))
 853
 854     def abort(self):
 855         """Remove the pack file from disk."""
 856         f = self.file
 857         if f:
 858             pfd = self.parentfd
 859             self.file = None
 860             self.parentfd = None
 861             self.idx = None
 862             try:
 863                 try:
 864                     os.unlink(self.filename + b'.pack')
 865                 finally:
 866                     f.close()
 867             finally:
 868                 if pfd is not None:
 869                     os.close(pfd)
 870
 871     def _end(self, run_midx=True):
 872         f = self.file
 873         if not f: return None
 874         self.file = None
 875         try:
 876             self.objcache = None
 877             idx = self.idx
 878             self.idx = None
 879
 880             # update object count
 881             f.seek(8)
 882             cp = struct.pack('!i', self.count)
 883             assert(len(cp) == 4)
 884             f.write(cp)
 885
 886             # calculate the pack sha1sum
 887             f.seek(0)
 888             sum = Sha1()
 889             for b in chunkyreader(f):
 890                 sum.update(b)
 891             packbin = sum.digest()
 892             f.write(packbin)
 893             fdatasync(f.fileno())
 894         finally:
 895             f.close()
 896
 897         obj_list_sha = idx.write(self.filename + b'.idx', packbin)
 898         nameprefix = os.path.join(self.repo_dir,
 899                                   b'objects/pack/pack-' +  obj_list_sha)
 900         if os.path.exists(self.filename + b'.map'):
 901             os.unlink(self.filename + b'.map')
 902         os.rename(self.filename + b'.pack', nameprefix + b'.pack')
 903         os.rename(self.filename + b'.idx', nameprefix + b'.idx')
 904         try:
 905             os.fsync(self.parentfd)
 906         finally:
 907             os.close(self.parentfd)
 908
 909         if run_midx:
 910             auto_midx(os.path.join(self.repo_dir, b'objects/pack'))
 911
 912         if self.on_pack_finish:
 913             self.on_pack_finish(nameprefix)
 914
 915         return nameprefix
 916
 917     def close(self, run_midx=True):
 918         """Close the pack file and move it to its definitive path."""
 919         return self._end(run_midx=run_midx)
 920
 921
 922 class PackIdxV2Writer:
 923     def __init__(self):
 924         self.idx = list(list() for i in range(256))
 925         self.count = 0
 926
 927     def add(self, sha, crc, offs):
 928         assert(sha)
 929         self.count += 1
 930         self.idx[byte_int(sha[0])].append((sha, crc, offs))
 931
 932     def write(self, filename, packbin):
 933         ofs64_count = 0
 934         for section in self.idx:
 935             for entry in section:
 936                 if entry[2] >= 2**31:
 937                     ofs64_count += 1
 938
 939         # Length: header + fan-out + shas-and-crcs + overflow-offsets
 940         index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
 941         idx_map = None
 942         idx_f = open(filename, 'w+b')
 943         try:
 944             idx_f.truncate(index_len)
 945             fdatasync(idx_f.fileno())
 946             idx_map = mmap_readwrite(idx_f, close=False)
 947             try:
 948                 count = _helpers.write_idx(filename, idx_map, self.idx,
 949                                            self.count)
 950                 assert(count == self.count)
 951                 idx_map.flush()
 952             finally:
 953                 idx_map.close()
 954         finally:
 955             idx_f.close()
 956
 957         idx_f = open(filename, 'a+b')
 958         try:
 959             idx_f.write(packbin)
 960             idx_f.seek(0)
 961             idx_sum = Sha1()
 962             b = idx_f.read(8 + 4*256)
 963             idx_sum.update(b)
 964
 965             obj_list_sum = Sha1()
 966             for b in chunkyreader(idx_f, 20 * self.count):
 967                 idx_sum.update(b)
 968                 obj_list_sum.update(b)
 969             namebase = hexlify(obj_list_sum.digest())
 970
 971             for b in chunkyreader(idx_f):
 972                 idx_sum.update(b)
 973             idx_f.write(idx_sum.digest())
 974             fdatasync(idx_f.fileno())
 975             return namebase
 976         finally:
 977             idx_f.close()
 978
 979
 980 def list_refs(patterns=None, repo_dir=None,
 981               limit_to_heads=False, limit_to_tags=False):
 982     """Yield (refname, hash) tuples for all repository refs unless
 983     patterns are specified.  In that case, only include tuples for
 984     refs matching those patterns (cf. git-show-ref(1)).  The limits
 985     restrict the result items to refs/heads or refs/tags.  If both
 986     limits are specified, items from both sources will be included.
 987
 988     """
 989     argv = [b'git', b'show-ref']
 990     if limit_to_heads:
 991         argv.append(b'--heads')
 992     if limit_to_tags:
 993         argv.append(b'--tags')
 994     argv.append(b'--')
 995     if patterns:
 996         argv.extend(patterns)
 997     p = subprocess.Popen(argv, env=_gitenv(repo_dir), stdout=subprocess.PIPE)
 998     out = p.stdout.read().strip()
 999     rv = p.wait()  # not fatal
1000     if rv:
1001         assert(not out)
1002     if out:
1003         for d in out.split(b'\n'):
1004             sha, name = d.split(b' ', 1)
1005             yield name, unhexlify(sha)
1006
1007
1008 def read_ref(refname, repo_dir = None):
1009     """Get the commit id of the most recent commit made on a given ref."""
1010     refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
1011     l = tuple(islice(refs, 2))
1012     if l:
1013         assert(len(l) == 1)
1014         return l[0][1]
1015     else:
1016         return None
1017
1018
1019 def rev_list_invocation(ref_or_refs, format=None):
1020     if isinstance(ref_or_refs, bytes):
1021         refs = (ref_or_refs,)
1022     else:
1023         refs = ref_or_refs
1024     argv = [b'git', b'rev-list']
1025
1026     if format:
1027         argv.append(b'--pretty=format:' + format)
1028     for ref in refs:
1029         assert not ref.startswith(b'-')
1030         argv.append(ref)
1031     argv.append(b'--')
1032     return argv
1033
1034
1035 def rev_list(ref_or_refs, parse=None, format=None, repo_dir=None):
1036     """Yield information about commits as per "git rev-list".  If a format
1037     is not provided, yield one hex hash at a time.  If a format is
1038     provided, pass it to rev-list and call parse(git_stdout) for each
1039     commit with the stream positioned just after the rev-list "commit
1040     HASH" header line.  When a format is provided yield (oidx,
1041     parse(git_stdout)) for each commit.
1042
1043     """
1044     assert bool(parse) == bool(format)
1045     p = subprocess.Popen(rev_list_invocation(ref_or_refs,
1046                                              format=format),
1047                          env=_gitenv(repo_dir),
1048                          stdout = subprocess.PIPE)
1049     if not format:
1050         for line in p.stdout:
1051             yield line.strip()
1052     else:
1053         line = p.stdout.readline()
1054         while line:
1055             s = line.strip()
1056             if not s.startswith(b'commit '):
1057                 raise Exception('unexpected line ' + repr(s))
1058             s = s[7:]
1059             assert len(s) == 40
1060             yield s, parse(p.stdout)
1061             line = p.stdout.readline()
1062
1063     rv = p.wait()  # not fatal
1064     if rv:
1065         raise GitError('git rev-list returned error %d' % rv)
1066
1067
1068 def get_commit_dates(refs, repo_dir=None):
1069     """Get the dates for the specified commit refs.  For now, every unique
1070        string in refs must resolve to a different commit or this
1071        function will fail."""
1072     result = []
1073     for ref in refs:
1074         commit = get_commit_items(ref, cp(repo_dir))
1075         result.append(commit.author_sec)
1076     return result
1077
1078
1079 def rev_parse(committish, repo_dir=None):
1080     """Resolve the full hash for 'committish', if it exists.
1081
1082     Should be roughly equivalent to 'git rev-parse'.
1083
1084     Returns the hex value of the hash if it is found, None if 'committish' does
1085     not correspond to anything.
1086     """
1087     head = read_ref(committish, repo_dir=repo_dir)
1088     if head:
1089         debug2("resolved from ref: commit = %s\n" % hexlify(head))
1090         return head
1091
1092     pL = PackIdxList(repo(b'objects/pack', repo_dir=repo_dir))
1093
1094     if len(committish) == 40:
1095         try:
1096             hash = unhexlify(committish)
1097         except TypeError:
1098             return None
1099
1100         if pL.exists(hash):
1101             return hash
1102
1103     return None
1104
1105
1106 def update_ref(refname, newval, oldval, repo_dir=None):
1107     """Update a repository reference."""
1108     if not oldval:
1109         oldval = b''
1110     assert refname.startswith(b'refs/heads/') \
1111         or refname.startswith(b'refs/tags/')
1112     p = subprocess.Popen([b'git', b'update-ref', refname,
1113                           hexlify(newval), hexlify(oldval)],
1114                          env=_gitenv(repo_dir))
1115     _git_wait(b'git update-ref', p)
1116
1117
1118 def delete_ref(refname, oldvalue=None):
1119     """Delete a repository reference (see git update-ref(1))."""
1120     assert refname.startswith(b'refs/')
1121     oldvalue = [] if not oldvalue else [oldvalue]
1122     p = subprocess.Popen([b'git', b'update-ref', b'-d', refname] + oldvalue,
1123                          env=_gitenv())
1124     _git_wait('git update-ref', p)
1125
1126
1127 def guess_repo(path=None):
1128     """Set the path value in the global variable "repodir".
1129     This makes bup look for an existing bup repository, but not fail if a
1130     repository doesn't exist. Usually, if you are interacting with a bup
1131     repository, you would not be calling this function but using
1132     check_repo_or_die().
1133     """
1134     global repodir
1135     if path:
1136         repodir = path
1137     if not repodir:
1138         repodir = environ.get(b'BUP_DIR')
1139         if not repodir:
1140             repodir = os.path.expanduser(b'~/.bup')
1141
1142
1143 def init_repo(path=None):
1144     """Create the Git bare repository for bup in a given path."""
1145     guess_repo(path)
1146     d = repo()  # appends a / to the path
1147     parent = os.path.dirname(os.path.dirname(d))
1148     if parent and not os.path.exists(parent):
1149         raise GitError('parent directory "%s" does not exist\n'
1150                        % path_msg(parent))
1151     if os.path.exists(d) and not os.path.isdir(os.path.join(d, b'.')):
1152         raise GitError('"%s" exists but is not a directory\n' % path_msg(d))
1153     p = subprocess.Popen([b'git', b'--bare', b'init'], stdout=sys.stderr,
1154                          env=_gitenv())
1155     _git_wait('git init', p)
1156     # Force the index version configuration in order to ensure bup works
1157     # regardless of the version of the installed Git binary.
1158     p = subprocess.Popen([b'git', b'config', b'pack.indexVersion', '2'],
1159                          stdout=sys.stderr, env=_gitenv())
1160     _git_wait('git config', p)
1161     # Enable the reflog
1162     p = subprocess.Popen([b'git', b'config', b'core.logAllRefUpdates', b'true'],
1163                          stdout=sys.stderr, env=_gitenv())
1164     _git_wait('git config', p)
1165
1166
1167 def check_repo_or_die(path=None):
1168     """Check to see if a bup repository probably exists, and abort if not."""
1169     guess_repo(path)
1170     top = repo()
1171     pst = stat_if_exists(top + b'/objects/pack')
1172     if pst and stat.S_ISDIR(pst.st_mode):
1173         return
1174     if not pst:
1175         top_st = stat_if_exists(top)
1176         if not top_st:
1177             log('error: repository %r does not exist (see "bup help init")\n'
1178                 % top)
1179             sys.exit(15)
1180     log('error: %s is not a repository\n' % path_msg(top))
1181     sys.exit(14)
1182
1183
1184 def is_suitable_git(ver_str):
1185     if not ver_str.startswith(b'git version '):
1186         return 'unrecognized'
1187     ver_str = ver_str[len(b'git version '):]
1188     if ver_str.startswith(b'0.'):
1189         return 'insufficient'
1190     if ver_str.startswith(b'1.'):
1191         if re.match(br'1\.[012345]rc', ver_str):
1192             return 'insufficient'
1193         if re.match(br'1\.[01234]\.', ver_str):
1194             return 'insufficient'
1195         if re.match(br'1\.5\.[012345]($|\.)', ver_str):
1196             return 'insufficient'
1197         if re.match(br'1\.5\.6-rc', ver_str):
1198             return 'insufficient'
1199         return 'suitable'
1200     if re.match(br'[0-9]+(\.|$)?', ver_str):
1201         return 'suitable'
1202     sys.exit(13)
1203
1204 _git_great = None
1205
1206 def require_suitable_git(ver_str=None):
1207     """Raise GitError if the version of git isn't suitable.
1208
1209     Rely on ver_str when provided, rather than invoking the git in the
1210     path.
1211
1212     """
1213     global _git_great
1214     if _git_great is not None:
1215         return
1216     if environ.get(b'BUP_GIT_VERSION_IS_FINE', b'').lower() \
1217        in (b'yes', b'true', b'1'):
1218         _git_great = True
1219         return
1220     if not ver_str:
1221         ver_str, _, _ = _git_exo([b'git', b'--version'])
1222     status = is_suitable_git(ver_str)
1223     if status == 'unrecognized':
1224         raise GitError('Unexpected git --version output: %r' % ver_str)
1225     if status == 'insufficient':
1226         log('error: git version must be at least 1.5.6\n')
1227         sys.exit(1)
1228     if status == 'suitable':
1229         _git_great = True
1230         return
1231     assert False
1232
1233
1234 class _AbortableIter:
1235     def __init__(self, it, onabort = None):
1236         self.it = it
1237         self.onabort = onabort
1238         self.done = None
1239
1240     def __iter__(self):
1241         return self
1242
1243     def __next__(self):
1244         try:
1245             return next(self.it)
1246         except StopIteration as e:
1247             self.done = True
1248             raise
1249         except:
1250             self.abort()
1251             raise
1252
1253     next = __next__
1254
1255     def abort(self):
1256         """Abort iteration and call the abortion callback, if needed."""
1257         if not self.done:
1258             self.done = True
1259             if self.onabort:
1260                 self.onabort()
1261
1262     def __del__(self):
1263         self.abort()
1264
1265
1266 class CatPipe:
1267     """Link to 'git cat-file' that is used to retrieve blob data."""
1268     def __init__(self, repo_dir = None):
1269         require_suitable_git()
1270         self.repo_dir = repo_dir
1271         self.p = self.inprogress = None
1272
1273     def _abort(self):
1274         if self.p:
1275             self.p.stdout.close()
1276             self.p.stdin.close()
1277         self.p = None
1278         self.inprogress = None
1279
1280     def restart(self):
1281         self._abort()
1282         self.p = subprocess.Popen([b'git', b'cat-file', b'--batch'],
1283                                   stdin=subprocess.PIPE,
1284                                   stdout=subprocess.PIPE,
1285                                   close_fds = True,
1286                                   bufsize = 4096,
1287                                   env=_gitenv(self.repo_dir))
1288
1289     def get(self, ref):
1290         """Yield (oidx, type, size), followed by the data referred to by ref.
1291         If ref does not exist, only yield (None, None, None).
1292
1293         """
1294         if not self.p or self.p.poll() != None:
1295             self.restart()
1296         assert(self.p)
1297         poll_result = self.p.poll()
1298         assert(poll_result == None)
1299         if self.inprogress:
1300             log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1301         assert(not self.inprogress)
1302         assert ref.find(b'\n') < 0
1303         assert ref.find(b'\r') < 0
1304         assert not ref.startswith(b'-')
1305         self.inprogress = ref
1306         self.p.stdin.write(ref + b'\n')
1307         self.p.stdin.flush()
1308         hdr = self.p.stdout.readline()
1309         if hdr.endswith(b' missing\n'):
1310             self.inprogress = None
1311             yield None, None, None
1312             return
1313         info = hdr.split(b' ')
1314         if len(info) != 3 or len(info[0]) != 40:
1315             raise GitError('expected object (id, type, size), got %r' % info)
1316         oidx, typ, size = info
1317         size = int(size)
1318         it = _AbortableIter(chunkyreader(self.p.stdout, size),
1319                             onabort=self._abort)
1320         try:
1321             yield oidx, typ, size
1322             for blob in it:
1323                 yield blob
1324             readline_result = self.p.stdout.readline()
1325             assert readline_result == b'\n'
1326             self.inprogress = None
1327         except Exception as e:
1328             it.abort()
1329             raise
1330
1331     def _join(self, it):
1332         _, typ, _ = next(it)
1333         if typ == b'blob':
1334             for blob in it:
1335                 yield blob
1336         elif typ == b'tree':
1337             treefile = b''.join(it)
1338             for (mode, name, sha) in tree_decode(treefile):
1339                 for blob in self.join(hexlify(sha)):
1340                     yield blob
1341         elif typ == b'commit':
1342             treeline = b''.join(it).split(b'\n')[0]
1343             assert treeline.startswith(b'tree ')
1344             for blob in self.join(treeline[5:]):
1345                 yield blob
1346         else:
1347             raise GitError('invalid object type %r: expected blob/tree/commit'
1348                            % typ)
1349
1350     def join(self, id):
1351         """Generate a list of the content of all blobs that can be reached
1352         from an object.  The hash given in 'id' must point to a blob, a tree
1353         or a commit. The content of all blobs that can be seen from trees or
1354         commits will be added to the list.
1355         """
1356         for d in self._join(self.get(id)):
1357             yield d
1358
1359
1360 _cp = {}
1361
1362 def cp(repo_dir=None):
1363     """Create a CatPipe object or reuse the already existing one."""
1364     global _cp, repodir
1365     if not repo_dir:
1366         repo_dir = repodir or repo()
1367     repo_dir = os.path.abspath(repo_dir)
1368     cp = _cp.get(repo_dir)
1369     if not cp:
1370         cp = CatPipe(repo_dir)
1371         _cp[repo_dir] = cp
1372     return cp
1373
1374
1375 def tags(repo_dir = None):
1376     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1377     tags = {}
1378     for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1379         assert n.startswith(b'refs/tags/')
1380         name = n[10:]
1381         if not c in tags:
1382             tags[c] = []
1383         tags[c].append(name)  # more than one tag can point at 'c'
1384     return tags
1385
1386
1387 class MissingObject(KeyError):
1388     def __init__(self, oid):
1389         self.oid = oid
1390         KeyError.__init__(self, 'object %r is missing' % hexlify(oid))
1391
1392
1393 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1394                                    'path', 'chunk_path', 'data'])
1395 # The path is the mangled path, and if an item represents a fragment
1396 # of a chunked file, the chunk_path will be the chunked subtree path
1397 # for the chunk, i.e. ['', '2d3115e', ...].  The top-level path for a
1398 # chunked file will have a chunk_path of [''].  So some chunk subtree
1399 # of the file '/foo/bar/baz' might look like this:
1400 #
1401 #   item.path = ['foo', 'bar', 'baz.bup']
1402 #   item.chunk_path = ['', '2d3115e', '016b097']
1403 #   item.type = 'tree'
1404 #   ...
1405
1406
1407 def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1408     """Yield everything reachable from oidx via get_ref (which must behave
1409     like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1410     returns true.  Throw MissingObject if a hash encountered is
1411     missing from the repository, and don't read or return blob content
1412     in the data field unless include_data is set.
1413
1414     """
1415     # Maintain the pending stack on the heap to avoid stack overflow
1416     pending = [(oidx, [], [], None)]
1417     while len(pending):
1418         oidx, parent_path, chunk_path, mode = pending.pop()
1419         oid = unhexlify(oidx)
1420         if stop_at and stop_at(oidx):
1421             continue
1422
1423         if (not include_data) and mode and stat.S_ISREG(mode):
1424             # If the object is a "regular file", then it's a leaf in
1425             # the graph, so we can skip reading the data if the caller
1426             # hasn't requested it.
1427             yield WalkItem(oid=oid, type=b'blob',
1428                            chunk_path=chunk_path, path=parent_path,
1429                            mode=mode,
1430                            data=None)
1431             continue
1432
1433         item_it = get_ref(oidx)
1434         get_oidx, typ, _ = next(item_it)
1435         if not get_oidx:
1436             raise MissingObject(unhexlify(oidx))
1437         if typ not in (b'blob', b'commit', b'tree'):
1438             raise Exception('unexpected repository object type %r' % typ)
1439
1440         # FIXME: set the mode based on the type when the mode is None
1441         if typ == b'blob' and not include_data:
1442             # Dump data until we can ask cat_pipe not to fetch it
1443             for ignored in item_it:
1444                 pass
1445             data = None
1446         else:
1447             data = b''.join(item_it)
1448
1449         yield WalkItem(oid=oid, type=typ,
1450                        chunk_path=chunk_path, path=parent_path,
1451                        mode=mode,
1452                        data=(data if include_data else None))
1453
1454         if typ == b'commit':
1455             commit_items = parse_commit(data)
1456             for pid in commit_items.parents:
1457                 pending.append((pid, parent_path, chunk_path, mode))
1458             pending.append((commit_items.tree, parent_path, chunk_path,
1459                             hashsplit.GIT_MODE_TREE))
1460         elif typ == b'tree':
1461             for mode, name, ent_id in tree_decode(data):
1462                 demangled, bup_type = demangle_name(name, mode)
1463                 if chunk_path:
1464                     sub_path = parent_path
1465                     sub_chunk_path = chunk_path + [name]
1466                 else:
1467                     sub_path = parent_path + [name]
1468                     if bup_type == BUP_CHUNKED:
1469                         sub_chunk_path = [b'']
1470                     else:
1471                         sub_chunk_path = chunk_path
1472                 pending.append((hexlify(ent_id), sub_path, sub_chunk_path,
1473                                 mode))