lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5
   6 import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
   7 from collections import namedtuple
   8 from itertools import islice
   9
  10 from bup import _helpers, hashsplit, path, midx, bloom, xstat
  11 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
  12                          fdatasync,
  13                          hostname, localtime, log, merge_iter,
  14                          mmap_read, mmap_readwrite,
  15                          progress, qprogress, unlink, username, userfullname,
  16                          utc_offset_str)
  17
  18
  19 max_pack_size = 1000*1000*1000  # larger packs will slow down pruning
  20 max_pack_objects = 200*1000  # cache memory usage is about 83 bytes per object
  21
  22 verbose = 0
  23 ignore_midx = 0
  24 repodir = None
  25
  26 _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
  27 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
  28
  29 _total_searches = 0
  30 _total_steps = 0
  31
  32
  33 class GitError(Exception):
  34     pass
  35
  36
  37 def parse_tz_offset(s):
  38     """UTC offset in seconds."""
  39     tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
  40     if s[0] == '-':
  41         return - tz_off
  42     return tz_off
  43
  44
  45 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
  46 # Make sure that's authoritative.
  47 _start_end_char = r'[^ .,:;<>"\'\0\n]'
  48 _content_char = r'[^\0\n<>]'
  49 _safe_str_rx = '(?:%s{1,2}|(?:%s%s*%s))' \
  50     % (_start_end_char,
  51        _start_end_char, _content_char, _start_end_char)
  52 _tz_rx = r'[-+]\d\d[0-5]\d'
  53 _parent_rx = r'(?:parent [abcdefABCDEF0123456789]{40}\n)'
  54 _commit_rx = re.compile(r'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
  55 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
  56 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)
  57
  58 (?P<message>(?:.|\n)*)''' % (_parent_rx,
  59                              _safe_str_rx, _safe_str_rx, _tz_rx,
  60                              _safe_str_rx, _safe_str_rx, _tz_rx))
  61 _parent_hash_rx = re.compile(r'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
  62
  63
  64 # Note that the author_sec and committer_sec values are (UTC) epoch seconds.
  65 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
  66                                        'author_name', 'author_mail',
  67                                        'author_sec', 'author_offset',
  68                                        'committer_name', 'committer_mail',
  69                                        'committer_sec', 'committer_offset',
  70                                        'message'])
  71
  72 def parse_commit(content):
  73     commit_match = re.match(_commit_rx, content)
  74     if not commit_match:
  75         raise Exception('cannot parse commit %r' % content)
  76     matches = commit_match.groupdict()
  77     return CommitInfo(tree=matches['tree'],
  78                       parents=re.findall(_parent_hash_rx, matches['parents']),
  79                       author_name=matches['author_name'],
  80                       author_mail=matches['author_mail'],
  81                       author_sec=int(matches['asec']),
  82                       author_offset=parse_tz_offset(matches['atz']),
  83                       committer_name=matches['committer_name'],
  84                       committer_mail=matches['committer_mail'],
  85                       committer_sec=int(matches['csec']),
  86                       committer_offset=parse_tz_offset(matches['ctz']),
  87                       message=matches['message'])
  88
  89
  90 def get_commit_items(id, cp):
  91     commit_it = cp.get(id)
  92     assert(commit_it.next() == 'commit')
  93     commit_content = ''.join(commit_it)
  94     return parse_commit(commit_content)
  95
  96
  97 def _local_git_date_str(epoch_sec):
  98     return '%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
  99
 100
 101 def _git_date_str(epoch_sec, tz_offset_sec):
 102     offs =  tz_offset_sec // 60
 103     return '%d %s%02d%02d' \
 104         % (epoch_sec,
 105            '+' if offs >= 0 else '-',
 106            abs(offs) // 60,
 107            abs(offs) % 60)
 108
 109
 110 def repo(sub = '', repo_dir=None):
 111     """Get the path to the git repository or one of its subdirectories."""
 112     global repodir
 113     repo_dir = repo_dir or repodir
 114     if not repo_dir:
 115         raise GitError('You should call check_repo_or_die()')
 116
 117     # If there's a .git subdirectory, then the actual repo is in there.
 118     gd = os.path.join(repo_dir, '.git')
 119     if os.path.exists(gd):
 120         repodir = gd
 121
 122     return os.path.join(repo_dir, sub)
 123
 124
 125 def shorten_hash(s):
 126     return re.sub(r'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)',
 127                   r'\1\2*\3', s)
 128
 129
 130 def repo_rel(path):
 131     full = os.path.abspath(path)
 132     fullrepo = os.path.abspath(repo(''))
 133     if not fullrepo.endswith('/'):
 134         fullrepo += '/'
 135     if full.startswith(fullrepo):
 136         path = full[len(fullrepo):]
 137     if path.startswith('index-cache/'):
 138         path = path[len('index-cache/'):]
 139     return shorten_hash(path)
 140
 141
 142 def all_packdirs():
 143     paths = [repo('objects/pack')]
 144     paths += glob.glob(repo('index-cache/*/.'))
 145     return paths
 146
 147
 148 def auto_midx(objdir):
 149     args = [path.exe(), 'midx', '--auto', '--dir', objdir]
 150     try:
 151         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 152     except OSError as e:
 153         # make sure 'args' gets printed to help with debugging
 154         add_error('%r: exception: %s' % (args, e))
 155         raise
 156     if rv:
 157         add_error('%r: returned %d' % (args, rv))
 158
 159     args = [path.exe(), 'bloom', '--dir', objdir]
 160     try:
 161         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 162     except OSError as e:
 163         # make sure 'args' gets printed to help with debugging
 164         add_error('%r: exception: %s' % (args, e))
 165         raise
 166     if rv:
 167         add_error('%r: returned %d' % (args, rv))
 168
 169
 170 def mangle_name(name, mode, gitmode):
 171     """Mangle a file name to present an abstract name for segmented files.
 172     Mangled file names will have the ".bup" extension added to them. If a
 173     file's name already ends with ".bup", a ".bupl" extension is added to
 174     disambiguate normal files from segmented ones.
 175     """
 176     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 177         assert(stat.S_ISDIR(gitmode))
 178         return name + '.bup'
 179     elif name.endswith('.bup') or name[:-1].endswith('.bup'):
 180         return name + '.bupl'
 181     else:
 182         return name
 183
 184
 185 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 186 def demangle_name(name, mode):
 187     """Remove name mangling from a file name, if necessary.
 188
 189     The return value is a tuple (demangled_filename,mode), where mode is one of
 190     the following:
 191
 192     * BUP_NORMAL  : files that should be read as-is from the repository
 193     * BUP_CHUNKED : files that were chunked and need to be reassembled
 194
 195     For more information on the name mangling algorithm, see mangle_name()
 196     """
 197     if name.endswith('.bupl'):
 198         return (name[:-5], BUP_NORMAL)
 199     elif name.endswith('.bup'):
 200         return (name[:-4], BUP_CHUNKED)
 201     elif name.endswith('.bupm'):
 202         return (name[:-5],
 203                 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
 204     else:
 205         return (name, BUP_NORMAL)
 206
 207
 208 def calc_hash(type, content):
 209     """Calculate some content's hash in the Git fashion."""
 210     header = '%s %d\0' % (type, len(content))
 211     sum = Sha1(header)
 212     sum.update(content)
 213     return sum.digest()
 214
 215
 216 def shalist_item_sort_key(ent):
 217     (mode, name, id) = ent
 218     assert(mode+0 == mode)
 219     if stat.S_ISDIR(mode):
 220         return name + '/'
 221     else:
 222         return name
 223
 224
 225 def tree_encode(shalist):
 226     """Generate a git tree object from (mode,name,hash) tuples."""
 227     shalist = sorted(shalist, key = shalist_item_sort_key)
 228     l = []
 229     for (mode,name,bin) in shalist:
 230         assert(mode)
 231         assert(mode+0 == mode)
 232         assert(name)
 233         assert(len(bin) == 20)
 234         s = '%o %s\0%s' % (mode,name,bin)
 235         assert(s[0] != '0')  # 0-padded octal is not acceptable in a git tree
 236         l.append(s)
 237     return ''.join(l)
 238
 239
 240 def tree_decode(buf):
 241     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 242     ofs = 0
 243     while ofs < len(buf):
 244         z = buf.find('\0', ofs)
 245         assert(z > ofs)
 246         spl = buf[ofs:z].split(' ', 1)
 247         assert(len(spl) == 2)
 248         mode,name = spl
 249         sha = buf[z+1:z+1+20]
 250         ofs = z+1+20
 251         yield (int(mode, 8), name, sha)
 252
 253
 254 def _encode_packobj(type, content, compression_level=1):
 255     if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
 256         raise ValueError('invalid compression level %s' % compression_level)
 257     szout = ''
 258     sz = len(content)
 259     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 260     sz >>= 4
 261     while 1:
 262         if sz: szbits |= 0x80
 263         szout += chr(szbits)
 264         if not sz:
 265             break
 266         szbits = sz & 0x7f
 267         sz >>= 7
 268     z = zlib.compressobj(compression_level)
 269     yield szout
 270     yield z.compress(content)
 271     yield z.flush()
 272
 273
 274 def _encode_looseobj(type, content, compression_level=1):
 275     z = zlib.compressobj(compression_level)
 276     yield z.compress('%s %d\0' % (type, len(content)))
 277     yield z.compress(content)
 278     yield z.flush()
 279
 280
 281 def _decode_looseobj(buf):
 282     assert(buf);
 283     s = zlib.decompress(buf)
 284     i = s.find('\0')
 285     assert(i > 0)
 286     l = s[:i].split(' ')
 287     type = l[0]
 288     sz = int(l[1])
 289     content = s[i+1:]
 290     assert(type in _typemap)
 291     assert(sz == len(content))
 292     return (type, content)
 293
 294
 295 def _decode_packobj(buf):
 296     assert(buf)
 297     c = ord(buf[0])
 298     type = _typermap[(c & 0x70) >> 4]
 299     sz = c & 0x0f
 300     shift = 4
 301     i = 0
 302     while c & 0x80:
 303         i += 1
 304         c = ord(buf[i])
 305         sz |= (c & 0x7f) << shift
 306         shift += 7
 307         if not (c & 0x80):
 308             break
 309     return (type, zlib.decompress(buf[i+1:]))
 310
 311
 312 class PackIdx:
 313     def __init__(self):
 314         assert(0)
 315
 316     def find_offset(self, hash):
 317         """Get the offset of an object inside the index file."""
 318         idx = self._idx_from_hash(hash)
 319         if idx != None:
 320             return self._ofs_from_idx(idx)
 321         return None
 322
 323     def exists(self, hash, want_source=False):
 324         """Return nonempty if the object exists in this index."""
 325         if hash and (self._idx_from_hash(hash) != None):
 326             return want_source and os.path.basename(self.name) or True
 327         return None
 328
 329     def __len__(self):
 330         return int(self.fanout[255])
 331
 332     def _idx_from_hash(self, hash):
 333         global _total_searches, _total_steps
 334         _total_searches += 1
 335         assert(len(hash) == 20)
 336         b1 = ord(hash[0])
 337         start = self.fanout[b1-1] # range -1..254
 338         end = self.fanout[b1] # range 0..255
 339         want = str(hash)
 340         _total_steps += 1  # lookup table is a step
 341         while start < end:
 342             _total_steps += 1
 343             mid = start + (end-start)/2
 344             v = self._idx_to_hash(mid)
 345             if v < want:
 346                 start = mid+1
 347             elif v > want:
 348                 end = mid
 349             else: # got it!
 350                 return mid
 351         return None
 352
 353
 354 class PackIdxV1(PackIdx):
 355     """Object representation of a Git pack index (version 1) file."""
 356     def __init__(self, filename, f):
 357         self.name = filename
 358         self.idxnames = [self.name]
 359         self.map = mmap_read(f)
 360         self.fanout = list(struct.unpack('!256I',
 361                                          str(buffer(self.map, 0, 256*4))))
 362         self.fanout.append(0)  # entry "-1"
 363         nsha = self.fanout[255]
 364         self.sha_ofs = 256*4
 365         self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
 366
 367     def _ofs_from_idx(self, idx):
 368         return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
 369
 370     def _idx_to_hash(self, idx):
 371         return str(self.shatable[idx*24+4 : idx*24+24])
 372
 373     def __iter__(self):
 374         for i in xrange(self.fanout[255]):
 375             yield buffer(self.map, 256*4 + 24*i + 4, 20)
 376
 377
 378 class PackIdxV2(PackIdx):
 379     """Object representation of a Git pack index (version 2) file."""
 380     def __init__(self, filename, f):
 381         self.name = filename
 382         self.idxnames = [self.name]
 383         self.map = mmap_read(f)
 384         assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
 385         self.fanout = list(struct.unpack('!256I',
 386                                          str(buffer(self.map, 8, 256*4))))
 387         self.fanout.append(0)  # entry "-1"
 388         nsha = self.fanout[255]
 389         self.sha_ofs = 8 + 256*4
 390         self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
 391         self.ofstable = buffer(self.map,
 392                                self.sha_ofs + nsha*20 + nsha*4,
 393                                nsha*4)
 394         self.ofs64table = buffer(self.map,
 395                                  8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
 396
 397     def _ofs_from_idx(self, idx):
 398         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
 399         if ofs & 0x80000000:
 400             idx64 = ofs & 0x7fffffff
 401             ofs = struct.unpack('!Q',
 402                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
 403         return ofs
 404
 405     def _idx_to_hash(self, idx):
 406         return str(self.shatable[idx*20:(idx+1)*20])
 407
 408     def __iter__(self):
 409         for i in xrange(self.fanout[255]):
 410             yield buffer(self.map, 8 + 256*4 + 20*i, 20)
 411
 412
 413 _mpi_count = 0
 414 class PackIdxList:
 415     def __init__(self, dir):
 416         global _mpi_count
 417         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 418         _mpi_count += 1
 419         self.dir = dir
 420         self.also = set()
 421         self.packs = []
 422         self.do_bloom = False
 423         self.bloom = None
 424         self.refresh()
 425
 426     def __del__(self):
 427         global _mpi_count
 428         _mpi_count -= 1
 429         assert(_mpi_count == 0)
 430
 431     def __iter__(self):
 432         return iter(idxmerge(self.packs))
 433
 434     def __len__(self):
 435         return sum(len(pack) for pack in self.packs)
 436
 437     def exists(self, hash, want_source=False):
 438         """Return nonempty if the object exists in the index files."""
 439         global _total_searches
 440         _total_searches += 1
 441         if hash in self.also:
 442             return True
 443         if self.do_bloom and self.bloom:
 444             if self.bloom.exists(hash):
 445                 self.do_bloom = False
 446             else:
 447                 _total_searches -= 1  # was counted by bloom
 448                 return None
 449         for i in xrange(len(self.packs)):
 450             p = self.packs[i]
 451             _total_searches -= 1  # will be incremented by sub-pack
 452             ix = p.exists(hash, want_source=want_source)
 453             if ix:
 454                 # reorder so most recently used packs are searched first
 455                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 456                 return ix
 457         self.do_bloom = True
 458         return None
 459
 460     def refresh(self, skip_midx = False):
 461         """Refresh the index list.
 462         This method verifies if .midx files were superseded (e.g. all of its
 463         contents are in another, bigger .midx file) and removes the superseded
 464         files.
 465
 466         If skip_midx is True, all work on .midx files will be skipped and .midx
 467         files will be removed from the list.
 468
 469         The module-global variable 'ignore_midx' can force this function to
 470         always act as if skip_midx was True.
 471         """
 472         self.bloom = None # Always reopen the bloom as it may have been relaced
 473         self.do_bloom = False
 474         skip_midx = skip_midx or ignore_midx
 475         d = dict((p.name, p) for p in self.packs
 476                  if not skip_midx or not isinstance(p, midx.PackMidx))
 477         if os.path.exists(self.dir):
 478             if not skip_midx:
 479                 midxl = []
 480                 for ix in self.packs:
 481                     if isinstance(ix, midx.PackMidx):
 482                         for name in ix.idxnames:
 483                             d[os.path.join(self.dir, name)] = ix
 484                 for full in glob.glob(os.path.join(self.dir,'*.midx')):
 485                     if not d.get(full):
 486                         mx = midx.PackMidx(full)
 487                         (mxd, mxf) = os.path.split(mx.name)
 488                         broken = False
 489                         for n in mx.idxnames:
 490                             if not os.path.exists(os.path.join(mxd, n)):
 491                                 log(('warning: index %s missing\n' +
 492                                     '  used by %s\n') % (n, mxf))
 493                                 broken = True
 494                         if broken:
 495                             mx.close()
 496                             del mx
 497                             unlink(full)
 498                         else:
 499                             midxl.append(mx)
 500                 midxl.sort(key=lambda ix:
 501                            (-len(ix), -xstat.stat(ix.name).st_mtime))
 502                 for ix in midxl:
 503                     any_needed = False
 504                     for sub in ix.idxnames:
 505                         found = d.get(os.path.join(self.dir, sub))
 506                         if not found or isinstance(found, PackIdx):
 507                             # doesn't exist, or exists but not in a midx
 508                             any_needed = True
 509                             break
 510                     if any_needed:
 511                         d[ix.name] = ix
 512                         for name in ix.idxnames:
 513                             d[os.path.join(self.dir, name)] = ix
 514                     elif not ix.force_keep:
 515                         debug1('midx: removing redundant: %s\n'
 516                                % os.path.basename(ix.name))
 517                         ix.close()
 518                         unlink(ix.name)
 519             for full in glob.glob(os.path.join(self.dir,'*.idx')):
 520                 if not d.get(full):
 521                     try:
 522                         ix = open_idx(full)
 523                     except GitError as e:
 524                         add_error(e)
 525                         continue
 526                     d[full] = ix
 527             bfull = os.path.join(self.dir, 'bup.bloom')
 528             if self.bloom is None and os.path.exists(bfull):
 529                 self.bloom = bloom.ShaBloom(bfull)
 530             self.packs = list(set(d.values()))
 531             self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
 532             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 533                 self.do_bloom = True
 534             else:
 535                 self.bloom = None
 536         debug1('PackIdxList: using %d index%s.\n'
 537             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 538
 539     def add(self, hash):
 540         """Insert an additional object in the list."""
 541         self.also.add(hash)
 542
 543
 544 def open_idx(filename):
 545     if filename.endswith('.idx'):
 546         f = open(filename, 'rb')
 547         header = f.read(8)
 548         if header[0:4] == '\377tOc':
 549             version = struct.unpack('!I', header[4:8])[0]
 550             if version == 2:
 551                 return PackIdxV2(filename, f)
 552             else:
 553                 raise GitError('%s: expected idx file version 2, got %d'
 554                                % (filename, version))
 555         elif len(header) == 8 and header[0:4] < '\377tOc':
 556             return PackIdxV1(filename, f)
 557         else:
 558             raise GitError('%s: unrecognized idx file header' % filename)
 559     elif filename.endswith('.midx'):
 560         return midx.PackMidx(filename)
 561     else:
 562         raise GitError('idx filenames must end with .idx or .midx')
 563
 564
 565 def idxmerge(idxlist, final_progress=True):
 566     """Generate a list of all the objects reachable in a PackIdxList."""
 567     def pfunc(count, total):
 568         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 569                   % (count*100.0/total, count, total))
 570     def pfinal(count, total):
 571         if final_progress:
 572             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 573                      % (100, total, total))
 574     return merge_iter(idxlist, 10024, pfunc, pfinal)
 575
 576
 577 def _make_objcache():
 578     return PackIdxList(repo('objects/pack'))
 579
 580 # bup-gc assumes that it can disable all PackWriter activities
 581 # (bloom/midx/cache) via the constructor and close() arguments.
 582
 583 class PackWriter:
 584     """Writes Git objects inside a pack file."""
 585     def __init__(self, objcache_maker=_make_objcache, compression_level=1,
 586                  run_midx=True, on_pack_finish=None):
 587         self.file = None
 588         self.parentfd = None
 589         self.count = 0
 590         self.outbytes = 0
 591         self.filename = None
 592         self.idx = None
 593         self.objcache_maker = objcache_maker
 594         self.objcache = None
 595         self.compression_level = compression_level
 596         self.run_midx=run_midx
 597         self.on_pack_finish = on_pack_finish
 598
 599     def __del__(self):
 600         self.close()
 601
 602     def _open(self):
 603         if not self.file:
 604             objdir = dir=repo('objects')
 605             fd, name = tempfile.mkstemp(suffix='.pack', dir=objdir)
 606             try:
 607                 self.file = os.fdopen(fd, 'w+b')
 608             except:
 609                 os.close(fd)
 610                 raise
 611             try:
 612                 self.parentfd = os.open(objdir, os.O_RDONLY)
 613             except:
 614                 f = self.file
 615                 self.file = None
 616                 f.close()
 617                 raise
 618             assert(name.endswith('.pack'))
 619             self.filename = name[:-5]
 620             self.file.write('PACK\0\0\0\2\0\0\0\0')
 621             self.idx = list(list() for i in xrange(256))
 622
 623     def _raw_write(self, datalist, sha):
 624         self._open()
 625         f = self.file
 626         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 627         # the file never has a *partial* blob.  So let's make sure it's
 628         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 629         # to our hashsplit algorithm.)  f.write() does its own buffering,
 630         # but that's okay because we'll flush it in _end().
 631         oneblob = ''.join(datalist)
 632         try:
 633             f.write(oneblob)
 634         except IOError as e:
 635             raise GitError, e, sys.exc_info()[2]
 636         nw = len(oneblob)
 637         crc = zlib.crc32(oneblob) & 0xffffffff
 638         self._update_idx(sha, crc, nw)
 639         self.outbytes += nw
 640         self.count += 1
 641         return nw, crc
 642
 643     def _update_idx(self, sha, crc, size):
 644         assert(sha)
 645         if self.idx:
 646             self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
 647
 648     def _write(self, sha, type, content):
 649         if verbose:
 650             log('>')
 651         if not sha:
 652             sha = calc_hash(type, content)
 653         size, crc = self._raw_write(_encode_packobj(type, content,
 654                                                     self.compression_level),
 655                                     sha=sha)
 656         if self.outbytes >= max_pack_size or self.count >= max_pack_objects:
 657             self.breakpoint()
 658         return sha
 659
 660     def breakpoint(self):
 661         """Clear byte and object counts and return the last processed id."""
 662         id = self._end(self.run_midx)
 663         self.outbytes = self.count = 0
 664         return id
 665
 666     def _require_objcache(self):
 667         if self.objcache is None and self.objcache_maker:
 668             self.objcache = self.objcache_maker()
 669         if self.objcache is None:
 670             raise GitError(
 671                     "PackWriter not opened or can't check exists w/o objcache")
 672
 673     def exists(self, id, want_source=False):
 674         """Return non-empty if an object is found in the object cache."""
 675         self._require_objcache()
 676         return self.objcache.exists(id, want_source=want_source)
 677
 678     def just_write(self, sha, type, content):
 679         """Write an object to the pack file, bypassing the objcache.  Fails if
 680         sha exists()."""
 681         self._write(sha, type, content)
 682
 683     def maybe_write(self, type, content):
 684         """Write an object to the pack file if not present and return its id."""
 685         sha = calc_hash(type, content)
 686         if not self.exists(sha):
 687             self.just_write(sha, type, content)
 688             self._require_objcache()
 689             self.objcache.add(sha)
 690         return sha
 691
 692     def new_blob(self, blob):
 693         """Create a blob object in the pack with the supplied content."""
 694         return self.maybe_write('blob', blob)
 695
 696     def new_tree(self, shalist):
 697         """Create a tree object in the pack."""
 698         content = tree_encode(shalist)
 699         return self.maybe_write('tree', content)
 700
 701     def new_commit(self, tree, parent,
 702                    author, adate_sec, adate_tz,
 703                    committer, cdate_sec, cdate_tz,
 704                    msg):
 705         """Create a commit object in the pack.  The date_sec values must be
 706         epoch-seconds, and if a tz is None, the local timezone is assumed."""
 707         if adate_tz:
 708             adate_str = _git_date_str(adate_sec, adate_tz)
 709         else:
 710             adate_str = _local_git_date_str(adate_sec)
 711         if cdate_tz:
 712             cdate_str = _git_date_str(cdate_sec, cdate_tz)
 713         else:
 714             cdate_str = _local_git_date_str(cdate_sec)
 715         l = []
 716         if tree: l.append('tree %s' % tree.encode('hex'))
 717         if parent: l.append('parent %s' % parent.encode('hex'))
 718         if author: l.append('author %s %s' % (author, adate_str))
 719         if committer: l.append('committer %s %s' % (committer, cdate_str))
 720         l.append('')
 721         l.append(msg)
 722         return self.maybe_write('commit', '\n'.join(l))
 723
 724     def abort(self):
 725         """Remove the pack file from disk."""
 726         f = self.file
 727         if f:
 728             pfd = self.parentfd
 729             self.file = None
 730             self.parentfd = None
 731             self.idx = None
 732             try:
 733                 try:
 734                     os.unlink(self.filename + '.pack')
 735                 finally:
 736                     f.close()
 737             finally:
 738                 if pfd is not None:
 739                     os.close(pfd)
 740
 741     def _end(self, run_midx=True):
 742         f = self.file
 743         if not f: return None
 744         self.file = None
 745         try:
 746             self.objcache = None
 747             idx = self.idx
 748             self.idx = None
 749
 750             # update object count
 751             f.seek(8)
 752             cp = struct.pack('!i', self.count)
 753             assert(len(cp) == 4)
 754             f.write(cp)
 755
 756             # calculate the pack sha1sum
 757             f.seek(0)
 758             sum = Sha1()
 759             for b in chunkyreader(f):
 760                 sum.update(b)
 761             packbin = sum.digest()
 762             f.write(packbin)
 763             fdatasync(f.fileno())
 764         finally:
 765             f.close()
 766
 767         obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
 768
 769         nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
 770         if os.path.exists(self.filename + '.map'):
 771             os.unlink(self.filename + '.map')
 772         os.rename(self.filename + '.pack', nameprefix + '.pack')
 773         os.rename(self.filename + '.idx', nameprefix + '.idx')
 774         try:
 775             os.fsync(self.parentfd)
 776         finally:
 777             os.close(self.parentfd)
 778
 779         if run_midx:
 780             auto_midx(repo('objects/pack'))
 781
 782         if self.on_pack_finish:
 783             self.on_pack_finish(nameprefix)
 784
 785         return nameprefix
 786
 787     def close(self, run_midx=True):
 788         """Close the pack file and move it to its definitive path."""
 789         return self._end(run_midx=run_midx)
 790
 791     def _write_pack_idx_v2(self, filename, idx, packbin):
 792         ofs64_count = 0
 793         for section in idx:
 794             for entry in section:
 795                 if entry[2] >= 2**31:
 796                     ofs64_count += 1
 797
 798         # Length: header + fan-out + shas-and-crcs + overflow-offsets
 799         index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
 800         idx_map = None
 801         idx_f = open(filename, 'w+b')
 802         try:
 803             idx_f.truncate(index_len)
 804             fdatasync(idx_f.fileno())
 805             idx_map = mmap_readwrite(idx_f, close=False)
 806             try:
 807                 count = _helpers.write_idx(filename, idx_map, idx, self.count)
 808                 assert(count == self.count)
 809                 idx_map.flush()
 810             finally:
 811                 idx_map.close()
 812         finally:
 813             idx_f.close()
 814
 815         idx_f = open(filename, 'a+b')
 816         try:
 817             idx_f.write(packbin)
 818             idx_f.seek(0)
 819             idx_sum = Sha1()
 820             b = idx_f.read(8 + 4*256)
 821             idx_sum.update(b)
 822
 823             obj_list_sum = Sha1()
 824             for b in chunkyreader(idx_f, 20*self.count):
 825                 idx_sum.update(b)
 826                 obj_list_sum.update(b)
 827             namebase = obj_list_sum.hexdigest()
 828
 829             for b in chunkyreader(idx_f):
 830                 idx_sum.update(b)
 831             idx_f.write(idx_sum.digest())
 832             fdatasync(idx_f.fileno())
 833             return namebase
 834         finally:
 835             idx_f.close()
 836
 837
 838 def _gitenv(repo_dir = None):
 839     if not repo_dir:
 840         repo_dir = repo()
 841     def env():
 842         os.environ['GIT_DIR'] = os.path.abspath(repo_dir)
 843     return env
 844
 845
 846 def list_refs(refnames=None, repo_dir=None,
 847               limit_to_heads=False, limit_to_tags=False):
 848     """Yield (refname, hash) tuples for all repository refs unless
 849     refnames are specified.  In that case, only include tuples for
 850     those refs.  The limits restrict the result items to refs/heads or
 851     refs/tags.  If both limits are specified, items from both sources
 852     will be included.
 853
 854     """
 855     argv = ['git', 'show-ref']
 856     if limit_to_heads:
 857         argv.append('--heads')
 858     if limit_to_tags:
 859         argv.append('--tags')
 860     argv.append('--')
 861     if refnames:
 862         argv += refnames
 863     p = subprocess.Popen(argv,
 864                          preexec_fn = _gitenv(repo_dir),
 865                          stdout = subprocess.PIPE)
 866     out = p.stdout.read().strip()
 867     rv = p.wait()  # not fatal
 868     if rv:
 869         assert(not out)
 870     if out:
 871         for d in out.split('\n'):
 872             (sha, name) = d.split(' ', 1)
 873             yield (name, sha.decode('hex'))
 874
 875
 876 def read_ref(refname, repo_dir = None):
 877     """Get the commit id of the most recent commit made on a given ref."""
 878     refs = list_refs(refnames=[refname], repo_dir=repo_dir, limit_to_heads=True)
 879     l = tuple(islice(refs, 2))
 880     if l:
 881         assert(len(l) == 1)
 882         return l[0][1]
 883     else:
 884         return None
 885
 886
 887 def rev_list(ref, count=None, repo_dir=None):
 888     """Generate a list of reachable commits in reverse chronological order.
 889
 890     This generator walks through commits, from child to parent, that are
 891     reachable via the specified ref and yields a series of tuples of the form
 892     (date,hash).
 893
 894     If count is a non-zero integer, limit the number of commits to "count"
 895     objects.
 896     """
 897     assert(not ref.startswith('-'))
 898     opts = []
 899     if count:
 900         opts += ['-n', str(atoi(count))]
 901     argv = ['git', 'rev-list', '--pretty=format:%at'] + opts + [ref, '--']
 902     p = subprocess.Popen(argv,
 903                          preexec_fn = _gitenv(repo_dir),
 904                          stdout = subprocess.PIPE)
 905     commit = None
 906     for row in p.stdout:
 907         s = row.strip()
 908         if s.startswith('commit '):
 909             commit = s[7:].decode('hex')
 910         else:
 911             date = int(s)
 912             yield (date, commit)
 913     rv = p.wait()  # not fatal
 914     if rv:
 915         raise GitError, 'git rev-list returned error %d' % rv
 916
 917
 918 def get_commit_dates(refs, repo_dir=None):
 919     """Get the dates for the specified commit refs.  For now, every unique
 920        string in refs must resolve to a different commit or this
 921        function will fail."""
 922     result = []
 923     for ref in refs:
 924         commit = get_commit_items(ref, cp(repo_dir))
 925         result.append(commit.author_sec)
 926     return result
 927
 928
 929 def rev_parse(committish, repo_dir=None):
 930     """Resolve the full hash for 'committish', if it exists.
 931
 932     Should be roughly equivalent to 'git rev-parse'.
 933
 934     Returns the hex value of the hash if it is found, None if 'committish' does
 935     not correspond to anything.
 936     """
 937     head = read_ref(committish, repo_dir=repo_dir)
 938     if head:
 939         debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
 940         return head
 941
 942     pL = PackIdxList(repo('objects/pack', repo_dir=repo_dir))
 943
 944     if len(committish) == 40:
 945         try:
 946             hash = committish.decode('hex')
 947         except TypeError:
 948             return None
 949
 950         if pL.exists(hash):
 951             return hash
 952
 953     return None
 954
 955
 956 def update_ref(refname, newval, oldval, repo_dir=None):
 957     """Update a repository reference."""
 958     if not oldval:
 959         oldval = ''
 960     assert(refname.startswith('refs/heads/') \
 961            or refname.startswith('refs/tags/'))
 962     p = subprocess.Popen(['git', 'update-ref', refname,
 963                           newval.encode('hex'), oldval.encode('hex')],
 964                          preexec_fn = _gitenv(repo_dir))
 965     _git_wait('git update-ref', p)
 966
 967
 968 def delete_ref(refname, oldvalue=None):
 969     """Delete a repository reference (see git update-ref(1))."""
 970     assert(refname.startswith('refs/'))
 971     oldvalue = [] if not oldvalue else [oldvalue]
 972     p = subprocess.Popen(['git', 'update-ref', '-d', refname] + oldvalue,
 973                          preexec_fn = _gitenv())
 974     _git_wait('git update-ref', p)
 975
 976
 977 def guess_repo(path=None):
 978     """Set the path value in the global variable "repodir".
 979     This makes bup look for an existing bup repository, but not fail if a
 980     repository doesn't exist. Usually, if you are interacting with a bup
 981     repository, you would not be calling this function but using
 982     check_repo_or_die().
 983     """
 984     global repodir
 985     if path:
 986         repodir = path
 987     if not repodir:
 988         repodir = os.environ.get('BUP_DIR')
 989         if not repodir:
 990             repodir = os.path.expanduser('~/.bup')
 991
 992
 993 def init_repo(path=None):
 994     """Create the Git bare repository for bup in a given path."""
 995     guess_repo(path)
 996     d = repo()  # appends a / to the path
 997     parent = os.path.dirname(os.path.dirname(d))
 998     if parent and not os.path.exists(parent):
 999         raise GitError('parent directory "%s" does not exist\n' % parent)
1000     if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
1001         raise GitError('"%s" exists but is not a directory\n' % d)
1002     p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
1003                          preexec_fn = _gitenv())
1004     _git_wait('git init', p)
1005     # Force the index version configuration in order to ensure bup works
1006     # regardless of the version of the installed Git binary.
1007     p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
1008                          stdout=sys.stderr, preexec_fn = _gitenv())
1009     _git_wait('git config', p)
1010     # Enable the reflog
1011     p = subprocess.Popen(['git', 'config', 'core.logAllRefUpdates', 'true'],
1012                          stdout=sys.stderr, preexec_fn = _gitenv())
1013     _git_wait('git config', p)
1014
1015
1016 def check_repo_or_die(path=None):
1017     """Make sure a bup repository exists, and abort if not.
1018     If the path to a particular repository was not specified, this function
1019     initializes the default repository automatically.
1020     """
1021     guess_repo(path)
1022     try:
1023         os.stat(repo('objects/pack/.'))
1024     except OSError as e:
1025         if e.errno == errno.ENOENT:
1026             log('error: %r is not a bup repository; run "bup init"\n'
1027                 % repo())
1028             sys.exit(15)
1029         else:
1030             log('error: %s\n' % e)
1031             sys.exit(14)
1032
1033
1034 _ver = None
1035 def ver():
1036     """Get Git's version and ensure a usable version is installed.
1037
1038     The returned version is formatted as an ordered tuple with each position
1039     representing a digit in the version tag. For example, the following tuple
1040     would represent version 1.6.6.9:
1041
1042         ('1', '6', '6', '9')
1043     """
1044     global _ver
1045     if not _ver:
1046         p = subprocess.Popen(['git', '--version'],
1047                              stdout=subprocess.PIPE)
1048         gvs = p.stdout.read()
1049         _git_wait('git --version', p)
1050         m = re.match(r'git version (\S+.\S+)', gvs)
1051         if not m:
1052             raise GitError('git --version weird output: %r' % gvs)
1053         _ver = tuple(m.group(1).split('.'))
1054     needed = ('1','5', '3', '1')
1055     if _ver < needed:
1056         raise GitError('git version %s or higher is required; you have %s'
1057                        % ('.'.join(needed), '.'.join(_ver)))
1058     return _ver
1059
1060
1061 def _git_wait(cmd, p):
1062     rv = p.wait()
1063     if rv != 0:
1064         raise GitError('%s returned %d' % (cmd, rv))
1065
1066
1067 def _git_capture(argv):
1068     p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv())
1069     r = p.stdout.read()
1070     _git_wait(repr(argv), p)
1071     return r
1072
1073
1074 class _AbortableIter:
1075     def __init__(self, it, onabort = None):
1076         self.it = it
1077         self.onabort = onabort
1078         self.done = None
1079
1080     def __iter__(self):
1081         return self
1082
1083     def next(self):
1084         try:
1085             return self.it.next()
1086         except StopIteration as e:
1087             self.done = True
1088             raise
1089         except:
1090             self.abort()
1091             raise
1092
1093     def abort(self):
1094         """Abort iteration and call the abortion callback, if needed."""
1095         if not self.done:
1096             self.done = True
1097             if self.onabort:
1098                 self.onabort()
1099
1100     def __del__(self):
1101         self.abort()
1102
1103
1104 class MissingObject(KeyError):
1105     def __init__(self, id):
1106         self.id = id
1107         KeyError.__init__(self, 'object %r is missing' % id.encode('hex'))
1108
1109
1110 _ver_warned = 0
1111 class CatPipe:
1112     """Link to 'git cat-file' that is used to retrieve blob data."""
1113     def __init__(self, repo_dir = None):
1114         global _ver_warned
1115         self.repo_dir = repo_dir
1116         wanted = ('1','5','6')
1117         if ver() < wanted:
1118             if not _ver_warned:
1119                 log('warning: git version < %s; bup will be slow.\n'
1120                     % '.'.join(wanted))
1121                 _ver_warned = 1
1122             self.get = self._slow_get
1123         else:
1124             self.p = self.inprogress = None
1125             self.get = self._fast_get
1126
1127     def _abort(self):
1128         if self.p:
1129             self.p.stdout.close()
1130             self.p.stdin.close()
1131         self.p = None
1132         self.inprogress = None
1133
1134     def _restart(self):
1135         self._abort()
1136         self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1137                                   stdin=subprocess.PIPE,
1138                                   stdout=subprocess.PIPE,
1139                                   close_fds = True,
1140                                   bufsize = 4096,
1141                                   preexec_fn = _gitenv(self.repo_dir))
1142
1143     def _fast_get(self, id):
1144         if not self.p or self.p.poll() != None:
1145             self._restart()
1146         assert(self.p)
1147         poll_result = self.p.poll()
1148         assert(poll_result == None)
1149         if self.inprogress:
1150             log('_fast_get: opening %r while %r is open\n'
1151                 % (id, self.inprogress))
1152         assert(not self.inprogress)
1153         assert(id.find('\n') < 0)
1154         assert(id.find('\r') < 0)
1155         assert(not id.startswith('-'))
1156         self.inprogress = id
1157         self.p.stdin.write('%s\n' % id)
1158         self.p.stdin.flush()
1159         hdr = self.p.stdout.readline()
1160         if hdr.endswith(' missing\n'):
1161             self.inprogress = None
1162             raise MissingObject(id.decode('hex'))
1163         spl = hdr.split(' ')
1164         if len(spl) != 3 or len(spl[0]) != 40:
1165             raise GitError('expected blob, got %r' % spl)
1166         (hex, type, size) = spl
1167
1168         it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
1169                            onabort = self._abort)
1170         try:
1171             yield type
1172             for blob in it:
1173                 yield blob
1174             readline_result = self.p.stdout.readline()
1175             assert(readline_result == '\n')
1176             self.inprogress = None
1177         except Exception as e:
1178             it.abort()
1179             raise
1180
1181     def _slow_get(self, id):
1182         assert(id.find('\n') < 0)
1183         assert(id.find('\r') < 0)
1184         assert(id[0] != '-')
1185         type = _git_capture(['git', 'cat-file', '-t', id]).strip()
1186         yield type
1187
1188         p = subprocess.Popen(['git', 'cat-file', type, id],
1189                              stdout=subprocess.PIPE,
1190                              preexec_fn = _gitenv(self.repo_dir))
1191         for blob in chunkyreader(p.stdout):
1192             yield blob
1193         _git_wait('git cat-file', p)
1194
1195     def _join(self, it):
1196         type = it.next()
1197         if type == 'blob':
1198             for blob in it:
1199                 yield blob
1200         elif type == 'tree':
1201             treefile = ''.join(it)
1202             for (mode, name, sha) in tree_decode(treefile):
1203                 for blob in self.join(sha.encode('hex')):
1204                     yield blob
1205         elif type == 'commit':
1206             treeline = ''.join(it).split('\n')[0]
1207             assert(treeline.startswith('tree '))
1208             for blob in self.join(treeline[5:]):
1209                 yield blob
1210         else:
1211             raise GitError('invalid object type %r: expected blob/tree/commit'
1212                            % type)
1213
1214     def join(self, id):
1215         """Generate a list of the content of all blobs that can be reached
1216         from an object.  The hash given in 'id' must point to a blob, a tree
1217         or a commit. The content of all blobs that can be seen from trees or
1218         commits will be added to the list.
1219         """
1220         try:
1221             for d in self._join(self.get(id)):
1222                 yield d
1223         except StopIteration:
1224             log('booger!\n')
1225
1226
1227 _cp = {}
1228
1229 def cp(repo_dir=None):
1230     """Create a CatPipe object or reuse the already existing one."""
1231     global _cp
1232     if not repo_dir:
1233         repo_dir = repo()
1234     repo_dir = os.path.abspath(repo_dir)
1235     cp = _cp.get(repo_dir)
1236     if not cp:
1237         cp = CatPipe(repo_dir)
1238         _cp[repo_dir] = cp
1239     return cp
1240
1241
1242 def tags(repo_dir = None):
1243     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1244     tags = {}
1245     for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1246         assert(n.startswith('refs/tags/'))
1247         name = n[10:]
1248         if not c in tags:
1249             tags[c] = []
1250         tags[c].append(name)  # more than one tag can point at 'c'
1251     return tags
1252
1253
1254 WalkItem = namedtuple('WalkItem', ['id', 'type', 'mode',
1255                                    'path', 'chunk_path', 'data'])
1256 # The path is the mangled path, and if an item represents a fragment
1257 # of a chunked file, the chunk_path will be the chunked subtree path
1258 # for the chunk, i.e. ['', '2d3115e', ...].  The top-level path for a
1259 # chunked file will have a chunk_path of [''].  So some chunk subtree
1260 # of the file '/foo/bar/baz' might look like this:
1261 #
1262 #   item.path = ['foo', 'bar', 'baz.bup']
1263 #   item.chunk_path = ['', '2d3115e', '016b097']
1264 #   item.type = 'tree'
1265 #   ...
1266
1267
1268 def walk_object(cat_pipe, id,
1269                 stop_at=None,
1270                 include_data=None):
1271     """Yield everything reachable from id via cat_pipe as a WalkItem,
1272     stopping whenever stop_at(id) returns true.  Throw MissingObject
1273     if a hash encountered is missing from the repository, and don't
1274     read or return blob content in the data field unless include_data
1275     is set.
1276     """
1277     # Maintain the pending stack on the heap to avoid stack overflow
1278     pending = [(id, [], [], None)]
1279     while len(pending):
1280         id, parent_path, chunk_path, mode = pending.pop()
1281         if stop_at and stop_at(id):
1282             continue
1283
1284         if (not include_data) and mode and stat.S_ISREG(mode):
1285             # If the object is a "regular file", then it's a leaf in
1286             # the graph, so we can skip reading the data if the caller
1287             # hasn't requested it.
1288             yield WalkItem(id=id, type='blob',
1289                            chunk_path=chunk_path, path=parent_path,
1290                            mode=mode,
1291                            data=None)
1292             continue
1293
1294         item_it = cat_pipe.get(id)
1295         type = item_it.next()
1296         if type not in ('blob', 'commit', 'tree'):
1297             raise Exception('unexpected repository object type %r' % type)
1298
1299         # FIXME: set the mode based on the type when the mode is None
1300         if type == 'blob' and not include_data:
1301             # Dump data until we can ask cat_pipe not to fetch it
1302             for ignored in item_it:
1303                 pass
1304             data = None
1305         else:
1306             data = ''.join(item_it)
1307
1308         yield WalkItem(id=id, type=type,
1309                        chunk_path=chunk_path, path=parent_path,
1310                        mode=mode,
1311                        data=(data if include_data else None))
1312
1313         if type == 'commit':
1314             commit_items = parse_commit(data)
1315             for pid in commit_items.parents:
1316                 pending.append((pid, parent_path, chunk_path, mode))
1317             pending.append((commit_items.tree, parent_path, chunk_path,
1318                             hashsplit.GIT_MODE_TREE))
1319         elif type == 'tree':
1320             for mode, name, ent_id in tree_decode(data):
1321                 demangled, bup_type = demangle_name(name, mode)
1322                 if chunk_path:
1323                     sub_path = parent_path
1324                     sub_chunk_path = chunk_path + [name]
1325                 else:
1326                     sub_path = parent_path + [name]
1327                     if bup_type == BUP_CHUNKED:
1328                         sub_chunk_path = ['']
1329                     else:
1330                         sub_chunk_path = chunk_path
1331                 pending.append((ent_id.encode('hex'), sub_path, sub_chunk_path,
1332                                 mode))