lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5
   6 import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
   7 from collections import namedtuple
   8 from itertools import islice
   9
  10 from bup import _helpers, hashsplit, path, midx, bloom, xstat
  11 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
  12                          fdatasync,
  13                          hostname, localtime, log, merge_iter,
  14                          mmap_read, mmap_readwrite,
  15                          progress, qprogress, unlink, username, userfullname,
  16                          utc_offset_str)
  17
  18
  19 max_pack_size = 1000*1000*1000  # larger packs will slow down pruning
  20 max_pack_objects = 200*1000  # cache memory usage is about 83 bytes per object
  21
  22 verbose = 0
  23 ignore_midx = 0
  24 repodir = None
  25
  26 _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
  27 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
  28
  29 _total_searches = 0
  30 _total_steps = 0
  31
  32
  33 class GitError(Exception):
  34     pass
  35
  36
  37 def parse_tz_offset(s):
  38     """UTC offset in seconds."""
  39     tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
  40     if s[0] == '-':
  41         return - tz_off
  42     return tz_off
  43
  44
  45 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
  46 # Make sure that's authoritative.
  47 _start_end_char = r'[^ .,:;<>"\'\0\n]'
  48 _content_char = r'[^\0\n<>]'
  49 _safe_str_rx = '(?:%s{1,2}|(?:%s%s*%s))' \
  50     % (_start_end_char,
  51        _start_end_char, _content_char, _start_end_char)
  52 _tz_rx = r'[-+]\d\d[0-5]\d'
  53 _parent_rx = r'(?:parent [abcdefABCDEF0123456789]{40}\n)'
  54 _commit_rx = re.compile(r'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
  55 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
  56 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)
  57
  58 (?P<message>(?:.|\n)*)''' % (_parent_rx,
  59                              _safe_str_rx, _safe_str_rx, _tz_rx,
  60                              _safe_str_rx, _safe_str_rx, _tz_rx))
  61 _parent_hash_rx = re.compile(r'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
  62
  63
  64 # Note that the author_sec and committer_sec values are (UTC) epoch seconds.
  65 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
  66                                        'author_name', 'author_mail',
  67                                        'author_sec', 'author_offset',
  68                                        'committer_name', 'committer_mail',
  69                                        'committer_sec', 'committer_offset',
  70                                        'message'])
  71
  72 def parse_commit(content):
  73     commit_match = re.match(_commit_rx, content)
  74     if not commit_match:
  75         raise Exception('cannot parse commit %r' % content)
  76     matches = commit_match.groupdict()
  77     return CommitInfo(tree=matches['tree'],
  78                       parents=re.findall(_parent_hash_rx, matches['parents']),
  79                       author_name=matches['author_name'],
  80                       author_mail=matches['author_mail'],
  81                       author_sec=int(matches['asec']),
  82                       author_offset=parse_tz_offset(matches['atz']),
  83                       committer_name=matches['committer_name'],
  84                       committer_mail=matches['committer_mail'],
  85                       committer_sec=int(matches['csec']),
  86                       committer_offset=parse_tz_offset(matches['ctz']),
  87                       message=matches['message'])
  88
  89
  90 def get_commit_items(id, cp):
  91     commit_it = cp.get(id)
  92     assert(commit_it.next() == 'commit')
  93     commit_content = ''.join(commit_it)
  94     return parse_commit(commit_content)
  95
  96
  97 def _local_git_date_str(epoch_sec):
  98     return '%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
  99
 100
 101 def _git_date_str(epoch_sec, tz_offset_sec):
 102     offs =  tz_offset_sec // 60
 103     return '%d %s%02d%02d' \
 104         % (epoch_sec,
 105            '+' if offs >= 0 else '-',
 106            abs(offs) // 60,
 107            abs(offs) % 60)
 108
 109
 110 def repo(sub = '', repo_dir=None):
 111     """Get the path to the git repository or one of its subdirectories."""
 112     global repodir
 113     repo_dir = repo_dir or repodir
 114     if not repo_dir:
 115         raise GitError('You should call check_repo_or_die()')
 116
 117     # If there's a .git subdirectory, then the actual repo is in there.
 118     gd = os.path.join(repo_dir, '.git')
 119     if os.path.exists(gd):
 120         repodir = gd
 121
 122     return os.path.join(repo_dir, sub)
 123
 124
 125 def shorten_hash(s):
 126     return re.sub(r'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)',
 127                   r'\1\2*\3', s)
 128
 129
 130 def repo_rel(path):
 131     full = os.path.abspath(path)
 132     fullrepo = os.path.abspath(repo(''))
 133     if not fullrepo.endswith('/'):
 134         fullrepo += '/'
 135     if full.startswith(fullrepo):
 136         path = full[len(fullrepo):]
 137     if path.startswith('index-cache/'):
 138         path = path[len('index-cache/'):]
 139     return shorten_hash(path)
 140
 141
 142 def all_packdirs():
 143     paths = [repo('objects/pack')]
 144     paths += glob.glob(repo('index-cache/*/.'))
 145     return paths
 146
 147
 148 def auto_midx(objdir):
 149     args = [path.exe(), 'midx', '--auto', '--dir', objdir]
 150     try:
 151         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 152     except OSError as e:
 153         # make sure 'args' gets printed to help with debugging
 154         add_error('%r: exception: %s' % (args, e))
 155         raise
 156     if rv:
 157         add_error('%r: returned %d' % (args, rv))
 158
 159     args = [path.exe(), 'bloom', '--dir', objdir]
 160     try:
 161         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 162     except OSError as e:
 163         # make sure 'args' gets printed to help with debugging
 164         add_error('%r: exception: %s' % (args, e))
 165         raise
 166     if rv:
 167         add_error('%r: returned %d' % (args, rv))
 168
 169
 170 def mangle_name(name, mode, gitmode):
 171     """Mangle a file name to present an abstract name for segmented files.
 172     Mangled file names will have the ".bup" extension added to them. If a
 173     file's name already ends with ".bup", a ".bupl" extension is added to
 174     disambiguate normal files from segmented ones.
 175     """
 176     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 177         assert(stat.S_ISDIR(gitmode))
 178         return name + '.bup'
 179     elif name.endswith('.bup') or name[:-1].endswith('.bup'):
 180         return name + '.bupl'
 181     else:
 182         return name
 183
 184
 185 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 186 def demangle_name(name, mode):
 187     """Remove name mangling from a file name, if necessary.
 188
 189     The return value is a tuple (demangled_filename,mode), where mode is one of
 190     the following:
 191
 192     * BUP_NORMAL  : files that should be read as-is from the repository
 193     * BUP_CHUNKED : files that were chunked and need to be reassembled
 194
 195     For more information on the name mangling algorithm, see mangle_name()
 196     """
 197     if name.endswith('.bupl'):
 198         return (name[:-5], BUP_NORMAL)
 199     elif name.endswith('.bup'):
 200         return (name[:-4], BUP_CHUNKED)
 201     elif name.endswith('.bupm'):
 202         return (name[:-5],
 203                 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
 204     else:
 205         return (name, BUP_NORMAL)
 206
 207
 208 def calc_hash(type, content):
 209     """Calculate some content's hash in the Git fashion."""
 210     header = '%s %d\0' % (type, len(content))
 211     sum = Sha1(header)
 212     sum.update(content)
 213     return sum.digest()
 214
 215
 216 def shalist_item_sort_key(ent):
 217     (mode, name, id) = ent
 218     assert(mode+0 == mode)
 219     if stat.S_ISDIR(mode):
 220         return name + '/'
 221     else:
 222         return name
 223
 224
 225 def tree_encode(shalist):
 226     """Generate a git tree object from (mode,name,hash) tuples."""
 227     shalist = sorted(shalist, key = shalist_item_sort_key)
 228     l = []
 229     for (mode,name,bin) in shalist:
 230         assert(mode)
 231         assert(mode+0 == mode)
 232         assert(name)
 233         assert(len(bin) == 20)
 234         s = '%o %s\0%s' % (mode,name,bin)
 235         assert(s[0] != '0')  # 0-padded octal is not acceptable in a git tree
 236         l.append(s)
 237     return ''.join(l)
 238
 239
 240 def tree_decode(buf):
 241     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 242     ofs = 0
 243     while ofs < len(buf):
 244         z = buf.find('\0', ofs)
 245         assert(z > ofs)
 246         spl = buf[ofs:z].split(' ', 1)
 247         assert(len(spl) == 2)
 248         mode,name = spl
 249         sha = buf[z+1:z+1+20]
 250         ofs = z+1+20
 251         yield (int(mode, 8), name, sha)
 252
 253
 254 def _encode_packobj(type, content, compression_level=1):
 255     if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
 256         raise ValueError('invalid compression level %s' % compression_level)
 257     szout = ''
 258     sz = len(content)
 259     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 260     sz >>= 4
 261     while 1:
 262         if sz: szbits |= 0x80
 263         szout += chr(szbits)
 264         if not sz:
 265             break
 266         szbits = sz & 0x7f
 267         sz >>= 7
 268     z = zlib.compressobj(compression_level)
 269     yield szout
 270     yield z.compress(content)
 271     yield z.flush()
 272
 273
 274 def _encode_looseobj(type, content, compression_level=1):
 275     z = zlib.compressobj(compression_level)
 276     yield z.compress('%s %d\0' % (type, len(content)))
 277     yield z.compress(content)
 278     yield z.flush()
 279
 280
 281 def _decode_looseobj(buf):
 282     assert(buf);
 283     s = zlib.decompress(buf)
 284     i = s.find('\0')
 285     assert(i > 0)
 286     l = s[:i].split(' ')
 287     type = l[0]
 288     sz = int(l[1])
 289     content = s[i+1:]
 290     assert(type in _typemap)
 291     assert(sz == len(content))
 292     return (type, content)
 293
 294
 295 def _decode_packobj(buf):
 296     assert(buf)
 297     c = ord(buf[0])
 298     type = _typermap[(c & 0x70) >> 4]
 299     sz = c & 0x0f
 300     shift = 4
 301     i = 0
 302     while c & 0x80:
 303         i += 1
 304         c = ord(buf[i])
 305         sz |= (c & 0x7f) << shift
 306         shift += 7
 307         if not (c & 0x80):
 308             break
 309     return (type, zlib.decompress(buf[i+1:]))
 310
 311
 312 class PackIdx:
 313     def __init__(self):
 314         assert(0)
 315
 316     def find_offset(self, hash):
 317         """Get the offset of an object inside the index file."""
 318         idx = self._idx_from_hash(hash)
 319         if idx != None:
 320             return self._ofs_from_idx(idx)
 321         return None
 322
 323     def exists(self, hash, want_source=False):
 324         """Return nonempty if the object exists in this index."""
 325         if hash and (self._idx_from_hash(hash) != None):
 326             return want_source and os.path.basename(self.name) or True
 327         return None
 328
 329     def __len__(self):
 330         return int(self.fanout[255])
 331
 332     def _idx_from_hash(self, hash):
 333         global _total_searches, _total_steps
 334         _total_searches += 1
 335         assert(len(hash) == 20)
 336         b1 = ord(hash[0])
 337         start = self.fanout[b1-1] # range -1..254
 338         end = self.fanout[b1] # range 0..255
 339         want = str(hash)
 340         _total_steps += 1  # lookup table is a step
 341         while start < end:
 342             _total_steps += 1
 343             mid = start + (end-start)/2
 344             v = self._idx_to_hash(mid)
 345             if v < want:
 346                 start = mid+1
 347             elif v > want:
 348                 end = mid
 349             else: # got it!
 350                 return mid
 351         return None
 352
 353
 354 class PackIdxV1(PackIdx):
 355     """Object representation of a Git pack index (version 1) file."""
 356     def __init__(self, filename, f):
 357         self.name = filename
 358         self.idxnames = [self.name]
 359         self.map = mmap_read(f)
 360         self.fanout = list(struct.unpack('!256I',
 361                                          str(buffer(self.map, 0, 256*4))))
 362         self.fanout.append(0)  # entry "-1"
 363         nsha = self.fanout[255]
 364         self.sha_ofs = 256*4
 365         self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
 366
 367     def _ofs_from_idx(self, idx):
 368         return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
 369
 370     def _idx_to_hash(self, idx):
 371         return str(self.shatable[idx*24+4 : idx*24+24])
 372
 373     def __iter__(self):
 374         for i in xrange(self.fanout[255]):
 375             yield buffer(self.map, 256*4 + 24*i + 4, 20)
 376
 377
 378 class PackIdxV2(PackIdx):
 379     """Object representation of a Git pack index (version 2) file."""
 380     def __init__(self, filename, f):
 381         self.name = filename
 382         self.idxnames = [self.name]
 383         self.map = mmap_read(f)
 384         assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
 385         self.fanout = list(struct.unpack('!256I',
 386                                          str(buffer(self.map, 8, 256*4))))
 387         self.fanout.append(0)  # entry "-1"
 388         nsha = self.fanout[255]
 389         self.sha_ofs = 8 + 256*4
 390         self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
 391         self.ofstable = buffer(self.map,
 392                                self.sha_ofs + nsha*20 + nsha*4,
 393                                nsha*4)
 394         self.ofs64table = buffer(self.map,
 395                                  8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
 396
 397     def _ofs_from_idx(self, idx):
 398         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
 399         if ofs & 0x80000000:
 400             idx64 = ofs & 0x7fffffff
 401             ofs = struct.unpack('!Q',
 402                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
 403         return ofs
 404
 405     def _idx_to_hash(self, idx):
 406         return str(self.shatable[idx*20:(idx+1)*20])
 407
 408     def __iter__(self):
 409         for i in xrange(self.fanout[255]):
 410             yield buffer(self.map, 8 + 256*4 + 20*i, 20)
 411
 412
 413 _mpi_count = 0
 414 class PackIdxList:
 415     def __init__(self, dir):
 416         global _mpi_count
 417         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 418         _mpi_count += 1
 419         self.dir = dir
 420         self.also = set()
 421         self.packs = []
 422         self.do_bloom = False
 423         self.bloom = None
 424         self.refresh()
 425
 426     def __del__(self):
 427         global _mpi_count
 428         _mpi_count -= 1
 429         assert(_mpi_count == 0)
 430
 431     def __iter__(self):
 432         return iter(idxmerge(self.packs))
 433
 434     def __len__(self):
 435         return sum(len(pack) for pack in self.packs)
 436
 437     def exists(self, hash, want_source=False):
 438         """Return nonempty if the object exists in the index files."""
 439         global _total_searches
 440         _total_searches += 1
 441         if hash in self.also:
 442             return True
 443         if self.do_bloom and self.bloom:
 444             if self.bloom.exists(hash):
 445                 self.do_bloom = False
 446             else:
 447                 _total_searches -= 1  # was counted by bloom
 448                 return None
 449         for i in xrange(len(self.packs)):
 450             p = self.packs[i]
 451             _total_searches -= 1  # will be incremented by sub-pack
 452             ix = p.exists(hash, want_source=want_source)
 453             if ix:
 454                 # reorder so most recently used packs are searched first
 455                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 456                 return ix
 457         self.do_bloom = True
 458         return None
 459
 460     def refresh(self, skip_midx = False):
 461         """Refresh the index list.
 462         This method verifies if .midx files were superseded (e.g. all of its
 463         contents are in another, bigger .midx file) and removes the superseded
 464         files.
 465
 466         If skip_midx is True, all work on .midx files will be skipped and .midx
 467         files will be removed from the list.
 468
 469         The module-global variable 'ignore_midx' can force this function to
 470         always act as if skip_midx was True.
 471         """
 472         self.bloom = None # Always reopen the bloom as it may have been relaced
 473         self.do_bloom = False
 474         skip_midx = skip_midx or ignore_midx
 475         d = dict((p.name, p) for p in self.packs
 476                  if not skip_midx or not isinstance(p, midx.PackMidx))
 477         if os.path.exists(self.dir):
 478             if not skip_midx:
 479                 midxl = []
 480                 for ix in self.packs:
 481                     if isinstance(ix, midx.PackMidx):
 482                         for name in ix.idxnames:
 483                             d[os.path.join(self.dir, name)] = ix
 484                 for full in glob.glob(os.path.join(self.dir,'*.midx')):
 485                     if not d.get(full):
 486                         mx = midx.PackMidx(full)
 487                         (mxd, mxf) = os.path.split(mx.name)
 488                         broken = False
 489                         for n in mx.idxnames:
 490                             if not os.path.exists(os.path.join(mxd, n)):
 491                                 log(('warning: index %s missing\n' +
 492                                     '  used by %s\n') % (n, mxf))
 493                                 broken = True
 494                         if broken:
 495                             mx.close()
 496                             del mx
 497                             unlink(full)
 498                         else:
 499                             midxl.append(mx)
 500                 midxl.sort(key=lambda ix:
 501                            (-len(ix), -xstat.stat(ix.name).st_mtime))
 502                 for ix in midxl:
 503                     any_needed = False
 504                     for sub in ix.idxnames:
 505                         found = d.get(os.path.join(self.dir, sub))
 506                         if not found or isinstance(found, PackIdx):
 507                             # doesn't exist, or exists but not in a midx
 508                             any_needed = True
 509                             break
 510                     if any_needed:
 511                         d[ix.name] = ix
 512                         for name in ix.idxnames:
 513                             d[os.path.join(self.dir, name)] = ix
 514                     elif not ix.force_keep:
 515                         debug1('midx: removing redundant: %s\n'
 516                                % os.path.basename(ix.name))
 517                         ix.close()
 518                         unlink(ix.name)
 519             for full in glob.glob(os.path.join(self.dir,'*.idx')):
 520                 if not d.get(full):
 521                     try:
 522                         ix = open_idx(full)
 523                     except GitError as e:
 524                         add_error(e)
 525                         continue
 526                     d[full] = ix
 527             bfull = os.path.join(self.dir, 'bup.bloom')
 528             if self.bloom is None and os.path.exists(bfull):
 529                 self.bloom = bloom.ShaBloom(bfull)
 530             self.packs = list(set(d.values()))
 531             self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
 532             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 533                 self.do_bloom = True
 534             else:
 535                 self.bloom = None
 536         debug1('PackIdxList: using %d index%s.\n'
 537             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 538
 539     def add(self, hash):
 540         """Insert an additional object in the list."""
 541         self.also.add(hash)
 542
 543
 544 def open_idx(filename):
 545     if filename.endswith('.idx'):
 546         f = open(filename, 'rb')
 547         header = f.read(8)
 548         if header[0:4] == '\377tOc':
 549             version = struct.unpack('!I', header[4:8])[0]
 550             if version == 2:
 551                 return PackIdxV2(filename, f)
 552             else:
 553                 raise GitError('%s: expected idx file version 2, got %d'
 554                                % (filename, version))
 555         elif len(header) == 8 and header[0:4] < '\377tOc':
 556             return PackIdxV1(filename, f)
 557         else:
 558             raise GitError('%s: unrecognized idx file header' % filename)
 559     elif filename.endswith('.midx'):
 560         return midx.PackMidx(filename)
 561     else:
 562         raise GitError('idx filenames must end with .idx or .midx')
 563
 564
 565 def idxmerge(idxlist, final_progress=True):
 566     """Generate a list of all the objects reachable in a PackIdxList."""
 567     def pfunc(count, total):
 568         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 569                   % (count*100.0/total, count, total))
 570     def pfinal(count, total):
 571         if final_progress:
 572             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 573                      % (100, total, total))
 574     return merge_iter(idxlist, 10024, pfunc, pfinal)
 575
 576
 577 def _make_objcache():
 578     return PackIdxList(repo('objects/pack'))
 579
 580 # bup-gc assumes that it can disable all PackWriter activities
 581 # (bloom/midx/cache) via the constructor and close() arguments.
 582
 583 class PackWriter:
 584     """Writes Git objects inside a pack file."""
 585     def __init__(self, objcache_maker=_make_objcache, compression_level=1,
 586                  run_midx=True, on_pack_finish=None):
 587         self.file = None
 588         self.parentfd = None
 589         self.count = 0
 590         self.outbytes = 0
 591         self.filename = None
 592         self.idx = None
 593         self.objcache_maker = objcache_maker
 594         self.objcache = None
 595         self.compression_level = compression_level
 596         self.run_midx=run_midx
 597         self.on_pack_finish = on_pack_finish
 598
 599     def __del__(self):
 600         self.close()
 601
 602     def _open(self):
 603         if not self.file:
 604             objdir = dir=repo('objects')
 605             fd, name = tempfile.mkstemp(suffix='.pack', dir=objdir)
 606             try:
 607                 self.file = os.fdopen(fd, 'w+b')
 608             except:
 609                 os.close(fd)
 610                 raise
 611             try:
 612                 self.parentfd = os.open(objdir, os.O_RDONLY)
 613             except:
 614                 f = self.file
 615                 self.file = None
 616                 f.close()
 617                 raise
 618             assert(name.endswith('.pack'))
 619             self.filename = name[:-5]
 620             self.file.write('PACK\0\0\0\2\0\0\0\0')
 621             self.idx = list(list() for i in xrange(256))
 622
 623     def _raw_write(self, datalist, sha):
 624         self._open()
 625         f = self.file
 626         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 627         # the file never has a *partial* blob.  So let's make sure it's
 628         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 629         # to our hashsplit algorithm.)  f.write() does its own buffering,
 630         # but that's okay because we'll flush it in _end().
 631         oneblob = ''.join(datalist)
 632         try:
 633             f.write(oneblob)
 634         except IOError as e:
 635             raise GitError, e, sys.exc_info()[2]
 636         nw = len(oneblob)
 637         crc = zlib.crc32(oneblob) & 0xffffffff
 638         self._update_idx(sha, crc, nw)
 639         self.outbytes += nw
 640         self.count += 1
 641         return nw, crc
 642
 643     def _update_idx(self, sha, crc, size):
 644         assert(sha)
 645         if self.idx:
 646             self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
 647
 648     def _write(self, sha, type, content):
 649         if verbose:
 650             log('>')
 651         if not sha:
 652             sha = calc_hash(type, content)
 653         size, crc = self._raw_write(_encode_packobj(type, content,
 654                                                     self.compression_level),
 655                                     sha=sha)
 656         if self.outbytes >= max_pack_size or self.count >= max_pack_objects:
 657             self.breakpoint()
 658         return sha
 659
 660     def breakpoint(self):
 661         """Clear byte and object counts and return the last processed id."""
 662         id = self._end(self.run_midx)
 663         self.outbytes = self.count = 0
 664         return id
 665
 666     def _require_objcache(self):
 667         if self.objcache is None and self.objcache_maker:
 668             self.objcache = self.objcache_maker()
 669         if self.objcache is None:
 670             raise GitError(
 671                     "PackWriter not opened or can't check exists w/o objcache")
 672
 673     def exists(self, id, want_source=False):
 674         """Return non-empty if an object is found in the object cache."""
 675         self._require_objcache()
 676         return self.objcache.exists(id, want_source=want_source)
 677
 678     def write(self, sha, type, content):
 679         """Write an object to the pack file.  Fails if sha exists()."""
 680         self._write(sha, type, content)
 681
 682     def maybe_write(self, type, content):
 683         """Write an object to the pack file if not present and return its id."""
 684         sha = calc_hash(type, content)
 685         if not self.exists(sha):
 686             self.write(sha, type, content)
 687             self._require_objcache()
 688             self.objcache.add(sha)
 689         return sha
 690
 691     def new_blob(self, blob):
 692         """Create a blob object in the pack with the supplied content."""
 693         return self.maybe_write('blob', blob)
 694
 695     def new_tree(self, shalist):
 696         """Create a tree object in the pack."""
 697         content = tree_encode(shalist)
 698         return self.maybe_write('tree', content)
 699
 700     def new_commit(self, tree, parent,
 701                    author, adate_sec, adate_tz,
 702                    committer, cdate_sec, cdate_tz,
 703                    msg):
 704         """Create a commit object in the pack.  The date_sec values must be
 705         epoch-seconds, and if a tz is None, the local timezone is assumed."""
 706         if adate_tz:
 707             adate_str = _git_date_str(adate_sec, adate_tz)
 708         else:
 709             adate_str = _local_git_date_str(adate_sec)
 710         if cdate_tz:
 711             cdate_str = _git_date_str(cdate_sec, cdate_tz)
 712         else:
 713             cdate_str = _local_git_date_str(cdate_sec)
 714         l = []
 715         if tree: l.append('tree %s' % tree.encode('hex'))
 716         if parent: l.append('parent %s' % parent.encode('hex'))
 717         if author: l.append('author %s %s' % (author, adate_str))
 718         if committer: l.append('committer %s %s' % (committer, cdate_str))
 719         l.append('')
 720         l.append(msg)
 721         return self.maybe_write('commit', '\n'.join(l))
 722
 723     def abort(self):
 724         """Remove the pack file from disk."""
 725         f = self.file
 726         if f:
 727             pfd = self.parentfd
 728             self.file = None
 729             self.parentfd = None
 730             self.idx = None
 731             try:
 732                 try:
 733                     os.unlink(self.filename + '.pack')
 734                 finally:
 735                     f.close()
 736             finally:
 737                 if pfd is not None:
 738                     os.close(pfd)
 739
 740     def _end(self, run_midx=True):
 741         f = self.file
 742         if not f: return None
 743         self.file = None
 744         try:
 745             self.objcache = None
 746             idx = self.idx
 747             self.idx = None
 748
 749             # update object count
 750             f.seek(8)
 751             cp = struct.pack('!i', self.count)
 752             assert(len(cp) == 4)
 753             f.write(cp)
 754
 755             # calculate the pack sha1sum
 756             f.seek(0)
 757             sum = Sha1()
 758             for b in chunkyreader(f):
 759                 sum.update(b)
 760             packbin = sum.digest()
 761             f.write(packbin)
 762             fdatasync(f.fileno())
 763         finally:
 764             f.close()
 765
 766         obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
 767
 768         nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
 769         if os.path.exists(self.filename + '.map'):
 770             os.unlink(self.filename + '.map')
 771         os.rename(self.filename + '.pack', nameprefix + '.pack')
 772         os.rename(self.filename + '.idx', nameprefix + '.idx')
 773         try:
 774             os.fsync(self.parentfd)
 775         finally:
 776             os.close(self.parentfd)
 777
 778         if run_midx:
 779             auto_midx(repo('objects/pack'))
 780
 781         if self.on_pack_finish:
 782             self.on_pack_finish(nameprefix)
 783
 784         return nameprefix
 785
 786     def close(self, run_midx=True):
 787         """Close the pack file and move it to its definitive path."""
 788         return self._end(run_midx=run_midx)
 789
 790     def _write_pack_idx_v2(self, filename, idx, packbin):
 791         ofs64_count = 0
 792         for section in idx:
 793             for entry in section:
 794                 if entry[2] >= 2**31:
 795                     ofs64_count += 1
 796
 797         # Length: header + fan-out + shas-and-crcs + overflow-offsets
 798         index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
 799         idx_map = None
 800         idx_f = open(filename, 'w+b')
 801         try:
 802             idx_f.truncate(index_len)
 803             fdatasync(idx_f.fileno())
 804             idx_map = mmap_readwrite(idx_f, close=False)
 805             try:
 806                 count = _helpers.write_idx(filename, idx_map, idx, self.count)
 807                 assert(count == self.count)
 808                 idx_map.flush()
 809             finally:
 810                 idx_map.close()
 811         finally:
 812             idx_f.close()
 813
 814         idx_f = open(filename, 'a+b')
 815         try:
 816             idx_f.write(packbin)
 817             idx_f.seek(0)
 818             idx_sum = Sha1()
 819             b = idx_f.read(8 + 4*256)
 820             idx_sum.update(b)
 821
 822             obj_list_sum = Sha1()
 823             for b in chunkyreader(idx_f, 20*self.count):
 824                 idx_sum.update(b)
 825                 obj_list_sum.update(b)
 826             namebase = obj_list_sum.hexdigest()
 827
 828             for b in chunkyreader(idx_f):
 829                 idx_sum.update(b)
 830             idx_f.write(idx_sum.digest())
 831             fdatasync(idx_f.fileno())
 832             return namebase
 833         finally:
 834             idx_f.close()
 835
 836
 837 def _gitenv(repo_dir = None):
 838     if not repo_dir:
 839         repo_dir = repo()
 840     def env():
 841         os.environ['GIT_DIR'] = os.path.abspath(repo_dir)
 842     return env
 843
 844
 845 def list_refs(refname=None, repo_dir=None,
 846               limit_to_heads=False, limit_to_tags=False):
 847     """Yield (refname, hash) tuples for all repository refs unless a ref
 848     name is specified.  Given a ref name, only include tuples for that
 849     particular ref.  The limits restrict the result items to
 850     refs/heads or refs/tags.  If both limits are specified, items from
 851     both sources will be included.
 852
 853     """
 854     argv = ['git', 'show-ref']
 855     if limit_to_heads:
 856         argv.append('--heads')
 857     if limit_to_tags:
 858         argv.append('--tags')
 859     argv.append('--')
 860     if refname:
 861         argv += [refname]
 862     p = subprocess.Popen(argv,
 863                          preexec_fn = _gitenv(repo_dir),
 864                          stdout = subprocess.PIPE)
 865     out = p.stdout.read().strip()
 866     rv = p.wait()  # not fatal
 867     if rv:
 868         assert(not out)
 869     if out:
 870         for d in out.split('\n'):
 871             (sha, name) = d.split(' ', 1)
 872             yield (name, sha.decode('hex'))
 873
 874
 875 def read_ref(refname, repo_dir = None):
 876     """Get the commit id of the most recent commit made on a given ref."""
 877     refs = list_refs(refname, repo_dir=repo_dir, limit_to_heads=True)
 878     l = tuple(islice(refs, 2))
 879     if l:
 880         assert(len(l) == 1)
 881         return l[0][1]
 882     else:
 883         return None
 884
 885
 886 def rev_list(ref, count=None, repo_dir=None):
 887     """Generate a list of reachable commits in reverse chronological order.
 888
 889     This generator walks through commits, from child to parent, that are
 890     reachable via the specified ref and yields a series of tuples of the form
 891     (date,hash).
 892
 893     If count is a non-zero integer, limit the number of commits to "count"
 894     objects.
 895     """
 896     assert(not ref.startswith('-'))
 897     opts = []
 898     if count:
 899         opts += ['-n', str(atoi(count))]
 900     argv = ['git', 'rev-list', '--pretty=format:%at'] + opts + [ref, '--']
 901     p = subprocess.Popen(argv,
 902                          preexec_fn = _gitenv(repo_dir),
 903                          stdout = subprocess.PIPE)
 904     commit = None
 905     for row in p.stdout:
 906         s = row.strip()
 907         if s.startswith('commit '):
 908             commit = s[7:].decode('hex')
 909         else:
 910             date = int(s)
 911             yield (date, commit)
 912     rv = p.wait()  # not fatal
 913     if rv:
 914         raise GitError, 'git rev-list returned error %d' % rv
 915
 916
 917 def get_commit_dates(refs, repo_dir=None):
 918     """Get the dates for the specified commit refs.  For now, every unique
 919        string in refs must resolve to a different commit or this
 920        function will fail."""
 921     result = []
 922     for ref in refs:
 923         commit = get_commit_items(ref, cp(repo_dir))
 924         result.append(commit.author_sec)
 925     return result
 926
 927
 928 def rev_parse(committish, repo_dir=None):
 929     """Resolve the full hash for 'committish', if it exists.
 930
 931     Should be roughly equivalent to 'git rev-parse'.
 932
 933     Returns the hex value of the hash if it is found, None if 'committish' does
 934     not correspond to anything.
 935     """
 936     head = read_ref(committish, repo_dir=repo_dir)
 937     if head:
 938         debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
 939         return head
 940
 941     pL = PackIdxList(repo('objects/pack', repo_dir=repo_dir))
 942
 943     if len(committish) == 40:
 944         try:
 945             hash = committish.decode('hex')
 946         except TypeError:
 947             return None
 948
 949         if pL.exists(hash):
 950             return hash
 951
 952     return None
 953
 954
 955 def update_ref(refname, newval, oldval, repo_dir=None):
 956     """Update a repository reference."""
 957     if not oldval:
 958         oldval = ''
 959     assert(refname.startswith('refs/heads/') \
 960            or refname.startswith('refs/tags/'))
 961     p = subprocess.Popen(['git', 'update-ref', refname,
 962                           newval.encode('hex'), oldval.encode('hex')],
 963                          preexec_fn = _gitenv(repo_dir))
 964     _git_wait('git update-ref', p)
 965
 966
 967 def delete_ref(refname, oldvalue=None):
 968     """Delete a repository reference (see git update-ref(1))."""
 969     assert(refname.startswith('refs/'))
 970     oldvalue = [] if not oldvalue else [oldvalue]
 971     p = subprocess.Popen(['git', 'update-ref', '-d', refname] + oldvalue,
 972                          preexec_fn = _gitenv())
 973     _git_wait('git update-ref', p)
 974
 975
 976 def guess_repo(path=None):
 977     """Set the path value in the global variable "repodir".
 978     This makes bup look for an existing bup repository, but not fail if a
 979     repository doesn't exist. Usually, if you are interacting with a bup
 980     repository, you would not be calling this function but using
 981     check_repo_or_die().
 982     """
 983     global repodir
 984     if path:
 985         repodir = path
 986     if not repodir:
 987         repodir = os.environ.get('BUP_DIR')
 988         if not repodir:
 989             repodir = os.path.expanduser('~/.bup')
 990
 991
 992 def init_repo(path=None):
 993     """Create the Git bare repository for bup in a given path."""
 994     guess_repo(path)
 995     d = repo()  # appends a / to the path
 996     parent = os.path.dirname(os.path.dirname(d))
 997     if parent and not os.path.exists(parent):
 998         raise GitError('parent directory "%s" does not exist\n' % parent)
 999     if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
1000         raise GitError('"%s" exists but is not a directory\n' % d)
1001     p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
1002                          preexec_fn = _gitenv())
1003     _git_wait('git init', p)
1004     # Force the index version configuration in order to ensure bup works
1005     # regardless of the version of the installed Git binary.
1006     p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
1007                          stdout=sys.stderr, preexec_fn = _gitenv())
1008     _git_wait('git config', p)
1009     # Enable the reflog
1010     p = subprocess.Popen(['git', 'config', 'core.logAllRefUpdates', 'true'],
1011                          stdout=sys.stderr, preexec_fn = _gitenv())
1012     _git_wait('git config', p)
1013
1014
1015 def check_repo_or_die(path=None):
1016     """Make sure a bup repository exists, and abort if not.
1017     If the path to a particular repository was not specified, this function
1018     initializes the default repository automatically.
1019     """
1020     guess_repo(path)
1021     try:
1022         os.stat(repo('objects/pack/.'))
1023     except OSError as e:
1024         if e.errno == errno.ENOENT:
1025             log('error: %r is not a bup repository; run "bup init"\n'
1026                 % repo())
1027             sys.exit(15)
1028         else:
1029             log('error: %s\n' % e)
1030             sys.exit(14)
1031
1032
1033 _ver = None
1034 def ver():
1035     """Get Git's version and ensure a usable version is installed.
1036
1037     The returned version is formatted as an ordered tuple with each position
1038     representing a digit in the version tag. For example, the following tuple
1039     would represent version 1.6.6.9:
1040
1041         ('1', '6', '6', '9')
1042     """
1043     global _ver
1044     if not _ver:
1045         p = subprocess.Popen(['git', '--version'],
1046                              stdout=subprocess.PIPE)
1047         gvs = p.stdout.read()
1048         _git_wait('git --version', p)
1049         m = re.match(r'git version (\S+.\S+)', gvs)
1050         if not m:
1051             raise GitError('git --version weird output: %r' % gvs)
1052         _ver = tuple(m.group(1).split('.'))
1053     needed = ('1','5', '3', '1')
1054     if _ver < needed:
1055         raise GitError('git version %s or higher is required; you have %s'
1056                        % ('.'.join(needed), '.'.join(_ver)))
1057     return _ver
1058
1059
1060 def _git_wait(cmd, p):
1061     rv = p.wait()
1062     if rv != 0:
1063         raise GitError('%s returned %d' % (cmd, rv))
1064
1065
1066 def _git_capture(argv):
1067     p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv())
1068     r = p.stdout.read()
1069     _git_wait(repr(argv), p)
1070     return r
1071
1072
1073 class _AbortableIter:
1074     def __init__(self, it, onabort = None):
1075         self.it = it
1076         self.onabort = onabort
1077         self.done = None
1078
1079     def __iter__(self):
1080         return self
1081
1082     def next(self):
1083         try:
1084             return self.it.next()
1085         except StopIteration as e:
1086             self.done = True
1087             raise
1088         except:
1089             self.abort()
1090             raise
1091
1092     def abort(self):
1093         """Abort iteration and call the abortion callback, if needed."""
1094         if not self.done:
1095             self.done = True
1096             if self.onabort:
1097                 self.onabort()
1098
1099     def __del__(self):
1100         self.abort()
1101
1102
1103 class MissingObject(KeyError):
1104     def __init__(self, id):
1105         self.id = id
1106         KeyError.__init__(self, 'object %r is missing' % id.encode('hex'))
1107
1108
1109 _ver_warned = 0
1110 class CatPipe:
1111     """Link to 'git cat-file' that is used to retrieve blob data."""
1112     def __init__(self, repo_dir = None):
1113         global _ver_warned
1114         self.repo_dir = repo_dir
1115         wanted = ('1','5','6')
1116         if ver() < wanted:
1117             if not _ver_warned:
1118                 log('warning: git version < %s; bup will be slow.\n'
1119                     % '.'.join(wanted))
1120                 _ver_warned = 1
1121             self.get = self._slow_get
1122         else:
1123             self.p = self.inprogress = None
1124             self.get = self._fast_get
1125
1126     def _abort(self):
1127         if self.p:
1128             self.p.stdout.close()
1129             self.p.stdin.close()
1130         self.p = None
1131         self.inprogress = None
1132
1133     def _restart(self):
1134         self._abort()
1135         self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1136                                   stdin=subprocess.PIPE,
1137                                   stdout=subprocess.PIPE,
1138                                   close_fds = True,
1139                                   bufsize = 4096,
1140                                   preexec_fn = _gitenv(self.repo_dir))
1141
1142     def _fast_get(self, id):
1143         if not self.p or self.p.poll() != None:
1144             self._restart()
1145         assert(self.p)
1146         poll_result = self.p.poll()
1147         assert(poll_result == None)
1148         if self.inprogress:
1149             log('_fast_get: opening %r while %r is open\n'
1150                 % (id, self.inprogress))
1151         assert(not self.inprogress)
1152         assert(id.find('\n') < 0)
1153         assert(id.find('\r') < 0)
1154         assert(not id.startswith('-'))
1155         self.inprogress = id
1156         self.p.stdin.write('%s\n' % id)
1157         self.p.stdin.flush()
1158         hdr = self.p.stdout.readline()
1159         if hdr.endswith(' missing\n'):
1160             self.inprogress = None
1161             raise MissingObject(id.decode('hex'))
1162         spl = hdr.split(' ')
1163         if len(spl) != 3 or len(spl[0]) != 40:
1164             raise GitError('expected blob, got %r' % spl)
1165         (hex, type, size) = spl
1166
1167         it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
1168                            onabort = self._abort)
1169         try:
1170             yield type
1171             for blob in it:
1172                 yield blob
1173             readline_result = self.p.stdout.readline()
1174             assert(readline_result == '\n')
1175             self.inprogress = None
1176         except Exception as e:
1177             it.abort()
1178             raise
1179
1180     def _slow_get(self, id):
1181         assert(id.find('\n') < 0)
1182         assert(id.find('\r') < 0)
1183         assert(id[0] != '-')
1184         type = _git_capture(['git', 'cat-file', '-t', id]).strip()
1185         yield type
1186
1187         p = subprocess.Popen(['git', 'cat-file', type, id],
1188                              stdout=subprocess.PIPE,
1189                              preexec_fn = _gitenv(self.repo_dir))
1190         for blob in chunkyreader(p.stdout):
1191             yield blob
1192         _git_wait('git cat-file', p)
1193
1194     def _join(self, it):
1195         type = it.next()
1196         if type == 'blob':
1197             for blob in it:
1198                 yield blob
1199         elif type == 'tree':
1200             treefile = ''.join(it)
1201             for (mode, name, sha) in tree_decode(treefile):
1202                 for blob in self.join(sha.encode('hex')):
1203                     yield blob
1204         elif type == 'commit':
1205             treeline = ''.join(it).split('\n')[0]
1206             assert(treeline.startswith('tree '))
1207             for blob in self.join(treeline[5:]):
1208                 yield blob
1209         else:
1210             raise GitError('invalid object type %r: expected blob/tree/commit'
1211                            % type)
1212
1213     def join(self, id):
1214         """Generate a list of the content of all blobs that can be reached
1215         from an object.  The hash given in 'id' must point to a blob, a tree
1216         or a commit. The content of all blobs that can be seen from trees or
1217         commits will be added to the list.
1218         """
1219         try:
1220             for d in self._join(self.get(id)):
1221                 yield d
1222         except StopIteration:
1223             log('booger!\n')
1224
1225
1226 _cp = {}
1227
1228 def cp(repo_dir=None):
1229     """Create a CatPipe object or reuse the already existing one."""
1230     global _cp
1231     if not repo_dir:
1232         repo_dir = repo()
1233     repo_dir = os.path.abspath(repo_dir)
1234     cp = _cp.get(repo_dir)
1235     if not cp:
1236         cp = CatPipe(repo_dir)
1237         _cp[repo_dir] = cp
1238     return cp
1239
1240
1241 def tags(repo_dir = None):
1242     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1243     tags = {}
1244     for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1245         assert(n.startswith('refs/tags/'))
1246         name = n[10:]
1247         if not c in tags:
1248             tags[c] = []
1249         tags[c].append(name)  # more than one tag can point at 'c'
1250     return tags
1251
1252
1253 WalkItem = namedtuple('WalkItem', ['id', 'type', 'mode',
1254                                    'path', 'chunk_path', 'data'])
1255 # The path is the mangled path, and if an item represents a fragment
1256 # of a chunked file, the chunk_path will be the chunked subtree path
1257 # for the chunk, i.e. ['', '2d3115e', ...].  The top-level path for a
1258 # chunked file will have a chunk_path of [''].  So some chunk subtree
1259 # of the file '/foo/bar/baz' might look like this:
1260 #
1261 #   item.path = ['foo', 'bar', 'baz.bup']
1262 #   item.chunk_path = ['', '2d3115e', '016b097']
1263 #   item.type = 'tree'
1264 #   ...
1265
1266
1267 def walk_object(cat_pipe, id,
1268                 stop_at=None,
1269                 include_data=None):
1270     """Yield everything reachable from id via cat_pipe as a WalkItem,
1271     stopping whenever stop_at(id) returns true.  Throw MissingObject
1272     if a hash encountered is missing from the repository.
1273
1274     """
1275     # Maintain the pending stack on the heap to avoid stack overflow
1276     pending = [(id, [], [], None)]
1277     while len(pending):
1278         id, parent_path, chunk_path, mode = pending.pop()
1279         if stop_at and stop_at(id):
1280             continue
1281
1282         item_it = cat_pipe.get(id)  # FIXME: use include_data
1283         type = item_it.next()
1284         if type not in ('blob', 'commit', 'tree'):
1285             raise Exception('unexpected repository object type %r' % type)
1286
1287         # FIXME: set the mode based on the type when the mode is None
1288         if type == 'blob' and not include_data:
1289             # Dump data until we can ask cat_pipe not to fetch it
1290             for ignored in item_it:
1291                 pass
1292             data = None
1293         else:
1294             data = ''.join(item_it)
1295
1296         yield WalkItem(id=id, type=type,
1297                        chunk_path=chunk_path, path=parent_path,
1298                        mode=mode,
1299                        data=(data if include_data else None))
1300
1301         if type == 'commit':
1302             commit_items = parse_commit(data)
1303             for pid in commit_items.parents:
1304                 pending.append((pid, parent_path, chunk_path, mode))
1305             pending.append((commit_items.tree, parent_path, chunk_path,
1306                             hashsplit.GIT_MODE_TREE))
1307         elif type == 'tree':
1308             for mode, name, ent_id in tree_decode(data):
1309                 demangled, bup_type = demangle_name(name, mode)
1310                 if chunk_path:
1311                     sub_path = parent_path
1312                     sub_chunk_path = chunk_path + [name]
1313                 else:
1314                     sub_path = parent_path + [name]
1315                     if bup_type == BUP_CHUNKED:
1316                         sub_chunk_path = ['']
1317                     else:
1318                         sub_chunk_path = chunk_path
1319                 pending.append((ent_id.encode('hex'), sub_path, sub_chunk_path,
1320                                 mode))