lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5
   6 import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
   7 from collections import namedtuple
   8 from itertools import islice
   9
  10 from bup import _helpers, hashsplit, path, midx, bloom, xstat
  11 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
  12                          fdatasync,
  13                          hostname, localtime, log, merge_iter,
  14                          mmap_read, mmap_readwrite,
  15                          progress, qprogress, stat_if_exists,
  16                          unlink, username, userfullname,
  17                          utc_offset_str)
  18
  19
  20 max_pack_size = 1000*1000*1000  # larger packs will slow down pruning
  21 max_pack_objects = 200*1000  # cache memory usage is about 83 bytes per object
  22
  23 verbose = 0
  24 ignore_midx = 0
  25 repodir = None  # The default repository, once initialized
  26
  27 _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
  28 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
  29
  30 _total_searches = 0
  31 _total_steps = 0
  32
  33
  34 class GitError(Exception):
  35     pass
  36
  37
  38 def parse_tz_offset(s):
  39     """UTC offset in seconds."""
  40     tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
  41     if s[0] == '-':
  42         return - tz_off
  43     return tz_off
  44
  45
  46 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
  47 # Make sure that's authoritative.
  48 _start_end_char = r'[^ .,:;<>"\'\0\n]'
  49 _content_char = r'[^\0\n<>]'
  50 _safe_str_rx = '(?:%s{1,2}|(?:%s%s*%s))' \
  51     % (_start_end_char,
  52        _start_end_char, _content_char, _start_end_char)
  53 _tz_rx = r'[-+]\d\d[0-5]\d'
  54 _parent_rx = r'(?:parent [abcdefABCDEF0123456789]{40}\n)'
  55 _commit_rx = re.compile(r'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
  56 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
  57 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)
  58
  59 (?P<message>(?:.|\n)*)''' % (_parent_rx,
  60                              _safe_str_rx, _safe_str_rx, _tz_rx,
  61                              _safe_str_rx, _safe_str_rx, _tz_rx))
  62 _parent_hash_rx = re.compile(r'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
  63
  64
  65 # Note that the author_sec and committer_sec values are (UTC) epoch seconds.
  66 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
  67                                        'author_name', 'author_mail',
  68                                        'author_sec', 'author_offset',
  69                                        'committer_name', 'committer_mail',
  70                                        'committer_sec', 'committer_offset',
  71                                        'message'])
  72
  73 def parse_commit(content):
  74     commit_match = re.match(_commit_rx, content)
  75     if not commit_match:
  76         raise Exception('cannot parse commit %r' % content)
  77     matches = commit_match.groupdict()
  78     return CommitInfo(tree=matches['tree'],
  79                       parents=re.findall(_parent_hash_rx, matches['parents']),
  80                       author_name=matches['author_name'],
  81                       author_mail=matches['author_mail'],
  82                       author_sec=int(matches['asec']),
  83                       author_offset=parse_tz_offset(matches['atz']),
  84                       committer_name=matches['committer_name'],
  85                       committer_mail=matches['committer_mail'],
  86                       committer_sec=int(matches['csec']),
  87                       committer_offset=parse_tz_offset(matches['ctz']),
  88                       message=matches['message'])
  89
  90
  91 def get_commit_items(id, cp):
  92     commit_it = cp.get(id)
  93     assert(commit_it.next() == 'commit')
  94     commit_content = ''.join(commit_it)
  95     return parse_commit(commit_content)
  96
  97
  98 def _local_git_date_str(epoch_sec):
  99     return '%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
 100
 101
 102 def _git_date_str(epoch_sec, tz_offset_sec):
 103     offs =  tz_offset_sec // 60
 104     return '%d %s%02d%02d' \
 105         % (epoch_sec,
 106            '+' if offs >= 0 else '-',
 107            abs(offs) // 60,
 108            abs(offs) % 60)
 109
 110
 111 def repo(sub = '', repo_dir=None):
 112     """Get the path to the git repository or one of its subdirectories."""
 113     global repodir
 114     repo_dir = repo_dir or repodir
 115     if not repo_dir:
 116         raise GitError('You should call check_repo_or_die()')
 117
 118     # If there's a .git subdirectory, then the actual repo is in there.
 119     gd = os.path.join(repo_dir, '.git')
 120     if os.path.exists(gd):
 121         repodir = gd
 122
 123     return os.path.join(repo_dir, sub)
 124
 125
 126 def shorten_hash(s):
 127     return re.sub(r'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)',
 128                   r'\1\2*\3', s)
 129
 130
 131 def repo_rel(path):
 132     full = os.path.abspath(path)
 133     fullrepo = os.path.abspath(repo(''))
 134     if not fullrepo.endswith('/'):
 135         fullrepo += '/'
 136     if full.startswith(fullrepo):
 137         path = full[len(fullrepo):]
 138     if path.startswith('index-cache/'):
 139         path = path[len('index-cache/'):]
 140     return shorten_hash(path)
 141
 142
 143 def all_packdirs():
 144     paths = [repo('objects/pack')]
 145     paths += glob.glob(repo('index-cache/*/.'))
 146     return paths
 147
 148
 149 def auto_midx(objdir):
 150     args = [path.exe(), 'midx', '--auto', '--dir', objdir]
 151     try:
 152         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 153     except OSError as e:
 154         # make sure 'args' gets printed to help with debugging
 155         add_error('%r: exception: %s' % (args, e))
 156         raise
 157     if rv:
 158         add_error('%r: returned %d' % (args, rv))
 159
 160     args = [path.exe(), 'bloom', '--dir', objdir]
 161     try:
 162         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 163     except OSError as e:
 164         # make sure 'args' gets printed to help with debugging
 165         add_error('%r: exception: %s' % (args, e))
 166         raise
 167     if rv:
 168         add_error('%r: returned %d' % (args, rv))
 169
 170
 171 def mangle_name(name, mode, gitmode):
 172     """Mangle a file name to present an abstract name for segmented files.
 173     Mangled file names will have the ".bup" extension added to them. If a
 174     file's name already ends with ".bup", a ".bupl" extension is added to
 175     disambiguate normal files from segmented ones.
 176     """
 177     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 178         assert(stat.S_ISDIR(gitmode))
 179         return name + '.bup'
 180     elif name.endswith('.bup') or name[:-1].endswith('.bup'):
 181         return name + '.bupl'
 182     else:
 183         return name
 184
 185
 186 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 187 def demangle_name(name, mode):
 188     """Remove name mangling from a file name, if necessary.
 189
 190     The return value is a tuple (demangled_filename,mode), where mode is one of
 191     the following:
 192
 193     * BUP_NORMAL  : files that should be read as-is from the repository
 194     * BUP_CHUNKED : files that were chunked and need to be reassembled
 195
 196     For more information on the name mangling algorithm, see mangle_name()
 197     """
 198     if name.endswith('.bupl'):
 199         return (name[:-5], BUP_NORMAL)
 200     elif name.endswith('.bup'):
 201         return (name[:-4], BUP_CHUNKED)
 202     elif name.endswith('.bupm'):
 203         return (name[:-5],
 204                 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
 205     else:
 206         return (name, BUP_NORMAL)
 207
 208
 209 def calc_hash(type, content):
 210     """Calculate some content's hash in the Git fashion."""
 211     header = '%s %d\0' % (type, len(content))
 212     sum = Sha1(header)
 213     sum.update(content)
 214     return sum.digest()
 215
 216
 217 def shalist_item_sort_key(ent):
 218     (mode, name, id) = ent
 219     assert(mode+0 == mode)
 220     if stat.S_ISDIR(mode):
 221         return name + '/'
 222     else:
 223         return name
 224
 225
 226 def tree_encode(shalist):
 227     """Generate a git tree object from (mode,name,hash) tuples."""
 228     shalist = sorted(shalist, key = shalist_item_sort_key)
 229     l = []
 230     for (mode,name,bin) in shalist:
 231         assert(mode)
 232         assert(mode+0 == mode)
 233         assert(name)
 234         assert(len(bin) == 20)
 235         s = '%o %s\0%s' % (mode,name,bin)
 236         assert(s[0] != '0')  # 0-padded octal is not acceptable in a git tree
 237         l.append(s)
 238     return ''.join(l)
 239
 240
 241 def tree_decode(buf):
 242     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 243     ofs = 0
 244     while ofs < len(buf):
 245         z = buf.find('\0', ofs)
 246         assert(z > ofs)
 247         spl = buf[ofs:z].split(' ', 1)
 248         assert(len(spl) == 2)
 249         mode,name = spl
 250         sha = buf[z+1:z+1+20]
 251         ofs = z+1+20
 252         yield (int(mode, 8), name, sha)
 253
 254
 255 def _encode_packobj(type, content, compression_level=1):
 256     if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
 257         raise ValueError('invalid compression level %s' % compression_level)
 258     szout = ''
 259     sz = len(content)
 260     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 261     sz >>= 4
 262     while 1:
 263         if sz: szbits |= 0x80
 264         szout += chr(szbits)
 265         if not sz:
 266             break
 267         szbits = sz & 0x7f
 268         sz >>= 7
 269     z = zlib.compressobj(compression_level)
 270     yield szout
 271     yield z.compress(content)
 272     yield z.flush()
 273
 274
 275 def _encode_looseobj(type, content, compression_level=1):
 276     z = zlib.compressobj(compression_level)
 277     yield z.compress('%s %d\0' % (type, len(content)))
 278     yield z.compress(content)
 279     yield z.flush()
 280
 281
 282 def _decode_looseobj(buf):
 283     assert(buf);
 284     s = zlib.decompress(buf)
 285     i = s.find('\0')
 286     assert(i > 0)
 287     l = s[:i].split(' ')
 288     type = l[0]
 289     sz = int(l[1])
 290     content = s[i+1:]
 291     assert(type in _typemap)
 292     assert(sz == len(content))
 293     return (type, content)
 294
 295
 296 def _decode_packobj(buf):
 297     assert(buf)
 298     c = ord(buf[0])
 299     type = _typermap[(c & 0x70) >> 4]
 300     sz = c & 0x0f
 301     shift = 4
 302     i = 0
 303     while c & 0x80:
 304         i += 1
 305         c = ord(buf[i])
 306         sz |= (c & 0x7f) << shift
 307         shift += 7
 308         if not (c & 0x80):
 309             break
 310     return (type, zlib.decompress(buf[i+1:]))
 311
 312
 313 class PackIdx:
 314     def __init__(self):
 315         assert(0)
 316
 317     def find_offset(self, hash):
 318         """Get the offset of an object inside the index file."""
 319         idx = self._idx_from_hash(hash)
 320         if idx != None:
 321             return self._ofs_from_idx(idx)
 322         return None
 323
 324     def exists(self, hash, want_source=False):
 325         """Return nonempty if the object exists in this index."""
 326         if hash and (self._idx_from_hash(hash) != None):
 327             return want_source and os.path.basename(self.name) or True
 328         return None
 329
 330     def __len__(self):
 331         return int(self.fanout[255])
 332
 333     def _idx_from_hash(self, hash):
 334         global _total_searches, _total_steps
 335         _total_searches += 1
 336         assert(len(hash) == 20)
 337         b1 = ord(hash[0])
 338         start = self.fanout[b1-1] # range -1..254
 339         end = self.fanout[b1] # range 0..255
 340         want = str(hash)
 341         _total_steps += 1  # lookup table is a step
 342         while start < end:
 343             _total_steps += 1
 344             mid = start + (end-start)/2
 345             v = self._idx_to_hash(mid)
 346             if v < want:
 347                 start = mid+1
 348             elif v > want:
 349                 end = mid
 350             else: # got it!
 351                 return mid
 352         return None
 353
 354
 355 class PackIdxV1(PackIdx):
 356     """Object representation of a Git pack index (version 1) file."""
 357     def __init__(self, filename, f):
 358         self.name = filename
 359         self.idxnames = [self.name]
 360         self.map = mmap_read(f)
 361         self.fanout = list(struct.unpack('!256I',
 362                                          str(buffer(self.map, 0, 256*4))))
 363         self.fanout.append(0)  # entry "-1"
 364         nsha = self.fanout[255]
 365         self.sha_ofs = 256*4
 366         self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
 367
 368     def _ofs_from_idx(self, idx):
 369         return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
 370
 371     def _idx_to_hash(self, idx):
 372         return str(self.shatable[idx*24+4 : idx*24+24])
 373
 374     def __iter__(self):
 375         for i in xrange(self.fanout[255]):
 376             yield buffer(self.map, 256*4 + 24*i + 4, 20)
 377
 378
 379 class PackIdxV2(PackIdx):
 380     """Object representation of a Git pack index (version 2) file."""
 381     def __init__(self, filename, f):
 382         self.name = filename
 383         self.idxnames = [self.name]
 384         self.map = mmap_read(f)
 385         assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
 386         self.fanout = list(struct.unpack('!256I',
 387                                          str(buffer(self.map, 8, 256*4))))
 388         self.fanout.append(0)  # entry "-1"
 389         nsha = self.fanout[255]
 390         self.sha_ofs = 8 + 256*4
 391         self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
 392         self.ofstable = buffer(self.map,
 393                                self.sha_ofs + nsha*20 + nsha*4,
 394                                nsha*4)
 395         self.ofs64table = buffer(self.map,
 396                                  8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
 397
 398     def _ofs_from_idx(self, idx):
 399         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
 400         if ofs & 0x80000000:
 401             idx64 = ofs & 0x7fffffff
 402             ofs = struct.unpack('!Q',
 403                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
 404         return ofs
 405
 406     def _idx_to_hash(self, idx):
 407         return str(self.shatable[idx*20:(idx+1)*20])
 408
 409     def __iter__(self):
 410         for i in xrange(self.fanout[255]):
 411             yield buffer(self.map, 8 + 256*4 + 20*i, 20)
 412
 413
 414 _mpi_count = 0
 415 class PackIdxList:
 416     def __init__(self, dir):
 417         global _mpi_count
 418         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 419         _mpi_count += 1
 420         self.dir = dir
 421         self.also = set()
 422         self.packs = []
 423         self.do_bloom = False
 424         self.bloom = None
 425         self.refresh()
 426
 427     def __del__(self):
 428         global _mpi_count
 429         _mpi_count -= 1
 430         assert(_mpi_count == 0)
 431
 432     def __iter__(self):
 433         return iter(idxmerge(self.packs))
 434
 435     def __len__(self):
 436         return sum(len(pack) for pack in self.packs)
 437
 438     def exists(self, hash, want_source=False):
 439         """Return nonempty if the object exists in the index files."""
 440         global _total_searches
 441         _total_searches += 1
 442         if hash in self.also:
 443             return True
 444         if self.do_bloom and self.bloom:
 445             if self.bloom.exists(hash):
 446                 self.do_bloom = False
 447             else:
 448                 _total_searches -= 1  # was counted by bloom
 449                 return None
 450         for i in xrange(len(self.packs)):
 451             p = self.packs[i]
 452             _total_searches -= 1  # will be incremented by sub-pack
 453             ix = p.exists(hash, want_source=want_source)
 454             if ix:
 455                 # reorder so most recently used packs are searched first
 456                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 457                 return ix
 458         self.do_bloom = True
 459         return None
 460
 461     def refresh(self, skip_midx = False):
 462         """Refresh the index list.
 463         This method verifies if .midx files were superseded (e.g. all of its
 464         contents are in another, bigger .midx file) and removes the superseded
 465         files.
 466
 467         If skip_midx is True, all work on .midx files will be skipped and .midx
 468         files will be removed from the list.
 469
 470         The module-global variable 'ignore_midx' can force this function to
 471         always act as if skip_midx was True.
 472         """
 473         self.bloom = None # Always reopen the bloom as it may have been relaced
 474         self.do_bloom = False
 475         skip_midx = skip_midx or ignore_midx
 476         d = dict((p.name, p) for p in self.packs
 477                  if not skip_midx or not isinstance(p, midx.PackMidx))
 478         if os.path.exists(self.dir):
 479             if not skip_midx:
 480                 midxl = []
 481                 for ix in self.packs:
 482                     if isinstance(ix, midx.PackMidx):
 483                         for name in ix.idxnames:
 484                             d[os.path.join(self.dir, name)] = ix
 485                 for full in glob.glob(os.path.join(self.dir,'*.midx')):
 486                     if not d.get(full):
 487                         mx = midx.PackMidx(full)
 488                         (mxd, mxf) = os.path.split(mx.name)
 489                         broken = False
 490                         for n in mx.idxnames:
 491                             if not os.path.exists(os.path.join(mxd, n)):
 492                                 log(('warning: index %s missing\n' +
 493                                     '  used by %s\n') % (n, mxf))
 494                                 broken = True
 495                         if broken:
 496                             mx.close()
 497                             del mx
 498                             unlink(full)
 499                         else:
 500                             midxl.append(mx)
 501                 midxl.sort(key=lambda ix:
 502                            (-len(ix), -xstat.stat(ix.name).st_mtime))
 503                 for ix in midxl:
 504                     any_needed = False
 505                     for sub in ix.idxnames:
 506                         found = d.get(os.path.join(self.dir, sub))
 507                         if not found or isinstance(found, PackIdx):
 508                             # doesn't exist, or exists but not in a midx
 509                             any_needed = True
 510                             break
 511                     if any_needed:
 512                         d[ix.name] = ix
 513                         for name in ix.idxnames:
 514                             d[os.path.join(self.dir, name)] = ix
 515                     elif not ix.force_keep:
 516                         debug1('midx: removing redundant: %s\n'
 517                                % os.path.basename(ix.name))
 518                         ix.close()
 519                         unlink(ix.name)
 520             for full in glob.glob(os.path.join(self.dir,'*.idx')):
 521                 if not d.get(full):
 522                     try:
 523                         ix = open_idx(full)
 524                     except GitError as e:
 525                         add_error(e)
 526                         continue
 527                     d[full] = ix
 528             bfull = os.path.join(self.dir, 'bup.bloom')
 529             if self.bloom is None and os.path.exists(bfull):
 530                 self.bloom = bloom.ShaBloom(bfull)
 531             self.packs = list(set(d.values()))
 532             self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
 533             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 534                 self.do_bloom = True
 535             else:
 536                 self.bloom = None
 537         debug1('PackIdxList: using %d index%s.\n'
 538             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 539
 540     def add(self, hash):
 541         """Insert an additional object in the list."""
 542         self.also.add(hash)
 543
 544
 545 def open_idx(filename):
 546     if filename.endswith('.idx'):
 547         f = open(filename, 'rb')
 548         header = f.read(8)
 549         if header[0:4] == '\377tOc':
 550             version = struct.unpack('!I', header[4:8])[0]
 551             if version == 2:
 552                 return PackIdxV2(filename, f)
 553             else:
 554                 raise GitError('%s: expected idx file version 2, got %d'
 555                                % (filename, version))
 556         elif len(header) == 8 and header[0:4] < '\377tOc':
 557             return PackIdxV1(filename, f)
 558         else:
 559             raise GitError('%s: unrecognized idx file header' % filename)
 560     elif filename.endswith('.midx'):
 561         return midx.PackMidx(filename)
 562     else:
 563         raise GitError('idx filenames must end with .idx or .midx')
 564
 565
 566 def idxmerge(idxlist, final_progress=True):
 567     """Generate a list of all the objects reachable in a PackIdxList."""
 568     def pfunc(count, total):
 569         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 570                   % (count*100.0/total, count, total))
 571     def pfinal(count, total):
 572         if final_progress:
 573             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 574                      % (100, total, total))
 575     return merge_iter(idxlist, 10024, pfunc, pfinal)
 576
 577
 578 def _make_objcache():
 579     return PackIdxList(repo('objects/pack'))
 580
 581 # bup-gc assumes that it can disable all PackWriter activities
 582 # (bloom/midx/cache) via the constructor and close() arguments.
 583
 584 class PackWriter:
 585     """Writes Git objects inside a pack file."""
 586     def __init__(self, objcache_maker=_make_objcache, compression_level=1,
 587                  run_midx=True, on_pack_finish=None):
 588         self.file = None
 589         self.parentfd = None
 590         self.count = 0
 591         self.outbytes = 0
 592         self.filename = None
 593         self.idx = None
 594         self.objcache_maker = objcache_maker
 595         self.objcache = None
 596         self.compression_level = compression_level
 597         self.run_midx=run_midx
 598         self.on_pack_finish = on_pack_finish
 599
 600     def __del__(self):
 601         self.close()
 602
 603     def _open(self):
 604         if not self.file:
 605             objdir = dir=repo('objects')
 606             fd, name = tempfile.mkstemp(suffix='.pack', dir=objdir)
 607             try:
 608                 self.file = os.fdopen(fd, 'w+b')
 609             except:
 610                 os.close(fd)
 611                 raise
 612             try:
 613                 self.parentfd = os.open(objdir, os.O_RDONLY)
 614             except:
 615                 f = self.file
 616                 self.file = None
 617                 f.close()
 618                 raise
 619             assert(name.endswith('.pack'))
 620             self.filename = name[:-5]
 621             self.file.write('PACK\0\0\0\2\0\0\0\0')
 622             self.idx = list(list() for i in xrange(256))
 623
 624     def _raw_write(self, datalist, sha):
 625         self._open()
 626         f = self.file
 627         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 628         # the file never has a *partial* blob.  So let's make sure it's
 629         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 630         # to our hashsplit algorithm.)  f.write() does its own buffering,
 631         # but that's okay because we'll flush it in _end().
 632         oneblob = ''.join(datalist)
 633         try:
 634             f.write(oneblob)
 635         except IOError as e:
 636             raise GitError, e, sys.exc_info()[2]
 637         nw = len(oneblob)
 638         crc = zlib.crc32(oneblob) & 0xffffffff
 639         self._update_idx(sha, crc, nw)
 640         self.outbytes += nw
 641         self.count += 1
 642         return nw, crc
 643
 644     def _update_idx(self, sha, crc, size):
 645         assert(sha)
 646         if self.idx:
 647             self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
 648
 649     def _write(self, sha, type, content):
 650         if verbose:
 651             log('>')
 652         if not sha:
 653             sha = calc_hash(type, content)
 654         size, crc = self._raw_write(_encode_packobj(type, content,
 655                                                     self.compression_level),
 656                                     sha=sha)
 657         if self.outbytes >= max_pack_size or self.count >= max_pack_objects:
 658             self.breakpoint()
 659         return sha
 660
 661     def breakpoint(self):
 662         """Clear byte and object counts and return the last processed id."""
 663         id = self._end(self.run_midx)
 664         self.outbytes = self.count = 0
 665         return id
 666
 667     def _require_objcache(self):
 668         if self.objcache is None and self.objcache_maker:
 669             self.objcache = self.objcache_maker()
 670         if self.objcache is None:
 671             raise GitError(
 672                     "PackWriter not opened or can't check exists w/o objcache")
 673
 674     def exists(self, id, want_source=False):
 675         """Return non-empty if an object is found in the object cache."""
 676         self._require_objcache()
 677         return self.objcache.exists(id, want_source=want_source)
 678
 679     def just_write(self, sha, type, content):
 680         """Write an object to the pack file, bypassing the objcache.  Fails if
 681         sha exists()."""
 682         self._write(sha, type, content)
 683
 684     def maybe_write(self, type, content):
 685         """Write an object to the pack file if not present and return its id."""
 686         sha = calc_hash(type, content)
 687         if not self.exists(sha):
 688             self.just_write(sha, type, content)
 689             self._require_objcache()
 690             self.objcache.add(sha)
 691         return sha
 692
 693     def new_blob(self, blob):
 694         """Create a blob object in the pack with the supplied content."""
 695         return self.maybe_write('blob', blob)
 696
 697     def new_tree(self, shalist):
 698         """Create a tree object in the pack."""
 699         content = tree_encode(shalist)
 700         return self.maybe_write('tree', content)
 701
 702     def new_commit(self, tree, parent,
 703                    author, adate_sec, adate_tz,
 704                    committer, cdate_sec, cdate_tz,
 705                    msg):
 706         """Create a commit object in the pack.  The date_sec values must be
 707         epoch-seconds, and if a tz is None, the local timezone is assumed."""
 708         if adate_tz:
 709             adate_str = _git_date_str(adate_sec, adate_tz)
 710         else:
 711             adate_str = _local_git_date_str(adate_sec)
 712         if cdate_tz:
 713             cdate_str = _git_date_str(cdate_sec, cdate_tz)
 714         else:
 715             cdate_str = _local_git_date_str(cdate_sec)
 716         l = []
 717         if tree: l.append('tree %s' % tree.encode('hex'))
 718         if parent: l.append('parent %s' % parent.encode('hex'))
 719         if author: l.append('author %s %s' % (author, adate_str))
 720         if committer: l.append('committer %s %s' % (committer, cdate_str))
 721         l.append('')
 722         l.append(msg)
 723         return self.maybe_write('commit', '\n'.join(l))
 724
 725     def abort(self):
 726         """Remove the pack file from disk."""
 727         f = self.file
 728         if f:
 729             pfd = self.parentfd
 730             self.file = None
 731             self.parentfd = None
 732             self.idx = None
 733             try:
 734                 try:
 735                     os.unlink(self.filename + '.pack')
 736                 finally:
 737                     f.close()
 738             finally:
 739                 if pfd is not None:
 740                     os.close(pfd)
 741
 742     def _end(self, run_midx=True):
 743         f = self.file
 744         if not f: return None
 745         self.file = None
 746         try:
 747             self.objcache = None
 748             idx = self.idx
 749             self.idx = None
 750
 751             # update object count
 752             f.seek(8)
 753             cp = struct.pack('!i', self.count)
 754             assert(len(cp) == 4)
 755             f.write(cp)
 756
 757             # calculate the pack sha1sum
 758             f.seek(0)
 759             sum = Sha1()
 760             for b in chunkyreader(f):
 761                 sum.update(b)
 762             packbin = sum.digest()
 763             f.write(packbin)
 764             fdatasync(f.fileno())
 765         finally:
 766             f.close()
 767
 768         obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
 769
 770         nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
 771         if os.path.exists(self.filename + '.map'):
 772             os.unlink(self.filename + '.map')
 773         os.rename(self.filename + '.pack', nameprefix + '.pack')
 774         os.rename(self.filename + '.idx', nameprefix + '.idx')
 775         try:
 776             os.fsync(self.parentfd)
 777         finally:
 778             os.close(self.parentfd)
 779
 780         if run_midx:
 781             auto_midx(repo('objects/pack'))
 782
 783         if self.on_pack_finish:
 784             self.on_pack_finish(nameprefix)
 785
 786         return nameprefix
 787
 788     def close(self, run_midx=True):
 789         """Close the pack file and move it to its definitive path."""
 790         return self._end(run_midx=run_midx)
 791
 792     def _write_pack_idx_v2(self, filename, idx, packbin):
 793         ofs64_count = 0
 794         for section in idx:
 795             for entry in section:
 796                 if entry[2] >= 2**31:
 797                     ofs64_count += 1
 798
 799         # Length: header + fan-out + shas-and-crcs + overflow-offsets
 800         index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
 801         idx_map = None
 802         idx_f = open(filename, 'w+b')
 803         try:
 804             idx_f.truncate(index_len)
 805             fdatasync(idx_f.fileno())
 806             idx_map = mmap_readwrite(idx_f, close=False)
 807             try:
 808                 count = _helpers.write_idx(filename, idx_map, idx, self.count)
 809                 assert(count == self.count)
 810                 idx_map.flush()
 811             finally:
 812                 idx_map.close()
 813         finally:
 814             idx_f.close()
 815
 816         idx_f = open(filename, 'a+b')
 817         try:
 818             idx_f.write(packbin)
 819             idx_f.seek(0)
 820             idx_sum = Sha1()
 821             b = idx_f.read(8 + 4*256)
 822             idx_sum.update(b)
 823
 824             obj_list_sum = Sha1()
 825             for b in chunkyreader(idx_f, 20*self.count):
 826                 idx_sum.update(b)
 827                 obj_list_sum.update(b)
 828             namebase = obj_list_sum.hexdigest()
 829
 830             for b in chunkyreader(idx_f):
 831                 idx_sum.update(b)
 832             idx_f.write(idx_sum.digest())
 833             fdatasync(idx_f.fileno())
 834             return namebase
 835         finally:
 836             idx_f.close()
 837
 838
 839 def _gitenv(repo_dir = None):
 840     if not repo_dir:
 841         repo_dir = repo()
 842     def env():
 843         os.environ['GIT_DIR'] = os.path.abspath(repo_dir)
 844     return env
 845
 846
 847 def list_refs(refnames=None, repo_dir=None,
 848               limit_to_heads=False, limit_to_tags=False):
 849     """Yield (refname, hash) tuples for all repository refs unless
 850     refnames are specified.  In that case, only include tuples for
 851     those refs.  The limits restrict the result items to refs/heads or
 852     refs/tags.  If both limits are specified, items from both sources
 853     will be included.
 854
 855     """
 856     argv = ['git', 'show-ref']
 857     if limit_to_heads:
 858         argv.append('--heads')
 859     if limit_to_tags:
 860         argv.append('--tags')
 861     argv.append('--')
 862     if refnames:
 863         argv += refnames
 864     p = subprocess.Popen(argv,
 865                          preexec_fn = _gitenv(repo_dir),
 866                          stdout = subprocess.PIPE)
 867     out = p.stdout.read().strip()
 868     rv = p.wait()  # not fatal
 869     if rv:
 870         assert(not out)
 871     if out:
 872         for d in out.split('\n'):
 873             (sha, name) = d.split(' ', 1)
 874             yield (name, sha.decode('hex'))
 875
 876
 877 def read_ref(refname, repo_dir = None):
 878     """Get the commit id of the most recent commit made on a given ref."""
 879     refs = list_refs(refnames=[refname], repo_dir=repo_dir, limit_to_heads=True)
 880     l = tuple(islice(refs, 2))
 881     if l:
 882         assert(len(l) == 1)
 883         return l[0][1]
 884     else:
 885         return None
 886
 887
 888 def rev_list(ref, count=None, repo_dir=None):
 889     """Generate a list of reachable commits in reverse chronological order.
 890
 891     This generator walks through commits, from child to parent, that are
 892     reachable via the specified ref and yields a series of tuples of the form
 893     (date,hash).
 894
 895     If count is a non-zero integer, limit the number of commits to "count"
 896     objects.
 897     """
 898     assert(not ref.startswith('-'))
 899     opts = []
 900     if count:
 901         opts += ['-n', str(atoi(count))]
 902     argv = ['git', 'rev-list', '--pretty=format:%at'] + opts + [ref, '--']
 903     p = subprocess.Popen(argv,
 904                          preexec_fn = _gitenv(repo_dir),
 905                          stdout = subprocess.PIPE)
 906     commit = None
 907     for row in p.stdout:
 908         s = row.strip()
 909         if s.startswith('commit '):
 910             commit = s[7:].decode('hex')
 911         else:
 912             date = int(s)
 913             yield (date, commit)
 914     rv = p.wait()  # not fatal
 915     if rv:
 916         raise GitError, 'git rev-list returned error %d' % rv
 917
 918
 919 def get_commit_dates(refs, repo_dir=None):
 920     """Get the dates for the specified commit refs.  For now, every unique
 921        string in refs must resolve to a different commit or this
 922        function will fail."""
 923     result = []
 924     for ref in refs:
 925         commit = get_commit_items(ref, cp(repo_dir))
 926         result.append(commit.author_sec)
 927     return result
 928
 929
 930 def rev_parse(committish, repo_dir=None):
 931     """Resolve the full hash for 'committish', if it exists.
 932
 933     Should be roughly equivalent to 'git rev-parse'.
 934
 935     Returns the hex value of the hash if it is found, None if 'committish' does
 936     not correspond to anything.
 937     """
 938     head = read_ref(committish, repo_dir=repo_dir)
 939     if head:
 940         debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
 941         return head
 942
 943     pL = PackIdxList(repo('objects/pack', repo_dir=repo_dir))
 944
 945     if len(committish) == 40:
 946         try:
 947             hash = committish.decode('hex')
 948         except TypeError:
 949             return None
 950
 951         if pL.exists(hash):
 952             return hash
 953
 954     return None
 955
 956
 957 def update_ref(refname, newval, oldval, repo_dir=None):
 958     """Update a repository reference."""
 959     if not oldval:
 960         oldval = ''
 961     assert(refname.startswith('refs/heads/') \
 962            or refname.startswith('refs/tags/'))
 963     p = subprocess.Popen(['git', 'update-ref', refname,
 964                           newval.encode('hex'), oldval.encode('hex')],
 965                          preexec_fn = _gitenv(repo_dir))
 966     _git_wait('git update-ref', p)
 967
 968
 969 def delete_ref(refname, oldvalue=None):
 970     """Delete a repository reference (see git update-ref(1))."""
 971     assert(refname.startswith('refs/'))
 972     oldvalue = [] if not oldvalue else [oldvalue]
 973     p = subprocess.Popen(['git', 'update-ref', '-d', refname] + oldvalue,
 974                          preexec_fn = _gitenv())
 975     _git_wait('git update-ref', p)
 976
 977
 978 def guess_repo(path=None):
 979     """Set the path value in the global variable "repodir".
 980     This makes bup look for an existing bup repository, but not fail if a
 981     repository doesn't exist. Usually, if you are interacting with a bup
 982     repository, you would not be calling this function but using
 983     check_repo_or_die().
 984     """
 985     global repodir
 986     if path:
 987         repodir = path
 988     if not repodir:
 989         repodir = os.environ.get('BUP_DIR')
 990         if not repodir:
 991             repodir = os.path.expanduser('~/.bup')
 992
 993
 994 def init_repo(path=None):
 995     """Create the Git bare repository for bup in a given path."""
 996     guess_repo(path)
 997     d = repo()  # appends a / to the path
 998     parent = os.path.dirname(os.path.dirname(d))
 999     if parent and not os.path.exists(parent):
1000         raise GitError('parent directory "%s" does not exist\n' % parent)
1001     if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
1002         raise GitError('"%s" exists but is not a directory\n' % d)
1003     p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
1004                          preexec_fn = _gitenv())
1005     _git_wait('git init', p)
1006     # Force the index version configuration in order to ensure bup works
1007     # regardless of the version of the installed Git binary.
1008     p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
1009                          stdout=sys.stderr, preexec_fn = _gitenv())
1010     _git_wait('git config', p)
1011     # Enable the reflog
1012     p = subprocess.Popen(['git', 'config', 'core.logAllRefUpdates', 'true'],
1013                          stdout=sys.stderr, preexec_fn = _gitenv())
1014     _git_wait('git config', p)
1015
1016
1017 def check_repo_or_die(path=None):
1018     """Check to see if a bup repository probably exists, and abort if not."""
1019     guess_repo(path)
1020     top = repo()
1021     pst = stat_if_exists(top + '/objects/pack')
1022     if pst and stat.S_ISDIR(pst.st_mode):
1023         return
1024     if not pst:
1025         top_st = stat_if_exists(top)
1026         if not top_st:
1027             log('error: repository %r does not exist (see "bup help init")\n'
1028                 % top)
1029             sys.exit(15)
1030     log('error: %r is not a repository\n' % top)
1031     sys.exit(14)
1032
1033
1034 _ver = None
1035 def ver():
1036     """Get Git's version and ensure a usable version is installed.
1037
1038     The returned version is formatted as an ordered tuple with each position
1039     representing a digit in the version tag. For example, the following tuple
1040     would represent version 1.6.6.9:
1041
1042         ('1', '6', '6', '9')
1043     """
1044     global _ver
1045     if not _ver:
1046         p = subprocess.Popen(['git', '--version'],
1047                              stdout=subprocess.PIPE)
1048         gvs = p.stdout.read()
1049         _git_wait('git --version', p)
1050         m = re.match(r'git version (\S+.\S+)', gvs)
1051         if not m:
1052             raise GitError('git --version weird output: %r' % gvs)
1053         _ver = tuple(m.group(1).split('.'))
1054     needed = ('1','5', '3', '1')
1055     if _ver < needed:
1056         raise GitError('git version %s or higher is required; you have %s'
1057                        % ('.'.join(needed), '.'.join(_ver)))
1058     return _ver
1059
1060
1061 def _git_wait(cmd, p):
1062     rv = p.wait()
1063     if rv != 0:
1064         raise GitError('%s returned %d' % (cmd, rv))
1065
1066
1067 def _git_capture(argv):
1068     p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv())
1069     r = p.stdout.read()
1070     _git_wait(repr(argv), p)
1071     return r
1072
1073
1074 class _AbortableIter:
1075     def __init__(self, it, onabort = None):
1076         self.it = it
1077         self.onabort = onabort
1078         self.done = None
1079
1080     def __iter__(self):
1081         return self
1082
1083     def next(self):
1084         try:
1085             return self.it.next()
1086         except StopIteration as e:
1087             self.done = True
1088             raise
1089         except:
1090             self.abort()
1091             raise
1092
1093     def abort(self):
1094         """Abort iteration and call the abortion callback, if needed."""
1095         if not self.done:
1096             self.done = True
1097             if self.onabort:
1098                 self.onabort()
1099
1100     def __del__(self):
1101         self.abort()
1102
1103
1104 class MissingObject(KeyError):
1105     def __init__(self, id):
1106         self.id = id
1107         KeyError.__init__(self, 'object %r is missing' % id.encode('hex'))
1108
1109
1110 _ver_warned = 0
1111 class CatPipe:
1112     """Link to 'git cat-file' that is used to retrieve blob data."""
1113     def __init__(self, repo_dir = None):
1114         global _ver_warned
1115         self.repo_dir = repo_dir
1116         wanted = ('1','5','6')
1117         if ver() < wanted:
1118             if not _ver_warned:
1119                 log('warning: git version < %s; bup will be slow.\n'
1120                     % '.'.join(wanted))
1121                 _ver_warned = 1
1122             self.get = self._slow_get
1123         else:
1124             self.p = self.inprogress = None
1125             self.get = self._fast_get
1126
1127     def _abort(self):
1128         if self.p:
1129             self.p.stdout.close()
1130             self.p.stdin.close()
1131         self.p = None
1132         self.inprogress = None
1133
1134     def restart(self):
1135         self._abort()
1136         self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1137                                   stdin=subprocess.PIPE,
1138                                   stdout=subprocess.PIPE,
1139                                   close_fds = True,
1140                                   bufsize = 4096,
1141                                   preexec_fn = _gitenv(self.repo_dir))
1142
1143     def _fast_get(self, id):
1144         if not self.p or self.p.poll() != None:
1145             self.restart()
1146         assert(self.p)
1147         poll_result = self.p.poll()
1148         assert(poll_result == None)
1149         if self.inprogress:
1150             log('_fast_get: opening %r while %r is open\n'
1151                 % (id, self.inprogress))
1152         assert(not self.inprogress)
1153         assert(id.find('\n') < 0)
1154         assert(id.find('\r') < 0)
1155         assert(not id.startswith('-'))
1156         self.inprogress = id
1157         self.p.stdin.write('%s\n' % id)
1158         self.p.stdin.flush()
1159         hdr = self.p.stdout.readline()
1160         if hdr.endswith(' missing\n'):
1161             self.inprogress = None
1162             raise MissingObject(id.decode('hex'))
1163         spl = hdr.split(' ')
1164         if len(spl) != 3 or len(spl[0]) != 40:
1165             raise GitError('expected blob, got %r' % spl)
1166         (hex, type, size) = spl
1167
1168         it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
1169                            onabort = self._abort)
1170         try:
1171             yield type
1172             for blob in it:
1173                 yield blob
1174             readline_result = self.p.stdout.readline()
1175             assert(readline_result == '\n')
1176             self.inprogress = None
1177         except Exception as e:
1178             it.abort()
1179             raise
1180
1181     def _slow_get(self, id):
1182         assert(id.find('\n') < 0)
1183         assert(id.find('\r') < 0)
1184         assert(id[0] != '-')
1185         type = _git_capture(['git', 'cat-file', '-t', id]).strip()
1186         yield type
1187
1188         p = subprocess.Popen(['git', 'cat-file', type, id],
1189                              stdout=subprocess.PIPE,
1190                              preexec_fn = _gitenv(self.repo_dir))
1191         for blob in chunkyreader(p.stdout):
1192             yield blob
1193         _git_wait('git cat-file', p)
1194
1195     def _join(self, it):
1196         type = it.next()
1197         if type == 'blob':
1198             for blob in it:
1199                 yield blob
1200         elif type == 'tree':
1201             treefile = ''.join(it)
1202             for (mode, name, sha) in tree_decode(treefile):
1203                 for blob in self.join(sha.encode('hex')):
1204                     yield blob
1205         elif type == 'commit':
1206             treeline = ''.join(it).split('\n')[0]
1207             assert(treeline.startswith('tree '))
1208             for blob in self.join(treeline[5:]):
1209                 yield blob
1210         else:
1211             raise GitError('invalid object type %r: expected blob/tree/commit'
1212                            % type)
1213
1214     def join(self, id):
1215         """Generate a list of the content of all blobs that can be reached
1216         from an object.  The hash given in 'id' must point to a blob, a tree
1217         or a commit. The content of all blobs that can be seen from trees or
1218         commits will be added to the list.
1219         """
1220         try:
1221             for d in self._join(self.get(id)):
1222                 yield d
1223         except StopIteration:
1224             log('booger!\n')
1225
1226
1227 _cp = {}
1228
1229 def cp(repo_dir=None):
1230     """Create a CatPipe object or reuse the already existing one."""
1231     global _cp, repodir
1232     if not repo_dir:
1233         repo_dir = repodir or repo()
1234     repo_dir = os.path.abspath(repo_dir)
1235     cp = _cp.get(repo_dir)
1236     if not cp:
1237         cp = CatPipe(repo_dir)
1238         _cp[repo_dir] = cp
1239     return cp
1240
1241
1242 def tags(repo_dir = None):
1243     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1244     tags = {}
1245     for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1246         assert(n.startswith('refs/tags/'))
1247         name = n[10:]
1248         if not c in tags:
1249             tags[c] = []
1250         tags[c].append(name)  # more than one tag can point at 'c'
1251     return tags
1252
1253
1254 WalkItem = namedtuple('WalkItem', ['id', 'type', 'mode',
1255                                    'path', 'chunk_path', 'data'])
1256 # The path is the mangled path, and if an item represents a fragment
1257 # of a chunked file, the chunk_path will be the chunked subtree path
1258 # for the chunk, i.e. ['', '2d3115e', ...].  The top-level path for a
1259 # chunked file will have a chunk_path of [''].  So some chunk subtree
1260 # of the file '/foo/bar/baz' might look like this:
1261 #
1262 #   item.path = ['foo', 'bar', 'baz.bup']
1263 #   item.chunk_path = ['', '2d3115e', '016b097']
1264 #   item.type = 'tree'
1265 #   ...
1266
1267
1268 def walk_object(cat_pipe, id,
1269                 stop_at=None,
1270                 include_data=None):
1271     """Yield everything reachable from id via cat_pipe as a WalkItem,
1272     stopping whenever stop_at(id) returns true.  Throw MissingObject
1273     if a hash encountered is missing from the repository, and don't
1274     read or return blob content in the data field unless include_data
1275     is set.
1276     """
1277     # Maintain the pending stack on the heap to avoid stack overflow
1278     pending = [(id, [], [], None)]
1279     while len(pending):
1280         id, parent_path, chunk_path, mode = pending.pop()
1281         if stop_at and stop_at(id):
1282             continue
1283
1284         if (not include_data) and mode and stat.S_ISREG(mode):
1285             # If the object is a "regular file", then it's a leaf in
1286             # the graph, so we can skip reading the data if the caller
1287             # hasn't requested it.
1288             yield WalkItem(id=id, type='blob',
1289                            chunk_path=chunk_path, path=parent_path,
1290                            mode=mode,
1291                            data=None)
1292             continue
1293
1294         item_it = cat_pipe.get(id)
1295         type = item_it.next()
1296         if type not in ('blob', 'commit', 'tree'):
1297             raise Exception('unexpected repository object type %r' % type)
1298
1299         # FIXME: set the mode based on the type when the mode is None
1300         if type == 'blob' and not include_data:
1301             # Dump data until we can ask cat_pipe not to fetch it
1302             for ignored in item_it:
1303                 pass
1304             data = None
1305         else:
1306             data = ''.join(item_it)
1307
1308         yield WalkItem(id=id, type=type,
1309                        chunk_path=chunk_path, path=parent_path,
1310                        mode=mode,
1311                        data=(data if include_data else None))
1312
1313         if type == 'commit':
1314             commit_items = parse_commit(data)
1315             for pid in commit_items.parents:
1316                 pending.append((pid, parent_path, chunk_path, mode))
1317             pending.append((commit_items.tree, parent_path, chunk_path,
1318                             hashsplit.GIT_MODE_TREE))
1319         elif type == 'tree':
1320             for mode, name, ent_id in tree_decode(data):
1321                 demangled, bup_type = demangle_name(name, mode)
1322                 if chunk_path:
1323                     sub_path = parent_path
1324                     sub_chunk_path = chunk_path + [name]
1325                 else:
1326                     sub_path = parent_path + [name]
1327                     if bup_type == BUP_CHUNKED:
1328                         sub_chunk_path = ['']
1329                     else:
1330                         sub_chunk_path = chunk_path
1331                 pending.append((ent_id.encode('hex'), sub_path, sub_chunk_path,
1332                                 mode))