lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5
   6 import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
   7 from collections import namedtuple
   8 from itertools import islice
   9
  10 from bup import _helpers, hashsplit, path, midx, bloom, xstat
  11 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
  12                          fdatasync,
  13                          hostname, localtime, log, merge_iter,
  14                          mmap_read, mmap_readwrite,
  15                          progress, qprogress, unlink, username, userfullname,
  16                          utc_offset_str)
  17
  18
  19 max_pack_size = 1000*1000*1000  # larger packs will slow down pruning
  20 max_pack_objects = 200*1000  # cache memory usage is about 83 bytes per object
  21
  22 verbose = 0
  23 ignore_midx = 0
  24 repodir = None
  25
  26 _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
  27 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
  28
  29 _total_searches = 0
  30 _total_steps = 0
  31
  32
  33 class GitError(Exception):
  34     pass
  35
  36
  37 def parse_tz_offset(s):
  38     """UTC offset in seconds."""
  39     tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
  40     if s[0] == '-':
  41         return - tz_off
  42     return tz_off
  43
  44
  45 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
  46 # Make sure that's authoritative.
  47 _start_end_char = r'[^ .,:;<>"\'\0\n]'
  48 _content_char = r'[^\0\n<>]'
  49 _safe_str_rx = '(?:%s{1,2}|(?:%s%s*%s))' \
  50     % (_start_end_char,
  51        _start_end_char, _content_char, _start_end_char)
  52 _tz_rx = r'[-+]\d\d[0-5]\d'
  53 _parent_rx = r'(?:parent [abcdefABCDEF0123456789]{40}\n)'
  54 _commit_rx = re.compile(r'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
  55 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
  56 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)
  57
  58 (?P<message>(?:.|\n)*)''' % (_parent_rx,
  59                              _safe_str_rx, _safe_str_rx, _tz_rx,
  60                              _safe_str_rx, _safe_str_rx, _tz_rx))
  61 _parent_hash_rx = re.compile(r'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
  62
  63
  64 # Note that the author_sec and committer_sec values are (UTC) epoch seconds.
  65 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
  66                                        'author_name', 'author_mail',
  67                                        'author_sec', 'author_offset',
  68                                        'committer_name', 'committer_mail',
  69                                        'committer_sec', 'committer_offset',
  70                                        'message'])
  71
  72 def parse_commit(content):
  73     commit_match = re.match(_commit_rx, content)
  74     if not commit_match:
  75         raise Exception('cannot parse commit %r' % content)
  76     matches = commit_match.groupdict()
  77     return CommitInfo(tree=matches['tree'],
  78                       parents=re.findall(_parent_hash_rx, matches['parents']),
  79                       author_name=matches['author_name'],
  80                       author_mail=matches['author_mail'],
  81                       author_sec=int(matches['asec']),
  82                       author_offset=parse_tz_offset(matches['atz']),
  83                       committer_name=matches['committer_name'],
  84                       committer_mail=matches['committer_mail'],
  85                       committer_sec=int(matches['csec']),
  86                       committer_offset=parse_tz_offset(matches['ctz']),
  87                       message=matches['message'])
  88
  89
  90 def get_commit_items(id, cp):
  91     commit_it = cp.get(id)
  92     assert(commit_it.next() == 'commit')
  93     commit_content = ''.join(commit_it)
  94     return parse_commit(commit_content)
  95
  96
  97 def _local_git_date_str(epoch_sec):
  98     return '%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
  99
 100
 101 def _git_date_str(epoch_sec, tz_offset_sec):
 102     offs =  tz_offset_sec // 60
 103     return '%d %s%02d%02d' \
 104         % (epoch_sec,
 105            '+' if offs >= 0 else '-',
 106            abs(offs) // 60,
 107            abs(offs) % 60)
 108
 109
 110 def repo(sub = '', repo_dir=None):
 111     """Get the path to the git repository or one of its subdirectories."""
 112     global repodir
 113     repo_dir = repo_dir or repodir
 114     if not repo_dir:
 115         raise GitError('You should call check_repo_or_die()')
 116
 117     # If there's a .git subdirectory, then the actual repo is in there.
 118     gd = os.path.join(repo_dir, '.git')
 119     if os.path.exists(gd):
 120         repodir = gd
 121
 122     return os.path.join(repo_dir, sub)
 123
 124
 125 def shorten_hash(s):
 126     return re.sub(r'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)',
 127                   r'\1\2*\3', s)
 128
 129
 130 def repo_rel(path):
 131     full = os.path.abspath(path)
 132     fullrepo = os.path.abspath(repo(''))
 133     if not fullrepo.endswith('/'):
 134         fullrepo += '/'
 135     if full.startswith(fullrepo):
 136         path = full[len(fullrepo):]
 137     if path.startswith('index-cache/'):
 138         path = path[len('index-cache/'):]
 139     return shorten_hash(path)
 140
 141
 142 def all_packdirs():
 143     paths = [repo('objects/pack')]
 144     paths += glob.glob(repo('index-cache/*/.'))
 145     return paths
 146
 147
 148 def auto_midx(objdir):
 149     args = [path.exe(), 'midx', '--auto', '--dir', objdir]
 150     try:
 151         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 152     except OSError as e:
 153         # make sure 'args' gets printed to help with debugging
 154         add_error('%r: exception: %s' % (args, e))
 155         raise
 156     if rv:
 157         add_error('%r: returned %d' % (args, rv))
 158
 159     args = [path.exe(), 'bloom', '--dir', objdir]
 160     try:
 161         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 162     except OSError as e:
 163         # make sure 'args' gets printed to help with debugging
 164         add_error('%r: exception: %s' % (args, e))
 165         raise
 166     if rv:
 167         add_error('%r: returned %d' % (args, rv))
 168
 169
 170 def mangle_name(name, mode, gitmode):
 171     """Mangle a file name to present an abstract name for segmented files.
 172     Mangled file names will have the ".bup" extension added to them. If a
 173     file's name already ends with ".bup", a ".bupl" extension is added to
 174     disambiguate normal files from segmented ones.
 175     """
 176     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 177         assert(stat.S_ISDIR(gitmode))
 178         return name + '.bup'
 179     elif name.endswith('.bup') or name[:-1].endswith('.bup'):
 180         return name + '.bupl'
 181     else:
 182         return name
 183
 184
 185 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 186 def demangle_name(name, mode):
 187     """Remove name mangling from a file name, if necessary.
 188
 189     The return value is a tuple (demangled_filename,mode), where mode is one of
 190     the following:
 191
 192     * BUP_NORMAL  : files that should be read as-is from the repository
 193     * BUP_CHUNKED : files that were chunked and need to be reassembled
 194
 195     For more information on the name mangling algorithm, see mangle_name()
 196     """
 197     if name.endswith('.bupl'):
 198         return (name[:-5], BUP_NORMAL)
 199     elif name.endswith('.bup'):
 200         return (name[:-4], BUP_CHUNKED)
 201     elif name.endswith('.bupm'):
 202         return (name[:-5],
 203                 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
 204     else:
 205         return (name, BUP_NORMAL)
 206
 207
 208 def calc_hash(type, content):
 209     """Calculate some content's hash in the Git fashion."""
 210     header = '%s %d\0' % (type, len(content))
 211     sum = Sha1(header)
 212     sum.update(content)
 213     return sum.digest()
 214
 215
 216 def shalist_item_sort_key(ent):
 217     (mode, name, id) = ent
 218     assert(mode+0 == mode)
 219     if stat.S_ISDIR(mode):
 220         return name + '/'
 221     else:
 222         return name
 223
 224
 225 def tree_encode(shalist):
 226     """Generate a git tree object from (mode,name,hash) tuples."""
 227     shalist = sorted(shalist, key = shalist_item_sort_key)
 228     l = []
 229     for (mode,name,bin) in shalist:
 230         assert(mode)
 231         assert(mode+0 == mode)
 232         assert(name)
 233         assert(len(bin) == 20)
 234         s = '%o %s\0%s' % (mode,name,bin)
 235         assert(s[0] != '0')  # 0-padded octal is not acceptable in a git tree
 236         l.append(s)
 237     return ''.join(l)
 238
 239
 240 def tree_decode(buf):
 241     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 242     ofs = 0
 243     while ofs < len(buf):
 244         z = buf.find('\0', ofs)
 245         assert(z > ofs)
 246         spl = buf[ofs:z].split(' ', 1)
 247         assert(len(spl) == 2)
 248         mode,name = spl
 249         sha = buf[z+1:z+1+20]
 250         ofs = z+1+20
 251         yield (int(mode, 8), name, sha)
 252
 253
 254 def _encode_packobj(type, content, compression_level=1):
 255     szout = ''
 256     sz = len(content)
 257     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 258     sz >>= 4
 259     while 1:
 260         if sz: szbits |= 0x80
 261         szout += chr(szbits)
 262         if not sz:
 263             break
 264         szbits = sz & 0x7f
 265         sz >>= 7
 266     if compression_level > 9:
 267         compression_level = 9
 268     elif compression_level < 0:
 269         compression_level = 0
 270     z = zlib.compressobj(compression_level)
 271     yield szout
 272     yield z.compress(content)
 273     yield z.flush()
 274
 275
 276 def _encode_looseobj(type, content, compression_level=1):
 277     z = zlib.compressobj(compression_level)
 278     yield z.compress('%s %d\0' % (type, len(content)))
 279     yield z.compress(content)
 280     yield z.flush()
 281
 282
 283 def _decode_looseobj(buf):
 284     assert(buf);
 285     s = zlib.decompress(buf)
 286     i = s.find('\0')
 287     assert(i > 0)
 288     l = s[:i].split(' ')
 289     type = l[0]
 290     sz = int(l[1])
 291     content = s[i+1:]
 292     assert(type in _typemap)
 293     assert(sz == len(content))
 294     return (type, content)
 295
 296
 297 def _decode_packobj(buf):
 298     assert(buf)
 299     c = ord(buf[0])
 300     type = _typermap[(c & 0x70) >> 4]
 301     sz = c & 0x0f
 302     shift = 4
 303     i = 0
 304     while c & 0x80:
 305         i += 1
 306         c = ord(buf[i])
 307         sz |= (c & 0x7f) << shift
 308         shift += 7
 309         if not (c & 0x80):
 310             break
 311     return (type, zlib.decompress(buf[i+1:]))
 312
 313
 314 class PackIdx:
 315     def __init__(self):
 316         assert(0)
 317
 318     def find_offset(self, hash):
 319         """Get the offset of an object inside the index file."""
 320         idx = self._idx_from_hash(hash)
 321         if idx != None:
 322             return self._ofs_from_idx(idx)
 323         return None
 324
 325     def exists(self, hash, want_source=False):
 326         """Return nonempty if the object exists in this index."""
 327         if hash and (self._idx_from_hash(hash) != None):
 328             return want_source and os.path.basename(self.name) or True
 329         return None
 330
 331     def __len__(self):
 332         return int(self.fanout[255])
 333
 334     def _idx_from_hash(self, hash):
 335         global _total_searches, _total_steps
 336         _total_searches += 1
 337         assert(len(hash) == 20)
 338         b1 = ord(hash[0])
 339         start = self.fanout[b1-1] # range -1..254
 340         end = self.fanout[b1] # range 0..255
 341         want = str(hash)
 342         _total_steps += 1  # lookup table is a step
 343         while start < end:
 344             _total_steps += 1
 345             mid = start + (end-start)/2
 346             v = self._idx_to_hash(mid)
 347             if v < want:
 348                 start = mid+1
 349             elif v > want:
 350                 end = mid
 351             else: # got it!
 352                 return mid
 353         return None
 354
 355
 356 class PackIdxV1(PackIdx):
 357     """Object representation of a Git pack index (version 1) file."""
 358     def __init__(self, filename, f):
 359         self.name = filename
 360         self.idxnames = [self.name]
 361         self.map = mmap_read(f)
 362         self.fanout = list(struct.unpack('!256I',
 363                                          str(buffer(self.map, 0, 256*4))))
 364         self.fanout.append(0)  # entry "-1"
 365         nsha = self.fanout[255]
 366         self.sha_ofs = 256*4
 367         self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
 368
 369     def _ofs_from_idx(self, idx):
 370         return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
 371
 372     def _idx_to_hash(self, idx):
 373         return str(self.shatable[idx*24+4 : idx*24+24])
 374
 375     def __iter__(self):
 376         for i in xrange(self.fanout[255]):
 377             yield buffer(self.map, 256*4 + 24*i + 4, 20)
 378
 379
 380 class PackIdxV2(PackIdx):
 381     """Object representation of a Git pack index (version 2) file."""
 382     def __init__(self, filename, f):
 383         self.name = filename
 384         self.idxnames = [self.name]
 385         self.map = mmap_read(f)
 386         assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
 387         self.fanout = list(struct.unpack('!256I',
 388                                          str(buffer(self.map, 8, 256*4))))
 389         self.fanout.append(0)  # entry "-1"
 390         nsha = self.fanout[255]
 391         self.sha_ofs = 8 + 256*4
 392         self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
 393         self.ofstable = buffer(self.map,
 394                                self.sha_ofs + nsha*20 + nsha*4,
 395                                nsha*4)
 396         self.ofs64table = buffer(self.map,
 397                                  8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
 398
 399     def _ofs_from_idx(self, idx):
 400         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
 401         if ofs & 0x80000000:
 402             idx64 = ofs & 0x7fffffff
 403             ofs = struct.unpack('!Q',
 404                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
 405         return ofs
 406
 407     def _idx_to_hash(self, idx):
 408         return str(self.shatable[idx*20:(idx+1)*20])
 409
 410     def __iter__(self):
 411         for i in xrange(self.fanout[255]):
 412             yield buffer(self.map, 8 + 256*4 + 20*i, 20)
 413
 414
 415 _mpi_count = 0
 416 class PackIdxList:
 417     def __init__(self, dir):
 418         global _mpi_count
 419         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 420         _mpi_count += 1
 421         self.dir = dir
 422         self.also = set()
 423         self.packs = []
 424         self.do_bloom = False
 425         self.bloom = None
 426         self.refresh()
 427
 428     def __del__(self):
 429         global _mpi_count
 430         _mpi_count -= 1
 431         assert(_mpi_count == 0)
 432
 433     def __iter__(self):
 434         return iter(idxmerge(self.packs))
 435
 436     def __len__(self):
 437         return sum(len(pack) for pack in self.packs)
 438
 439     def exists(self, hash, want_source=False):
 440         """Return nonempty if the object exists in the index files."""
 441         global _total_searches
 442         _total_searches += 1
 443         if hash in self.also:
 444             return True
 445         if self.do_bloom and self.bloom:
 446             if self.bloom.exists(hash):
 447                 self.do_bloom = False
 448             else:
 449                 _total_searches -= 1  # was counted by bloom
 450                 return None
 451         for i in xrange(len(self.packs)):
 452             p = self.packs[i]
 453             _total_searches -= 1  # will be incremented by sub-pack
 454             ix = p.exists(hash, want_source=want_source)
 455             if ix:
 456                 # reorder so most recently used packs are searched first
 457                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 458                 return ix
 459         self.do_bloom = True
 460         return None
 461
 462     def refresh(self, skip_midx = False):
 463         """Refresh the index list.
 464         This method verifies if .midx files were superseded (e.g. all of its
 465         contents are in another, bigger .midx file) and removes the superseded
 466         files.
 467
 468         If skip_midx is True, all work on .midx files will be skipped and .midx
 469         files will be removed from the list.
 470
 471         The module-global variable 'ignore_midx' can force this function to
 472         always act as if skip_midx was True.
 473         """
 474         self.bloom = None # Always reopen the bloom as it may have been relaced
 475         self.do_bloom = False
 476         skip_midx = skip_midx or ignore_midx
 477         d = dict((p.name, p) for p in self.packs
 478                  if not skip_midx or not isinstance(p, midx.PackMidx))
 479         if os.path.exists(self.dir):
 480             if not skip_midx:
 481                 midxl = []
 482                 for ix in self.packs:
 483                     if isinstance(ix, midx.PackMidx):
 484                         for name in ix.idxnames:
 485                             d[os.path.join(self.dir, name)] = ix
 486                 for full in glob.glob(os.path.join(self.dir,'*.midx')):
 487                     if not d.get(full):
 488                         mx = midx.PackMidx(full)
 489                         (mxd, mxf) = os.path.split(mx.name)
 490                         broken = False
 491                         for n in mx.idxnames:
 492                             if not os.path.exists(os.path.join(mxd, n)):
 493                                 log(('warning: index %s missing\n' +
 494                                     '  used by %s\n') % (n, mxf))
 495                                 broken = True
 496                         if broken:
 497                             mx.close()
 498                             del mx
 499                             unlink(full)
 500                         else:
 501                             midxl.append(mx)
 502                 midxl.sort(key=lambda ix:
 503                            (-len(ix), -xstat.stat(ix.name).st_mtime))
 504                 for ix in midxl:
 505                     any_needed = False
 506                     for sub in ix.idxnames:
 507                         found = d.get(os.path.join(self.dir, sub))
 508                         if not found or isinstance(found, PackIdx):
 509                             # doesn't exist, or exists but not in a midx
 510                             any_needed = True
 511                             break
 512                     if any_needed:
 513                         d[ix.name] = ix
 514                         for name in ix.idxnames:
 515                             d[os.path.join(self.dir, name)] = ix
 516                     elif not ix.force_keep:
 517                         debug1('midx: removing redundant: %s\n'
 518                                % os.path.basename(ix.name))
 519                         ix.close()
 520                         unlink(ix.name)
 521             for full in glob.glob(os.path.join(self.dir,'*.idx')):
 522                 if not d.get(full):
 523                     try:
 524                         ix = open_idx(full)
 525                     except GitError as e:
 526                         add_error(e)
 527                         continue
 528                     d[full] = ix
 529             bfull = os.path.join(self.dir, 'bup.bloom')
 530             if self.bloom is None and os.path.exists(bfull):
 531                 self.bloom = bloom.ShaBloom(bfull)
 532             self.packs = list(set(d.values()))
 533             self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
 534             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 535                 self.do_bloom = True
 536             else:
 537                 self.bloom = None
 538         debug1('PackIdxList: using %d index%s.\n'
 539             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 540
 541     def add(self, hash):
 542         """Insert an additional object in the list."""
 543         self.also.add(hash)
 544
 545
 546 def open_idx(filename):
 547     if filename.endswith('.idx'):
 548         f = open(filename, 'rb')
 549         header = f.read(8)
 550         if header[0:4] == '\377tOc':
 551             version = struct.unpack('!I', header[4:8])[0]
 552             if version == 2:
 553                 return PackIdxV2(filename, f)
 554             else:
 555                 raise GitError('%s: expected idx file version 2, got %d'
 556                                % (filename, version))
 557         elif len(header) == 8 and header[0:4] < '\377tOc':
 558             return PackIdxV1(filename, f)
 559         else:
 560             raise GitError('%s: unrecognized idx file header' % filename)
 561     elif filename.endswith('.midx'):
 562         return midx.PackMidx(filename)
 563     else:
 564         raise GitError('idx filenames must end with .idx or .midx')
 565
 566
 567 def idxmerge(idxlist, final_progress=True):
 568     """Generate a list of all the objects reachable in a PackIdxList."""
 569     def pfunc(count, total):
 570         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 571                   % (count*100.0/total, count, total))
 572     def pfinal(count, total):
 573         if final_progress:
 574             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 575                      % (100, total, total))
 576     return merge_iter(idxlist, 10024, pfunc, pfinal)
 577
 578
 579 def _make_objcache():
 580     return PackIdxList(repo('objects/pack'))
 581
 582 # bup-gc assumes that it can disable all PackWriter activities
 583 # (bloom/midx/cache) via the constructor and close() arguments.
 584
 585 class PackWriter:
 586     """Writes Git objects inside a pack file."""
 587     def __init__(self, objcache_maker=_make_objcache, compression_level=1,
 588                  run_midx=True, on_pack_finish=None):
 589         self.file = None
 590         self.parentfd = None
 591         self.count = 0
 592         self.outbytes = 0
 593         self.filename = None
 594         self.idx = None
 595         self.objcache_maker = objcache_maker
 596         self.objcache = None
 597         self.compression_level = compression_level
 598         self.run_midx=run_midx
 599         self.on_pack_finish = on_pack_finish
 600
 601     def __del__(self):
 602         self.close()
 603
 604     def _open(self):
 605         if not self.file:
 606             objdir = dir=repo('objects')
 607             fd, name = tempfile.mkstemp(suffix='.pack', dir=objdir)
 608             try:
 609                 self.file = os.fdopen(fd, 'w+b')
 610             except:
 611                 os.close(fd)
 612                 raise
 613             try:
 614                 self.parentfd = os.open(objdir, os.O_RDONLY)
 615             except:
 616                 f = self.file
 617                 self.file = None
 618                 f.close()
 619                 raise
 620             assert(name.endswith('.pack'))
 621             self.filename = name[:-5]
 622             self.file.write('PACK\0\0\0\2\0\0\0\0')
 623             self.idx = list(list() for i in xrange(256))
 624
 625     def _raw_write(self, datalist, sha):
 626         self._open()
 627         f = self.file
 628         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 629         # the file never has a *partial* blob.  So let's make sure it's
 630         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 631         # to our hashsplit algorithm.)  f.write() does its own buffering,
 632         # but that's okay because we'll flush it in _end().
 633         oneblob = ''.join(datalist)
 634         try:
 635             f.write(oneblob)
 636         except IOError as e:
 637             raise GitError, e, sys.exc_info()[2]
 638         nw = len(oneblob)
 639         crc = zlib.crc32(oneblob) & 0xffffffff
 640         self._update_idx(sha, crc, nw)
 641         self.outbytes += nw
 642         self.count += 1
 643         return nw, crc
 644
 645     def _update_idx(self, sha, crc, size):
 646         assert(sha)
 647         if self.idx:
 648             self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
 649
 650     def _write(self, sha, type, content):
 651         if verbose:
 652             log('>')
 653         if not sha:
 654             sha = calc_hash(type, content)
 655         size, crc = self._raw_write(_encode_packobj(type, content,
 656                                                     self.compression_level),
 657                                     sha=sha)
 658         if self.outbytes >= max_pack_size or self.count >= max_pack_objects:
 659             self.breakpoint()
 660         return sha
 661
 662     def breakpoint(self):
 663         """Clear byte and object counts and return the last processed id."""
 664         id = self._end(self.run_midx)
 665         self.outbytes = self.count = 0
 666         return id
 667
 668     def _require_objcache(self):
 669         if self.objcache is None and self.objcache_maker:
 670             self.objcache = self.objcache_maker()
 671         if self.objcache is None:
 672             raise GitError(
 673                     "PackWriter not opened or can't check exists w/o objcache")
 674
 675     def exists(self, id, want_source=False):
 676         """Return non-empty if an object is found in the object cache."""
 677         self._require_objcache()
 678         return self.objcache.exists(id, want_source=want_source)
 679
 680     def write(self, sha, type, content):
 681         """Write an object to the pack file.  Fails if sha exists()."""
 682         self._write(sha, type, content)
 683
 684     def maybe_write(self, type, content):
 685         """Write an object to the pack file if not present and return its id."""
 686         sha = calc_hash(type, content)
 687         if not self.exists(sha):
 688             self.write(sha, type, content)
 689             self._require_objcache()
 690             self.objcache.add(sha)
 691         return sha
 692
 693     def new_blob(self, blob):
 694         """Create a blob object in the pack with the supplied content."""
 695         return self.maybe_write('blob', blob)
 696
 697     def new_tree(self, shalist):
 698         """Create a tree object in the pack."""
 699         content = tree_encode(shalist)
 700         return self.maybe_write('tree', content)
 701
 702     def new_commit(self, tree, parent,
 703                    author, adate_sec, adate_tz,
 704                    committer, cdate_sec, cdate_tz,
 705                    msg):
 706         """Create a commit object in the pack.  The date_sec values must be
 707         epoch-seconds, and if a tz is None, the local timezone is assumed."""
 708         if adate_tz:
 709             adate_str = _git_date_str(adate_sec, adate_tz)
 710         else:
 711             adate_str = _local_git_date_str(adate_sec)
 712         if cdate_tz:
 713             cdate_str = _git_date_str(cdate_sec, cdate_tz)
 714         else:
 715             cdate_str = _local_git_date_str(cdate_sec)
 716         l = []
 717         if tree: l.append('tree %s' % tree.encode('hex'))
 718         if parent: l.append('parent %s' % parent.encode('hex'))
 719         if author: l.append('author %s %s' % (author, adate_str))
 720         if committer: l.append('committer %s %s' % (committer, cdate_str))
 721         l.append('')
 722         l.append(msg)
 723         return self.maybe_write('commit', '\n'.join(l))
 724
 725     def abort(self):
 726         """Remove the pack file from disk."""
 727         f = self.file
 728         if f:
 729             pfd = self.parentfd
 730             self.file = None
 731             self.parentfd = None
 732             self.idx = None
 733             try:
 734                 try:
 735                     os.unlink(self.filename + '.pack')
 736                 finally:
 737                     f.close()
 738             finally:
 739                 if pfd is not None:
 740                     os.close(pfd)
 741
 742     def _end(self, run_midx=True):
 743         f = self.file
 744         if not f: return None
 745         self.file = None
 746         try:
 747             self.objcache = None
 748             idx = self.idx
 749             self.idx = None
 750
 751             # update object count
 752             f.seek(8)
 753             cp = struct.pack('!i', self.count)
 754             assert(len(cp) == 4)
 755             f.write(cp)
 756
 757             # calculate the pack sha1sum
 758             f.seek(0)
 759             sum = Sha1()
 760             for b in chunkyreader(f):
 761                 sum.update(b)
 762             packbin = sum.digest()
 763             f.write(packbin)
 764             fdatasync(f.fileno())
 765         finally:
 766             f.close()
 767
 768         obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
 769
 770         nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
 771         if os.path.exists(self.filename + '.map'):
 772             os.unlink(self.filename + '.map')
 773         os.rename(self.filename + '.pack', nameprefix + '.pack')
 774         os.rename(self.filename + '.idx', nameprefix + '.idx')
 775         try:
 776             os.fsync(self.parentfd)
 777         finally:
 778             os.close(self.parentfd)
 779
 780         if run_midx:
 781             auto_midx(repo('objects/pack'))
 782
 783         if self.on_pack_finish:
 784             self.on_pack_finish(nameprefix)
 785
 786         return nameprefix
 787
 788     def close(self, run_midx=True):
 789         """Close the pack file and move it to its definitive path."""
 790         return self._end(run_midx=run_midx)
 791
 792     def _write_pack_idx_v2(self, filename, idx, packbin):
 793         ofs64_count = 0
 794         for section in idx:
 795             for entry in section:
 796                 if entry[2] >= 2**31:
 797                     ofs64_count += 1
 798
 799         # Length: header + fan-out + shas-and-crcs + overflow-offsets
 800         index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
 801         idx_map = None
 802         idx_f = open(filename, 'w+b')
 803         try:
 804             idx_f.truncate(index_len)
 805             fdatasync(idx_f.fileno())
 806             idx_map = mmap_readwrite(idx_f, close=False)
 807             try:
 808                 count = _helpers.write_idx(filename, idx_map, idx, self.count)
 809                 assert(count == self.count)
 810                 idx_map.flush()
 811             finally:
 812                 idx_map.close()
 813         finally:
 814             idx_f.close()
 815
 816         idx_f = open(filename, 'a+b')
 817         try:
 818             idx_f.write(packbin)
 819             idx_f.seek(0)
 820             idx_sum = Sha1()
 821             b = idx_f.read(8 + 4*256)
 822             idx_sum.update(b)
 823
 824             obj_list_sum = Sha1()
 825             for b in chunkyreader(idx_f, 20*self.count):
 826                 idx_sum.update(b)
 827                 obj_list_sum.update(b)
 828             namebase = obj_list_sum.hexdigest()
 829
 830             for b in chunkyreader(idx_f):
 831                 idx_sum.update(b)
 832             idx_f.write(idx_sum.digest())
 833             fdatasync(idx_f.fileno())
 834             return namebase
 835         finally:
 836             idx_f.close()
 837
 838
 839 def _gitenv(repo_dir = None):
 840     if not repo_dir:
 841         repo_dir = repo()
 842     def env():
 843         os.environ['GIT_DIR'] = os.path.abspath(repo_dir)
 844     return env
 845
 846
 847 def list_refs(refname=None, repo_dir=None,
 848               limit_to_heads=False, limit_to_tags=False):
 849     """Yield (refname, hash) tuples for all repository refs unless a ref
 850     name is specified.  Given a ref name, only include tuples for that
 851     particular ref.  The limits restrict the result items to
 852     refs/heads or refs/tags.  If both limits are specified, items from
 853     both sources will be included.
 854
 855     """
 856     argv = ['git', 'show-ref']
 857     if limit_to_heads:
 858         argv.append('--heads')
 859     if limit_to_tags:
 860         argv.append('--tags')
 861     argv.append('--')
 862     if refname:
 863         argv += [refname]
 864     p = subprocess.Popen(argv,
 865                          preexec_fn = _gitenv(repo_dir),
 866                          stdout = subprocess.PIPE)
 867     out = p.stdout.read().strip()
 868     rv = p.wait()  # not fatal
 869     if rv:
 870         assert(not out)
 871     if out:
 872         for d in out.split('\n'):
 873             (sha, name) = d.split(' ', 1)
 874             yield (name, sha.decode('hex'))
 875
 876
 877 def read_ref(refname, repo_dir = None):
 878     """Get the commit id of the most recent commit made on a given ref."""
 879     refs = list_refs(refname, repo_dir=repo_dir, limit_to_heads=True)
 880     l = tuple(islice(refs, 2))
 881     if l:
 882         assert(len(l) == 1)
 883         return l[0][1]
 884     else:
 885         return None
 886
 887
 888 def rev_list(ref, count=None, repo_dir=None):
 889     """Generate a list of reachable commits in reverse chronological order.
 890
 891     This generator walks through commits, from child to parent, that are
 892     reachable via the specified ref and yields a series of tuples of the form
 893     (date,hash).
 894
 895     If count is a non-zero integer, limit the number of commits to "count"
 896     objects.
 897     """
 898     assert(not ref.startswith('-'))
 899     opts = []
 900     if count:
 901         opts += ['-n', str(atoi(count))]
 902     argv = ['git', 'rev-list', '--pretty=format:%at'] + opts + [ref, '--']
 903     p = subprocess.Popen(argv,
 904                          preexec_fn = _gitenv(repo_dir),
 905                          stdout = subprocess.PIPE)
 906     commit = None
 907     for row in p.stdout:
 908         s = row.strip()
 909         if s.startswith('commit '):
 910             commit = s[7:].decode('hex')
 911         else:
 912             date = int(s)
 913             yield (date, commit)
 914     rv = p.wait()  # not fatal
 915     if rv:
 916         raise GitError, 'git rev-list returned error %d' % rv
 917
 918
 919 def get_commit_dates(refs, repo_dir=None):
 920     """Get the dates for the specified commit refs.  For now, every unique
 921        string in refs must resolve to a different commit or this
 922        function will fail."""
 923     result = []
 924     for ref in refs:
 925         commit = get_commit_items(ref, cp(repo_dir))
 926         result.append(commit.author_sec)
 927     return result
 928
 929
 930 def rev_parse(committish, repo_dir=None):
 931     """Resolve the full hash for 'committish', if it exists.
 932
 933     Should be roughly equivalent to 'git rev-parse'.
 934
 935     Returns the hex value of the hash if it is found, None if 'committish' does
 936     not correspond to anything.
 937     """
 938     head = read_ref(committish, repo_dir=repo_dir)
 939     if head:
 940         debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
 941         return head
 942
 943     pL = PackIdxList(repo('objects/pack', repo_dir=repo_dir))
 944
 945     if len(committish) == 40:
 946         try:
 947             hash = committish.decode('hex')
 948         except TypeError:
 949             return None
 950
 951         if pL.exists(hash):
 952             return hash
 953
 954     return None
 955
 956
 957 def update_ref(refname, newval, oldval, repo_dir=None):
 958     """Update a repository reference."""
 959     if not oldval:
 960         oldval = ''
 961     assert(refname.startswith('refs/heads/') \
 962            or refname.startswith('refs/tags/'))
 963     p = subprocess.Popen(['git', 'update-ref', refname,
 964                           newval.encode('hex'), oldval.encode('hex')],
 965                          preexec_fn = _gitenv(repo_dir))
 966     _git_wait('git update-ref', p)
 967
 968
 969 def delete_ref(refname, oldvalue=None):
 970     """Delete a repository reference (see git update-ref(1))."""
 971     assert(refname.startswith('refs/'))
 972     oldvalue = [] if not oldvalue else [oldvalue]
 973     p = subprocess.Popen(['git', 'update-ref', '-d', refname] + oldvalue,
 974                          preexec_fn = _gitenv())
 975     _git_wait('git update-ref', p)
 976
 977
 978 def guess_repo(path=None):
 979     """Set the path value in the global variable "repodir".
 980     This makes bup look for an existing bup repository, but not fail if a
 981     repository doesn't exist. Usually, if you are interacting with a bup
 982     repository, you would not be calling this function but using
 983     check_repo_or_die().
 984     """
 985     global repodir
 986     if path:
 987         repodir = path
 988     if not repodir:
 989         repodir = os.environ.get('BUP_DIR')
 990         if not repodir:
 991             repodir = os.path.expanduser('~/.bup')
 992
 993
 994 def init_repo(path=None):
 995     """Create the Git bare repository for bup in a given path."""
 996     guess_repo(path)
 997     d = repo()  # appends a / to the path
 998     parent = os.path.dirname(os.path.dirname(d))
 999     if parent and not os.path.exists(parent):
1000         raise GitError('parent directory "%s" does not exist\n' % parent)
1001     if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
1002         raise GitError('"%s" exists but is not a directory\n' % d)
1003     p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
1004                          preexec_fn = _gitenv())
1005     _git_wait('git init', p)
1006     # Force the index version configuration in order to ensure bup works
1007     # regardless of the version of the installed Git binary.
1008     p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
1009                          stdout=sys.stderr, preexec_fn = _gitenv())
1010     _git_wait('git config', p)
1011     # Enable the reflog
1012     p = subprocess.Popen(['git', 'config', 'core.logAllRefUpdates', 'true'],
1013                          stdout=sys.stderr, preexec_fn = _gitenv())
1014     _git_wait('git config', p)
1015
1016
1017 def check_repo_or_die(path=None):
1018     """Make sure a bup repository exists, and abort if not.
1019     If the path to a particular repository was not specified, this function
1020     initializes the default repository automatically.
1021     """
1022     guess_repo(path)
1023     try:
1024         os.stat(repo('objects/pack/.'))
1025     except OSError as e:
1026         if e.errno == errno.ENOENT:
1027             log('error: %r is not a bup repository; run "bup init"\n'
1028                 % repo())
1029             sys.exit(15)
1030         else:
1031             log('error: %s\n' % e)
1032             sys.exit(14)
1033
1034
1035 _ver = None
1036 def ver():
1037     """Get Git's version and ensure a usable version is installed.
1038
1039     The returned version is formatted as an ordered tuple with each position
1040     representing a digit in the version tag. For example, the following tuple
1041     would represent version 1.6.6.9:
1042
1043         ('1', '6', '6', '9')
1044     """
1045     global _ver
1046     if not _ver:
1047         p = subprocess.Popen(['git', '--version'],
1048                              stdout=subprocess.PIPE)
1049         gvs = p.stdout.read()
1050         _git_wait('git --version', p)
1051         m = re.match(r'git version (\S+.\S+)', gvs)
1052         if not m:
1053             raise GitError('git --version weird output: %r' % gvs)
1054         _ver = tuple(m.group(1).split('.'))
1055     needed = ('1','5', '3', '1')
1056     if _ver < needed:
1057         raise GitError('git version %s or higher is required; you have %s'
1058                        % ('.'.join(needed), '.'.join(_ver)))
1059     return _ver
1060
1061
1062 def _git_wait(cmd, p):
1063     rv = p.wait()
1064     if rv != 0:
1065         raise GitError('%s returned %d' % (cmd, rv))
1066
1067
1068 def _git_capture(argv):
1069     p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv())
1070     r = p.stdout.read()
1071     _git_wait(repr(argv), p)
1072     return r
1073
1074
1075 class _AbortableIter:
1076     def __init__(self, it, onabort = None):
1077         self.it = it
1078         self.onabort = onabort
1079         self.done = None
1080
1081     def __iter__(self):
1082         return self
1083
1084     def next(self):
1085         try:
1086             return self.it.next()
1087         except StopIteration as e:
1088             self.done = True
1089             raise
1090         except:
1091             self.abort()
1092             raise
1093
1094     def abort(self):
1095         """Abort iteration and call the abortion callback, if needed."""
1096         if not self.done:
1097             self.done = True
1098             if self.onabort:
1099                 self.onabort()
1100
1101     def __del__(self):
1102         self.abort()
1103
1104
1105 _ver_warned = 0
1106 class CatPipe:
1107     """Link to 'git cat-file' that is used to retrieve blob data."""
1108     def __init__(self, repo_dir = None):
1109         global _ver_warned
1110         self.repo_dir = repo_dir
1111         wanted = ('1','5','6')
1112         if ver() < wanted:
1113             if not _ver_warned:
1114                 log('warning: git version < %s; bup will be slow.\n'
1115                     % '.'.join(wanted))
1116                 _ver_warned = 1
1117             self.get = self._slow_get
1118         else:
1119             self.p = self.inprogress = None
1120             self.get = self._fast_get
1121
1122     def _abort(self):
1123         if self.p:
1124             self.p.stdout.close()
1125             self.p.stdin.close()
1126         self.p = None
1127         self.inprogress = None
1128
1129     def _restart(self):
1130         self._abort()
1131         self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1132                                   stdin=subprocess.PIPE,
1133                                   stdout=subprocess.PIPE,
1134                                   close_fds = True,
1135                                   bufsize = 4096,
1136                                   preexec_fn = _gitenv(self.repo_dir))
1137
1138     def _fast_get(self, id):
1139         if not self.p or self.p.poll() != None:
1140             self._restart()
1141         assert(self.p)
1142         poll_result = self.p.poll()
1143         assert(poll_result == None)
1144         if self.inprogress:
1145             log('_fast_get: opening %r while %r is open\n'
1146                 % (id, self.inprogress))
1147         assert(not self.inprogress)
1148         assert(id.find('\n') < 0)
1149         assert(id.find('\r') < 0)
1150         assert(not id.startswith('-'))
1151         self.inprogress = id
1152         self.p.stdin.write('%s\n' % id)
1153         self.p.stdin.flush()
1154         hdr = self.p.stdout.readline()
1155         if hdr.endswith(' missing\n'):
1156             self.inprogress = None
1157             raise KeyError('blob %r is missing' % id)
1158         spl = hdr.split(' ')
1159         if len(spl) != 3 or len(spl[0]) != 40:
1160             raise GitError('expected blob, got %r' % spl)
1161         (hex, type, size) = spl
1162
1163         it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
1164                            onabort = self._abort)
1165         try:
1166             yield type
1167             for blob in it:
1168                 yield blob
1169             readline_result = self.p.stdout.readline()
1170             assert(readline_result == '\n')
1171             self.inprogress = None
1172         except Exception as e:
1173             it.abort()
1174             raise
1175
1176     def _slow_get(self, id):
1177         assert(id.find('\n') < 0)
1178         assert(id.find('\r') < 0)
1179         assert(id[0] != '-')
1180         type = _git_capture(['git', 'cat-file', '-t', id]).strip()
1181         yield type
1182
1183         p = subprocess.Popen(['git', 'cat-file', type, id],
1184                              stdout=subprocess.PIPE,
1185                              preexec_fn = _gitenv(self.repo_dir))
1186         for blob in chunkyreader(p.stdout):
1187             yield blob
1188         _git_wait('git cat-file', p)
1189
1190     def _join(self, it):
1191         type = it.next()
1192         if type == 'blob':
1193             for blob in it:
1194                 yield blob
1195         elif type == 'tree':
1196             treefile = ''.join(it)
1197             for (mode, name, sha) in tree_decode(treefile):
1198                 for blob in self.join(sha.encode('hex')):
1199                     yield blob
1200         elif type == 'commit':
1201             treeline = ''.join(it).split('\n')[0]
1202             assert(treeline.startswith('tree '))
1203             for blob in self.join(treeline[5:]):
1204                 yield blob
1205         else:
1206             raise GitError('invalid object type %r: expected blob/tree/commit'
1207                            % type)
1208
1209     def join(self, id):
1210         """Generate a list of the content of all blobs that can be reached
1211         from an object.  The hash given in 'id' must point to a blob, a tree
1212         or a commit. The content of all blobs that can be seen from trees or
1213         commits will be added to the list.
1214         """
1215         try:
1216             for d in self._join(self.get(id)):
1217                 yield d
1218         except StopIteration:
1219             log('booger!\n')
1220
1221
1222 _cp = {}
1223
1224 def cp(repo_dir=None):
1225     """Create a CatPipe object or reuse the already existing one."""
1226     global _cp
1227     if not repo_dir:
1228         repo_dir = repo()
1229     repo_dir = os.path.abspath(repo_dir)
1230     cp = _cp.get(repo_dir)
1231     if not cp:
1232         cp = CatPipe(repo_dir)
1233         _cp[repo_dir] = cp
1234     return cp
1235
1236
1237 def tags(repo_dir = None):
1238     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1239     tags = {}
1240     for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1241         assert(n.startswith('refs/tags/'))
1242         name = n[10:]
1243         if not c in tags:
1244             tags[c] = []
1245         tags[c].append(name)  # more than one tag can point at 'c'
1246     return tags
1247
1248
1249 WalkItem = namedtuple('WalkItem', ['id', 'type', 'mode',
1250                                    'path', 'chunk_path', 'data'])
1251 # The path is the mangled path, and if an item represents a fragment
1252 # of a chunked file, the chunk_path will be the chunked subtree path
1253 # for the chunk, i.e. ['', '2d3115e', ...].  The top-level path for a
1254 # chunked file will have a chunk_path of [''].  So some chunk subtree
1255 # of the file '/foo/bar/baz' might look like this:
1256 #
1257 #   item.path = ['foo', 'bar', 'baz.bup']
1258 #   item.chunk_path = ['', '2d3115e', '016b097']
1259 #   item.type = 'tree'
1260 #   ...
1261
1262
1263 def _walk_object(cat_pipe, id,
1264                  parent_path, chunk_path,
1265                  mode=None,
1266                  stop_at=None,
1267                  include_data=None):
1268
1269     if stop_at and stop_at(id):
1270         return
1271
1272     item_it = cat_pipe.get(id)  # FIXME: use include_data
1273     type = item_it.next()
1274
1275     if type not in ('blob', 'commit', 'tree'):
1276         raise Exception('unexpected repository object type %r' % type)
1277
1278     # FIXME: set the mode based on the type when the mode is None
1279
1280     if type == 'blob' and not include_data:
1281         # Dump data until we can ask cat_pipe not to fetch it
1282         for ignored in item_it:
1283             pass
1284         data = None
1285     else:
1286         data = ''.join(item_it)
1287
1288     yield  WalkItem(id=id, type=type,
1289                     chunk_path=chunk_path, path=parent_path,
1290                     mode=mode,
1291                     data=(data if include_data else None))
1292
1293     if type == 'commit':
1294         commit_items = parse_commit(data)
1295         tree_id = commit_items.tree
1296         for x in _walk_object(cat_pipe, tree_id, parent_path, chunk_path,
1297                               mode=hashsplit.GIT_MODE_TREE,
1298                               stop_at=stop_at,
1299                               include_data=include_data):
1300             yield x
1301         parents = commit_items.parents
1302         for pid in parents:
1303             for x in _walk_object(cat_pipe, pid, parent_path, chunk_path,
1304                                   mode=mode, # Same mode as this child
1305                                   stop_at=stop_at,
1306                                   include_data=include_data):
1307                 yield x
1308     elif type == 'tree':
1309         for mode, name, ent_id in tree_decode(data):
1310             demangled, bup_type = demangle_name(name, mode)
1311             if chunk_path:
1312                 sub_path = parent_path
1313                 sub_chunk_path = chunk_path + [name]
1314             else:
1315                 sub_path = parent_path + [name]
1316                 if bup_type == BUP_CHUNKED:
1317                     sub_chunk_path = ['']
1318                 else:
1319                     sub_chunk_path = chunk_path
1320             for x in _walk_object(cat_pipe, ent_id.encode('hex'),
1321                                   sub_path, sub_chunk_path,
1322                                   mode=mode,
1323                                   stop_at=stop_at,
1324                                   include_data=include_data):
1325                 yield x
1326
1327
1328 def walk_object(cat_pipe, id,
1329                 stop_at=None,
1330                 include_data=None):
1331     """Yield everything reachable from id via cat_pipe as a WalkItem,
1332     stopping whenever stop_at(id) returns true."""
1333     return _walk_object(cat_pipe, id, [], [],
1334                         stop_at=stop_at,
1335                         include_data=include_data)