lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5
   6 import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
   7 from collections import namedtuple
   8 from itertools import islice
   9
  10 from bup import _helpers, hashsplit, path, midx, bloom, xstat
  11 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
  12                          fdatasync,
  13                          hostname, localtime, log, merge_iter,
  14                          mmap_read, mmap_readwrite,
  15                          progress, qprogress, unlink, username, userfullname,
  16                          utc_offset_str)
  17
  18
  19 max_pack_size = 1000*1000*1000  # larger packs will slow down pruning
  20 max_pack_objects = 200*1000  # cache memory usage is about 83 bytes per object
  21
  22 verbose = 0
  23 ignore_midx = 0
  24 repodir = None
  25
  26 _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
  27 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
  28
  29 _total_searches = 0
  30 _total_steps = 0
  31
  32
  33 class GitError(Exception):
  34     pass
  35
  36
  37 def parse_tz_offset(s):
  38     """UTC offset in seconds."""
  39     tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
  40     if s[0] == '-':
  41         return - tz_off
  42     return tz_off
  43
  44
  45 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
  46 # Make sure that's authoritative.
  47 _start_end_char = r'[^ .,:;<>"\'\0\n]'
  48 _content_char = r'[^\0\n<>]'
  49 _safe_str_rx = '(?:%s{1,2}|(?:%s%s*%s))' \
  50     % (_start_end_char,
  51        _start_end_char, _content_char, _start_end_char)
  52 _tz_rx = r'[-+]\d\d[0-5]\d'
  53 _parent_rx = r'(?:parent [abcdefABCDEF0123456789]{40}\n)'
  54 _commit_rx = re.compile(r'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
  55 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
  56 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)
  57
  58 (?P<message>(?:.|\n)*)''' % (_parent_rx,
  59                              _safe_str_rx, _safe_str_rx, _tz_rx,
  60                              _safe_str_rx, _safe_str_rx, _tz_rx))
  61 _parent_hash_rx = re.compile(r'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
  62
  63
  64 # Note that the author_sec and committer_sec values are (UTC) epoch seconds.
  65 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
  66                                        'author_name', 'author_mail',
  67                                        'author_sec', 'author_offset',
  68                                        'committer_name', 'committer_mail',
  69                                        'committer_sec', 'committer_offset',
  70                                        'message'])
  71
  72 def parse_commit(content):
  73     commit_match = re.match(_commit_rx, content)
  74     if not commit_match:
  75         raise Exception('cannot parse commit %r' % content)
  76     matches = commit_match.groupdict()
  77     return CommitInfo(tree=matches['tree'],
  78                       parents=re.findall(_parent_hash_rx, matches['parents']),
  79                       author_name=matches['author_name'],
  80                       author_mail=matches['author_mail'],
  81                       author_sec=int(matches['asec']),
  82                       author_offset=parse_tz_offset(matches['atz']),
  83                       committer_name=matches['committer_name'],
  84                       committer_mail=matches['committer_mail'],
  85                       committer_sec=int(matches['csec']),
  86                       committer_offset=parse_tz_offset(matches['ctz']),
  87                       message=matches['message'])
  88
  89
  90 def get_commit_items(id, cp):
  91     commit_it = cp.get(id)
  92     assert(commit_it.next() == 'commit')
  93     commit_content = ''.join(commit_it)
  94     return parse_commit(commit_content)
  95
  96
  97 def _local_git_date_str(epoch_sec):
  98     return '%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
  99
 100
 101 def _git_date_str(epoch_sec, tz_offset_sec):
 102     offs =  tz_offset_sec // 60
 103     return '%d %s%02d%02d' \
 104         % (epoch_sec,
 105            '+' if offs >= 0 else '-',
 106            abs(offs) // 60,
 107            abs(offs) % 60)
 108
 109
 110 def repo(sub = '', repo_dir=None):
 111     """Get the path to the git repository or one of its subdirectories."""
 112     global repodir
 113     repo_dir = repo_dir or repodir
 114     if not repo_dir:
 115         raise GitError('You should call check_repo_or_die()')
 116
 117     # If there's a .git subdirectory, then the actual repo is in there.
 118     gd = os.path.join(repo_dir, '.git')
 119     if os.path.exists(gd):
 120         repodir = gd
 121
 122     return os.path.join(repo_dir, sub)
 123
 124
 125 def shorten_hash(s):
 126     return re.sub(r'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)',
 127                   r'\1\2*\3', s)
 128
 129
 130 def repo_rel(path):
 131     full = os.path.abspath(path)
 132     fullrepo = os.path.abspath(repo(''))
 133     if not fullrepo.endswith('/'):
 134         fullrepo += '/'
 135     if full.startswith(fullrepo):
 136         path = full[len(fullrepo):]
 137     if path.startswith('index-cache/'):
 138         path = path[len('index-cache/'):]
 139     return shorten_hash(path)
 140
 141
 142 def all_packdirs():
 143     paths = [repo('objects/pack')]
 144     paths += glob.glob(repo('index-cache/*/.'))
 145     return paths
 146
 147
 148 def auto_midx(objdir):
 149     args = [path.exe(), 'midx', '--auto', '--dir', objdir]
 150     try:
 151         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 152     except OSError as e:
 153         # make sure 'args' gets printed to help with debugging
 154         add_error('%r: exception: %s' % (args, e))
 155         raise
 156     if rv:
 157         add_error('%r: returned %d' % (args, rv))
 158
 159     args = [path.exe(), 'bloom', '--dir', objdir]
 160     try:
 161         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 162     except OSError as e:
 163         # make sure 'args' gets printed to help with debugging
 164         add_error('%r: exception: %s' % (args, e))
 165         raise
 166     if rv:
 167         add_error('%r: returned %d' % (args, rv))
 168
 169
 170 def mangle_name(name, mode, gitmode):
 171     """Mangle a file name to present an abstract name for segmented files.
 172     Mangled file names will have the ".bup" extension added to them. If a
 173     file's name already ends with ".bup", a ".bupl" extension is added to
 174     disambiguate normal files from segmented ones.
 175     """
 176     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 177         assert(stat.S_ISDIR(gitmode))
 178         return name + '.bup'
 179     elif name.endswith('.bup') or name[:-1].endswith('.bup'):
 180         return name + '.bupl'
 181     else:
 182         return name
 183
 184
 185 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 186 def demangle_name(name, mode):
 187     """Remove name mangling from a file name, if necessary.
 188
 189     The return value is a tuple (demangled_filename,mode), where mode is one of
 190     the following:
 191
 192     * BUP_NORMAL  : files that should be read as-is from the repository
 193     * BUP_CHUNKED : files that were chunked and need to be reassembled
 194
 195     For more information on the name mangling algorithm, see mangle_name()
 196     """
 197     if name.endswith('.bupl'):
 198         return (name[:-5], BUP_NORMAL)
 199     elif name.endswith('.bup'):
 200         return (name[:-4], BUP_CHUNKED)
 201     elif name.endswith('.bupm'):
 202         return (name[:-5],
 203                 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
 204     else:
 205         return (name, BUP_NORMAL)
 206
 207
 208 def calc_hash(type, content):
 209     """Calculate some content's hash in the Git fashion."""
 210     header = '%s %d\0' % (type, len(content))
 211     sum = Sha1(header)
 212     sum.update(content)
 213     return sum.digest()
 214
 215
 216 def shalist_item_sort_key(ent):
 217     (mode, name, id) = ent
 218     assert(mode+0 == mode)
 219     if stat.S_ISDIR(mode):
 220         return name + '/'
 221     else:
 222         return name
 223
 224
 225 def tree_encode(shalist):
 226     """Generate a git tree object from (mode,name,hash) tuples."""
 227     shalist = sorted(shalist, key = shalist_item_sort_key)
 228     l = []
 229     for (mode,name,bin) in shalist:
 230         assert(mode)
 231         assert(mode+0 == mode)
 232         assert(name)
 233         assert(len(bin) == 20)
 234         s = '%o %s\0%s' % (mode,name,bin)
 235         assert(s[0] != '0')  # 0-padded octal is not acceptable in a git tree
 236         l.append(s)
 237     return ''.join(l)
 238
 239
 240 def tree_decode(buf):
 241     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 242     ofs = 0
 243     while ofs < len(buf):
 244         z = buf.find('\0', ofs)
 245         assert(z > ofs)
 246         spl = buf[ofs:z].split(' ', 1)
 247         assert(len(spl) == 2)
 248         mode,name = spl
 249         sha = buf[z+1:z+1+20]
 250         ofs = z+1+20
 251         yield (int(mode, 8), name, sha)
 252
 253
 254 def _encode_packobj(type, content, compression_level=1):
 255     szout = ''
 256     sz = len(content)
 257     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 258     sz >>= 4
 259     while 1:
 260         if sz: szbits |= 0x80
 261         szout += chr(szbits)
 262         if not sz:
 263             break
 264         szbits = sz & 0x7f
 265         sz >>= 7
 266     if compression_level > 9:
 267         compression_level = 9
 268     elif compression_level < 0:
 269         compression_level = 0
 270     z = zlib.compressobj(compression_level)
 271     yield szout
 272     yield z.compress(content)
 273     yield z.flush()
 274
 275
 276 def _encode_looseobj(type, content, compression_level=1):
 277     z = zlib.compressobj(compression_level)
 278     yield z.compress('%s %d\0' % (type, len(content)))
 279     yield z.compress(content)
 280     yield z.flush()
 281
 282
 283 def _decode_looseobj(buf):
 284     assert(buf);
 285     s = zlib.decompress(buf)
 286     i = s.find('\0')
 287     assert(i > 0)
 288     l = s[:i].split(' ')
 289     type = l[0]
 290     sz = int(l[1])
 291     content = s[i+1:]
 292     assert(type in _typemap)
 293     assert(sz == len(content))
 294     return (type, content)
 295
 296
 297 def _decode_packobj(buf):
 298     assert(buf)
 299     c = ord(buf[0])
 300     type = _typermap[(c & 0x70) >> 4]
 301     sz = c & 0x0f
 302     shift = 4
 303     i = 0
 304     while c & 0x80:
 305         i += 1
 306         c = ord(buf[i])
 307         sz |= (c & 0x7f) << shift
 308         shift += 7
 309         if not (c & 0x80):
 310             break
 311     return (type, zlib.decompress(buf[i+1:]))
 312
 313
 314 class PackIdx:
 315     def __init__(self):
 316         assert(0)
 317
 318     def find_offset(self, hash):
 319         """Get the offset of an object inside the index file."""
 320         idx = self._idx_from_hash(hash)
 321         if idx != None:
 322             return self._ofs_from_idx(idx)
 323         return None
 324
 325     def exists(self, hash, want_source=False):
 326         """Return nonempty if the object exists in this index."""
 327         if hash and (self._idx_from_hash(hash) != None):
 328             return want_source and os.path.basename(self.name) or True
 329         return None
 330
 331     def __len__(self):
 332         return int(self.fanout[255])
 333
 334     def _idx_from_hash(self, hash):
 335         global _total_searches, _total_steps
 336         _total_searches += 1
 337         assert(len(hash) == 20)
 338         b1 = ord(hash[0])
 339         start = self.fanout[b1-1] # range -1..254
 340         end = self.fanout[b1] # range 0..255
 341         want = str(hash)
 342         _total_steps += 1  # lookup table is a step
 343         while start < end:
 344             _total_steps += 1
 345             mid = start + (end-start)/2
 346             v = self._idx_to_hash(mid)
 347             if v < want:
 348                 start = mid+1
 349             elif v > want:
 350                 end = mid
 351             else: # got it!
 352                 return mid
 353         return None
 354
 355
 356 class PackIdxV1(PackIdx):
 357     """Object representation of a Git pack index (version 1) file."""
 358     def __init__(self, filename, f):
 359         self.name = filename
 360         self.idxnames = [self.name]
 361         self.map = mmap_read(f)
 362         self.fanout = list(struct.unpack('!256I',
 363                                          str(buffer(self.map, 0, 256*4))))
 364         self.fanout.append(0)  # entry "-1"
 365         nsha = self.fanout[255]
 366         self.sha_ofs = 256*4
 367         self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
 368
 369     def _ofs_from_idx(self, idx):
 370         return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
 371
 372     def _idx_to_hash(self, idx):
 373         return str(self.shatable[idx*24+4 : idx*24+24])
 374
 375     def __iter__(self):
 376         for i in xrange(self.fanout[255]):
 377             yield buffer(self.map, 256*4 + 24*i + 4, 20)
 378
 379
 380 class PackIdxV2(PackIdx):
 381     """Object representation of a Git pack index (version 2) file."""
 382     def __init__(self, filename, f):
 383         self.name = filename
 384         self.idxnames = [self.name]
 385         self.map = mmap_read(f)
 386         assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
 387         self.fanout = list(struct.unpack('!256I',
 388                                          str(buffer(self.map, 8, 256*4))))
 389         self.fanout.append(0)  # entry "-1"
 390         nsha = self.fanout[255]
 391         self.sha_ofs = 8 + 256*4
 392         self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
 393         self.ofstable = buffer(self.map,
 394                                self.sha_ofs + nsha*20 + nsha*4,
 395                                nsha*4)
 396         self.ofs64table = buffer(self.map,
 397                                  8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
 398
 399     def _ofs_from_idx(self, idx):
 400         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
 401         if ofs & 0x80000000:
 402             idx64 = ofs & 0x7fffffff
 403             ofs = struct.unpack('!Q',
 404                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
 405         return ofs
 406
 407     def _idx_to_hash(self, idx):
 408         return str(self.shatable[idx*20:(idx+1)*20])
 409
 410     def __iter__(self):
 411         for i in xrange(self.fanout[255]):
 412             yield buffer(self.map, 8 + 256*4 + 20*i, 20)
 413
 414
 415 _mpi_count = 0
 416 class PackIdxList:
 417     def __init__(self, dir):
 418         global _mpi_count
 419         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 420         _mpi_count += 1
 421         self.dir = dir
 422         self.also = set()
 423         self.packs = []
 424         self.do_bloom = False
 425         self.bloom = None
 426         self.refresh()
 427
 428     def __del__(self):
 429         global _mpi_count
 430         _mpi_count -= 1
 431         assert(_mpi_count == 0)
 432
 433     def __iter__(self):
 434         return iter(idxmerge(self.packs))
 435
 436     def __len__(self):
 437         return sum(len(pack) for pack in self.packs)
 438
 439     def exists(self, hash, want_source=False):
 440         """Return nonempty if the object exists in the index files."""
 441         global _total_searches
 442         _total_searches += 1
 443         if hash in self.also:
 444             return True
 445         if self.do_bloom and self.bloom:
 446             if self.bloom.exists(hash):
 447                 self.do_bloom = False
 448             else:
 449                 _total_searches -= 1  # was counted by bloom
 450                 return None
 451         for i in xrange(len(self.packs)):
 452             p = self.packs[i]
 453             _total_searches -= 1  # will be incremented by sub-pack
 454             ix = p.exists(hash, want_source=want_source)
 455             if ix:
 456                 # reorder so most recently used packs are searched first
 457                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 458                 return ix
 459         self.do_bloom = True
 460         return None
 461
 462     def refresh(self, skip_midx = False):
 463         """Refresh the index list.
 464         This method verifies if .midx files were superseded (e.g. all of its
 465         contents are in another, bigger .midx file) and removes the superseded
 466         files.
 467
 468         If skip_midx is True, all work on .midx files will be skipped and .midx
 469         files will be removed from the list.
 470
 471         The module-global variable 'ignore_midx' can force this function to
 472         always act as if skip_midx was True.
 473         """
 474         self.bloom = None # Always reopen the bloom as it may have been relaced
 475         self.do_bloom = False
 476         skip_midx = skip_midx or ignore_midx
 477         d = dict((p.name, p) for p in self.packs
 478                  if not skip_midx or not isinstance(p, midx.PackMidx))
 479         if os.path.exists(self.dir):
 480             if not skip_midx:
 481                 midxl = []
 482                 for ix in self.packs:
 483                     if isinstance(ix, midx.PackMidx):
 484                         for name in ix.idxnames:
 485                             d[os.path.join(self.dir, name)] = ix
 486                 for full in glob.glob(os.path.join(self.dir,'*.midx')):
 487                     if not d.get(full):
 488                         mx = midx.PackMidx(full)
 489                         (mxd, mxf) = os.path.split(mx.name)
 490                         broken = False
 491                         for n in mx.idxnames:
 492                             if not os.path.exists(os.path.join(mxd, n)):
 493                                 log(('warning: index %s missing\n' +
 494                                     '  used by %s\n') % (n, mxf))
 495                                 broken = True
 496                         if broken:
 497                             mx.close()
 498                             del mx
 499                             unlink(full)
 500                         else:
 501                             midxl.append(mx)
 502                 midxl.sort(key=lambda ix:
 503                            (-len(ix), -xstat.stat(ix.name).st_mtime))
 504                 for ix in midxl:
 505                     any_needed = False
 506                     for sub in ix.idxnames:
 507                         found = d.get(os.path.join(self.dir, sub))
 508                         if not found or isinstance(found, PackIdx):
 509                             # doesn't exist, or exists but not in a midx
 510                             any_needed = True
 511                             break
 512                     if any_needed:
 513                         d[ix.name] = ix
 514                         for name in ix.idxnames:
 515                             d[os.path.join(self.dir, name)] = ix
 516                     elif not ix.force_keep:
 517                         debug1('midx: removing redundant: %s\n'
 518                                % os.path.basename(ix.name))
 519                         ix.close()
 520                         unlink(ix.name)
 521             for full in glob.glob(os.path.join(self.dir,'*.idx')):
 522                 if not d.get(full):
 523                     try:
 524                         ix = open_idx(full)
 525                     except GitError as e:
 526                         add_error(e)
 527                         continue
 528                     d[full] = ix
 529             bfull = os.path.join(self.dir, 'bup.bloom')
 530             if self.bloom is None and os.path.exists(bfull):
 531                 self.bloom = bloom.ShaBloom(bfull)
 532             self.packs = list(set(d.values()))
 533             self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
 534             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 535                 self.do_bloom = True
 536             else:
 537                 self.bloom = None
 538         debug1('PackIdxList: using %d index%s.\n'
 539             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 540
 541     def add(self, hash):
 542         """Insert an additional object in the list."""
 543         self.also.add(hash)
 544
 545
 546 def open_idx(filename):
 547     if filename.endswith('.idx'):
 548         f = open(filename, 'rb')
 549         header = f.read(8)
 550         if header[0:4] == '\377tOc':
 551             version = struct.unpack('!I', header[4:8])[0]
 552             if version == 2:
 553                 return PackIdxV2(filename, f)
 554             else:
 555                 raise GitError('%s: expected idx file version 2, got %d'
 556                                % (filename, version))
 557         elif len(header) == 8 and header[0:4] < '\377tOc':
 558             return PackIdxV1(filename, f)
 559         else:
 560             raise GitError('%s: unrecognized idx file header' % filename)
 561     elif filename.endswith('.midx'):
 562         return midx.PackMidx(filename)
 563     else:
 564         raise GitError('idx filenames must end with .idx or .midx')
 565
 566
 567 def idxmerge(idxlist, final_progress=True):
 568     """Generate a list of all the objects reachable in a PackIdxList."""
 569     def pfunc(count, total):
 570         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 571                   % (count*100.0/total, count, total))
 572     def pfinal(count, total):
 573         if final_progress:
 574             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 575                      % (100, total, total))
 576     return merge_iter(idxlist, 10024, pfunc, pfinal)
 577
 578
 579 def _make_objcache():
 580     return PackIdxList(repo('objects/pack'))
 581
 582 class PackWriter:
 583     """Writes Git objects inside a pack file."""
 584     def __init__(self, objcache_maker=_make_objcache, compression_level=1):
 585         self.file = None
 586         self.parentfd = None
 587         self.count = 0
 588         self.outbytes = 0
 589         self.filename = None
 590         self.idx = None
 591         self.objcache_maker = objcache_maker
 592         self.objcache = None
 593         self.compression_level = compression_level
 594
 595     def __del__(self):
 596         self.close()
 597
 598     def _open(self):
 599         if not self.file:
 600             objdir = dir=repo('objects')
 601             fd, name = tempfile.mkstemp(suffix='.pack', dir=objdir)
 602             try:
 603                 self.file = os.fdopen(fd, 'w+b')
 604             except:
 605                 os.close(fd)
 606                 raise
 607             try:
 608                 self.parentfd = os.open(objdir, os.O_RDONLY)
 609             except:
 610                 f = self.file
 611                 self.file = None
 612                 f.close()
 613                 raise
 614             assert(name.endswith('.pack'))
 615             self.filename = name[:-5]
 616             self.file.write('PACK\0\0\0\2\0\0\0\0')
 617             self.idx = list(list() for i in xrange(256))
 618
 619     def _raw_write(self, datalist, sha):
 620         self._open()
 621         f = self.file
 622         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 623         # the file never has a *partial* blob.  So let's make sure it's
 624         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 625         # to our hashsplit algorithm.)  f.write() does its own buffering,
 626         # but that's okay because we'll flush it in _end().
 627         oneblob = ''.join(datalist)
 628         try:
 629             f.write(oneblob)
 630         except IOError as e:
 631             raise GitError, e, sys.exc_info()[2]
 632         nw = len(oneblob)
 633         crc = zlib.crc32(oneblob) & 0xffffffff
 634         self._update_idx(sha, crc, nw)
 635         self.outbytes += nw
 636         self.count += 1
 637         return nw, crc
 638
 639     def _update_idx(self, sha, crc, size):
 640         assert(sha)
 641         if self.idx:
 642             self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
 643
 644     def _write(self, sha, type, content):
 645         if verbose:
 646             log('>')
 647         if not sha:
 648             sha = calc_hash(type, content)
 649         size, crc = self._raw_write(_encode_packobj(type, content,
 650                                                     self.compression_level),
 651                                     sha=sha)
 652         if self.outbytes >= max_pack_size or self.count >= max_pack_objects:
 653             self.breakpoint()
 654         return sha
 655
 656     def breakpoint(self):
 657         """Clear byte and object counts and return the last processed id."""
 658         id = self._end()
 659         self.outbytes = self.count = 0
 660         return id
 661
 662     def _require_objcache(self):
 663         if self.objcache is None and self.objcache_maker:
 664             self.objcache = self.objcache_maker()
 665         if self.objcache is None:
 666             raise GitError(
 667                     "PackWriter not opened or can't check exists w/o objcache")
 668
 669     def exists(self, id, want_source=False):
 670         """Return non-empty if an object is found in the object cache."""
 671         self._require_objcache()
 672         return self.objcache.exists(id, want_source=want_source)
 673
 674     def maybe_write(self, type, content):
 675         """Write an object to the pack file if not present and return its id."""
 676         sha = calc_hash(type, content)
 677         if not self.exists(sha):
 678             self._write(sha, type, content)
 679             self._require_objcache()
 680             self.objcache.add(sha)
 681         return sha
 682
 683     def new_blob(self, blob):
 684         """Create a blob object in the pack with the supplied content."""
 685         return self.maybe_write('blob', blob)
 686
 687     def new_tree(self, shalist):
 688         """Create a tree object in the pack."""
 689         content = tree_encode(shalist)
 690         return self.maybe_write('tree', content)
 691
 692     def new_commit(self, tree, parent,
 693                    author, adate_sec, adate_tz,
 694                    committer, cdate_sec, cdate_tz,
 695                    msg):
 696         """Create a commit object in the pack.  The date_sec values must be
 697         epoch-seconds, and if a tz is None, the local timezone is assumed."""
 698         if adate_tz:
 699             adate_str = _git_date_str(adate_sec, adate_tz)
 700         else:
 701             adate_str = _local_git_date_str(adate_sec)
 702         if cdate_tz:
 703             cdate_str = _git_date_str(cdate_sec, cdate_tz)
 704         else:
 705             cdate_str = _local_git_date_str(cdate_sec)
 706         l = []
 707         if tree: l.append('tree %s' % tree.encode('hex'))
 708         if parent: l.append('parent %s' % parent.encode('hex'))
 709         if author: l.append('author %s %s' % (author, adate_str))
 710         if committer: l.append('committer %s %s' % (committer, cdate_str))
 711         l.append('')
 712         l.append(msg)
 713         return self.maybe_write('commit', '\n'.join(l))
 714
 715     def abort(self):
 716         """Remove the pack file from disk."""
 717         f = self.file
 718         if f:
 719             pfd = self.parentfd
 720             self.file = None
 721             self.parentfd = None
 722             self.idx = None
 723             try:
 724                 try:
 725                     os.unlink(self.filename + '.pack')
 726                 finally:
 727                     f.close()
 728             finally:
 729                 if pfd is not None:
 730                     os.close(pfd)
 731
 732     def _end(self, run_midx=True):
 733         f = self.file
 734         if not f: return None
 735         self.file = None
 736         try:
 737             self.objcache = None
 738             idx = self.idx
 739             self.idx = None
 740
 741             # update object count
 742             f.seek(8)
 743             cp = struct.pack('!i', self.count)
 744             assert(len(cp) == 4)
 745             f.write(cp)
 746
 747             # calculate the pack sha1sum
 748             f.seek(0)
 749             sum = Sha1()
 750             for b in chunkyreader(f):
 751                 sum.update(b)
 752             packbin = sum.digest()
 753             f.write(packbin)
 754             fdatasync(f.fileno())
 755         finally:
 756             f.close()
 757
 758         obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
 759
 760         nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
 761         if os.path.exists(self.filename + '.map'):
 762             os.unlink(self.filename + '.map')
 763         os.rename(self.filename + '.pack', nameprefix + '.pack')
 764         os.rename(self.filename + '.idx', nameprefix + '.idx')
 765         try:
 766             os.fsync(self.parentfd)
 767         finally:
 768             os.close(self.parentfd)
 769
 770         if run_midx:
 771             auto_midx(repo('objects/pack'))
 772         return nameprefix
 773
 774     def close(self, run_midx=True):
 775         """Close the pack file and move it to its definitive path."""
 776         return self._end(run_midx=run_midx)
 777
 778     def _write_pack_idx_v2(self, filename, idx, packbin):
 779         ofs64_count = 0
 780         for section in idx:
 781             for entry in section:
 782                 if entry[2] >= 2**31:
 783                     ofs64_count += 1
 784
 785         # Length: header + fan-out + shas-and-crcs + overflow-offsets
 786         index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
 787         idx_map = None
 788         idx_f = open(filename, 'w+b')
 789         try:
 790             idx_f.truncate(index_len)
 791             fdatasync(idx_f.fileno())
 792             idx_map = mmap_readwrite(idx_f, close=False)
 793             try:
 794                 count = _helpers.write_idx(filename, idx_map, idx, self.count)
 795                 assert(count == self.count)
 796                 idx_map.flush()
 797             finally:
 798                 idx_map.close()
 799         finally:
 800             idx_f.close()
 801
 802         idx_f = open(filename, 'a+b')
 803         try:
 804             idx_f.write(packbin)
 805             idx_f.seek(0)
 806             idx_sum = Sha1()
 807             b = idx_f.read(8 + 4*256)
 808             idx_sum.update(b)
 809
 810             obj_list_sum = Sha1()
 811             for b in chunkyreader(idx_f, 20*self.count):
 812                 idx_sum.update(b)
 813                 obj_list_sum.update(b)
 814             namebase = obj_list_sum.hexdigest()
 815
 816             for b in chunkyreader(idx_f):
 817                 idx_sum.update(b)
 818             idx_f.write(idx_sum.digest())
 819             fdatasync(idx_f.fileno())
 820             return namebase
 821         finally:
 822             idx_f.close()
 823
 824
 825 def _gitenv(repo_dir = None):
 826     if not repo_dir:
 827         repo_dir = repo()
 828     def env():
 829         os.environ['GIT_DIR'] = os.path.abspath(repo_dir)
 830     return env
 831
 832
 833 def list_refs(refname=None, repo_dir=None,
 834               limit_to_heads=False, limit_to_tags=False):
 835     """Yield (refname, hash) tuples for all repository refs unless a ref
 836     name is specified.  Given a ref name, only include tuples for that
 837     particular ref.  The limits restrict the result items to
 838     refs/heads or refs/tags.  If both limits are specified, items from
 839     both sources will be included.
 840
 841     """
 842     argv = ['git', 'show-ref']
 843     if limit_to_heads:
 844         argv.append('--heads')
 845     if limit_to_tags:
 846         argv.append('--tags')
 847     argv.append('--')
 848     if refname:
 849         argv += [refname]
 850     p = subprocess.Popen(argv,
 851                          preexec_fn = _gitenv(repo_dir),
 852                          stdout = subprocess.PIPE)
 853     out = p.stdout.read().strip()
 854     rv = p.wait()  # not fatal
 855     if rv:
 856         assert(not out)
 857     if out:
 858         for d in out.split('\n'):
 859             (sha, name) = d.split(' ', 1)
 860             yield (name, sha.decode('hex'))
 861
 862
 863 def read_ref(refname, repo_dir = None):
 864     """Get the commit id of the most recent commit made on a given ref."""
 865     refs = list_refs(refname, repo_dir=repo_dir, limit_to_heads=True)
 866     l = tuple(islice(refs, 2))
 867     if l:
 868         assert(len(l) == 1)
 869         return l[0][1]
 870     else:
 871         return None
 872
 873
 874 def rev_list(ref, count=None, repo_dir=None):
 875     """Generate a list of reachable commits in reverse chronological order.
 876
 877     This generator walks through commits, from child to parent, that are
 878     reachable via the specified ref and yields a series of tuples of the form
 879     (date,hash).
 880
 881     If count is a non-zero integer, limit the number of commits to "count"
 882     objects.
 883     """
 884     assert(not ref.startswith('-'))
 885     opts = []
 886     if count:
 887         opts += ['-n', str(atoi(count))]
 888     argv = ['git', 'rev-list', '--pretty=format:%at'] + opts + [ref, '--']
 889     p = subprocess.Popen(argv,
 890                          preexec_fn = _gitenv(repo_dir),
 891                          stdout = subprocess.PIPE)
 892     commit = None
 893     for row in p.stdout:
 894         s = row.strip()
 895         if s.startswith('commit '):
 896             commit = s[7:].decode('hex')
 897         else:
 898             date = int(s)
 899             yield (date, commit)
 900     rv = p.wait()  # not fatal
 901     if rv:
 902         raise GitError, 'git rev-list returned error %d' % rv
 903
 904
 905 def get_commit_dates(refs, repo_dir=None):
 906     """Get the dates for the specified commit refs.  For now, every unique
 907        string in refs must resolve to a different commit or this
 908        function will fail."""
 909     result = []
 910     for ref in refs:
 911         commit = get_commit_items(ref, cp(repo_dir))
 912         result.append(commit.author_sec)
 913     return result
 914
 915
 916 def rev_parse(committish, repo_dir=None):
 917     """Resolve the full hash for 'committish', if it exists.
 918
 919     Should be roughly equivalent to 'git rev-parse'.
 920
 921     Returns the hex value of the hash if it is found, None if 'committish' does
 922     not correspond to anything.
 923     """
 924     head = read_ref(committish, repo_dir=repo_dir)
 925     if head:
 926         debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
 927         return head
 928
 929     pL = PackIdxList(repo('objects/pack', repo_dir=repo_dir))
 930
 931     if len(committish) == 40:
 932         try:
 933             hash = committish.decode('hex')
 934         except TypeError:
 935             return None
 936
 937         if pL.exists(hash):
 938             return hash
 939
 940     return None
 941
 942
 943 def update_ref(refname, newval, oldval, repo_dir=None):
 944     """Update a repository reference."""
 945     if not oldval:
 946         oldval = ''
 947     assert(refname.startswith('refs/heads/') \
 948            or refname.startswith('refs/tags/'))
 949     p = subprocess.Popen(['git', 'update-ref', refname,
 950                           newval.encode('hex'), oldval.encode('hex')],
 951                          preexec_fn = _gitenv(repo_dir))
 952     _git_wait('git update-ref', p)
 953
 954
 955 def delete_ref(refname, oldvalue=None):
 956     """Delete a repository reference (see git update-ref(1))."""
 957     assert(refname.startswith('refs/'))
 958     oldvalue = [] if not oldvalue else [oldvalue]
 959     p = subprocess.Popen(['git', 'update-ref', '-d', refname] + oldvalue,
 960                          preexec_fn = _gitenv())
 961     _git_wait('git update-ref', p)
 962
 963
 964 def guess_repo(path=None):
 965     """Set the path value in the global variable "repodir".
 966     This makes bup look for an existing bup repository, but not fail if a
 967     repository doesn't exist. Usually, if you are interacting with a bup
 968     repository, you would not be calling this function but using
 969     check_repo_or_die().
 970     """
 971     global repodir
 972     if path:
 973         repodir = path
 974     if not repodir:
 975         repodir = os.environ.get('BUP_DIR')
 976         if not repodir:
 977             repodir = os.path.expanduser('~/.bup')
 978
 979
 980 def init_repo(path=None):
 981     """Create the Git bare repository for bup in a given path."""
 982     guess_repo(path)
 983     d = repo()  # appends a / to the path
 984     parent = os.path.dirname(os.path.dirname(d))
 985     if parent and not os.path.exists(parent):
 986         raise GitError('parent directory "%s" does not exist\n' % parent)
 987     if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
 988         raise GitError('"%s" exists but is not a directory\n' % d)
 989     p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
 990                          preexec_fn = _gitenv())
 991     _git_wait('git init', p)
 992     # Force the index version configuration in order to ensure bup works
 993     # regardless of the version of the installed Git binary.
 994     p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
 995                          stdout=sys.stderr, preexec_fn = _gitenv())
 996     _git_wait('git config', p)
 997     # Enable the reflog
 998     p = subprocess.Popen(['git', 'config', 'core.logAllRefUpdates', 'true'],
 999                          stdout=sys.stderr, preexec_fn = _gitenv())
1000     _git_wait('git config', p)
1001
1002
1003 def check_repo_or_die(path=None):
1004     """Make sure a bup repository exists, and abort if not.
1005     If the path to a particular repository was not specified, this function
1006     initializes the default repository automatically.
1007     """
1008     guess_repo(path)
1009     try:
1010         os.stat(repo('objects/pack/.'))
1011     except OSError as e:
1012         if e.errno == errno.ENOENT:
1013             log('error: %r is not a bup repository; run "bup init"\n'
1014                 % repo())
1015             sys.exit(15)
1016         else:
1017             log('error: %s\n' % e)
1018             sys.exit(14)
1019
1020
1021 _ver = None
1022 def ver():
1023     """Get Git's version and ensure a usable version is installed.
1024
1025     The returned version is formatted as an ordered tuple with each position
1026     representing a digit in the version tag. For example, the following tuple
1027     would represent version 1.6.6.9:
1028
1029         ('1', '6', '6', '9')
1030     """
1031     global _ver
1032     if not _ver:
1033         p = subprocess.Popen(['git', '--version'],
1034                              stdout=subprocess.PIPE)
1035         gvs = p.stdout.read()
1036         _git_wait('git --version', p)
1037         m = re.match(r'git version (\S+.\S+)', gvs)
1038         if not m:
1039             raise GitError('git --version weird output: %r' % gvs)
1040         _ver = tuple(m.group(1).split('.'))
1041     needed = ('1','5', '3', '1')
1042     if _ver < needed:
1043         raise GitError('git version %s or higher is required; you have %s'
1044                        % ('.'.join(needed), '.'.join(_ver)))
1045     return _ver
1046
1047
1048 def _git_wait(cmd, p):
1049     rv = p.wait()
1050     if rv != 0:
1051         raise GitError('%s returned %d' % (cmd, rv))
1052
1053
1054 def _git_capture(argv):
1055     p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv())
1056     r = p.stdout.read()
1057     _git_wait(repr(argv), p)
1058     return r
1059
1060
1061 class _AbortableIter:
1062     def __init__(self, it, onabort = None):
1063         self.it = it
1064         self.onabort = onabort
1065         self.done = None
1066
1067     def __iter__(self):
1068         return self
1069
1070     def next(self):
1071         try:
1072             return self.it.next()
1073         except StopIteration as e:
1074             self.done = True
1075             raise
1076         except:
1077             self.abort()
1078             raise
1079
1080     def abort(self):
1081         """Abort iteration and call the abortion callback, if needed."""
1082         if not self.done:
1083             self.done = True
1084             if self.onabort:
1085                 self.onabort()
1086
1087     def __del__(self):
1088         self.abort()
1089
1090
1091 _ver_warned = 0
1092 class CatPipe:
1093     """Link to 'git cat-file' that is used to retrieve blob data."""
1094     def __init__(self, repo_dir = None):
1095         global _ver_warned
1096         self.repo_dir = repo_dir
1097         wanted = ('1','5','6')
1098         if ver() < wanted:
1099             if not _ver_warned:
1100                 log('warning: git version < %s; bup will be slow.\n'
1101                     % '.'.join(wanted))
1102                 _ver_warned = 1
1103             self.get = self._slow_get
1104         else:
1105             self.p = self.inprogress = None
1106             self.get = self._fast_get
1107
1108     def _abort(self):
1109         if self.p:
1110             self.p.stdout.close()
1111             self.p.stdin.close()
1112         self.p = None
1113         self.inprogress = None
1114
1115     def _restart(self):
1116         self._abort()
1117         self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1118                                   stdin=subprocess.PIPE,
1119                                   stdout=subprocess.PIPE,
1120                                   close_fds = True,
1121                                   bufsize = 4096,
1122                                   preexec_fn = _gitenv(self.repo_dir))
1123
1124     def _fast_get(self, id):
1125         if not self.p or self.p.poll() != None:
1126             self._restart()
1127         assert(self.p)
1128         poll_result = self.p.poll()
1129         assert(poll_result == None)
1130         if self.inprogress:
1131             log('_fast_get: opening %r while %r is open\n'
1132                 % (id, self.inprogress))
1133         assert(not self.inprogress)
1134         assert(id.find('\n') < 0)
1135         assert(id.find('\r') < 0)
1136         assert(not id.startswith('-'))
1137         self.inprogress = id
1138         self.p.stdin.write('%s\n' % id)
1139         self.p.stdin.flush()
1140         hdr = self.p.stdout.readline()
1141         if hdr.endswith(' missing\n'):
1142             self.inprogress = None
1143             raise KeyError('blob %r is missing' % id)
1144         spl = hdr.split(' ')
1145         if len(spl) != 3 or len(spl[0]) != 40:
1146             raise GitError('expected blob, got %r' % spl)
1147         (hex, type, size) = spl
1148
1149         it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
1150                            onabort = self._abort)
1151         try:
1152             yield type
1153             for blob in it:
1154                 yield blob
1155             readline_result = self.p.stdout.readline()
1156             assert(readline_result == '\n')
1157             self.inprogress = None
1158         except Exception as e:
1159             it.abort()
1160             raise
1161
1162     def _slow_get(self, id):
1163         assert(id.find('\n') < 0)
1164         assert(id.find('\r') < 0)
1165         assert(id[0] != '-')
1166         type = _git_capture(['git', 'cat-file', '-t', id]).strip()
1167         yield type
1168
1169         p = subprocess.Popen(['git', 'cat-file', type, id],
1170                              stdout=subprocess.PIPE,
1171                              preexec_fn = _gitenv(self.repo_dir))
1172         for blob in chunkyreader(p.stdout):
1173             yield blob
1174         _git_wait('git cat-file', p)
1175
1176     def _join(self, it):
1177         type = it.next()
1178         if type == 'blob':
1179             for blob in it:
1180                 yield blob
1181         elif type == 'tree':
1182             treefile = ''.join(it)
1183             for (mode, name, sha) in tree_decode(treefile):
1184                 for blob in self.join(sha.encode('hex')):
1185                     yield blob
1186         elif type == 'commit':
1187             treeline = ''.join(it).split('\n')[0]
1188             assert(treeline.startswith('tree '))
1189             for blob in self.join(treeline[5:]):
1190                 yield blob
1191         else:
1192             raise GitError('invalid object type %r: expected blob/tree/commit'
1193                            % type)
1194
1195     def join(self, id):
1196         """Generate a list of the content of all blobs that can be reached
1197         from an object.  The hash given in 'id' must point to a blob, a tree
1198         or a commit. The content of all blobs that can be seen from trees or
1199         commits will be added to the list.
1200         """
1201         try:
1202             for d in self._join(self.get(id)):
1203                 yield d
1204         except StopIteration:
1205             log('booger!\n')
1206
1207
1208 _cp = {}
1209
1210 def cp(repo_dir=None):
1211     """Create a CatPipe object or reuse the already existing one."""
1212     global _cp
1213     if not repo_dir:
1214         repo_dir = repo()
1215     repo_dir = os.path.abspath(repo_dir)
1216     cp = _cp.get(repo_dir)
1217     if not cp:
1218         cp = CatPipe(repo_dir)
1219         _cp[repo_dir] = cp
1220     return cp
1221
1222
1223 def tags(repo_dir = None):
1224     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1225     tags = {}
1226     for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1227         assert(n.startswith('refs/tags/'))
1228         name = n[10:]
1229         if not c in tags:
1230             tags[c] = []
1231         tags[c].append(name)  # more than one tag can point at 'c'
1232     return tags
1233
1234
1235 WalkItem = namedtuple('WalkItem', ['id', 'type', 'mode',
1236                                    'path', 'chunk_path', 'data'])
1237 # The path is the mangled path, and if an item represents a fragment
1238 # of a chunked file, the chunk_path will be the chunked subtree path
1239 # for the chunk, i.e. ['', '2d3115e', ...].  The top-level path for a
1240 # chunked file will have a chunk_path of [''].  So some chunk subtree
1241 # of the file '/foo/bar/baz' might look like this:
1242 #
1243 #   item.path = ['foo', 'bar', 'baz.bup']
1244 #   item.chunk_path = ['', '2d3115e', '016b097']
1245 #   item.type = 'tree'
1246 #   ...
1247
1248
1249 def _walk_object(cat_pipe, id,
1250                  parent_path, chunk_path,
1251                  mode=None,
1252                  stop_at=None,
1253                  include_data=None):
1254
1255     if stop_at and stop_at(id):
1256         return
1257
1258     item_it = cat_pipe.get(id)  # FIXME: use include_data
1259     type = item_it.next()
1260
1261     if type not in ('blob', 'commit', 'tree'):
1262         raise Exception('unexpected repository object type %r' % type)
1263
1264     # FIXME: set the mode based on the type when the mode is None
1265
1266     if type == 'blob' and not include_data:
1267         # Dump data until we can ask cat_pipe not to fetch it
1268         for ignored in item_it:
1269             pass
1270         data = None
1271     else:
1272         data = ''.join(item_it)
1273
1274     yield  WalkItem(id=id, type=type,
1275                     chunk_path=chunk_path, path=parent_path,
1276                     mode=mode,
1277                     data=(data if include_data else None))
1278
1279     if type == 'commit':
1280         commit_items = parse_commit(data)
1281         tree_id = commit_items.tree
1282         for x in _walk_object(cat_pipe, tree_id, parent_path, chunk_path,
1283                               mode=hashsplit.GIT_MODE_TREE,
1284                               stop_at=stop_at,
1285                               include_data=include_data):
1286             yield x
1287         parents = commit_items.parents
1288         for pid in parents:
1289             for x in _walk_object(cat_pipe, pid, parent_path, chunk_path,
1290                                   mode=mode, # Same mode as this child
1291                                   stop_at=stop_at,
1292                                   include_data=include_data):
1293                 yield x
1294     elif type == 'tree':
1295         for mode, name, ent_id in tree_decode(data):
1296             demangled, bup_type = demangle_name(name, mode)
1297             if chunk_path:
1298                 sub_path = parent_path
1299                 sub_chunk_path = chunk_path + [name]
1300             else:
1301                 sub_path = parent_path + [name]
1302                 if bup_type == BUP_CHUNKED:
1303                     sub_chunk_path = ['']
1304                 else:
1305                     sub_chunk_path = chunk_path
1306             for x in _walk_object(cat_pipe, ent_id.encode('hex'),
1307                                   sub_path, sub_chunk_path,
1308                                   mode=mode,
1309                                   stop_at=stop_at,
1310                                   include_data=include_data):
1311                 yield x
1312
1313
1314 def walk_object(cat_pipe, id,
1315                 stop_at=None,
1316                 include_data=None):
1317     """Yield everything reachable from id via cat_pipe as a WalkItem,
1318     stopping whenever stop_at(id) returns true."""
1319     return _walk_object(cat_pipe, id, [], [],
1320                         stop_at=stop_at,
1321                         include_data=include_data)