lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5
   6 import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
   7 from collections import namedtuple
   8 from itertools import islice
   9
  10 from bup import _helpers, hashsplit, path, midx, bloom, xstat
  11 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
  12                          fdatasync,
  13                          hostname, localtime, log, merge_iter,
  14                          mmap_read, mmap_readwrite,
  15                          progress, qprogress, stat_if_exists,
  16                          unlink, username, userfullname,
  17                          utc_offset_str)
  18
  19 verbose = 0
  20 ignore_midx = 0
  21 repodir = None  # The default repository, once initialized
  22
  23 _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
  24 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
  25
  26 _total_searches = 0
  27 _total_steps = 0
  28
  29
  30 class GitError(Exception):
  31     pass
  32
  33
  34 def _git_wait(cmd, p):
  35     rv = p.wait()
  36     if rv != 0:
  37         raise GitError('%s returned %d' % (cmd, rv))
  38
  39 def _git_capture(argv):
  40     p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv())
  41     r = p.stdout.read()
  42     _git_wait(repr(argv), p)
  43     return r
  44
  45
  46 def parse_tz_offset(s):
  47     """UTC offset in seconds."""
  48     tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
  49     if s[0] == '-':
  50         return - tz_off
  51     return tz_off
  52
  53
  54 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
  55 # Make sure that's authoritative.
  56 _start_end_char = r'[^ .,:;<>"\'\0\n]'
  57 _content_char = r'[^\0\n<>]'
  58 _safe_str_rx = '(?:%s{1,2}|(?:%s%s*%s))' \
  59     % (_start_end_char,
  60        _start_end_char, _content_char, _start_end_char)
  61 _tz_rx = r'[-+]\d\d[0-5]\d'
  62 _parent_rx = r'(?:parent [abcdefABCDEF0123456789]{40}\n)'
  63 _commit_rx = re.compile(r'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
  64 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
  65 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)
  66
  67 (?P<message>(?:.|\n)*)''' % (_parent_rx,
  68                              _safe_str_rx, _safe_str_rx, _tz_rx,
  69                              _safe_str_rx, _safe_str_rx, _tz_rx))
  70 _parent_hash_rx = re.compile(r'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
  71
  72
  73 # Note that the author_sec and committer_sec values are (UTC) epoch seconds.
  74 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
  75                                        'author_name', 'author_mail',
  76                                        'author_sec', 'author_offset',
  77                                        'committer_name', 'committer_mail',
  78                                        'committer_sec', 'committer_offset',
  79                                        'message'])
  80
  81 def parse_commit(content):
  82     commit_match = re.match(_commit_rx, content)
  83     if not commit_match:
  84         raise Exception('cannot parse commit %r' % content)
  85     matches = commit_match.groupdict()
  86     return CommitInfo(tree=matches['tree'],
  87                       parents=re.findall(_parent_hash_rx, matches['parents']),
  88                       author_name=matches['author_name'],
  89                       author_mail=matches['author_mail'],
  90                       author_sec=int(matches['asec']),
  91                       author_offset=parse_tz_offset(matches['atz']),
  92                       committer_name=matches['committer_name'],
  93                       committer_mail=matches['committer_mail'],
  94                       committer_sec=int(matches['csec']),
  95                       committer_offset=parse_tz_offset(matches['ctz']),
  96                       message=matches['message'])
  97
  98
  99 def get_commit_items(id, cp):
 100     commit_it = cp.get(id)
 101     assert(commit_it.next() == 'commit')
 102     commit_content = ''.join(commit_it)
 103     return parse_commit(commit_content)
 104
 105
 106 def _local_git_date_str(epoch_sec):
 107     return '%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
 108
 109
 110 def _git_date_str(epoch_sec, tz_offset_sec):
 111     offs =  tz_offset_sec // 60
 112     return '%d %s%02d%02d' \
 113         % (epoch_sec,
 114            '+' if offs >= 0 else '-',
 115            abs(offs) // 60,
 116            abs(offs) % 60)
 117
 118
 119 def repo(sub = '', repo_dir=None):
 120     """Get the path to the git repository or one of its subdirectories."""
 121     global repodir
 122     repo_dir = repo_dir or repodir
 123     if not repo_dir:
 124         raise GitError('You should call check_repo_or_die()')
 125
 126     # If there's a .git subdirectory, then the actual repo is in there.
 127     gd = os.path.join(repo_dir, '.git')
 128     if os.path.exists(gd):
 129         repodir = gd
 130
 131     return os.path.join(repo_dir, sub)
 132
 133
 134 def shorten_hash(s):
 135     return re.sub(r'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)',
 136                   r'\1\2*\3', s)
 137
 138
 139 def repo_rel(path):
 140     full = os.path.abspath(path)
 141     fullrepo = os.path.abspath(repo(''))
 142     if not fullrepo.endswith('/'):
 143         fullrepo += '/'
 144     if full.startswith(fullrepo):
 145         path = full[len(fullrepo):]
 146     if path.startswith('index-cache/'):
 147         path = path[len('index-cache/'):]
 148     return shorten_hash(path)
 149
 150
 151 def all_packdirs():
 152     paths = [repo('objects/pack')]
 153     paths += glob.glob(repo('index-cache/*/.'))
 154     return paths
 155
 156
 157 def auto_midx(objdir):
 158     args = [path.exe(), 'midx', '--auto', '--dir', objdir]
 159     try:
 160         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 161     except OSError as e:
 162         # make sure 'args' gets printed to help with debugging
 163         add_error('%r: exception: %s' % (args, e))
 164         raise
 165     if rv:
 166         add_error('%r: returned %d' % (args, rv))
 167
 168     args = [path.exe(), 'bloom', '--dir', objdir]
 169     try:
 170         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 171     except OSError as e:
 172         # make sure 'args' gets printed to help with debugging
 173         add_error('%r: exception: %s' % (args, e))
 174         raise
 175     if rv:
 176         add_error('%r: returned %d' % (args, rv))
 177
 178
 179 def mangle_name(name, mode, gitmode):
 180     """Mangle a file name to present an abstract name for segmented files.
 181     Mangled file names will have the ".bup" extension added to them. If a
 182     file's name already ends with ".bup", a ".bupl" extension is added to
 183     disambiguate normal files from segmented ones.
 184     """
 185     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 186         assert(stat.S_ISDIR(gitmode))
 187         return name + '.bup'
 188     elif name.endswith('.bup') or name[:-1].endswith('.bup'):
 189         return name + '.bupl'
 190     else:
 191         return name
 192
 193
 194 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 195 def demangle_name(name, mode):
 196     """Remove name mangling from a file name, if necessary.
 197
 198     The return value is a tuple (demangled_filename,mode), where mode is one of
 199     the following:
 200
 201     * BUP_NORMAL  : files that should be read as-is from the repository
 202     * BUP_CHUNKED : files that were chunked and need to be reassembled
 203
 204     For more information on the name mangling algorithm, see mangle_name()
 205     """
 206     if name.endswith('.bupl'):
 207         return (name[:-5], BUP_NORMAL)
 208     elif name.endswith('.bup'):
 209         return (name[:-4], BUP_CHUNKED)
 210     elif name.endswith('.bupm'):
 211         return (name[:-5],
 212                 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
 213     else:
 214         return (name, BUP_NORMAL)
 215
 216
 217 def calc_hash(type, content):
 218     """Calculate some content's hash in the Git fashion."""
 219     header = '%s %d\0' % (type, len(content))
 220     sum = Sha1(header)
 221     sum.update(content)
 222     return sum.digest()
 223
 224
 225 def shalist_item_sort_key(ent):
 226     (mode, name, id) = ent
 227     assert(mode+0 == mode)
 228     if stat.S_ISDIR(mode):
 229         return name + '/'
 230     else:
 231         return name
 232
 233
 234 def tree_encode(shalist):
 235     """Generate a git tree object from (mode,name,hash) tuples."""
 236     shalist = sorted(shalist, key = shalist_item_sort_key)
 237     l = []
 238     for (mode,name,bin) in shalist:
 239         assert(mode)
 240         assert(mode+0 == mode)
 241         assert(name)
 242         assert(len(bin) == 20)
 243         s = '%o %s\0%s' % (mode,name,bin)
 244         assert(s[0] != '0')  # 0-padded octal is not acceptable in a git tree
 245         l.append(s)
 246     return ''.join(l)
 247
 248
 249 def tree_decode(buf):
 250     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 251     ofs = 0
 252     while ofs < len(buf):
 253         z = buf.find('\0', ofs)
 254         assert(z > ofs)
 255         spl = buf[ofs:z].split(' ', 1)
 256         assert(len(spl) == 2)
 257         mode,name = spl
 258         sha = buf[z+1:z+1+20]
 259         ofs = z+1+20
 260         yield (int(mode, 8), name, sha)
 261
 262
 263 def _encode_packobj(type, content, compression_level=1):
 264     if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
 265         raise ValueError('invalid compression level %s' % compression_level)
 266     szout = ''
 267     sz = len(content)
 268     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 269     sz >>= 4
 270     while 1:
 271         if sz: szbits |= 0x80
 272         szout += chr(szbits)
 273         if not sz:
 274             break
 275         szbits = sz & 0x7f
 276         sz >>= 7
 277     z = zlib.compressobj(compression_level)
 278     yield szout
 279     yield z.compress(content)
 280     yield z.flush()
 281
 282
 283 def _encode_looseobj(type, content, compression_level=1):
 284     z = zlib.compressobj(compression_level)
 285     yield z.compress('%s %d\0' % (type, len(content)))
 286     yield z.compress(content)
 287     yield z.flush()
 288
 289
 290 def _decode_looseobj(buf):
 291     assert(buf);
 292     s = zlib.decompress(buf)
 293     i = s.find('\0')
 294     assert(i > 0)
 295     l = s[:i].split(' ')
 296     type = l[0]
 297     sz = int(l[1])
 298     content = s[i+1:]
 299     assert(type in _typemap)
 300     assert(sz == len(content))
 301     return (type, content)
 302
 303
 304 def _decode_packobj(buf):
 305     assert(buf)
 306     c = ord(buf[0])
 307     type = _typermap[(c & 0x70) >> 4]
 308     sz = c & 0x0f
 309     shift = 4
 310     i = 0
 311     while c & 0x80:
 312         i += 1
 313         c = ord(buf[i])
 314         sz |= (c & 0x7f) << shift
 315         shift += 7
 316         if not (c & 0x80):
 317             break
 318     return (type, zlib.decompress(buf[i+1:]))
 319
 320
 321 class PackIdx:
 322     def __init__(self):
 323         assert(0)
 324
 325     def find_offset(self, hash):
 326         """Get the offset of an object inside the index file."""
 327         idx = self._idx_from_hash(hash)
 328         if idx != None:
 329             return self._ofs_from_idx(idx)
 330         return None
 331
 332     def exists(self, hash, want_source=False):
 333         """Return nonempty if the object exists in this index."""
 334         if hash and (self._idx_from_hash(hash) != None):
 335             return want_source and os.path.basename(self.name) or True
 336         return None
 337
 338     def __len__(self):
 339         return int(self.fanout[255])
 340
 341     def _idx_from_hash(self, hash):
 342         global _total_searches, _total_steps
 343         _total_searches += 1
 344         assert(len(hash) == 20)
 345         b1 = ord(hash[0])
 346         start = self.fanout[b1-1] # range -1..254
 347         end = self.fanout[b1] # range 0..255
 348         want = str(hash)
 349         _total_steps += 1  # lookup table is a step
 350         while start < end:
 351             _total_steps += 1
 352             mid = start + (end-start)/2
 353             v = self._idx_to_hash(mid)
 354             if v < want:
 355                 start = mid+1
 356             elif v > want:
 357                 end = mid
 358             else: # got it!
 359                 return mid
 360         return None
 361
 362
 363 class PackIdxV1(PackIdx):
 364     """Object representation of a Git pack index (version 1) file."""
 365     def __init__(self, filename, f):
 366         self.name = filename
 367         self.idxnames = [self.name]
 368         self.map = mmap_read(f)
 369         self.fanout = list(struct.unpack('!256I',
 370                                          str(buffer(self.map, 0, 256*4))))
 371         self.fanout.append(0)  # entry "-1"
 372         nsha = self.fanout[255]
 373         self.sha_ofs = 256*4
 374         self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
 375
 376     def _ofs_from_idx(self, idx):
 377         return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
 378
 379     def _idx_to_hash(self, idx):
 380         return str(self.shatable[idx*24+4 : idx*24+24])
 381
 382     def __iter__(self):
 383         for i in xrange(self.fanout[255]):
 384             yield buffer(self.map, 256*4 + 24*i + 4, 20)
 385
 386
 387 class PackIdxV2(PackIdx):
 388     """Object representation of a Git pack index (version 2) file."""
 389     def __init__(self, filename, f):
 390         self.name = filename
 391         self.idxnames = [self.name]
 392         self.map = mmap_read(f)
 393         assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
 394         self.fanout = list(struct.unpack('!256I',
 395                                          str(buffer(self.map, 8, 256*4))))
 396         self.fanout.append(0)  # entry "-1"
 397         nsha = self.fanout[255]
 398         self.sha_ofs = 8 + 256*4
 399         self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
 400         self.ofstable = buffer(self.map,
 401                                self.sha_ofs + nsha*20 + nsha*4,
 402                                nsha*4)
 403         self.ofs64table = buffer(self.map,
 404                                  8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
 405
 406     def _ofs_from_idx(self, idx):
 407         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
 408         if ofs & 0x80000000:
 409             idx64 = ofs & 0x7fffffff
 410             ofs = struct.unpack('!Q',
 411                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
 412         return ofs
 413
 414     def _idx_to_hash(self, idx):
 415         return str(self.shatable[idx*20:(idx+1)*20])
 416
 417     def __iter__(self):
 418         for i in xrange(self.fanout[255]):
 419             yield buffer(self.map, 8 + 256*4 + 20*i, 20)
 420
 421
 422 _mpi_count = 0
 423 class PackIdxList:
 424     def __init__(self, dir):
 425         global _mpi_count
 426         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 427         _mpi_count += 1
 428         self.dir = dir
 429         self.also = set()
 430         self.packs = []
 431         self.do_bloom = False
 432         self.bloom = None
 433         self.refresh()
 434
 435     def __del__(self):
 436         global _mpi_count
 437         _mpi_count -= 1
 438         assert(_mpi_count == 0)
 439
 440     def __iter__(self):
 441         return iter(idxmerge(self.packs))
 442
 443     def __len__(self):
 444         return sum(len(pack) for pack in self.packs)
 445
 446     def exists(self, hash, want_source=False):
 447         """Return nonempty if the object exists in the index files."""
 448         global _total_searches
 449         _total_searches += 1
 450         if hash in self.also:
 451             return True
 452         if self.do_bloom and self.bloom:
 453             if self.bloom.exists(hash):
 454                 self.do_bloom = False
 455             else:
 456                 _total_searches -= 1  # was counted by bloom
 457                 return None
 458         for i in xrange(len(self.packs)):
 459             p = self.packs[i]
 460             _total_searches -= 1  # will be incremented by sub-pack
 461             ix = p.exists(hash, want_source=want_source)
 462             if ix:
 463                 # reorder so most recently used packs are searched first
 464                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 465                 return ix
 466         self.do_bloom = True
 467         return None
 468
 469     def refresh(self, skip_midx = False):
 470         """Refresh the index list.
 471         This method verifies if .midx files were superseded (e.g. all of its
 472         contents are in another, bigger .midx file) and removes the superseded
 473         files.
 474
 475         If skip_midx is True, all work on .midx files will be skipped and .midx
 476         files will be removed from the list.
 477
 478         The module-global variable 'ignore_midx' can force this function to
 479         always act as if skip_midx was True.
 480         """
 481         self.bloom = None # Always reopen the bloom as it may have been relaced
 482         self.do_bloom = False
 483         skip_midx = skip_midx or ignore_midx
 484         d = dict((p.name, p) for p in self.packs
 485                  if not skip_midx or not isinstance(p, midx.PackMidx))
 486         if os.path.exists(self.dir):
 487             if not skip_midx:
 488                 midxl = []
 489                 for ix in self.packs:
 490                     if isinstance(ix, midx.PackMidx):
 491                         for name in ix.idxnames:
 492                             d[os.path.join(self.dir, name)] = ix
 493                 for full in glob.glob(os.path.join(self.dir,'*.midx')):
 494                     if not d.get(full):
 495                         mx = midx.PackMidx(full)
 496                         (mxd, mxf) = os.path.split(mx.name)
 497                         broken = False
 498                         for n in mx.idxnames:
 499                             if not os.path.exists(os.path.join(mxd, n)):
 500                                 log(('warning: index %s missing\n' +
 501                                     '  used by %s\n') % (n, mxf))
 502                                 broken = True
 503                         if broken:
 504                             mx.close()
 505                             del mx
 506                             unlink(full)
 507                         else:
 508                             midxl.append(mx)
 509                 midxl.sort(key=lambda ix:
 510                            (-len(ix), -xstat.stat(ix.name).st_mtime))
 511                 for ix in midxl:
 512                     any_needed = False
 513                     for sub in ix.idxnames:
 514                         found = d.get(os.path.join(self.dir, sub))
 515                         if not found or isinstance(found, PackIdx):
 516                             # doesn't exist, or exists but not in a midx
 517                             any_needed = True
 518                             break
 519                     if any_needed:
 520                         d[ix.name] = ix
 521                         for name in ix.idxnames:
 522                             d[os.path.join(self.dir, name)] = ix
 523                     elif not ix.force_keep:
 524                         debug1('midx: removing redundant: %s\n'
 525                                % os.path.basename(ix.name))
 526                         ix.close()
 527                         unlink(ix.name)
 528             for full in glob.glob(os.path.join(self.dir,'*.idx')):
 529                 if not d.get(full):
 530                     try:
 531                         ix = open_idx(full)
 532                     except GitError as e:
 533                         add_error(e)
 534                         continue
 535                     d[full] = ix
 536             bfull = os.path.join(self.dir, 'bup.bloom')
 537             if self.bloom is None and os.path.exists(bfull):
 538                 self.bloom = bloom.ShaBloom(bfull)
 539             self.packs = list(set(d.values()))
 540             self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
 541             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 542                 self.do_bloom = True
 543             else:
 544                 self.bloom = None
 545         debug1('PackIdxList: using %d index%s.\n'
 546             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 547
 548     def add(self, hash):
 549         """Insert an additional object in the list."""
 550         self.also.add(hash)
 551
 552
 553 def open_idx(filename):
 554     if filename.endswith('.idx'):
 555         f = open(filename, 'rb')
 556         header = f.read(8)
 557         if header[0:4] == '\377tOc':
 558             version = struct.unpack('!I', header[4:8])[0]
 559             if version == 2:
 560                 return PackIdxV2(filename, f)
 561             else:
 562                 raise GitError('%s: expected idx file version 2, got %d'
 563                                % (filename, version))
 564         elif len(header) == 8 and header[0:4] < '\377tOc':
 565             return PackIdxV1(filename, f)
 566         else:
 567             raise GitError('%s: unrecognized idx file header' % filename)
 568     elif filename.endswith('.midx'):
 569         return midx.PackMidx(filename)
 570     else:
 571         raise GitError('idx filenames must end with .idx or .midx')
 572
 573
 574 def idxmerge(idxlist, final_progress=True):
 575     """Generate a list of all the objects reachable in a PackIdxList."""
 576     def pfunc(count, total):
 577         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 578                   % (count*100.0/total, count, total))
 579     def pfinal(count, total):
 580         if final_progress:
 581             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 582                      % (100, total, total))
 583     return merge_iter(idxlist, 10024, pfunc, pfinal)
 584
 585
 586 def _make_objcache():
 587     return PackIdxList(repo('objects/pack'))
 588
 589 # bup-gc assumes that it can disable all PackWriter activities
 590 # (bloom/midx/cache) via the constructor and close() arguments.
 591
 592 class PackWriter:
 593     """Writes Git objects inside a pack file."""
 594     def __init__(self, objcache_maker=_make_objcache, compression_level=1,
 595                  run_midx=True, on_pack_finish=None,
 596                  max_pack_size=None, max_pack_objects=None):
 597         self.repo_dir = repo()
 598         self.file = None
 599         self.parentfd = None
 600         self.count = 0
 601         self.outbytes = 0
 602         self.filename = None
 603         self.idx = None
 604         self.objcache_maker = objcache_maker
 605         self.objcache = None
 606         self.compression_level = compression_level
 607         self.run_midx=run_midx
 608         self.on_pack_finish = on_pack_finish
 609         # larger packs will slow down pruning
 610         self.max_pack_size = max_pack_size if max_pack_size \
 611                              else 1000 * 1000 * 1000
 612         # cache memory usage is about 83 bytes per object
 613         self.max_pack_objects = max_pack_objects if max_pack_objects \
 614                                 else max(1, self.max_pack_size // 5000)
 615
 616     def __del__(self):
 617         self.close()
 618
 619     def _open(self):
 620         if not self.file:
 621             objdir = dir = os.path.join(self.repo_dir, 'objects')
 622             fd, name = tempfile.mkstemp(suffix='.pack', dir=objdir)
 623             try:
 624                 self.file = os.fdopen(fd, 'w+b')
 625             except:
 626                 os.close(fd)
 627                 raise
 628             try:
 629                 self.parentfd = os.open(objdir, os.O_RDONLY)
 630             except:
 631                 f = self.file
 632                 self.file = None
 633                 f.close()
 634                 raise
 635             assert(name.endswith('.pack'))
 636             self.filename = name[:-5]
 637             self.file.write('PACK\0\0\0\2\0\0\0\0')
 638             self.idx = list(list() for i in xrange(256))
 639
 640     def _raw_write(self, datalist, sha):
 641         self._open()
 642         f = self.file
 643         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 644         # the file never has a *partial* blob.  So let's make sure it's
 645         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 646         # to our hashsplit algorithm.)  f.write() does its own buffering,
 647         # but that's okay because we'll flush it in _end().
 648         oneblob = ''.join(datalist)
 649         try:
 650             f.write(oneblob)
 651         except IOError as e:
 652             raise GitError, e, sys.exc_info()[2]
 653         nw = len(oneblob)
 654         crc = zlib.crc32(oneblob) & 0xffffffff
 655         self._update_idx(sha, crc, nw)
 656         self.outbytes += nw
 657         self.count += 1
 658         return nw, crc
 659
 660     def _update_idx(self, sha, crc, size):
 661         assert(sha)
 662         if self.idx:
 663             self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
 664
 665     def _write(self, sha, type, content):
 666         if verbose:
 667             log('>')
 668         if not sha:
 669             sha = calc_hash(type, content)
 670         size, crc = self._raw_write(_encode_packobj(type, content,
 671                                                     self.compression_level),
 672                                     sha=sha)
 673         if self.outbytes >= self.max_pack_size \
 674            or self.count >= self.max_pack_objects:
 675             self.breakpoint()
 676         return sha
 677
 678     def breakpoint(self):
 679         """Clear byte and object counts and return the last processed id."""
 680         id = self._end(self.run_midx)
 681         self.outbytes = self.count = 0
 682         return id
 683
 684     def _require_objcache(self):
 685         if self.objcache is None and self.objcache_maker:
 686             self.objcache = self.objcache_maker()
 687         if self.objcache is None:
 688             raise GitError(
 689                     "PackWriter not opened or can't check exists w/o objcache")
 690
 691     def exists(self, id, want_source=False):
 692         """Return non-empty if an object is found in the object cache."""
 693         self._require_objcache()
 694         return self.objcache.exists(id, want_source=want_source)
 695
 696     def just_write(self, sha, type, content):
 697         """Write an object to the pack file, bypassing the objcache.  Fails if
 698         sha exists()."""
 699         self._write(sha, type, content)
 700
 701     def maybe_write(self, type, content):
 702         """Write an object to the pack file if not present and return its id."""
 703         sha = calc_hash(type, content)
 704         if not self.exists(sha):
 705             self.just_write(sha, type, content)
 706             self._require_objcache()
 707             self.objcache.add(sha)
 708         return sha
 709
 710     def new_blob(self, blob):
 711         """Create a blob object in the pack with the supplied content."""
 712         return self.maybe_write('blob', blob)
 713
 714     def new_tree(self, shalist):
 715         """Create a tree object in the pack."""
 716         content = tree_encode(shalist)
 717         return self.maybe_write('tree', content)
 718
 719     def new_commit(self, tree, parent,
 720                    author, adate_sec, adate_tz,
 721                    committer, cdate_sec, cdate_tz,
 722                    msg):
 723         """Create a commit object in the pack.  The date_sec values must be
 724         epoch-seconds, and if a tz is None, the local timezone is assumed."""
 725         if adate_tz:
 726             adate_str = _git_date_str(adate_sec, adate_tz)
 727         else:
 728             adate_str = _local_git_date_str(adate_sec)
 729         if cdate_tz:
 730             cdate_str = _git_date_str(cdate_sec, cdate_tz)
 731         else:
 732             cdate_str = _local_git_date_str(cdate_sec)
 733         l = []
 734         if tree: l.append('tree %s' % tree.encode('hex'))
 735         if parent: l.append('parent %s' % parent.encode('hex'))
 736         if author: l.append('author %s %s' % (author, adate_str))
 737         if committer: l.append('committer %s %s' % (committer, cdate_str))
 738         l.append('')
 739         l.append(msg)
 740         return self.maybe_write('commit', '\n'.join(l))
 741
 742     def abort(self):
 743         """Remove the pack file from disk."""
 744         f = self.file
 745         if f:
 746             pfd = self.parentfd
 747             self.file = None
 748             self.parentfd = None
 749             self.idx = None
 750             try:
 751                 try:
 752                     os.unlink(self.filename + '.pack')
 753                 finally:
 754                     f.close()
 755             finally:
 756                 if pfd is not None:
 757                     os.close(pfd)
 758
 759     def _end(self, run_midx=True):
 760         f = self.file
 761         if not f: return None
 762         self.file = None
 763         try:
 764             self.objcache = None
 765             idx = self.idx
 766             self.idx = None
 767
 768             # update object count
 769             f.seek(8)
 770             cp = struct.pack('!i', self.count)
 771             assert(len(cp) == 4)
 772             f.write(cp)
 773
 774             # calculate the pack sha1sum
 775             f.seek(0)
 776             sum = Sha1()
 777             for b in chunkyreader(f):
 778                 sum.update(b)
 779             packbin = sum.digest()
 780             f.write(packbin)
 781             fdatasync(f.fileno())
 782         finally:
 783             f.close()
 784
 785         obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
 786         nameprefix = os.path.join(self.repo_dir,
 787                                   'objects/pack/pack-' +  obj_list_sha)
 788         if os.path.exists(self.filename + '.map'):
 789             os.unlink(self.filename + '.map')
 790         os.rename(self.filename + '.pack', nameprefix + '.pack')
 791         os.rename(self.filename + '.idx', nameprefix + '.idx')
 792         try:
 793             os.fsync(self.parentfd)
 794         finally:
 795             os.close(self.parentfd)
 796
 797         if run_midx:
 798             auto_midx(os.path.join(self.repo_dir, 'objects/pack'))
 799
 800         if self.on_pack_finish:
 801             self.on_pack_finish(nameprefix)
 802
 803         return nameprefix
 804
 805     def close(self, run_midx=True):
 806         """Close the pack file and move it to its definitive path."""
 807         return self._end(run_midx=run_midx)
 808
 809     def _write_pack_idx_v2(self, filename, idx, packbin):
 810         ofs64_count = 0
 811         for section in idx:
 812             for entry in section:
 813                 if entry[2] >= 2**31:
 814                     ofs64_count += 1
 815
 816         # Length: header + fan-out + shas-and-crcs + overflow-offsets
 817         index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
 818         idx_map = None
 819         idx_f = open(filename, 'w+b')
 820         try:
 821             idx_f.truncate(index_len)
 822             fdatasync(idx_f.fileno())
 823             idx_map = mmap_readwrite(idx_f, close=False)
 824             try:
 825                 count = _helpers.write_idx(filename, idx_map, idx, self.count)
 826                 assert(count == self.count)
 827                 idx_map.flush()
 828             finally:
 829                 idx_map.close()
 830         finally:
 831             idx_f.close()
 832
 833         idx_f = open(filename, 'a+b')
 834         try:
 835             idx_f.write(packbin)
 836             idx_f.seek(0)
 837             idx_sum = Sha1()
 838             b = idx_f.read(8 + 4*256)
 839             idx_sum.update(b)
 840
 841             obj_list_sum = Sha1()
 842             for b in chunkyreader(idx_f, 20*self.count):
 843                 idx_sum.update(b)
 844                 obj_list_sum.update(b)
 845             namebase = obj_list_sum.hexdigest()
 846
 847             for b in chunkyreader(idx_f):
 848                 idx_sum.update(b)
 849             idx_f.write(idx_sum.digest())
 850             fdatasync(idx_f.fileno())
 851             return namebase
 852         finally:
 853             idx_f.close()
 854
 855
 856 def _gitenv(repo_dir = None):
 857     if not repo_dir:
 858         repo_dir = repo()
 859     def env():
 860         os.environ['GIT_DIR'] = os.path.abspath(repo_dir)
 861     return env
 862
 863
 864 def list_refs(refnames=None, repo_dir=None,
 865               limit_to_heads=False, limit_to_tags=False):
 866     """Yield (refname, hash) tuples for all repository refs unless
 867     refnames are specified.  In that case, only include tuples for
 868     those refs.  The limits restrict the result items to refs/heads or
 869     refs/tags.  If both limits are specified, items from both sources
 870     will be included.
 871
 872     """
 873     argv = ['git', 'show-ref']
 874     if limit_to_heads:
 875         argv.append('--heads')
 876     if limit_to_tags:
 877         argv.append('--tags')
 878     argv.append('--')
 879     if refnames:
 880         argv += refnames
 881     p = subprocess.Popen(argv,
 882                          preexec_fn = _gitenv(repo_dir),
 883                          stdout = subprocess.PIPE)
 884     out = p.stdout.read().strip()
 885     rv = p.wait()  # not fatal
 886     if rv:
 887         assert(not out)
 888     if out:
 889         for d in out.split('\n'):
 890             (sha, name) = d.split(' ', 1)
 891             yield (name, sha.decode('hex'))
 892
 893
 894 def read_ref(refname, repo_dir = None):
 895     """Get the commit id of the most recent commit made on a given ref."""
 896     refs = list_refs(refnames=[refname], repo_dir=repo_dir, limit_to_heads=True)
 897     l = tuple(islice(refs, 2))
 898     if l:
 899         assert(len(l) == 1)
 900         return l[0][1]
 901     else:
 902         return None
 903
 904
 905 def rev_list(ref, count=None, repo_dir=None):
 906     """Generate a list of reachable commits in reverse chronological order.
 907
 908     This generator walks through commits, from child to parent, that are
 909     reachable via the specified ref and yields a series of tuples of the form
 910     (date,hash).
 911
 912     If count is a non-zero integer, limit the number of commits to "count"
 913     objects.
 914     """
 915     assert(not ref.startswith('-'))
 916     opts = []
 917     if count:
 918         opts += ['-n', str(atoi(count))]
 919     argv = ['git', 'rev-list', '--pretty=format:%at'] + opts + [ref, '--']
 920     p = subprocess.Popen(argv,
 921                          preexec_fn = _gitenv(repo_dir),
 922                          stdout = subprocess.PIPE)
 923     commit = None
 924     for row in p.stdout:
 925         s = row.strip()
 926         if s.startswith('commit '):
 927             commit = s[7:].decode('hex')
 928         else:
 929             date = int(s)
 930             yield (date, commit)
 931     rv = p.wait()  # not fatal
 932     if rv:
 933         raise GitError, 'git rev-list returned error %d' % rv
 934
 935
 936 def get_commit_dates(refs, repo_dir=None):
 937     """Get the dates for the specified commit refs.  For now, every unique
 938        string in refs must resolve to a different commit or this
 939        function will fail."""
 940     result = []
 941     for ref in refs:
 942         commit = get_commit_items(ref, cp(repo_dir))
 943         result.append(commit.author_sec)
 944     return result
 945
 946
 947 def rev_parse(committish, repo_dir=None):
 948     """Resolve the full hash for 'committish', if it exists.
 949
 950     Should be roughly equivalent to 'git rev-parse'.
 951
 952     Returns the hex value of the hash if it is found, None if 'committish' does
 953     not correspond to anything.
 954     """
 955     head = read_ref(committish, repo_dir=repo_dir)
 956     if head:
 957         debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
 958         return head
 959
 960     pL = PackIdxList(repo('objects/pack', repo_dir=repo_dir))
 961
 962     if len(committish) == 40:
 963         try:
 964             hash = committish.decode('hex')
 965         except TypeError:
 966             return None
 967
 968         if pL.exists(hash):
 969             return hash
 970
 971     return None
 972
 973
 974 def update_ref(refname, newval, oldval, repo_dir=None):
 975     """Update a repository reference."""
 976     if not oldval:
 977         oldval = ''
 978     assert(refname.startswith('refs/heads/') \
 979            or refname.startswith('refs/tags/'))
 980     p = subprocess.Popen(['git', 'update-ref', refname,
 981                           newval.encode('hex'), oldval.encode('hex')],
 982                          preexec_fn = _gitenv(repo_dir))
 983     _git_wait('git update-ref', p)
 984
 985
 986 def delete_ref(refname, oldvalue=None):
 987     """Delete a repository reference (see git update-ref(1))."""
 988     assert(refname.startswith('refs/'))
 989     oldvalue = [] if not oldvalue else [oldvalue]
 990     p = subprocess.Popen(['git', 'update-ref', '-d', refname] + oldvalue,
 991                          preexec_fn = _gitenv())
 992     _git_wait('git update-ref', p)
 993
 994
 995 def guess_repo(path=None):
 996     """Set the path value in the global variable "repodir".
 997     This makes bup look for an existing bup repository, but not fail if a
 998     repository doesn't exist. Usually, if you are interacting with a bup
 999     repository, you would not be calling this function but using
1000     check_repo_or_die().
1001     """
1002     global repodir
1003     if path:
1004         repodir = path
1005     if not repodir:
1006         repodir = os.environ.get('BUP_DIR')
1007         if not repodir:
1008             repodir = os.path.expanduser('~/.bup')
1009
1010
1011 def init_repo(path=None):
1012     """Create the Git bare repository for bup in a given path."""
1013     guess_repo(path)
1014     d = repo()  # appends a / to the path
1015     parent = os.path.dirname(os.path.dirname(d))
1016     if parent and not os.path.exists(parent):
1017         raise GitError('parent directory "%s" does not exist\n' % parent)
1018     if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
1019         raise GitError('"%s" exists but is not a directory\n' % d)
1020     p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
1021                          preexec_fn = _gitenv())
1022     _git_wait('git init', p)
1023     # Force the index version configuration in order to ensure bup works
1024     # regardless of the version of the installed Git binary.
1025     p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
1026                          stdout=sys.stderr, preexec_fn = _gitenv())
1027     _git_wait('git config', p)
1028     # Enable the reflog
1029     p = subprocess.Popen(['git', 'config', 'core.logAllRefUpdates', 'true'],
1030                          stdout=sys.stderr, preexec_fn = _gitenv())
1031     _git_wait('git config', p)
1032
1033
1034 def check_repo_or_die(path=None):
1035     """Check to see if a bup repository probably exists, and abort if not."""
1036     guess_repo(path)
1037     top = repo()
1038     pst = stat_if_exists(top + '/objects/pack')
1039     if pst and stat.S_ISDIR(pst.st_mode):
1040         return
1041     if not pst:
1042         top_st = stat_if_exists(top)
1043         if not top_st:
1044             log('error: repository %r does not exist (see "bup help init")\n'
1045                 % top)
1046             sys.exit(15)
1047     log('error: %r is not a repository\n' % top)
1048     sys.exit(14)
1049
1050
1051 _ver = None
1052 def ver():
1053     """Get Git's version and ensure a usable version is installed.
1054
1055     The returned version is formatted as an ordered tuple with each position
1056     representing a digit in the version tag. For example, the following tuple
1057     would represent version 1.6.6.9:
1058
1059         ('1', '6', '6', '9')
1060     """
1061     global _ver
1062     if not _ver:
1063         p = subprocess.Popen(['git', '--version'],
1064                              stdout=subprocess.PIPE)
1065         gvs = p.stdout.read()
1066         _git_wait('git --version', p)
1067         m = re.match(r'git version (\S+.\S+)', gvs)
1068         if not m:
1069             raise GitError('git --version weird output: %r' % gvs)
1070         _ver = tuple(m.group(1).split('.'))
1071     needed = ('1','5', '3', '1')
1072     if _ver < needed:
1073         raise GitError('git version %s or higher is required; you have %s'
1074                        % ('.'.join(needed), '.'.join(_ver)))
1075     return _ver
1076
1077
1078 class _AbortableIter:
1079     def __init__(self, it, onabort = None):
1080         self.it = it
1081         self.onabort = onabort
1082         self.done = None
1083
1084     def __iter__(self):
1085         return self
1086
1087     def next(self):
1088         try:
1089             return self.it.next()
1090         except StopIteration as e:
1091             self.done = True
1092             raise
1093         except:
1094             self.abort()
1095             raise
1096
1097     def abort(self):
1098         """Abort iteration and call the abortion callback, if needed."""
1099         if not self.done:
1100             self.done = True
1101             if self.onabort:
1102                 self.onabort()
1103
1104     def __del__(self):
1105         self.abort()
1106
1107
1108 class MissingObject(KeyError):
1109     def __init__(self, id):
1110         self.id = id
1111         KeyError.__init__(self, 'object %r is missing' % id.encode('hex'))
1112
1113
1114 _ver_warned = 0
1115 class CatPipe:
1116     """Link to 'git cat-file' that is used to retrieve blob data."""
1117     def __init__(self, repo_dir = None):
1118         global _ver_warned
1119         self.repo_dir = repo_dir
1120         wanted = ('1','5','6')
1121         if ver() < wanted:
1122             if not _ver_warned:
1123                 log('warning: git version < %s; bup will be slow.\n'
1124                     % '.'.join(wanted))
1125                 _ver_warned = 1
1126             self.get = self._slow_get
1127         else:
1128             self.p = self.inprogress = None
1129             self.get = self._fast_get
1130
1131     def _abort(self):
1132         if self.p:
1133             self.p.stdout.close()
1134             self.p.stdin.close()
1135         self.p = None
1136         self.inprogress = None
1137
1138     def restart(self):
1139         self._abort()
1140         self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1141                                   stdin=subprocess.PIPE,
1142                                   stdout=subprocess.PIPE,
1143                                   close_fds = True,
1144                                   bufsize = 4096,
1145                                   preexec_fn = _gitenv(self.repo_dir))
1146
1147     def _fast_get(self, id):
1148         if not self.p or self.p.poll() != None:
1149             self.restart()
1150         assert(self.p)
1151         poll_result = self.p.poll()
1152         assert(poll_result == None)
1153         if self.inprogress:
1154             log('_fast_get: opening %r while %r is open\n'
1155                 % (id, self.inprogress))
1156         assert(not self.inprogress)
1157         assert(id.find('\n') < 0)
1158         assert(id.find('\r') < 0)
1159         assert(not id.startswith('-'))
1160         self.inprogress = id
1161         self.p.stdin.write('%s\n' % id)
1162         self.p.stdin.flush()
1163         hdr = self.p.stdout.readline()
1164         if hdr.endswith(' missing\n'):
1165             self.inprogress = None
1166             raise MissingObject(id.decode('hex'))
1167         spl = hdr.split(' ')
1168         if len(spl) != 3 or len(spl[0]) != 40:
1169             raise GitError('expected blob, got %r' % spl)
1170         (hex, type, size) = spl
1171
1172         it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
1173                            onabort = self._abort)
1174         try:
1175             yield type
1176             for blob in it:
1177                 yield blob
1178             readline_result = self.p.stdout.readline()
1179             assert(readline_result == '\n')
1180             self.inprogress = None
1181         except Exception as e:
1182             it.abort()
1183             raise
1184
1185     def _slow_get(self, id):
1186         assert(id.find('\n') < 0)
1187         assert(id.find('\r') < 0)
1188         assert(id[0] != '-')
1189         type = _git_capture(['git', 'cat-file', '-t', id]).strip()
1190         yield type
1191
1192         p = subprocess.Popen(['git', 'cat-file', type, id],
1193                              stdout=subprocess.PIPE,
1194                              preexec_fn = _gitenv(self.repo_dir))
1195         for blob in chunkyreader(p.stdout):
1196             yield blob
1197         _git_wait('git cat-file', p)
1198
1199     def _join(self, it):
1200         type = it.next()
1201         if type == 'blob':
1202             for blob in it:
1203                 yield blob
1204         elif type == 'tree':
1205             treefile = ''.join(it)
1206             for (mode, name, sha) in tree_decode(treefile):
1207                 for blob in self.join(sha.encode('hex')):
1208                     yield blob
1209         elif type == 'commit':
1210             treeline = ''.join(it).split('\n')[0]
1211             assert(treeline.startswith('tree '))
1212             for blob in self.join(treeline[5:]):
1213                 yield blob
1214         else:
1215             raise GitError('invalid object type %r: expected blob/tree/commit'
1216                            % type)
1217
1218     def join(self, id):
1219         """Generate a list of the content of all blobs that can be reached
1220         from an object.  The hash given in 'id' must point to a blob, a tree
1221         or a commit. The content of all blobs that can be seen from trees or
1222         commits will be added to the list.
1223         """
1224         try:
1225             for d in self._join(self.get(id)):
1226                 yield d
1227         except StopIteration:
1228             log('booger!\n')
1229
1230
1231 _cp = {}
1232
1233 def cp(repo_dir=None):
1234     """Create a CatPipe object or reuse the already existing one."""
1235     global _cp, repodir
1236     if not repo_dir:
1237         repo_dir = repodir or repo()
1238     repo_dir = os.path.abspath(repo_dir)
1239     cp = _cp.get(repo_dir)
1240     if not cp:
1241         cp = CatPipe(repo_dir)
1242         _cp[repo_dir] = cp
1243     return cp
1244
1245
1246 def tags(repo_dir = None):
1247     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1248     tags = {}
1249     for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1250         assert(n.startswith('refs/tags/'))
1251         name = n[10:]
1252         if not c in tags:
1253             tags[c] = []
1254         tags[c].append(name)  # more than one tag can point at 'c'
1255     return tags
1256
1257
1258 WalkItem = namedtuple('WalkItem', ['id', 'type', 'mode',
1259                                    'path', 'chunk_path', 'data'])
1260 # The path is the mangled path, and if an item represents a fragment
1261 # of a chunked file, the chunk_path will be the chunked subtree path
1262 # for the chunk, i.e. ['', '2d3115e', ...].  The top-level path for a
1263 # chunked file will have a chunk_path of [''].  So some chunk subtree
1264 # of the file '/foo/bar/baz' might look like this:
1265 #
1266 #   item.path = ['foo', 'bar', 'baz.bup']
1267 #   item.chunk_path = ['', '2d3115e', '016b097']
1268 #   item.type = 'tree'
1269 #   ...
1270
1271
1272 def walk_object(cat_pipe, id,
1273                 stop_at=None,
1274                 include_data=None):
1275     """Yield everything reachable from id via cat_pipe as a WalkItem,
1276     stopping whenever stop_at(id) returns true.  Throw MissingObject
1277     if a hash encountered is missing from the repository, and don't
1278     read or return blob content in the data field unless include_data
1279     is set.
1280     """
1281     # Maintain the pending stack on the heap to avoid stack overflow
1282     pending = [(id, [], [], None)]
1283     while len(pending):
1284         id, parent_path, chunk_path, mode = pending.pop()
1285         if stop_at and stop_at(id):
1286             continue
1287
1288         if (not include_data) and mode and stat.S_ISREG(mode):
1289             # If the object is a "regular file", then it's a leaf in
1290             # the graph, so we can skip reading the data if the caller
1291             # hasn't requested it.
1292             yield WalkItem(id=id, type='blob',
1293                            chunk_path=chunk_path, path=parent_path,
1294                            mode=mode,
1295                            data=None)
1296             continue
1297
1298         item_it = cat_pipe.get(id)
1299         type = item_it.next()
1300         if type not in ('blob', 'commit', 'tree'):
1301             raise Exception('unexpected repository object type %r' % type)
1302
1303         # FIXME: set the mode based on the type when the mode is None
1304         if type == 'blob' and not include_data:
1305             # Dump data until we can ask cat_pipe not to fetch it
1306             for ignored in item_it:
1307                 pass
1308             data = None
1309         else:
1310             data = ''.join(item_it)
1311
1312         yield WalkItem(id=id, type=type,
1313                        chunk_path=chunk_path, path=parent_path,
1314                        mode=mode,
1315                        data=(data if include_data else None))
1316
1317         if type == 'commit':
1318             commit_items = parse_commit(data)
1319             for pid in commit_items.parents:
1320                 pending.append((pid, parent_path, chunk_path, mode))
1321             pending.append((commit_items.tree, parent_path, chunk_path,
1322                             hashsplit.GIT_MODE_TREE))
1323         elif type == 'tree':
1324             for mode, name, ent_id in tree_decode(data):
1325                 demangled, bup_type = demangle_name(name, mode)
1326                 if chunk_path:
1327                     sub_path = parent_path
1328                     sub_chunk_path = chunk_path + [name]
1329                 else:
1330                     sub_path = parent_path + [name]
1331                     if bup_type == BUP_CHUNKED:
1332                         sub_chunk_path = ['']
1333                     else:
1334                         sub_chunk_path = chunk_path
1335                 pending.append((ent_id.encode('hex'), sub_path, sub_chunk_path,
1336                                 mode))