lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5
   6 import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
   7 from collections import namedtuple
   8 from itertools import islice
   9
  10 from bup import _helpers, hashsplit, path, midx, bloom, xstat
  11 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
  12                          fdatasync,
  13                          hostname, localtime, log, merge_iter,
  14                          mmap_read, mmap_readwrite,
  15                          progress, qprogress, stat_if_exists,
  16                          unlink, username, userfullname,
  17                          utc_offset_str)
  18
  19
  20 max_pack_size = 1000*1000*1000  # larger packs will slow down pruning
  21 max_pack_objects = 200*1000  # cache memory usage is about 83 bytes per object
  22
  23 verbose = 0
  24 ignore_midx = 0
  25 repodir = None  # The default repository, once initialized
  26
  27 _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
  28 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
  29
  30 _total_searches = 0
  31 _total_steps = 0
  32
  33
  34 class GitError(Exception):
  35     pass
  36
  37
  38 def _git_wait(cmd, p):
  39     rv = p.wait()
  40     if rv != 0:
  41         raise GitError('%s returned %d' % (cmd, rv))
  42
  43 def _git_capture(argv):
  44     p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv())
  45     r = p.stdout.read()
  46     _git_wait(repr(argv), p)
  47     return r
  48
  49
  50 def parse_tz_offset(s):
  51     """UTC offset in seconds."""
  52     tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
  53     if s[0] == '-':
  54         return - tz_off
  55     return tz_off
  56
  57
  58 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
  59 # Make sure that's authoritative.
  60 _start_end_char = r'[^ .,:;<>"\'\0\n]'
  61 _content_char = r'[^\0\n<>]'
  62 _safe_str_rx = '(?:%s{1,2}|(?:%s%s*%s))' \
  63     % (_start_end_char,
  64        _start_end_char, _content_char, _start_end_char)
  65 _tz_rx = r'[-+]\d\d[0-5]\d'
  66 _parent_rx = r'(?:parent [abcdefABCDEF0123456789]{40}\n)'
  67 _commit_rx = re.compile(r'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
  68 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
  69 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)
  70
  71 (?P<message>(?:.|\n)*)''' % (_parent_rx,
  72                              _safe_str_rx, _safe_str_rx, _tz_rx,
  73                              _safe_str_rx, _safe_str_rx, _tz_rx))
  74 _parent_hash_rx = re.compile(r'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
  75
  76
  77 # Note that the author_sec and committer_sec values are (UTC) epoch seconds.
  78 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
  79                                        'author_name', 'author_mail',
  80                                        'author_sec', 'author_offset',
  81                                        'committer_name', 'committer_mail',
  82                                        'committer_sec', 'committer_offset',
  83                                        'message'])
  84
  85 def parse_commit(content):
  86     commit_match = re.match(_commit_rx, content)
  87     if not commit_match:
  88         raise Exception('cannot parse commit %r' % content)
  89     matches = commit_match.groupdict()
  90     return CommitInfo(tree=matches['tree'],
  91                       parents=re.findall(_parent_hash_rx, matches['parents']),
  92                       author_name=matches['author_name'],
  93                       author_mail=matches['author_mail'],
  94                       author_sec=int(matches['asec']),
  95                       author_offset=parse_tz_offset(matches['atz']),
  96                       committer_name=matches['committer_name'],
  97                       committer_mail=matches['committer_mail'],
  98                       committer_sec=int(matches['csec']),
  99                       committer_offset=parse_tz_offset(matches['ctz']),
 100                       message=matches['message'])
 101
 102
 103 def get_commit_items(id, cp):
 104     commit_it = cp.get(id)
 105     assert(commit_it.next() == 'commit')
 106     commit_content = ''.join(commit_it)
 107     return parse_commit(commit_content)
 108
 109
 110 def _local_git_date_str(epoch_sec):
 111     return '%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
 112
 113
 114 def _git_date_str(epoch_sec, tz_offset_sec):
 115     offs =  tz_offset_sec // 60
 116     return '%d %s%02d%02d' \
 117         % (epoch_sec,
 118            '+' if offs >= 0 else '-',
 119            abs(offs) // 60,
 120            abs(offs) % 60)
 121
 122
 123 def repo(sub = '', repo_dir=None):
 124     """Get the path to the git repository or one of its subdirectories."""
 125     global repodir
 126     repo_dir = repo_dir or repodir
 127     if not repo_dir:
 128         raise GitError('You should call check_repo_or_die()')
 129
 130     # If there's a .git subdirectory, then the actual repo is in there.
 131     gd = os.path.join(repo_dir, '.git')
 132     if os.path.exists(gd):
 133         repodir = gd
 134
 135     return os.path.join(repo_dir, sub)
 136
 137
 138 def shorten_hash(s):
 139     return re.sub(r'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)',
 140                   r'\1\2*\3', s)
 141
 142
 143 def repo_rel(path):
 144     full = os.path.abspath(path)
 145     fullrepo = os.path.abspath(repo(''))
 146     if not fullrepo.endswith('/'):
 147         fullrepo += '/'
 148     if full.startswith(fullrepo):
 149         path = full[len(fullrepo):]
 150     if path.startswith('index-cache/'):
 151         path = path[len('index-cache/'):]
 152     return shorten_hash(path)
 153
 154
 155 def all_packdirs():
 156     paths = [repo('objects/pack')]
 157     paths += glob.glob(repo('index-cache/*/.'))
 158     return paths
 159
 160
 161 def auto_midx(objdir):
 162     args = [path.exe(), 'midx', '--auto', '--dir', objdir]
 163     try:
 164         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 165     except OSError as e:
 166         # make sure 'args' gets printed to help with debugging
 167         add_error('%r: exception: %s' % (args, e))
 168         raise
 169     if rv:
 170         add_error('%r: returned %d' % (args, rv))
 171
 172     args = [path.exe(), 'bloom', '--dir', objdir]
 173     try:
 174         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 175     except OSError as e:
 176         # make sure 'args' gets printed to help with debugging
 177         add_error('%r: exception: %s' % (args, e))
 178         raise
 179     if rv:
 180         add_error('%r: returned %d' % (args, rv))
 181
 182
 183 def mangle_name(name, mode, gitmode):
 184     """Mangle a file name to present an abstract name for segmented files.
 185     Mangled file names will have the ".bup" extension added to them. If a
 186     file's name already ends with ".bup", a ".bupl" extension is added to
 187     disambiguate normal files from segmented ones.
 188     """
 189     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 190         assert(stat.S_ISDIR(gitmode))
 191         return name + '.bup'
 192     elif name.endswith('.bup') or name[:-1].endswith('.bup'):
 193         return name + '.bupl'
 194     else:
 195         return name
 196
 197
 198 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 199 def demangle_name(name, mode):
 200     """Remove name mangling from a file name, if necessary.
 201
 202     The return value is a tuple (demangled_filename,mode), where mode is one of
 203     the following:
 204
 205     * BUP_NORMAL  : files that should be read as-is from the repository
 206     * BUP_CHUNKED : files that were chunked and need to be reassembled
 207
 208     For more information on the name mangling algorithm, see mangle_name()
 209     """
 210     if name.endswith('.bupl'):
 211         return (name[:-5], BUP_NORMAL)
 212     elif name.endswith('.bup'):
 213         return (name[:-4], BUP_CHUNKED)
 214     elif name.endswith('.bupm'):
 215         return (name[:-5],
 216                 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
 217     else:
 218         return (name, BUP_NORMAL)
 219
 220
 221 def calc_hash(type, content):
 222     """Calculate some content's hash in the Git fashion."""
 223     header = '%s %d\0' % (type, len(content))
 224     sum = Sha1(header)
 225     sum.update(content)
 226     return sum.digest()
 227
 228
 229 def shalist_item_sort_key(ent):
 230     (mode, name, id) = ent
 231     assert(mode+0 == mode)
 232     if stat.S_ISDIR(mode):
 233         return name + '/'
 234     else:
 235         return name
 236
 237
 238 def tree_encode(shalist):
 239     """Generate a git tree object from (mode,name,hash) tuples."""
 240     shalist = sorted(shalist, key = shalist_item_sort_key)
 241     l = []
 242     for (mode,name,bin) in shalist:
 243         assert(mode)
 244         assert(mode+0 == mode)
 245         assert(name)
 246         assert(len(bin) == 20)
 247         s = '%o %s\0%s' % (mode,name,bin)
 248         assert(s[0] != '0')  # 0-padded octal is not acceptable in a git tree
 249         l.append(s)
 250     return ''.join(l)
 251
 252
 253 def tree_decode(buf):
 254     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 255     ofs = 0
 256     while ofs < len(buf):
 257         z = buf.find('\0', ofs)
 258         assert(z > ofs)
 259         spl = buf[ofs:z].split(' ', 1)
 260         assert(len(spl) == 2)
 261         mode,name = spl
 262         sha = buf[z+1:z+1+20]
 263         ofs = z+1+20
 264         yield (int(mode, 8), name, sha)
 265
 266
 267 def _encode_packobj(type, content, compression_level=1):
 268     if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
 269         raise ValueError('invalid compression level %s' % compression_level)
 270     szout = ''
 271     sz = len(content)
 272     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 273     sz >>= 4
 274     while 1:
 275         if sz: szbits |= 0x80
 276         szout += chr(szbits)
 277         if not sz:
 278             break
 279         szbits = sz & 0x7f
 280         sz >>= 7
 281     z = zlib.compressobj(compression_level)
 282     yield szout
 283     yield z.compress(content)
 284     yield z.flush()
 285
 286
 287 def _encode_looseobj(type, content, compression_level=1):
 288     z = zlib.compressobj(compression_level)
 289     yield z.compress('%s %d\0' % (type, len(content)))
 290     yield z.compress(content)
 291     yield z.flush()
 292
 293
 294 def _decode_looseobj(buf):
 295     assert(buf);
 296     s = zlib.decompress(buf)
 297     i = s.find('\0')
 298     assert(i > 0)
 299     l = s[:i].split(' ')
 300     type = l[0]
 301     sz = int(l[1])
 302     content = s[i+1:]
 303     assert(type in _typemap)
 304     assert(sz == len(content))
 305     return (type, content)
 306
 307
 308 def _decode_packobj(buf):
 309     assert(buf)
 310     c = ord(buf[0])
 311     type = _typermap[(c & 0x70) >> 4]
 312     sz = c & 0x0f
 313     shift = 4
 314     i = 0
 315     while c & 0x80:
 316         i += 1
 317         c = ord(buf[i])
 318         sz |= (c & 0x7f) << shift
 319         shift += 7
 320         if not (c & 0x80):
 321             break
 322     return (type, zlib.decompress(buf[i+1:]))
 323
 324
 325 class PackIdx:
 326     def __init__(self):
 327         assert(0)
 328
 329     def find_offset(self, hash):
 330         """Get the offset of an object inside the index file."""
 331         idx = self._idx_from_hash(hash)
 332         if idx != None:
 333             return self._ofs_from_idx(idx)
 334         return None
 335
 336     def exists(self, hash, want_source=False):
 337         """Return nonempty if the object exists in this index."""
 338         if hash and (self._idx_from_hash(hash) != None):
 339             return want_source and os.path.basename(self.name) or True
 340         return None
 341
 342     def __len__(self):
 343         return int(self.fanout[255])
 344
 345     def _idx_from_hash(self, hash):
 346         global _total_searches, _total_steps
 347         _total_searches += 1
 348         assert(len(hash) == 20)
 349         b1 = ord(hash[0])
 350         start = self.fanout[b1-1] # range -1..254
 351         end = self.fanout[b1] # range 0..255
 352         want = str(hash)
 353         _total_steps += 1  # lookup table is a step
 354         while start < end:
 355             _total_steps += 1
 356             mid = start + (end-start)/2
 357             v = self._idx_to_hash(mid)
 358             if v < want:
 359                 start = mid+1
 360             elif v > want:
 361                 end = mid
 362             else: # got it!
 363                 return mid
 364         return None
 365
 366
 367 class PackIdxV1(PackIdx):
 368     """Object representation of a Git pack index (version 1) file."""
 369     def __init__(self, filename, f):
 370         self.name = filename
 371         self.idxnames = [self.name]
 372         self.map = mmap_read(f)
 373         self.fanout = list(struct.unpack('!256I',
 374                                          str(buffer(self.map, 0, 256*4))))
 375         self.fanout.append(0)  # entry "-1"
 376         nsha = self.fanout[255]
 377         self.sha_ofs = 256*4
 378         self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
 379
 380     def _ofs_from_idx(self, idx):
 381         return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
 382
 383     def _idx_to_hash(self, idx):
 384         return str(self.shatable[idx*24+4 : idx*24+24])
 385
 386     def __iter__(self):
 387         for i in xrange(self.fanout[255]):
 388             yield buffer(self.map, 256*4 + 24*i + 4, 20)
 389
 390
 391 class PackIdxV2(PackIdx):
 392     """Object representation of a Git pack index (version 2) file."""
 393     def __init__(self, filename, f):
 394         self.name = filename
 395         self.idxnames = [self.name]
 396         self.map = mmap_read(f)
 397         assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
 398         self.fanout = list(struct.unpack('!256I',
 399                                          str(buffer(self.map, 8, 256*4))))
 400         self.fanout.append(0)  # entry "-1"
 401         nsha = self.fanout[255]
 402         self.sha_ofs = 8 + 256*4
 403         self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
 404         self.ofstable = buffer(self.map,
 405                                self.sha_ofs + nsha*20 + nsha*4,
 406                                nsha*4)
 407         self.ofs64table = buffer(self.map,
 408                                  8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
 409
 410     def _ofs_from_idx(self, idx):
 411         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
 412         if ofs & 0x80000000:
 413             idx64 = ofs & 0x7fffffff
 414             ofs = struct.unpack('!Q',
 415                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
 416         return ofs
 417
 418     def _idx_to_hash(self, idx):
 419         return str(self.shatable[idx*20:(idx+1)*20])
 420
 421     def __iter__(self):
 422         for i in xrange(self.fanout[255]):
 423             yield buffer(self.map, 8 + 256*4 + 20*i, 20)
 424
 425
 426 _mpi_count = 0
 427 class PackIdxList:
 428     def __init__(self, dir):
 429         global _mpi_count
 430         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 431         _mpi_count += 1
 432         self.dir = dir
 433         self.also = set()
 434         self.packs = []
 435         self.do_bloom = False
 436         self.bloom = None
 437         self.refresh()
 438
 439     def __del__(self):
 440         global _mpi_count
 441         _mpi_count -= 1
 442         assert(_mpi_count == 0)
 443
 444     def __iter__(self):
 445         return iter(idxmerge(self.packs))
 446
 447     def __len__(self):
 448         return sum(len(pack) for pack in self.packs)
 449
 450     def exists(self, hash, want_source=False):
 451         """Return nonempty if the object exists in the index files."""
 452         global _total_searches
 453         _total_searches += 1
 454         if hash in self.also:
 455             return True
 456         if self.do_bloom and self.bloom:
 457             if self.bloom.exists(hash):
 458                 self.do_bloom = False
 459             else:
 460                 _total_searches -= 1  # was counted by bloom
 461                 return None
 462         for i in xrange(len(self.packs)):
 463             p = self.packs[i]
 464             _total_searches -= 1  # will be incremented by sub-pack
 465             ix = p.exists(hash, want_source=want_source)
 466             if ix:
 467                 # reorder so most recently used packs are searched first
 468                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 469                 return ix
 470         self.do_bloom = True
 471         return None
 472
 473     def refresh(self, skip_midx = False):
 474         """Refresh the index list.
 475         This method verifies if .midx files were superseded (e.g. all of its
 476         contents are in another, bigger .midx file) and removes the superseded
 477         files.
 478
 479         If skip_midx is True, all work on .midx files will be skipped and .midx
 480         files will be removed from the list.
 481
 482         The module-global variable 'ignore_midx' can force this function to
 483         always act as if skip_midx was True.
 484         """
 485         self.bloom = None # Always reopen the bloom as it may have been relaced
 486         self.do_bloom = False
 487         skip_midx = skip_midx or ignore_midx
 488         d = dict((p.name, p) for p in self.packs
 489                  if not skip_midx or not isinstance(p, midx.PackMidx))
 490         if os.path.exists(self.dir):
 491             if not skip_midx:
 492                 midxl = []
 493                 for ix in self.packs:
 494                     if isinstance(ix, midx.PackMidx):
 495                         for name in ix.idxnames:
 496                             d[os.path.join(self.dir, name)] = ix
 497                 for full in glob.glob(os.path.join(self.dir,'*.midx')):
 498                     if not d.get(full):
 499                         mx = midx.PackMidx(full)
 500                         (mxd, mxf) = os.path.split(mx.name)
 501                         broken = False
 502                         for n in mx.idxnames:
 503                             if not os.path.exists(os.path.join(mxd, n)):
 504                                 log(('warning: index %s missing\n' +
 505                                     '  used by %s\n') % (n, mxf))
 506                                 broken = True
 507                         if broken:
 508                             mx.close()
 509                             del mx
 510                             unlink(full)
 511                         else:
 512                             midxl.append(mx)
 513                 midxl.sort(key=lambda ix:
 514                            (-len(ix), -xstat.stat(ix.name).st_mtime))
 515                 for ix in midxl:
 516                     any_needed = False
 517                     for sub in ix.idxnames:
 518                         found = d.get(os.path.join(self.dir, sub))
 519                         if not found or isinstance(found, PackIdx):
 520                             # doesn't exist, or exists but not in a midx
 521                             any_needed = True
 522                             break
 523                     if any_needed:
 524                         d[ix.name] = ix
 525                         for name in ix.idxnames:
 526                             d[os.path.join(self.dir, name)] = ix
 527                     elif not ix.force_keep:
 528                         debug1('midx: removing redundant: %s\n'
 529                                % os.path.basename(ix.name))
 530                         ix.close()
 531                         unlink(ix.name)
 532             for full in glob.glob(os.path.join(self.dir,'*.idx')):
 533                 if not d.get(full):
 534                     try:
 535                         ix = open_idx(full)
 536                     except GitError as e:
 537                         add_error(e)
 538                         continue
 539                     d[full] = ix
 540             bfull = os.path.join(self.dir, 'bup.bloom')
 541             if self.bloom is None and os.path.exists(bfull):
 542                 self.bloom = bloom.ShaBloom(bfull)
 543             self.packs = list(set(d.values()))
 544             self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
 545             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 546                 self.do_bloom = True
 547             else:
 548                 self.bloom = None
 549         debug1('PackIdxList: using %d index%s.\n'
 550             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 551
 552     def add(self, hash):
 553         """Insert an additional object in the list."""
 554         self.also.add(hash)
 555
 556
 557 def open_idx(filename):
 558     if filename.endswith('.idx'):
 559         f = open(filename, 'rb')
 560         header = f.read(8)
 561         if header[0:4] == '\377tOc':
 562             version = struct.unpack('!I', header[4:8])[0]
 563             if version == 2:
 564                 return PackIdxV2(filename, f)
 565             else:
 566                 raise GitError('%s: expected idx file version 2, got %d'
 567                                % (filename, version))
 568         elif len(header) == 8 and header[0:4] < '\377tOc':
 569             return PackIdxV1(filename, f)
 570         else:
 571             raise GitError('%s: unrecognized idx file header' % filename)
 572     elif filename.endswith('.midx'):
 573         return midx.PackMidx(filename)
 574     else:
 575         raise GitError('idx filenames must end with .idx or .midx')
 576
 577
 578 def idxmerge(idxlist, final_progress=True):
 579     """Generate a list of all the objects reachable in a PackIdxList."""
 580     def pfunc(count, total):
 581         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 582                   % (count*100.0/total, count, total))
 583     def pfinal(count, total):
 584         if final_progress:
 585             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 586                      % (100, total, total))
 587     return merge_iter(idxlist, 10024, pfunc, pfinal)
 588
 589
 590 def _make_objcache():
 591     return PackIdxList(repo('objects/pack'))
 592
 593 # bup-gc assumes that it can disable all PackWriter activities
 594 # (bloom/midx/cache) via the constructor and close() arguments.
 595
 596 class PackWriter:
 597     """Writes Git objects inside a pack file."""
 598     def __init__(self, objcache_maker=_make_objcache, compression_level=1,
 599                  run_midx=True, on_pack_finish=None):
 600         self.file = None
 601         self.parentfd = None
 602         self.count = 0
 603         self.outbytes = 0
 604         self.filename = None
 605         self.idx = None
 606         self.objcache_maker = objcache_maker
 607         self.objcache = None
 608         self.compression_level = compression_level
 609         self.run_midx=run_midx
 610         self.on_pack_finish = on_pack_finish
 611
 612     def __del__(self):
 613         self.close()
 614
 615     def _open(self):
 616         if not self.file:
 617             objdir = dir=repo('objects')
 618             fd, name = tempfile.mkstemp(suffix='.pack', dir=objdir)
 619             try:
 620                 self.file = os.fdopen(fd, 'w+b')
 621             except:
 622                 os.close(fd)
 623                 raise
 624             try:
 625                 self.parentfd = os.open(objdir, os.O_RDONLY)
 626             except:
 627                 f = self.file
 628                 self.file = None
 629                 f.close()
 630                 raise
 631             assert(name.endswith('.pack'))
 632             self.filename = name[:-5]
 633             self.file.write('PACK\0\0\0\2\0\0\0\0')
 634             self.idx = list(list() for i in xrange(256))
 635
 636     def _raw_write(self, datalist, sha):
 637         self._open()
 638         f = self.file
 639         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 640         # the file never has a *partial* blob.  So let's make sure it's
 641         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 642         # to our hashsplit algorithm.)  f.write() does its own buffering,
 643         # but that's okay because we'll flush it in _end().
 644         oneblob = ''.join(datalist)
 645         try:
 646             f.write(oneblob)
 647         except IOError as e:
 648             raise GitError, e, sys.exc_info()[2]
 649         nw = len(oneblob)
 650         crc = zlib.crc32(oneblob) & 0xffffffff
 651         self._update_idx(sha, crc, nw)
 652         self.outbytes += nw
 653         self.count += 1
 654         return nw, crc
 655
 656     def _update_idx(self, sha, crc, size):
 657         assert(sha)
 658         if self.idx:
 659             self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
 660
 661     def _write(self, sha, type, content):
 662         if verbose:
 663             log('>')
 664         if not sha:
 665             sha = calc_hash(type, content)
 666         size, crc = self._raw_write(_encode_packobj(type, content,
 667                                                     self.compression_level),
 668                                     sha=sha)
 669         if self.outbytes >= max_pack_size or self.count >= max_pack_objects:
 670             self.breakpoint()
 671         return sha
 672
 673     def breakpoint(self):
 674         """Clear byte and object counts and return the last processed id."""
 675         id = self._end(self.run_midx)
 676         self.outbytes = self.count = 0
 677         return id
 678
 679     def _require_objcache(self):
 680         if self.objcache is None and self.objcache_maker:
 681             self.objcache = self.objcache_maker()
 682         if self.objcache is None:
 683             raise GitError(
 684                     "PackWriter not opened or can't check exists w/o objcache")
 685
 686     def exists(self, id, want_source=False):
 687         """Return non-empty if an object is found in the object cache."""
 688         self._require_objcache()
 689         return self.objcache.exists(id, want_source=want_source)
 690
 691     def just_write(self, sha, type, content):
 692         """Write an object to the pack file, bypassing the objcache.  Fails if
 693         sha exists()."""
 694         self._write(sha, type, content)
 695
 696     def maybe_write(self, type, content):
 697         """Write an object to the pack file if not present and return its id."""
 698         sha = calc_hash(type, content)
 699         if not self.exists(sha):
 700             self.just_write(sha, type, content)
 701             self._require_objcache()
 702             self.objcache.add(sha)
 703         return sha
 704
 705     def new_blob(self, blob):
 706         """Create a blob object in the pack with the supplied content."""
 707         return self.maybe_write('blob', blob)
 708
 709     def new_tree(self, shalist):
 710         """Create a tree object in the pack."""
 711         content = tree_encode(shalist)
 712         return self.maybe_write('tree', content)
 713
 714     def new_commit(self, tree, parent,
 715                    author, adate_sec, adate_tz,
 716                    committer, cdate_sec, cdate_tz,
 717                    msg):
 718         """Create a commit object in the pack.  The date_sec values must be
 719         epoch-seconds, and if a tz is None, the local timezone is assumed."""
 720         if adate_tz:
 721             adate_str = _git_date_str(adate_sec, adate_tz)
 722         else:
 723             adate_str = _local_git_date_str(adate_sec)
 724         if cdate_tz:
 725             cdate_str = _git_date_str(cdate_sec, cdate_tz)
 726         else:
 727             cdate_str = _local_git_date_str(cdate_sec)
 728         l = []
 729         if tree: l.append('tree %s' % tree.encode('hex'))
 730         if parent: l.append('parent %s' % parent.encode('hex'))
 731         if author: l.append('author %s %s' % (author, adate_str))
 732         if committer: l.append('committer %s %s' % (committer, cdate_str))
 733         l.append('')
 734         l.append(msg)
 735         return self.maybe_write('commit', '\n'.join(l))
 736
 737     def abort(self):
 738         """Remove the pack file from disk."""
 739         f = self.file
 740         if f:
 741             pfd = self.parentfd
 742             self.file = None
 743             self.parentfd = None
 744             self.idx = None
 745             try:
 746                 try:
 747                     os.unlink(self.filename + '.pack')
 748                 finally:
 749                     f.close()
 750             finally:
 751                 if pfd is not None:
 752                     os.close(pfd)
 753
 754     def _end(self, run_midx=True):
 755         f = self.file
 756         if not f: return None
 757         self.file = None
 758         try:
 759             self.objcache = None
 760             idx = self.idx
 761             self.idx = None
 762
 763             # update object count
 764             f.seek(8)
 765             cp = struct.pack('!i', self.count)
 766             assert(len(cp) == 4)
 767             f.write(cp)
 768
 769             # calculate the pack sha1sum
 770             f.seek(0)
 771             sum = Sha1()
 772             for b in chunkyreader(f):
 773                 sum.update(b)
 774             packbin = sum.digest()
 775             f.write(packbin)
 776             fdatasync(f.fileno())
 777         finally:
 778             f.close()
 779
 780         obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
 781
 782         nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
 783         if os.path.exists(self.filename + '.map'):
 784             os.unlink(self.filename + '.map')
 785         os.rename(self.filename + '.pack', nameprefix + '.pack')
 786         os.rename(self.filename + '.idx', nameprefix + '.idx')
 787         try:
 788             os.fsync(self.parentfd)
 789         finally:
 790             os.close(self.parentfd)
 791
 792         if run_midx:
 793             auto_midx(repo('objects/pack'))
 794
 795         if self.on_pack_finish:
 796             self.on_pack_finish(nameprefix)
 797
 798         return nameprefix
 799
 800     def close(self, run_midx=True):
 801         """Close the pack file and move it to its definitive path."""
 802         return self._end(run_midx=run_midx)
 803
 804     def _write_pack_idx_v2(self, filename, idx, packbin):
 805         ofs64_count = 0
 806         for section in idx:
 807             for entry in section:
 808                 if entry[2] >= 2**31:
 809                     ofs64_count += 1
 810
 811         # Length: header + fan-out + shas-and-crcs + overflow-offsets
 812         index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
 813         idx_map = None
 814         idx_f = open(filename, 'w+b')
 815         try:
 816             idx_f.truncate(index_len)
 817             fdatasync(idx_f.fileno())
 818             idx_map = mmap_readwrite(idx_f, close=False)
 819             try:
 820                 count = _helpers.write_idx(filename, idx_map, idx, self.count)
 821                 assert(count == self.count)
 822                 idx_map.flush()
 823             finally:
 824                 idx_map.close()
 825         finally:
 826             idx_f.close()
 827
 828         idx_f = open(filename, 'a+b')
 829         try:
 830             idx_f.write(packbin)
 831             idx_f.seek(0)
 832             idx_sum = Sha1()
 833             b = idx_f.read(8 + 4*256)
 834             idx_sum.update(b)
 835
 836             obj_list_sum = Sha1()
 837             for b in chunkyreader(idx_f, 20*self.count):
 838                 idx_sum.update(b)
 839                 obj_list_sum.update(b)
 840             namebase = obj_list_sum.hexdigest()
 841
 842             for b in chunkyreader(idx_f):
 843                 idx_sum.update(b)
 844             idx_f.write(idx_sum.digest())
 845             fdatasync(idx_f.fileno())
 846             return namebase
 847         finally:
 848             idx_f.close()
 849
 850
 851 def _gitenv(repo_dir = None):
 852     if not repo_dir:
 853         repo_dir = repo()
 854     def env():
 855         os.environ['GIT_DIR'] = os.path.abspath(repo_dir)
 856     return env
 857
 858
 859 def list_refs(refnames=None, repo_dir=None,
 860               limit_to_heads=False, limit_to_tags=False):
 861     """Yield (refname, hash) tuples for all repository refs unless
 862     refnames are specified.  In that case, only include tuples for
 863     those refs.  The limits restrict the result items to refs/heads or
 864     refs/tags.  If both limits are specified, items from both sources
 865     will be included.
 866
 867     """
 868     argv = ['git', 'show-ref']
 869     if limit_to_heads:
 870         argv.append('--heads')
 871     if limit_to_tags:
 872         argv.append('--tags')
 873     argv.append('--')
 874     if refnames:
 875         argv += refnames
 876     p = subprocess.Popen(argv,
 877                          preexec_fn = _gitenv(repo_dir),
 878                          stdout = subprocess.PIPE)
 879     out = p.stdout.read().strip()
 880     rv = p.wait()  # not fatal
 881     if rv:
 882         assert(not out)
 883     if out:
 884         for d in out.split('\n'):
 885             (sha, name) = d.split(' ', 1)
 886             yield (name, sha.decode('hex'))
 887
 888
 889 def read_ref(refname, repo_dir = None):
 890     """Get the commit id of the most recent commit made on a given ref."""
 891     refs = list_refs(refnames=[refname], repo_dir=repo_dir, limit_to_heads=True)
 892     l = tuple(islice(refs, 2))
 893     if l:
 894         assert(len(l) == 1)
 895         return l[0][1]
 896     else:
 897         return None
 898
 899
 900 def rev_list(ref, count=None, repo_dir=None):
 901     """Generate a list of reachable commits in reverse chronological order.
 902
 903     This generator walks through commits, from child to parent, that are
 904     reachable via the specified ref and yields a series of tuples of the form
 905     (date,hash).
 906
 907     If count is a non-zero integer, limit the number of commits to "count"
 908     objects.
 909     """
 910     assert(not ref.startswith('-'))
 911     opts = []
 912     if count:
 913         opts += ['-n', str(atoi(count))]
 914     argv = ['git', 'rev-list', '--pretty=format:%at'] + opts + [ref, '--']
 915     p = subprocess.Popen(argv,
 916                          preexec_fn = _gitenv(repo_dir),
 917                          stdout = subprocess.PIPE)
 918     commit = None
 919     for row in p.stdout:
 920         s = row.strip()
 921         if s.startswith('commit '):
 922             commit = s[7:].decode('hex')
 923         else:
 924             date = int(s)
 925             yield (date, commit)
 926     rv = p.wait()  # not fatal
 927     if rv:
 928         raise GitError, 'git rev-list returned error %d' % rv
 929
 930
 931 def get_commit_dates(refs, repo_dir=None):
 932     """Get the dates for the specified commit refs.  For now, every unique
 933        string in refs must resolve to a different commit or this
 934        function will fail."""
 935     result = []
 936     for ref in refs:
 937         commit = get_commit_items(ref, cp(repo_dir))
 938         result.append(commit.author_sec)
 939     return result
 940
 941
 942 def rev_parse(committish, repo_dir=None):
 943     """Resolve the full hash for 'committish', if it exists.
 944
 945     Should be roughly equivalent to 'git rev-parse'.
 946
 947     Returns the hex value of the hash if it is found, None if 'committish' does
 948     not correspond to anything.
 949     """
 950     head = read_ref(committish, repo_dir=repo_dir)
 951     if head:
 952         debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
 953         return head
 954
 955     pL = PackIdxList(repo('objects/pack', repo_dir=repo_dir))
 956
 957     if len(committish) == 40:
 958         try:
 959             hash = committish.decode('hex')
 960         except TypeError:
 961             return None
 962
 963         if pL.exists(hash):
 964             return hash
 965
 966     return None
 967
 968
 969 def update_ref(refname, newval, oldval, repo_dir=None):
 970     """Update a repository reference."""
 971     if not oldval:
 972         oldval = ''
 973     assert(refname.startswith('refs/heads/') \
 974            or refname.startswith('refs/tags/'))
 975     p = subprocess.Popen(['git', 'update-ref', refname,
 976                           newval.encode('hex'), oldval.encode('hex')],
 977                          preexec_fn = _gitenv(repo_dir))
 978     _git_wait('git update-ref', p)
 979
 980
 981 def delete_ref(refname, oldvalue=None):
 982     """Delete a repository reference (see git update-ref(1))."""
 983     assert(refname.startswith('refs/'))
 984     oldvalue = [] if not oldvalue else [oldvalue]
 985     p = subprocess.Popen(['git', 'update-ref', '-d', refname] + oldvalue,
 986                          preexec_fn = _gitenv())
 987     _git_wait('git update-ref', p)
 988
 989
 990 def guess_repo(path=None):
 991     """Set the path value in the global variable "repodir".
 992     This makes bup look for an existing bup repository, but not fail if a
 993     repository doesn't exist. Usually, if you are interacting with a bup
 994     repository, you would not be calling this function but using
 995     check_repo_or_die().
 996     """
 997     global repodir
 998     if path:
 999         repodir = path
1000     if not repodir:
1001         repodir = os.environ.get('BUP_DIR')
1002         if not repodir:
1003             repodir = os.path.expanduser('~/.bup')
1004
1005
1006 def init_repo(path=None):
1007     """Create the Git bare repository for bup in a given path."""
1008     guess_repo(path)
1009     d = repo()  # appends a / to the path
1010     parent = os.path.dirname(os.path.dirname(d))
1011     if parent and not os.path.exists(parent):
1012         raise GitError('parent directory "%s" does not exist\n' % parent)
1013     if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
1014         raise GitError('"%s" exists but is not a directory\n' % d)
1015     p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
1016                          preexec_fn = _gitenv())
1017     _git_wait('git init', p)
1018     # Force the index version configuration in order to ensure bup works
1019     # regardless of the version of the installed Git binary.
1020     p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
1021                          stdout=sys.stderr, preexec_fn = _gitenv())
1022     _git_wait('git config', p)
1023     # Enable the reflog
1024     p = subprocess.Popen(['git', 'config', 'core.logAllRefUpdates', 'true'],
1025                          stdout=sys.stderr, preexec_fn = _gitenv())
1026     _git_wait('git config', p)
1027
1028
1029 def check_repo_or_die(path=None):
1030     """Check to see if a bup repository probably exists, and abort if not."""
1031     guess_repo(path)
1032     top = repo()
1033     pst = stat_if_exists(top + '/objects/pack')
1034     if pst and stat.S_ISDIR(pst.st_mode):
1035         return
1036     if not pst:
1037         top_st = stat_if_exists(top)
1038         if not top_st:
1039             log('error: repository %r does not exist (see "bup help init")\n'
1040                 % top)
1041             sys.exit(15)
1042     log('error: %r is not a repository\n' % top)
1043     sys.exit(14)
1044
1045
1046 _ver = None
1047 def ver():
1048     """Get Git's version and ensure a usable version is installed.
1049
1050     The returned version is formatted as an ordered tuple with each position
1051     representing a digit in the version tag. For example, the following tuple
1052     would represent version 1.6.6.9:
1053
1054         ('1', '6', '6', '9')
1055     """
1056     global _ver
1057     if not _ver:
1058         p = subprocess.Popen(['git', '--version'],
1059                              stdout=subprocess.PIPE)
1060         gvs = p.stdout.read()
1061         _git_wait('git --version', p)
1062         m = re.match(r'git version (\S+.\S+)', gvs)
1063         if not m:
1064             raise GitError('git --version weird output: %r' % gvs)
1065         _ver = tuple(m.group(1).split('.'))
1066     needed = ('1','5', '3', '1')
1067     if _ver < needed:
1068         raise GitError('git version %s or higher is required; you have %s'
1069                        % ('.'.join(needed), '.'.join(_ver)))
1070     return _ver
1071
1072
1073 class _AbortableIter:
1074     def __init__(self, it, onabort = None):
1075         self.it = it
1076         self.onabort = onabort
1077         self.done = None
1078
1079     def __iter__(self):
1080         return self
1081
1082     def next(self):
1083         try:
1084             return self.it.next()
1085         except StopIteration as e:
1086             self.done = True
1087             raise
1088         except:
1089             self.abort()
1090             raise
1091
1092     def abort(self):
1093         """Abort iteration and call the abortion callback, if needed."""
1094         if not self.done:
1095             self.done = True
1096             if self.onabort:
1097                 self.onabort()
1098
1099     def __del__(self):
1100         self.abort()
1101
1102
1103 class MissingObject(KeyError):
1104     def __init__(self, id):
1105         self.id = id
1106         KeyError.__init__(self, 'object %r is missing' % id.encode('hex'))
1107
1108
1109 _ver_warned = 0
1110 class CatPipe:
1111     """Link to 'git cat-file' that is used to retrieve blob data."""
1112     def __init__(self, repo_dir = None):
1113         global _ver_warned
1114         self.repo_dir = repo_dir
1115         wanted = ('1','5','6')
1116         if ver() < wanted:
1117             if not _ver_warned:
1118                 log('warning: git version < %s; bup will be slow.\n'
1119                     % '.'.join(wanted))
1120                 _ver_warned = 1
1121             self.get = self._slow_get
1122         else:
1123             self.p = self.inprogress = None
1124             self.get = self._fast_get
1125
1126     def _abort(self):
1127         if self.p:
1128             self.p.stdout.close()
1129             self.p.stdin.close()
1130         self.p = None
1131         self.inprogress = None
1132
1133     def restart(self):
1134         self._abort()
1135         self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1136                                   stdin=subprocess.PIPE,
1137                                   stdout=subprocess.PIPE,
1138                                   close_fds = True,
1139                                   bufsize = 4096,
1140                                   preexec_fn = _gitenv(self.repo_dir))
1141
1142     def _fast_get(self, id):
1143         if not self.p or self.p.poll() != None:
1144             self.restart()
1145         assert(self.p)
1146         poll_result = self.p.poll()
1147         assert(poll_result == None)
1148         if self.inprogress:
1149             log('_fast_get: opening %r while %r is open\n'
1150                 % (id, self.inprogress))
1151         assert(not self.inprogress)
1152         assert(id.find('\n') < 0)
1153         assert(id.find('\r') < 0)
1154         assert(not id.startswith('-'))
1155         self.inprogress = id
1156         self.p.stdin.write('%s\n' % id)
1157         self.p.stdin.flush()
1158         hdr = self.p.stdout.readline()
1159         if hdr.endswith(' missing\n'):
1160             self.inprogress = None
1161             raise MissingObject(id.decode('hex'))
1162         spl = hdr.split(' ')
1163         if len(spl) != 3 or len(spl[0]) != 40:
1164             raise GitError('expected blob, got %r' % spl)
1165         (hex, type, size) = spl
1166
1167         it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
1168                            onabort = self._abort)
1169         try:
1170             yield type
1171             for blob in it:
1172                 yield blob
1173             readline_result = self.p.stdout.readline()
1174             assert(readline_result == '\n')
1175             self.inprogress = None
1176         except Exception as e:
1177             it.abort()
1178             raise
1179
1180     def _slow_get(self, id):
1181         assert(id.find('\n') < 0)
1182         assert(id.find('\r') < 0)
1183         assert(id[0] != '-')
1184         type = _git_capture(['git', 'cat-file', '-t', id]).strip()
1185         yield type
1186
1187         p = subprocess.Popen(['git', 'cat-file', type, id],
1188                              stdout=subprocess.PIPE,
1189                              preexec_fn = _gitenv(self.repo_dir))
1190         for blob in chunkyreader(p.stdout):
1191             yield blob
1192         _git_wait('git cat-file', p)
1193
1194     def _join(self, it):
1195         type = it.next()
1196         if type == 'blob':
1197             for blob in it:
1198                 yield blob
1199         elif type == 'tree':
1200             treefile = ''.join(it)
1201             for (mode, name, sha) in tree_decode(treefile):
1202                 for blob in self.join(sha.encode('hex')):
1203                     yield blob
1204         elif type == 'commit':
1205             treeline = ''.join(it).split('\n')[0]
1206             assert(treeline.startswith('tree '))
1207             for blob in self.join(treeline[5:]):
1208                 yield blob
1209         else:
1210             raise GitError('invalid object type %r: expected blob/tree/commit'
1211                            % type)
1212
1213     def join(self, id):
1214         """Generate a list of the content of all blobs that can be reached
1215         from an object.  The hash given in 'id' must point to a blob, a tree
1216         or a commit. The content of all blobs that can be seen from trees or
1217         commits will be added to the list.
1218         """
1219         try:
1220             for d in self._join(self.get(id)):
1221                 yield d
1222         except StopIteration:
1223             log('booger!\n')
1224
1225
1226 _cp = {}
1227
1228 def cp(repo_dir=None):
1229     """Create a CatPipe object or reuse the already existing one."""
1230     global _cp, repodir
1231     if not repo_dir:
1232         repo_dir = repodir or repo()
1233     repo_dir = os.path.abspath(repo_dir)
1234     cp = _cp.get(repo_dir)
1235     if not cp:
1236         cp = CatPipe(repo_dir)
1237         _cp[repo_dir] = cp
1238     return cp
1239
1240
1241 def tags(repo_dir = None):
1242     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1243     tags = {}
1244     for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1245         assert(n.startswith('refs/tags/'))
1246         name = n[10:]
1247         if not c in tags:
1248             tags[c] = []
1249         tags[c].append(name)  # more than one tag can point at 'c'
1250     return tags
1251
1252
1253 WalkItem = namedtuple('WalkItem', ['id', 'type', 'mode',
1254                                    'path', 'chunk_path', 'data'])
1255 # The path is the mangled path, and if an item represents a fragment
1256 # of a chunked file, the chunk_path will be the chunked subtree path
1257 # for the chunk, i.e. ['', '2d3115e', ...].  The top-level path for a
1258 # chunked file will have a chunk_path of [''].  So some chunk subtree
1259 # of the file '/foo/bar/baz' might look like this:
1260 #
1261 #   item.path = ['foo', 'bar', 'baz.bup']
1262 #   item.chunk_path = ['', '2d3115e', '016b097']
1263 #   item.type = 'tree'
1264 #   ...
1265
1266
1267 def walk_object(cat_pipe, id,
1268                 stop_at=None,
1269                 include_data=None):
1270     """Yield everything reachable from id via cat_pipe as a WalkItem,
1271     stopping whenever stop_at(id) returns true.  Throw MissingObject
1272     if a hash encountered is missing from the repository, and don't
1273     read or return blob content in the data field unless include_data
1274     is set.
1275     """
1276     # Maintain the pending stack on the heap to avoid stack overflow
1277     pending = [(id, [], [], None)]
1278     while len(pending):
1279         id, parent_path, chunk_path, mode = pending.pop()
1280         if stop_at and stop_at(id):
1281             continue
1282
1283         if (not include_data) and mode and stat.S_ISREG(mode):
1284             # If the object is a "regular file", then it's a leaf in
1285             # the graph, so we can skip reading the data if the caller
1286             # hasn't requested it.
1287             yield WalkItem(id=id, type='blob',
1288                            chunk_path=chunk_path, path=parent_path,
1289                            mode=mode,
1290                            data=None)
1291             continue
1292
1293         item_it = cat_pipe.get(id)
1294         type = item_it.next()
1295         if type not in ('blob', 'commit', 'tree'):
1296             raise Exception('unexpected repository object type %r' % type)
1297
1298         # FIXME: set the mode based on the type when the mode is None
1299         if type == 'blob' and not include_data:
1300             # Dump data until we can ask cat_pipe not to fetch it
1301             for ignored in item_it:
1302                 pass
1303             data = None
1304         else:
1305             data = ''.join(item_it)
1306
1307         yield WalkItem(id=id, type=type,
1308                        chunk_path=chunk_path, path=parent_path,
1309                        mode=mode,
1310                        data=(data if include_data else None))
1311
1312         if type == 'commit':
1313             commit_items = parse_commit(data)
1314             for pid in commit_items.parents:
1315                 pending.append((pid, parent_path, chunk_path, mode))
1316             pending.append((commit_items.tree, parent_path, chunk_path,
1317                             hashsplit.GIT_MODE_TREE))
1318         elif type == 'tree':
1319             for mode, name, ent_id in tree_decode(data):
1320                 demangled, bup_type = demangle_name(name, mode)
1321                 if chunk_path:
1322                     sub_path = parent_path
1323                     sub_chunk_path = chunk_path + [name]
1324                 else:
1325                     sub_path = parent_path + [name]
1326                     if bup_type == BUP_CHUNKED:
1327                         sub_chunk_path = ['']
1328                     else:
1329                         sub_chunk_path = chunk_path
1330                 pending.append((ent_id.encode('hex'), sub_path, sub_chunk_path,
1331                                 mode))