lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5 import os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
   6 from bup.helpers import *
   7 from bup import _helpers, path, midx, bloom, xstat
   8
   9 max_pack_size = 1000*1000*1000  # larger packs will slow down pruning
  10 max_pack_objects = 200*1000  # cache memory usage is about 83 bytes per object
  11 SEEK_END=2  # os.SEEK_END is not defined in python 2.4
  12
  13 verbose = 0
  14 ignore_midx = 0
  15 home_repodir = os.path.expanduser('~/.bup')
  16 repodir = None
  17
  18 _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
  19 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
  20
  21 _total_searches = 0
  22 _total_steps = 0
  23
  24
  25 class GitError(Exception):
  26     pass
  27
  28
  29 def repo(sub = ''):
  30     """Get the path to the git repository or one of its subdirectories."""
  31     global repodir
  32     if not repodir:
  33         raise GitError('You should call check_repo_or_die()')
  34
  35     # If there's a .git subdirectory, then the actual repo is in there.
  36     gd = os.path.join(repodir, '.git')
  37     if os.path.exists(gd):
  38         repodir = gd
  39
  40     return os.path.join(repodir, sub)
  41
  42
  43 def shorten_hash(s):
  44     return re.sub(r'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)',
  45                   r'\1\2*\3', s)
  46
  47
  48 def repo_rel(path):
  49     full = os.path.abspath(path)
  50     fullrepo = os.path.abspath(repo(''))
  51     if not fullrepo.endswith('/'):
  52         fullrepo += '/'
  53     if full.startswith(fullrepo):
  54         path = full[len(fullrepo):]
  55     if path.startswith('index-cache/'):
  56         path = path[len('index-cache/'):]
  57     return shorten_hash(path)
  58
  59
  60 def all_packdirs():
  61     paths = [repo('objects/pack')]
  62     paths += glob.glob(repo('index-cache/*/.'))
  63     return paths
  64
  65
  66 def auto_midx(objdir):
  67     args = [path.exe(), 'midx', '--auto', '--dir', objdir]
  68     try:
  69         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
  70     except OSError, e:
  71         # make sure 'args' gets printed to help with debugging
  72         add_error('%r: exception: %s' % (args, e))
  73         raise
  74     if rv:
  75         add_error('%r: returned %d' % (args, rv))
  76
  77     args = [path.exe(), 'bloom', '--dir', objdir]
  78     try:
  79         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
  80     except OSError, e:
  81         # make sure 'args' gets printed to help with debugging
  82         add_error('%r: exception: %s' % (args, e))
  83         raise
  84     if rv:
  85         add_error('%r: returned %d' % (args, rv))
  86
  87
  88 def mangle_name(name, mode, gitmode):
  89     """Mangle a file name to present an abstract name for segmented files.
  90     Mangled file names will have the ".bup" extension added to them. If a
  91     file's name already ends with ".bup", a ".bupl" extension is added to
  92     disambiguate normal files from semgmented ones.
  93     """
  94     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
  95         return name + '.bup'
  96     elif name.endswith('.bup') or name[:-1].endswith('.bup'):
  97         return name + '.bupl'
  98     else:
  99         return name
 100
 101
 102 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 103 def demangle_name(name):
 104     """Remove name mangling from a file name, if necessary.
 105
 106     The return value is a tuple (demangled_filename,mode), where mode is one of
 107     the following:
 108
 109     * BUP_NORMAL  : files that should be read as-is from the repository
 110     * BUP_CHUNKED : files that were chunked and need to be assembled
 111
 112     For more information on the name mangling algorythm, see mangle_name()
 113     """
 114     if name.endswith('.bupl'):
 115         return (name[:-5], BUP_NORMAL)
 116     elif name.endswith('.bup'):
 117         return (name[:-4], BUP_CHUNKED)
 118     else:
 119         return (name, BUP_NORMAL)
 120
 121
 122 def calc_hash(type, content):
 123     """Calculate some content's hash in the Git fashion."""
 124     header = '%s %d\0' % (type, len(content))
 125     sum = Sha1(header)
 126     sum.update(content)
 127     return sum.digest()
 128
 129
 130 def _shalist_sort_key(ent):
 131     (mode, name, id) = ent
 132     assert(mode+0 == mode)
 133     if stat.S_ISDIR(mode):
 134         return name + '/'
 135     else:
 136         return name
 137
 138
 139 def tree_encode(shalist):
 140     """Generate a git tree object from (mode,name,hash) tuples."""
 141     shalist = sorted(shalist, key = _shalist_sort_key)
 142     l = []
 143     for (mode,name,bin) in shalist:
 144         assert(mode)
 145         assert(mode+0 == mode)
 146         assert(name)
 147         assert(len(bin) == 20)
 148         s = '%o %s\0%s' % (mode,name,bin)
 149         assert(s[0] != '0')  # 0-padded octal is not acceptable in a git tree
 150         l.append(s)
 151     return ''.join(l)
 152
 153
 154 def tree_decode(buf):
 155     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 156     ofs = 0
 157     while ofs < len(buf):
 158         z = buf[ofs:].find('\0')
 159         assert(z > 0)
 160         spl = buf[ofs:ofs+z].split(' ', 1)
 161         assert(len(spl) == 2)
 162         mode,name = spl
 163         sha = buf[ofs+z+1:ofs+z+1+20]
 164         ofs += z+1+20
 165         yield (int(mode, 8), name, sha)
 166
 167
 168 def _encode_packobj(type, content):
 169     szout = ''
 170     sz = len(content)
 171     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 172     sz >>= 4
 173     while 1:
 174         if sz: szbits |= 0x80
 175         szout += chr(szbits)
 176         if not sz:
 177             break
 178         szbits = sz & 0x7f
 179         sz >>= 7
 180     z = zlib.compressobj(1)
 181     yield szout
 182     yield z.compress(content)
 183     yield z.flush()
 184
 185
 186 def _encode_looseobj(type, content):
 187     z = zlib.compressobj(1)
 188     yield z.compress('%s %d\0' % (type, len(content)))
 189     yield z.compress(content)
 190     yield z.flush()
 191
 192
 193 def _decode_looseobj(buf):
 194     assert(buf);
 195     s = zlib.decompress(buf)
 196     i = s.find('\0')
 197     assert(i > 0)
 198     l = s[:i].split(' ')
 199     type = l[0]
 200     sz = int(l[1])
 201     content = s[i+1:]
 202     assert(type in _typemap)
 203     assert(sz == len(content))
 204     return (type, content)
 205
 206
 207 def _decode_packobj(buf):
 208     assert(buf)
 209     c = ord(buf[0])
 210     type = _typermap[(c & 0x70) >> 4]
 211     sz = c & 0x0f
 212     shift = 4
 213     i = 0
 214     while c & 0x80:
 215         i += 1
 216         c = ord(buf[i])
 217         sz |= (c & 0x7f) << shift
 218         shift += 7
 219         if not (c & 0x80):
 220             break
 221     return (type, zlib.decompress(buf[i+1:]))
 222
 223
 224 class PackIdx:
 225     def __init__(self):
 226         assert(0)
 227
 228     def find_offset(self, hash):
 229         """Get the offset of an object inside the index file."""
 230         idx = self._idx_from_hash(hash)
 231         if idx != None:
 232             return self._ofs_from_idx(idx)
 233         return None
 234
 235     def exists(self, hash, want_source=False):
 236         """Return nonempty if the object exists in this index."""
 237         if hash and (self._idx_from_hash(hash) != None):
 238             return want_source and os.path.basename(self.name) or True
 239         return None
 240
 241     def __len__(self):
 242         return int(self.fanout[255])
 243
 244     def _idx_from_hash(self, hash):
 245         global _total_searches, _total_steps
 246         _total_searches += 1
 247         assert(len(hash) == 20)
 248         b1 = ord(hash[0])
 249         start = self.fanout[b1-1] # range -1..254
 250         end = self.fanout[b1] # range 0..255
 251         want = str(hash)
 252         _total_steps += 1  # lookup table is a step
 253         while start < end:
 254             _total_steps += 1
 255             mid = start + (end-start)/2
 256             v = self._idx_to_hash(mid)
 257             if v < want:
 258                 start = mid+1
 259             elif v > want:
 260                 end = mid
 261             else: # got it!
 262                 return mid
 263         return None
 264
 265
 266 class PackIdxV1(PackIdx):
 267     """Object representation of a Git pack index (version 1) file."""
 268     def __init__(self, filename, f):
 269         self.name = filename
 270         self.idxnames = [self.name]
 271         self.map = mmap_read(f)
 272         self.fanout = list(struct.unpack('!256I',
 273                                          str(buffer(self.map, 0, 256*4))))
 274         self.fanout.append(0)  # entry "-1"
 275         nsha = self.fanout[255]
 276         self.sha_ofs = 256*4
 277         self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
 278
 279     def _ofs_from_idx(self, idx):
 280         return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
 281
 282     def _idx_to_hash(self, idx):
 283         return str(self.shatable[idx*24+4 : idx*24+24])
 284
 285     def __iter__(self):
 286         for i in xrange(self.fanout[255]):
 287             yield buffer(self.map, 256*4 + 24*i + 4, 20)
 288
 289
 290 class PackIdxV2(PackIdx):
 291     """Object representation of a Git pack index (version 2) file."""
 292     def __init__(self, filename, f):
 293         self.name = filename
 294         self.idxnames = [self.name]
 295         self.map = mmap_read(f)
 296         assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
 297         self.fanout = list(struct.unpack('!256I',
 298                                          str(buffer(self.map, 8, 256*4))))
 299         self.fanout.append(0)  # entry "-1"
 300         nsha = self.fanout[255]
 301         self.sha_ofs = 8 + 256*4
 302         self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
 303         self.ofstable = buffer(self.map,
 304                                self.sha_ofs + nsha*20 + nsha*4,
 305                                nsha*4)
 306         self.ofs64table = buffer(self.map,
 307                                  8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
 308
 309     def _ofs_from_idx(self, idx):
 310         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
 311         if ofs & 0x80000000:
 312             idx64 = ofs & 0x7fffffff
 313             ofs = struct.unpack('!Q',
 314                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
 315         return ofs
 316
 317     def _idx_to_hash(self, idx):
 318         return str(self.shatable[idx*20:(idx+1)*20])
 319
 320     def __iter__(self):
 321         for i in xrange(self.fanout[255]):
 322             yield buffer(self.map, 8 + 256*4 + 20*i, 20)
 323
 324
 325 _mpi_count = 0
 326 class PackIdxList:
 327     def __init__(self, dir):
 328         global _mpi_count
 329         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 330         _mpi_count += 1
 331         self.dir = dir
 332         self.also = set()
 333         self.packs = []
 334         self.do_bloom = False
 335         self.bloom = None
 336         self.refresh()
 337
 338     def __del__(self):
 339         global _mpi_count
 340         _mpi_count -= 1
 341         assert(_mpi_count == 0)
 342
 343     def __iter__(self):
 344         return iter(idxmerge(self.packs))
 345
 346     def __len__(self):
 347         return sum(len(pack) for pack in self.packs)
 348
 349     def exists(self, hash, want_source=False):
 350         """Return nonempty if the object exists in the index files."""
 351         global _total_searches
 352         _total_searches += 1
 353         if hash in self.also:
 354             return True
 355         if self.do_bloom and self.bloom:
 356             if self.bloom.exists(hash):
 357                 self.do_bloom = False
 358             else:
 359                 _total_searches -= 1  # was counted by bloom
 360                 return None
 361         for i in xrange(len(self.packs)):
 362             p = self.packs[i]
 363             _total_searches -= 1  # will be incremented by sub-pack
 364             ix = p.exists(hash, want_source=want_source)
 365             if ix:
 366                 # reorder so most recently used packs are searched first
 367                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 368                 return ix
 369         self.do_bloom = True
 370         return None
 371
 372     def refresh(self, skip_midx = False):
 373         """Refresh the index list.
 374         This method verifies if .midx files were superseded (e.g. all of its
 375         contents are in another, bigger .midx file) and removes the superseded
 376         files.
 377
 378         If skip_midx is True, all work on .midx files will be skipped and .midx
 379         files will be removed from the list.
 380
 381         The module-global variable 'ignore_midx' can force this function to
 382         always act as if skip_midx was True.
 383         """
 384         self.bloom = None # Always reopen the bloom as it may have been relaced
 385         self.do_bloom = False
 386         skip_midx = skip_midx or ignore_midx
 387         d = dict((p.name, p) for p in self.packs
 388                  if not skip_midx or not isinstance(p, midx.PackMidx))
 389         if os.path.exists(self.dir):
 390             if not skip_midx:
 391                 midxl = []
 392                 for ix in self.packs:
 393                     if isinstance(ix, midx.PackMidx):
 394                         for name in ix.idxnames:
 395                             d[os.path.join(self.dir, name)] = ix
 396                 for full in glob.glob(os.path.join(self.dir,'*.midx')):
 397                     if not d.get(full):
 398                         mx = midx.PackMidx(full)
 399                         (mxd, mxf) = os.path.split(mx.name)
 400                         broken = False
 401                         for n in mx.idxnames:
 402                             if not os.path.exists(os.path.join(mxd, n)):
 403                                 log(('warning: index %s missing\n' +
 404                                     '  used by %s\n') % (n, mxf))
 405                                 broken = True
 406                         if broken:
 407                             del mx
 408                             unlink(full)
 409                         else:
 410                             midxl.append(mx)
 411                 midxl.sort(key=lambda ix:
 412                            (-len(ix), -xstat.stat(ix.name).st_mtime))
 413                 for ix in midxl:
 414                     any_needed = False
 415                     for sub in ix.idxnames:
 416                         found = d.get(os.path.join(self.dir, sub))
 417                         if not found or isinstance(found, PackIdx):
 418                             # doesn't exist, or exists but not in a midx
 419                             any_needed = True
 420                             break
 421                     if any_needed:
 422                         d[ix.name] = ix
 423                         for name in ix.idxnames:
 424                             d[os.path.join(self.dir, name)] = ix
 425                     elif not ix.force_keep:
 426                         debug1('midx: removing redundant: %s\n'
 427                                % os.path.basename(ix.name))
 428                         unlink(ix.name)
 429             for full in glob.glob(os.path.join(self.dir,'*.idx')):
 430                 if not d.get(full):
 431                     try:
 432                         ix = open_idx(full)
 433                     except GitError, e:
 434                         add_error(e)
 435                         continue
 436                     d[full] = ix
 437             bfull = os.path.join(self.dir, 'bup.bloom')
 438             if self.bloom is None and os.path.exists(bfull):
 439                 self.bloom = bloom.ShaBloom(bfull)
 440             self.packs = list(set(d.values()))
 441             self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
 442             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 443                 self.do_bloom = True
 444             else:
 445                 self.bloom = None
 446         debug1('PackIdxList: using %d index%s.\n'
 447             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 448
 449     def add(self, hash):
 450         """Insert an additional object in the list."""
 451         self.also.add(hash)
 452
 453
 454 def open_idx(filename):
 455     if filename.endswith('.idx'):
 456         f = open(filename, 'rb')
 457         header = f.read(8)
 458         if header[0:4] == '\377tOc':
 459             version = struct.unpack('!I', header[4:8])[0]
 460             if version == 2:
 461                 return PackIdxV2(filename, f)
 462             else:
 463                 raise GitError('%s: expected idx file version 2, got %d'
 464                                % (filename, version))
 465         elif len(header) == 8 and header[0:4] < '\377tOc':
 466             return PackIdxV1(filename, f)
 467         else:
 468             raise GitError('%s: unrecognized idx file header' % filename)
 469     elif filename.endswith('.midx'):
 470         return midx.PackMidx(filename)
 471     else:
 472         raise GitError('idx filenames must end with .idx or .midx')
 473
 474
 475 def idxmerge(idxlist, final_progress=True):
 476     """Generate a list of all the objects reachable in a PackIdxList."""
 477     def pfunc(count, total):
 478         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 479                   % (count*100.0/total, count, total))
 480     def pfinal(count, total):
 481         if final_progress:
 482             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 483                      % (100, total, total))
 484     return merge_iter(idxlist, 10024, pfunc, pfinal)
 485
 486
 487 def _make_objcache():
 488     return PackIdxList(repo('objects/pack'))
 489
 490 class PackWriter:
 491     """Writes Git objects inside a pack file."""
 492     def __init__(self, objcache_maker=_make_objcache):
 493         self.count = 0
 494         self.outbytes = 0
 495         self.filename = None
 496         self.file = None
 497         self.idx = None
 498         self.objcache_maker = objcache_maker
 499         self.objcache = None
 500
 501     def __del__(self):
 502         self.close()
 503
 504     def _open(self):
 505         if not self.file:
 506             (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
 507             self.file = os.fdopen(fd, 'w+b')
 508             assert(name.endswith('.pack'))
 509             self.filename = name[:-5]
 510             self.file.write('PACK\0\0\0\2\0\0\0\0')
 511             self.idx = list(list() for i in xrange(256))
 512
 513     def _raw_write(self, datalist, sha):
 514         self._open()
 515         f = self.file
 516         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 517         # the file never has a *partial* blob.  So let's make sure it's
 518         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 519         # to our hashsplit algorithm.)  f.write() does its own buffering,
 520         # but that's okay because we'll flush it in _end().
 521         oneblob = ''.join(datalist)
 522         try:
 523             f.write(oneblob)
 524         except IOError, e:
 525             raise GitError, e, sys.exc_info()[2]
 526         nw = len(oneblob)
 527         crc = zlib.crc32(oneblob) & 0xffffffff
 528         self._update_idx(sha, crc, nw)
 529         self.outbytes += nw
 530         self.count += 1
 531         return nw, crc
 532
 533     def _update_idx(self, sha, crc, size):
 534         assert(sha)
 535         if self.idx:
 536             self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
 537
 538     def _write(self, sha, type, content):
 539         if verbose:
 540             log('>')
 541         if not sha:
 542             sha = calc_hash(type, content)
 543         size, crc = self._raw_write(_encode_packobj(type, content), sha=sha)
 544         if self.outbytes >= max_pack_size or self.count >= max_pack_objects:
 545             self.breakpoint()
 546         return sha
 547
 548     def breakpoint(self):
 549         """Clear byte and object counts and return the last processed id."""
 550         id = self._end()
 551         self.outbytes = self.count = 0
 552         return id
 553
 554     def _require_objcache(self):
 555         if self.objcache is None and self.objcache_maker:
 556             self.objcache = self.objcache_maker()
 557         if self.objcache is None:
 558             raise GitError(
 559                     "PackWriter not opened or can't check exists w/o objcache")
 560
 561     def exists(self, id, want_source=False):
 562         """Return non-empty if an object is found in the object cache."""
 563         self._require_objcache()
 564         return self.objcache.exists(id, want_source=want_source)
 565
 566     def maybe_write(self, type, content):
 567         """Write an object to the pack file if not present and return its id."""
 568         sha = calc_hash(type, content)
 569         if not self.exists(sha):
 570             self._write(sha, type, content)
 571             self._require_objcache()
 572             self.objcache.add(sha)
 573         return sha
 574
 575     def new_blob(self, blob):
 576         """Create a blob object in the pack with the supplied content."""
 577         return self.maybe_write('blob', blob)
 578
 579     def new_tree(self, shalist):
 580         """Create a tree object in the pack."""
 581         content = tree_encode(shalist)
 582         return self.maybe_write('tree', content)
 583
 584     def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
 585         l = []
 586         if tree: l.append('tree %s' % tree.encode('hex'))
 587         if parent: l.append('parent %s' % parent.encode('hex'))
 588         if author: l.append('author %s %s' % (author, _git_date(adate)))
 589         if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
 590         l.append('')
 591         l.append(msg)
 592         return self.maybe_write('commit', '\n'.join(l))
 593
 594     def new_commit(self, parent, tree, date, msg):
 595         """Create a commit object in the pack."""
 596         userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
 597         commit = self._new_commit(tree, parent,
 598                                   userline, date, userline, date,
 599                                   msg)
 600         return commit
 601
 602     def abort(self):
 603         """Remove the pack file from disk."""
 604         f = self.file
 605         if f:
 606             self.idx = None
 607             self.file = None
 608             f.close()
 609             os.unlink(self.filename + '.pack')
 610
 611     def _end(self, run_midx=True):
 612         f = self.file
 613         if not f: return None
 614         self.file = None
 615         self.objcache = None
 616         idx = self.idx
 617         self.idx = None
 618
 619         # update object count
 620         f.seek(8)
 621         cp = struct.pack('!i', self.count)
 622         assert(len(cp) == 4)
 623         f.write(cp)
 624
 625         # calculate the pack sha1sum
 626         f.seek(0)
 627         sum = Sha1()
 628         for b in chunkyreader(f):
 629             sum.update(b)
 630         packbin = sum.digest()
 631         f.write(packbin)
 632         f.close()
 633
 634         obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
 635
 636         nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
 637         if os.path.exists(self.filename + '.map'):
 638             os.unlink(self.filename + '.map')
 639         os.rename(self.filename + '.pack', nameprefix + '.pack')
 640         os.rename(self.filename + '.idx', nameprefix + '.idx')
 641
 642         if run_midx:
 643             auto_midx(repo('objects/pack'))
 644         return nameprefix
 645
 646     def close(self, run_midx=True):
 647         """Close the pack file and move it to its definitive path."""
 648         return self._end(run_midx=run_midx)
 649
 650     def _write_pack_idx_v2(self, filename, idx, packbin):
 651         idx_f = open(filename, 'w+b')
 652         idx_f.write('\377tOc\0\0\0\2')
 653
 654         ofs64_ofs = 8 + 4*256 + 28*self.count
 655         idx_f.truncate(ofs64_ofs)
 656         idx_f.seek(0)
 657         idx_map = mmap_readwrite(idx_f, close=False)
 658         idx_f.seek(0, SEEK_END)
 659         count = _helpers.write_idx(idx_f, idx_map, idx, self.count)
 660         assert(count == self.count)
 661         idx_map.close()
 662         idx_f.write(packbin)
 663
 664         idx_f.seek(0)
 665         idx_sum = Sha1()
 666         b = idx_f.read(8 + 4*256)
 667         idx_sum.update(b)
 668
 669         obj_list_sum = Sha1()
 670         for b in chunkyreader(idx_f, 20*self.count):
 671             idx_sum.update(b)
 672             obj_list_sum.update(b)
 673         namebase = obj_list_sum.hexdigest()
 674
 675         for b in chunkyreader(idx_f):
 676             idx_sum.update(b)
 677         idx_f.write(idx_sum.digest())
 678         idx_f.close()
 679
 680         return namebase
 681
 682
 683 def _git_date(date):
 684     return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
 685
 686
 687 def _gitenv():
 688     os.environ['GIT_DIR'] = os.path.abspath(repo())
 689
 690
 691 def list_refs(refname = None):
 692     """Generate a list of tuples in the form (refname,hash).
 693     If a ref name is specified, list only this particular ref.
 694     """
 695     argv = ['git', 'show-ref', '--']
 696     if refname:
 697         argv += [refname]
 698     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 699     out = p.stdout.read().strip()
 700     rv = p.wait()  # not fatal
 701     if rv:
 702         assert(not out)
 703     if out:
 704         for d in out.split('\n'):
 705             (sha, name) = d.split(' ', 1)
 706             yield (name, sha.decode('hex'))
 707
 708
 709 def read_ref(refname):
 710     """Get the commit id of the most recent commit made on a given ref."""
 711     l = list(list_refs(refname))
 712     if l:
 713         assert(len(l) == 1)
 714         return l[0][1]
 715     else:
 716         return None
 717
 718
 719 def rev_list(ref, count=None):
 720     """Generate a list of reachable commits in reverse chronological order.
 721
 722     This generator walks through commits, from child to parent, that are
 723     reachable via the specified ref and yields a series of tuples of the form
 724     (date,hash).
 725
 726     If count is a non-zero integer, limit the number of commits to "count"
 727     objects.
 728     """
 729     assert(not ref.startswith('-'))
 730     opts = []
 731     if count:
 732         opts += ['-n', str(atoi(count))]
 733     argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
 734     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 735     commit = None
 736     for row in p.stdout:
 737         s = row.strip()
 738         if s.startswith('commit '):
 739             commit = s[7:].decode('hex')
 740         else:
 741             date = int(s)
 742             yield (date, commit)
 743     rv = p.wait()  # not fatal
 744     if rv:
 745         raise GitError, 'git rev-list returned error %d' % rv
 746
 747
 748 def rev_get_date(ref):
 749     """Get the date of the latest commit on the specified ref."""
 750     for (date, commit) in rev_list(ref, count=1):
 751         return date
 752     raise GitError, 'no such commit %r' % ref
 753
 754
 755 def rev_parse(committish):
 756     """Resolve the full hash for 'committish', if it exists.
 757
 758     Should be roughly equivalent to 'git rev-parse'.
 759
 760     Returns the hex value of the hash if it is found, None if 'committish' does
 761     not correspond to anything.
 762     """
 763     head = read_ref(committish)
 764     if head:
 765         debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
 766         return head
 767
 768     pL = PackIdxList(repo('objects/pack'))
 769
 770     if len(committish) == 40:
 771         try:
 772             hash = committish.decode('hex')
 773         except TypeError:
 774             return None
 775
 776         if pL.exists(hash):
 777             return hash
 778
 779     return None
 780
 781
 782 def update_ref(refname, newval, oldval):
 783     """Change the commit pointed to by a branch."""
 784     if not oldval:
 785         oldval = ''
 786     assert(refname.startswith('refs/heads/'))
 787     p = subprocess.Popen(['git', 'update-ref', refname,
 788                           newval.encode('hex'), oldval.encode('hex')],
 789                          preexec_fn = _gitenv)
 790     _git_wait('git update-ref', p)
 791
 792
 793 def guess_repo(path=None):
 794     """Set the path value in the global variable "repodir".
 795     This makes bup look for an existing bup repository, but not fail if a
 796     repository doesn't exist. Usually, if you are interacting with a bup
 797     repository, you would not be calling this function but using
 798     check_repo_or_die().
 799     """
 800     global repodir
 801     if path:
 802         repodir = path
 803     if not repodir:
 804         repodir = os.environ.get('BUP_DIR')
 805         if not repodir:
 806             repodir = os.path.expanduser('~/.bup')
 807
 808
 809 def init_repo(path=None):
 810     """Create the Git bare repository for bup in a given path."""
 811     guess_repo(path)
 812     d = repo()  # appends a / to the path
 813     parent = os.path.dirname(os.path.dirname(d))
 814     if parent and not os.path.exists(parent):
 815         raise GitError('parent directory "%s" does not exist\n' % parent)
 816     if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
 817         raise GitError('"%d" exists but is not a directory\n' % d)
 818     p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
 819                          preexec_fn = _gitenv)
 820     _git_wait('git init', p)
 821     # Force the index version configuration in order to ensure bup works
 822     # regardless of the version of the installed Git binary.
 823     p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
 824                          stdout=sys.stderr, preexec_fn = _gitenv)
 825     _git_wait('git config', p)
 826
 827
 828 def check_repo_or_die(path=None):
 829     """Make sure a bup repository exists, and abort if not.
 830     If the path to a particular repository was not specified, this function
 831     initializes the default repository automatically.
 832     """
 833     guess_repo(path)
 834     try:
 835         os.stat(repo('objects/pack/.'))
 836     except OSError, e:
 837         if e.errno == errno.ENOENT:
 838             if repodir != home_repodir:
 839                 log('error: %r is not a bup/git repository\n' % repo())
 840                 sys.exit(15)
 841             else:
 842                 init_repo()
 843         else:
 844             log('error: %s\n' % e)
 845             sys.exit(14)
 846
 847
 848 _ver = None
 849 def ver():
 850     """Get Git's version and ensure a usable version is installed.
 851
 852     The returned version is formatted as an ordered tuple with each position
 853     representing a digit in the version tag. For example, the following tuple
 854     would represent version 1.6.6.9:
 855
 856         ('1', '6', '6', '9')
 857     """
 858     global _ver
 859     if not _ver:
 860         p = subprocess.Popen(['git', '--version'],
 861                              stdout=subprocess.PIPE)
 862         gvs = p.stdout.read()
 863         _git_wait('git --version', p)
 864         m = re.match(r'git version (\S+.\S+)', gvs)
 865         if not m:
 866             raise GitError('git --version weird output: %r' % gvs)
 867         _ver = tuple(m.group(1).split('.'))
 868     needed = ('1','5', '3', '1')
 869     if _ver < needed:
 870         raise GitError('git version %s or higher is required; you have %s'
 871                        % ('.'.join(needed), '.'.join(_ver)))
 872     return _ver
 873
 874
 875 def _git_wait(cmd, p):
 876     rv = p.wait()
 877     if rv != 0:
 878         raise GitError('%s returned %d' % (cmd, rv))
 879
 880
 881 def _git_capture(argv):
 882     p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
 883     r = p.stdout.read()
 884     _git_wait(repr(argv), p)
 885     return r
 886
 887
 888 class _AbortableIter:
 889     def __init__(self, it, onabort = None):
 890         self.it = it
 891         self.onabort = onabort
 892         self.done = None
 893
 894     def __iter__(self):
 895         return self
 896
 897     def next(self):
 898         try:
 899             return self.it.next()
 900         except StopIteration, e:
 901             self.done = True
 902             raise
 903         except:
 904             self.abort()
 905             raise
 906
 907     def abort(self):
 908         """Abort iteration and call the abortion callback, if needed."""
 909         if not self.done:
 910             self.done = True
 911             if self.onabort:
 912                 self.onabort()
 913
 914     def __del__(self):
 915         self.abort()
 916
 917
 918 _ver_warned = 0
 919 class CatPipe:
 920     """Link to 'git cat-file' that is used to retrieve blob data."""
 921     def __init__(self):
 922         global _ver_warned
 923         wanted = ('1','5','6')
 924         if ver() < wanted:
 925             if not _ver_warned:
 926                 log('warning: git version < %s; bup will be slow.\n'
 927                     % '.'.join(wanted))
 928                 _ver_warned = 1
 929             self.get = self._slow_get
 930         else:
 931             self.p = self.inprogress = None
 932             self.get = self._fast_get
 933
 934     def _abort(self):
 935         if self.p:
 936             self.p.stdout.close()
 937             self.p.stdin.close()
 938         self.p = None
 939         self.inprogress = None
 940
 941     def _restart(self):
 942         self._abort()
 943         self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
 944                                   stdin=subprocess.PIPE,
 945                                   stdout=subprocess.PIPE,
 946                                   close_fds = True,
 947                                   bufsize = 4096,
 948                                   preexec_fn = _gitenv)
 949
 950     def _fast_get(self, id):
 951         if not self.p or self.p.poll() != None:
 952             self._restart()
 953         assert(self.p)
 954         assert(self.p.poll() == None)
 955         if self.inprogress:
 956             log('_fast_get: opening %r while %r is open'
 957                 % (id, self.inprogress))
 958         assert(not self.inprogress)
 959         assert(id.find('\n') < 0)
 960         assert(id.find('\r') < 0)
 961         assert(not id.startswith('-'))
 962         self.inprogress = id
 963         self.p.stdin.write('%s\n' % id)
 964         self.p.stdin.flush()
 965         hdr = self.p.stdout.readline()
 966         if hdr.endswith(' missing\n'):
 967             self.inprogress = None
 968             raise KeyError('blob %r is missing' % id)
 969         spl = hdr.split(' ')
 970         if len(spl) != 3 or len(spl[0]) != 40:
 971             raise GitError('expected blob, got %r' % spl)
 972         (hex, type, size) = spl
 973
 974         it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
 975                            onabort = self._abort)
 976         try:
 977             yield type
 978             for blob in it:
 979                 yield blob
 980             assert(self.p.stdout.readline() == '\n')
 981             self.inprogress = None
 982         except Exception, e:
 983             it.abort()
 984             raise
 985
 986     def _slow_get(self, id):
 987         assert(id.find('\n') < 0)
 988         assert(id.find('\r') < 0)
 989         assert(id[0] != '-')
 990         type = _git_capture(['git', 'cat-file', '-t', id]).strip()
 991         yield type
 992
 993         p = subprocess.Popen(['git', 'cat-file', type, id],
 994                              stdout=subprocess.PIPE,
 995                              preexec_fn = _gitenv)
 996         for blob in chunkyreader(p.stdout):
 997             yield blob
 998         _git_wait('git cat-file', p)
 999
1000     def _join(self, it):
1001         type = it.next()
1002         if type == 'blob':
1003             for blob in it:
1004                 yield blob
1005         elif type == 'tree':
1006             treefile = ''.join(it)
1007             for (mode, name, sha) in tree_decode(treefile):
1008                 for blob in self.join(sha.encode('hex')):
1009                     yield blob
1010         elif type == 'commit':
1011             treeline = ''.join(it).split('\n')[0]
1012             assert(treeline.startswith('tree '))
1013             for blob in self.join(treeline[5:]):
1014                 yield blob
1015         else:
1016             raise GitError('invalid object type %r: expected blob/tree/commit'
1017                            % type)
1018
1019     def join(self, id):
1020         """Generate a list of the content of all blobs that can be reached
1021         from an object.  The hash given in 'id' must point to a blob, a tree
1022         or a commit. The content of all blobs that can be seen from trees or
1023         commits will be added to the list.
1024         """
1025         try:
1026             for d in self._join(self.get(id)):
1027                 yield d
1028         except StopIteration:
1029             log('booger!\n')
1030
1031 def tags():
1032     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1033     tags = {}
1034     for (n,c) in list_refs():
1035         if n.startswith('refs/tags/'):
1036             name = n[10:]
1037             if not c in tags:
1038                 tags[c] = []
1039
1040             tags[c].append(name)  # more than one tag can point at 'c'
1041
1042     return tags