lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5 import os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
   6 from bup.helpers import *
   7 from bup import _helpers, path, midx, bloom
   8
   9 max_pack_size = 1000*1000*1000  # larger packs will slow down pruning
  10 max_pack_objects = 200*1000  # cache memory usage is about 83 bytes per object
  11 SEEK_END=2  # os.SEEK_END is not defined in python 2.4
  12
  13 verbose = 0
  14 ignore_midx = 0
  15 home_repodir = os.path.expanduser('~/.bup')
  16 repodir = None
  17
  18 _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
  19 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
  20
  21 _total_searches = 0
  22 _total_steps = 0
  23
  24
  25 class GitError(Exception):
  26     pass
  27
  28
  29 def repo(sub = ''):
  30     """Get the path to the git repository or one of its subdirectories."""
  31     global repodir
  32     if not repodir:
  33         raise GitError('You should call check_repo_or_die()')
  34
  35     # If there's a .git subdirectory, then the actual repo is in there.
  36     gd = os.path.join(repodir, '.git')
  37     if os.path.exists(gd):
  38         repodir = gd
  39
  40     return os.path.join(repodir, sub)
  41
  42
  43 def shorten_hash(s):
  44     return re.sub(r'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)',
  45                   r'\1\2*\3', s)
  46
  47
  48 def repo_rel(path):
  49     full = os.path.abspath(path)
  50     fullrepo = os.path.abspath(repo(''))
  51     if not fullrepo.endswith('/'):
  52         fullrepo += '/'
  53     if full.startswith(fullrepo):
  54         path = full[len(fullrepo):]
  55     if path.startswith('index-cache/'):
  56         path = path[len('index-cache/'):]
  57     return shorten_hash(path)
  58
  59
  60 def all_packdirs():
  61     paths = [repo('objects/pack')]
  62     paths += glob.glob(repo('index-cache/*/.'))
  63     return paths
  64
  65
  66 def auto_midx(objdir):
  67     args = [path.exe(), 'midx', '--auto', '--dir', objdir]
  68     try:
  69         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
  70     except OSError, e:
  71         # make sure 'args' gets printed to help with debugging
  72         add_error('%r: exception: %s' % (args, e))
  73         raise
  74     if rv:
  75         add_error('%r: returned %d' % (args, rv))
  76
  77     args = [path.exe(), 'bloom', '--dir', objdir]
  78     try:
  79         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
  80     except OSError, e:
  81         # make sure 'args' gets printed to help with debugging
  82         add_error('%r: exception: %s' % (args, e))
  83         raise
  84     if rv:
  85         add_error('%r: returned %d' % (args, rv))
  86
  87
  88 def mangle_name(name, mode, gitmode):
  89     """Mangle a file name to present an abstract name for segmented files.
  90     Mangled file names will have the ".bup" extension added to them. If a
  91     file's name already ends with ".bup", a ".bupl" extension is added to
  92     disambiguate normal files from semgmented ones.
  93     """
  94     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
  95         return name + '.bup'
  96     elif name.endswith('.bup') or name[:-1].endswith('.bup'):
  97         return name + '.bupl'
  98     else:
  99         return name
 100
 101
 102 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 103 def demangle_name(name):
 104     """Remove name mangling from a file name, if necessary.
 105
 106     The return value is a tuple (demangled_filename,mode), where mode is one of
 107     the following:
 108
 109     * BUP_NORMAL  : files that should be read as-is from the repository
 110     * BUP_CHUNKED : files that were chunked and need to be assembled
 111
 112     For more information on the name mangling algorythm, see mangle_name()
 113     """
 114     if name.endswith('.bupl'):
 115         return (name[:-5], BUP_NORMAL)
 116     elif name.endswith('.bup'):
 117         return (name[:-4], BUP_CHUNKED)
 118     else:
 119         return (name, BUP_NORMAL)
 120
 121
 122 def calc_hash(type, content):
 123     """Calculate some content's hash in the Git fashion."""
 124     header = '%s %d\0' % (type, len(content))
 125     sum = Sha1(header)
 126     sum.update(content)
 127     return sum.digest()
 128
 129
 130 def _shalist_sort_key(ent):
 131     (mode, name, id) = ent
 132     assert(mode+0 == mode)
 133     if stat.S_ISDIR(mode):
 134         return name + '/'
 135     else:
 136         return name
 137
 138
 139 def tree_encode(shalist):
 140     """Generate a git tree object from (mode,name,hash) tuples."""
 141     shalist = sorted(shalist, key = _shalist_sort_key)
 142     l = []
 143     for (mode,name,bin) in shalist:
 144         assert(mode)
 145         assert(mode+0 == mode)
 146         assert(name)
 147         assert(len(bin) == 20)
 148         s = '%o %s\0%s' % (mode,name,bin)
 149         assert(s[0] != '0')  # 0-padded octal is not acceptable in a git tree
 150         l.append(s)
 151     return ''.join(l)
 152
 153
 154 def tree_decode(buf):
 155     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 156     ofs = 0
 157     while ofs < len(buf):
 158         z = buf[ofs:].find('\0')
 159         assert(z > 0)
 160         spl = buf[ofs:ofs+z].split(' ', 1)
 161         assert(len(spl) == 2)
 162         mode,name = spl
 163         sha = buf[ofs+z+1:ofs+z+1+20]
 164         ofs += z+1+20
 165         yield (int(mode, 8), name, sha)
 166
 167
 168 def _encode_packobj(type, content, compression_level=1):
 169     szout = ''
 170     sz = len(content)
 171     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 172     sz >>= 4
 173     while 1:
 174         if sz: szbits |= 0x80
 175         szout += chr(szbits)
 176         if not sz:
 177             break
 178         szbits = sz & 0x7f
 179         sz >>= 7
 180     z = zlib.compressobj(compression_level)
 181     yield szout
 182     yield z.compress(content)
 183     yield z.flush()
 184
 185
 186 def _encode_looseobj(type, content, compression_level=1):
 187     z = zlib.compressobj(compression_level)
 188     yield z.compress('%s %d\0' % (type, len(content)))
 189     yield z.compress(content)
 190     yield z.flush()
 191
 192
 193 def _decode_looseobj(buf):
 194     assert(buf);
 195     s = zlib.decompress(buf)
 196     i = s.find('\0')
 197     assert(i > 0)
 198     l = s[:i].split(' ')
 199     type = l[0]
 200     sz = int(l[1])
 201     content = s[i+1:]
 202     assert(type in _typemap)
 203     assert(sz == len(content))
 204     return (type, content)
 205
 206
 207 def _decode_packobj(buf):
 208     assert(buf)
 209     c = ord(buf[0])
 210     type = _typermap[(c & 0x70) >> 4]
 211     sz = c & 0x0f
 212     shift = 4
 213     i = 0
 214     while c & 0x80:
 215         i += 1
 216         c = ord(buf[i])
 217         sz |= (c & 0x7f) << shift
 218         shift += 7
 219         if not (c & 0x80):
 220             break
 221     return (type, zlib.decompress(buf[i+1:]))
 222
 223
 224 class PackIdx:
 225     def __init__(self):
 226         assert(0)
 227
 228     def find_offset(self, hash):
 229         """Get the offset of an object inside the index file."""
 230         idx = self._idx_from_hash(hash)
 231         if idx != None:
 232             return self._ofs_from_idx(idx)
 233         return None
 234
 235     def exists(self, hash, want_source=False):
 236         """Return nonempty if the object exists in this index."""
 237         if hash and (self._idx_from_hash(hash) != None):
 238             return want_source and os.path.basename(self.name) or True
 239         return None
 240
 241     def __len__(self):
 242         return int(self.fanout[255])
 243
 244     def _idx_from_hash(self, hash):
 245         global _total_searches, _total_steps
 246         _total_searches += 1
 247         assert(len(hash) == 20)
 248         b1 = ord(hash[0])
 249         start = self.fanout[b1-1] # range -1..254
 250         end = self.fanout[b1] # range 0..255
 251         want = str(hash)
 252         _total_steps += 1  # lookup table is a step
 253         while start < end:
 254             _total_steps += 1
 255             mid = start + (end-start)/2
 256             v = self._idx_to_hash(mid)
 257             if v < want:
 258                 start = mid+1
 259             elif v > want:
 260                 end = mid
 261             else: # got it!
 262                 return mid
 263         return None
 264
 265
 266 class PackIdxV1(PackIdx):
 267     """Object representation of a Git pack index (version 1) file."""
 268     def __init__(self, filename, f):
 269         self.name = filename
 270         self.idxnames = [self.name]
 271         self.map = mmap_read(f)
 272         self.fanout = list(struct.unpack('!256I',
 273                                          str(buffer(self.map, 0, 256*4))))
 274         self.fanout.append(0)  # entry "-1"
 275         nsha = self.fanout[255]
 276         self.sha_ofs = 256*4
 277         self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
 278
 279     def _ofs_from_idx(self, idx):
 280         return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
 281
 282     def _idx_to_hash(self, idx):
 283         return str(self.shatable[idx*24+4 : idx*24+24])
 284
 285     def __iter__(self):
 286         for i in xrange(self.fanout[255]):
 287             yield buffer(self.map, 256*4 + 24*i + 4, 20)
 288
 289
 290 class PackIdxV2(PackIdx):
 291     """Object representation of a Git pack index (version 2) file."""
 292     def __init__(self, filename, f):
 293         self.name = filename
 294         self.idxnames = [self.name]
 295         self.map = mmap_read(f)
 296         assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
 297         self.fanout = list(struct.unpack('!256I',
 298                                          str(buffer(self.map, 8, 256*4))))
 299         self.fanout.append(0)  # entry "-1"
 300         nsha = self.fanout[255]
 301         self.sha_ofs = 8 + 256*4
 302         self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
 303         self.ofstable = buffer(self.map,
 304                                self.sha_ofs + nsha*20 + nsha*4,
 305                                nsha*4)
 306         self.ofs64table = buffer(self.map,
 307                                  8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
 308
 309     def _ofs_from_idx(self, idx):
 310         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
 311         if ofs & 0x80000000:
 312             idx64 = ofs & 0x7fffffff
 313             ofs = struct.unpack('!Q',
 314                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
 315         return ofs
 316
 317     def _idx_to_hash(self, idx):
 318         return str(self.shatable[idx*20:(idx+1)*20])
 319
 320     def __iter__(self):
 321         for i in xrange(self.fanout[255]):
 322             yield buffer(self.map, 8 + 256*4 + 20*i, 20)
 323
 324
 325 _mpi_count = 0
 326 class PackIdxList:
 327     def __init__(self, dir):
 328         global _mpi_count
 329         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 330         _mpi_count += 1
 331         self.dir = dir
 332         self.also = set()
 333         self.packs = []
 334         self.do_bloom = False
 335         self.bloom = None
 336         self.refresh()
 337
 338     def __del__(self):
 339         global _mpi_count
 340         _mpi_count -= 1
 341         assert(_mpi_count == 0)
 342
 343     def __iter__(self):
 344         return iter(idxmerge(self.packs))
 345
 346     def __len__(self):
 347         return sum(len(pack) for pack in self.packs)
 348
 349     def exists(self, hash, want_source=False):
 350         """Return nonempty if the object exists in the index files."""
 351         global _total_searches
 352         _total_searches += 1
 353         if hash in self.also:
 354             return True
 355         if self.do_bloom and self.bloom:
 356             if self.bloom.exists(hash):
 357                 self.do_bloom = False
 358             else:
 359                 _total_searches -= 1  # was counted by bloom
 360                 return None
 361         for i in xrange(len(self.packs)):
 362             p = self.packs[i]
 363             _total_searches -= 1  # will be incremented by sub-pack
 364             ix = p.exists(hash, want_source=want_source)
 365             if ix:
 366                 # reorder so most recently used packs are searched first
 367                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 368                 return ix
 369         self.do_bloom = True
 370         return None
 371
 372     def refresh(self, skip_midx = False):
 373         """Refresh the index list.
 374         This method verifies if .midx files were superseded (e.g. all of its
 375         contents are in another, bigger .midx file) and removes the superseded
 376         files.
 377
 378         If skip_midx is True, all work on .midx files will be skipped and .midx
 379         files will be removed from the list.
 380
 381         The module-global variable 'ignore_midx' can force this function to
 382         always act as if skip_midx was True.
 383         """
 384         self.bloom = None # Always reopen the bloom as it may have been relaced
 385         self.do_bloom = False
 386         skip_midx = skip_midx or ignore_midx
 387         d = dict((p.name, p) for p in self.packs
 388                  if not skip_midx or not isinstance(p, midx.PackMidx))
 389         if os.path.exists(self.dir):
 390             if not skip_midx:
 391                 midxl = []
 392                 for ix in self.packs:
 393                     if isinstance(ix, midx.PackMidx):
 394                         for name in ix.idxnames:
 395                             d[os.path.join(self.dir, name)] = ix
 396                 for full in glob.glob(os.path.join(self.dir,'*.midx')):
 397                     if not d.get(full):
 398                         mx = midx.PackMidx(full)
 399                         (mxd, mxf) = os.path.split(mx.name)
 400                         broken = False
 401                         for n in mx.idxnames:
 402                             if not os.path.exists(os.path.join(mxd, n)):
 403                                 log(('warning: index %s missing\n' +
 404                                     '  used by %s\n') % (n, mxf))
 405                                 broken = True
 406                         if broken:
 407                             del mx
 408                             unlink(full)
 409                         else:
 410                             midxl.append(mx)
 411                 midxl.sort(key=lambda ix:
 412                            (-len(ix), -os.stat(ix.name).st_mtime))
 413                 for ix in midxl:
 414                     any_needed = False
 415                     for sub in ix.idxnames:
 416                         found = d.get(os.path.join(self.dir, sub))
 417                         if not found or isinstance(found, PackIdx):
 418                             # doesn't exist, or exists but not in a midx
 419                             any_needed = True
 420                             break
 421                     if any_needed:
 422                         d[ix.name] = ix
 423                         for name in ix.idxnames:
 424                             d[os.path.join(self.dir, name)] = ix
 425                     elif not ix.force_keep:
 426                         debug1('midx: removing redundant: %s\n'
 427                                % os.path.basename(ix.name))
 428                         unlink(ix.name)
 429             for full in glob.glob(os.path.join(self.dir,'*.idx')):
 430                 if not d.get(full):
 431                     try:
 432                         ix = open_idx(full)
 433                     except GitError, e:
 434                         add_error(e)
 435                         continue
 436                     d[full] = ix
 437             bfull = os.path.join(self.dir, 'bup.bloom')
 438             if self.bloom is None and os.path.exists(bfull):
 439                 self.bloom = bloom.ShaBloom(bfull)
 440             self.packs = list(set(d.values()))
 441             self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
 442             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 443                 self.do_bloom = True
 444             else:
 445                 self.bloom = None
 446         debug1('PackIdxList: using %d index%s.\n'
 447             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 448
 449     def add(self, hash):
 450         """Insert an additional object in the list."""
 451         self.also.add(hash)
 452
 453
 454 def open_idx(filename):
 455     if filename.endswith('.idx'):
 456         f = open(filename, 'rb')
 457         header = f.read(8)
 458         if header[0:4] == '\377tOc':
 459             version = struct.unpack('!I', header[4:8])[0]
 460             if version == 2:
 461                 return PackIdxV2(filename, f)
 462             else:
 463                 raise GitError('%s: expected idx file version 2, got %d'
 464                                % (filename, version))
 465         elif len(header) == 8 and header[0:4] < '\377tOc':
 466             return PackIdxV1(filename, f)
 467         else:
 468             raise GitError('%s: unrecognized idx file header' % filename)
 469     elif filename.endswith('.midx'):
 470         return midx.PackMidx(filename)
 471     else:
 472         raise GitError('idx filenames must end with .idx or .midx')
 473
 474
 475 def idxmerge(idxlist, final_progress=True):
 476     """Generate a list of all the objects reachable in a PackIdxList."""
 477     def pfunc(count, total):
 478         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 479                   % (count*100.0/total, count, total))
 480     def pfinal(count, total):
 481         if final_progress:
 482             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 483                      % (100, total, total))
 484     return merge_iter(idxlist, 10024, pfunc, pfinal)
 485
 486
 487 def _make_objcache():
 488     return PackIdxList(repo('objects/pack'))
 489
 490 class PackWriter:
 491     """Writes Git objects inside a pack file."""
 492     def __init__(self, objcache_maker=_make_objcache, compression_level=1):
 493         self.count = 0
 494         self.outbytes = 0
 495         self.filename = None
 496         self.file = None
 497         self.idx = None
 498         self.objcache_maker = objcache_maker
 499         self.objcache = None
 500         self.compression_level = compression_level
 501
 502     def __del__(self):
 503         self.close()
 504
 505     def _open(self):
 506         if not self.file:
 507             (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
 508             self.file = os.fdopen(fd, 'w+b')
 509             assert(name.endswith('.pack'))
 510             self.filename = name[:-5]
 511             self.file.write('PACK\0\0\0\2\0\0\0\0')
 512             self.idx = list(list() for i in xrange(256))
 513
 514     def _raw_write(self, datalist, sha):
 515         self._open()
 516         f = self.file
 517         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 518         # the file never has a *partial* blob.  So let's make sure it's
 519         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 520         # to our hashsplit algorithm.)  f.write() does its own buffering,
 521         # but that's okay because we'll flush it in _end().
 522         oneblob = ''.join(datalist)
 523         try:
 524             f.write(oneblob)
 525         except IOError, e:
 526             raise GitError, e, sys.exc_info()[2]
 527         nw = len(oneblob)
 528         crc = zlib.crc32(oneblob) & 0xffffffff
 529         self._update_idx(sha, crc, nw)
 530         self.outbytes += nw
 531         self.count += 1
 532         return nw, crc
 533
 534     def _update_idx(self, sha, crc, size):
 535         assert(sha)
 536         if self.idx:
 537             self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
 538
 539     def _write(self, sha, type, content):
 540         if verbose:
 541             log('>')
 542         if not sha:
 543             sha = calc_hash(type, content)
 544         size, crc = self._raw_write(_encode_packobj(type, content,
 545                                                     self.compression_level),
 546                                     sha=sha)
 547         if self.outbytes >= max_pack_size or self.count >= max_pack_objects:
 548             self.breakpoint()
 549         return sha
 550
 551     def breakpoint(self):
 552         """Clear byte and object counts and return the last processed id."""
 553         id = self._end()
 554         self.outbytes = self.count = 0
 555         return id
 556
 557     def _require_objcache(self):
 558         if self.objcache is None and self.objcache_maker:
 559             self.objcache = self.objcache_maker()
 560         if self.objcache is None:
 561             raise GitError(
 562                     "PackWriter not opened or can't check exists w/o objcache")
 563
 564     def exists(self, id, want_source=False):
 565         """Return non-empty if an object is found in the object cache."""
 566         self._require_objcache()
 567         return self.objcache.exists(id, want_source=want_source)
 568
 569     def maybe_write(self, type, content):
 570         """Write an object to the pack file if not present and return its id."""
 571         sha = calc_hash(type, content)
 572         if not self.exists(sha):
 573             self._write(sha, type, content)
 574             self._require_objcache()
 575             self.objcache.add(sha)
 576         return sha
 577
 578     def new_blob(self, blob):
 579         """Create a blob object in the pack with the supplied content."""
 580         return self.maybe_write('blob', blob)
 581
 582     def new_tree(self, shalist):
 583         """Create a tree object in the pack."""
 584         content = tree_encode(shalist)
 585         return self.maybe_write('tree', content)
 586
 587     def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
 588         l = []
 589         if tree: l.append('tree %s' % tree.encode('hex'))
 590         if parent: l.append('parent %s' % parent.encode('hex'))
 591         if author: l.append('author %s %s' % (author, _git_date(adate)))
 592         if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
 593         l.append('')
 594         l.append(msg)
 595         return self.maybe_write('commit', '\n'.join(l))
 596
 597     def new_commit(self, parent, tree, date, msg):
 598         """Create a commit object in the pack."""
 599         userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
 600         commit = self._new_commit(tree, parent,
 601                                   userline, date, userline, date,
 602                                   msg)
 603         return commit
 604
 605     def abort(self):
 606         """Remove the pack file from disk."""
 607         f = self.file
 608         if f:
 609             self.idx = None
 610             self.file = None
 611             f.close()
 612             os.unlink(self.filename + '.pack')
 613
 614     def _end(self, run_midx=True):
 615         f = self.file
 616         if not f: return None
 617         self.file = None
 618         self.objcache = None
 619         idx = self.idx
 620         self.idx = None
 621
 622         # update object count
 623         f.seek(8)
 624         cp = struct.pack('!i', self.count)
 625         assert(len(cp) == 4)
 626         f.write(cp)
 627
 628         # calculate the pack sha1sum
 629         f.seek(0)
 630         sum = Sha1()
 631         for b in chunkyreader(f):
 632             sum.update(b)
 633         packbin = sum.digest()
 634         f.write(packbin)
 635         f.close()
 636
 637         obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
 638
 639         nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
 640         if os.path.exists(self.filename + '.map'):
 641             os.unlink(self.filename + '.map')
 642         os.rename(self.filename + '.pack', nameprefix + '.pack')
 643         os.rename(self.filename + '.idx', nameprefix + '.idx')
 644
 645         if run_midx:
 646             auto_midx(repo('objects/pack'))
 647         return nameprefix
 648
 649     def close(self, run_midx=True):
 650         """Close the pack file and move it to its definitive path."""
 651         return self._end(run_midx=run_midx)
 652
 653     def _write_pack_idx_v2(self, filename, idx, packbin):
 654         idx_f = open(filename, 'w+b')
 655         idx_f.write('\377tOc\0\0\0\2')
 656
 657         ofs64_ofs = 8 + 4*256 + 28*self.count
 658         idx_f.truncate(ofs64_ofs)
 659         idx_f.seek(0)
 660         idx_map = mmap_readwrite(idx_f, close=False)
 661         idx_f.seek(0, SEEK_END)
 662         count = _helpers.write_idx(idx_f, idx_map, idx, self.count)
 663         assert(count == self.count)
 664         idx_map.close()
 665         idx_f.write(packbin)
 666
 667         idx_f.seek(0)
 668         idx_sum = Sha1()
 669         b = idx_f.read(8 + 4*256)
 670         idx_sum.update(b)
 671
 672         obj_list_sum = Sha1()
 673         for b in chunkyreader(idx_f, 20*self.count):
 674             idx_sum.update(b)
 675             obj_list_sum.update(b)
 676         namebase = obj_list_sum.hexdigest()
 677
 678         for b in chunkyreader(idx_f):
 679             idx_sum.update(b)
 680         idx_f.write(idx_sum.digest())
 681         idx_f.close()
 682
 683         return namebase
 684
 685
 686 def _git_date(date):
 687     return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
 688
 689
 690 def _gitenv():
 691     os.environ['GIT_DIR'] = os.path.abspath(repo())
 692
 693
 694 def list_refs(refname = None):
 695     """Generate a list of tuples in the form (refname,hash).
 696     If a ref name is specified, list only this particular ref.
 697     """
 698     argv = ['git', 'show-ref', '--']
 699     if refname:
 700         argv += [refname]
 701     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 702     out = p.stdout.read().strip()
 703     rv = p.wait()  # not fatal
 704     if rv:
 705         assert(not out)
 706     if out:
 707         for d in out.split('\n'):
 708             (sha, name) = d.split(' ', 1)
 709             yield (name, sha.decode('hex'))
 710
 711
 712 def read_ref(refname):
 713     """Get the commit id of the most recent commit made on a given ref."""
 714     l = list(list_refs(refname))
 715     if l:
 716         assert(len(l) == 1)
 717         return l[0][1]
 718     else:
 719         return None
 720
 721
 722 def rev_list(ref, count=None):
 723     """Generate a list of reachable commits in reverse chronological order.
 724
 725     This generator walks through commits, from child to parent, that are
 726     reachable via the specified ref and yields a series of tuples of the form
 727     (date,hash).
 728
 729     If count is a non-zero integer, limit the number of commits to "count"
 730     objects.
 731     """
 732     assert(not ref.startswith('-'))
 733     opts = []
 734     if count:
 735         opts += ['-n', str(atoi(count))]
 736     argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
 737     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 738     commit = None
 739     for row in p.stdout:
 740         s = row.strip()
 741         if s.startswith('commit '):
 742             commit = s[7:].decode('hex')
 743         else:
 744             date = int(s)
 745             yield (date, commit)
 746     rv = p.wait()  # not fatal
 747     if rv:
 748         raise GitError, 'git rev-list returned error %d' % rv
 749
 750
 751 def rev_get_date(ref):
 752     """Get the date of the latest commit on the specified ref."""
 753     for (date, commit) in rev_list(ref, count=1):
 754         return date
 755     raise GitError, 'no such commit %r' % ref
 756
 757
 758 def rev_parse(committish):
 759     """Resolve the full hash for 'committish', if it exists.
 760
 761     Should be roughly equivalent to 'git rev-parse'.
 762
 763     Returns the hex value of the hash if it is found, None if 'committish' does
 764     not correspond to anything.
 765     """
 766     head = read_ref(committish)
 767     if head:
 768         debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
 769         return head
 770
 771     pL = PackIdxList(repo('objects/pack'))
 772
 773     if len(committish) == 40:
 774         try:
 775             hash = committish.decode('hex')
 776         except TypeError:
 777             return None
 778
 779         if pL.exists(hash):
 780             return hash
 781
 782     return None
 783
 784
 785 def update_ref(refname, newval, oldval):
 786     """Change the commit pointed to by a branch."""
 787     if not oldval:
 788         oldval = ''
 789     assert(refname.startswith('refs/heads/'))
 790     p = subprocess.Popen(['git', 'update-ref', refname,
 791                           newval.encode('hex'), oldval.encode('hex')],
 792                          preexec_fn = _gitenv)
 793     _git_wait('git update-ref', p)
 794
 795
 796 def guess_repo(path=None):
 797     """Set the path value in the global variable "repodir".
 798     This makes bup look for an existing bup repository, but not fail if a
 799     repository doesn't exist. Usually, if you are interacting with a bup
 800     repository, you would not be calling this function but using
 801     check_repo_or_die().
 802     """
 803     global repodir
 804     if path:
 805         repodir = path
 806     if not repodir:
 807         repodir = os.environ.get('BUP_DIR')
 808         if not repodir:
 809             repodir = os.path.expanduser('~/.bup')
 810
 811
 812 def init_repo(path=None):
 813     """Create the Git bare repository for bup in a given path."""
 814     guess_repo(path)
 815     d = repo()  # appends a / to the path
 816     parent = os.path.dirname(os.path.dirname(d))
 817     if parent and not os.path.exists(parent):
 818         raise GitError('parent directory "%s" does not exist\n' % parent)
 819     if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
 820         raise GitError('"%d" exists but is not a directory\n' % d)
 821     p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
 822                          preexec_fn = _gitenv)
 823     _git_wait('git init', p)
 824     # Force the index version configuration in order to ensure bup works
 825     # regardless of the version of the installed Git binary.
 826     p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
 827                          stdout=sys.stderr, preexec_fn = _gitenv)
 828     _git_wait('git config', p)
 829
 830
 831 def check_repo_or_die(path=None):
 832     """Make sure a bup repository exists, and abort if not.
 833     If the path to a particular repository was not specified, this function
 834     initializes the default repository automatically.
 835     """
 836     guess_repo(path)
 837     try:
 838         os.stat(repo('objects/pack/.'))
 839     except OSError, e:
 840         if e.errno == errno.ENOENT:
 841             if repodir != home_repodir:
 842                 log('error: %r is not a bup/git repository\n' % repo())
 843                 sys.exit(15)
 844             else:
 845                 init_repo()
 846         else:
 847             log('error: %s\n' % e)
 848             sys.exit(14)
 849
 850
 851 _ver = None
 852 def ver():
 853     """Get Git's version and ensure a usable version is installed.
 854
 855     The returned version is formatted as an ordered tuple with each position
 856     representing a digit in the version tag. For example, the following tuple
 857     would represent version 1.6.6.9:
 858
 859         ('1', '6', '6', '9')
 860     """
 861     global _ver
 862     if not _ver:
 863         p = subprocess.Popen(['git', '--version'],
 864                              stdout=subprocess.PIPE)
 865         gvs = p.stdout.read()
 866         _git_wait('git --version', p)
 867         m = re.match(r'git version (\S+.\S+)', gvs)
 868         if not m:
 869             raise GitError('git --version weird output: %r' % gvs)
 870         _ver = tuple(m.group(1).split('.'))
 871     needed = ('1','5', '3', '1')
 872     if _ver < needed:
 873         raise GitError('git version %s or higher is required; you have %s'
 874                        % ('.'.join(needed), '.'.join(_ver)))
 875     return _ver
 876
 877
 878 def _git_wait(cmd, p):
 879     rv = p.wait()
 880     if rv != 0:
 881         raise GitError('%s returned %d' % (cmd, rv))
 882
 883
 884 def _git_capture(argv):
 885     p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
 886     r = p.stdout.read()
 887     _git_wait(repr(argv), p)
 888     return r
 889
 890
 891 class _AbortableIter:
 892     def __init__(self, it, onabort = None):
 893         self.it = it
 894         self.onabort = onabort
 895         self.done = None
 896
 897     def __iter__(self):
 898         return self
 899
 900     def next(self):
 901         try:
 902             return self.it.next()
 903         except StopIteration, e:
 904             self.done = True
 905             raise
 906         except:
 907             self.abort()
 908             raise
 909
 910     def abort(self):
 911         """Abort iteration and call the abortion callback, if needed."""
 912         if not self.done:
 913             self.done = True
 914             if self.onabort:
 915                 self.onabort()
 916
 917     def __del__(self):
 918         self.abort()
 919
 920
 921 _ver_warned = 0
 922 class CatPipe:
 923     """Link to 'git cat-file' that is used to retrieve blob data."""
 924     def __init__(self):
 925         global _ver_warned
 926         wanted = ('1','5','6')
 927         if ver() < wanted:
 928             if not _ver_warned:
 929                 log('warning: git version < %s; bup will be slow.\n'
 930                     % '.'.join(wanted))
 931                 _ver_warned = 1
 932             self.get = self._slow_get
 933         else:
 934             self.p = self.inprogress = None
 935             self.get = self._fast_get
 936
 937     def _abort(self):
 938         if self.p:
 939             self.p.stdout.close()
 940             self.p.stdin.close()
 941         self.p = None
 942         self.inprogress = None
 943
 944     def _restart(self):
 945         self._abort()
 946         self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
 947                                   stdin=subprocess.PIPE,
 948                                   stdout=subprocess.PIPE,
 949                                   close_fds = True,
 950                                   bufsize = 4096,
 951                                   preexec_fn = _gitenv)
 952
 953     def _fast_get(self, id):
 954         if not self.p or self.p.poll() != None:
 955             self._restart()
 956         assert(self.p)
 957         assert(self.p.poll() == None)
 958         if self.inprogress:
 959             log('_fast_get: opening %r while %r is open'
 960                 % (id, self.inprogress))
 961         assert(not self.inprogress)
 962         assert(id.find('\n') < 0)
 963         assert(id.find('\r') < 0)
 964         assert(not id.startswith('-'))
 965         self.inprogress = id
 966         self.p.stdin.write('%s\n' % id)
 967         self.p.stdin.flush()
 968         hdr = self.p.stdout.readline()
 969         if hdr.endswith(' missing\n'):
 970             self.inprogress = None
 971             raise KeyError('blob %r is missing' % id)
 972         spl = hdr.split(' ')
 973         if len(spl) != 3 or len(spl[0]) != 40:
 974             raise GitError('expected blob, got %r' % spl)
 975         (hex, type, size) = spl
 976
 977         it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
 978                            onabort = self._abort)
 979         try:
 980             yield type
 981             for blob in it:
 982                 yield blob
 983             assert(self.p.stdout.readline() == '\n')
 984             self.inprogress = None
 985         except Exception, e:
 986             it.abort()
 987             raise
 988
 989     def _slow_get(self, id):
 990         assert(id.find('\n') < 0)
 991         assert(id.find('\r') < 0)
 992         assert(id[0] != '-')
 993         type = _git_capture(['git', 'cat-file', '-t', id]).strip()
 994         yield type
 995
 996         p = subprocess.Popen(['git', 'cat-file', type, id],
 997                              stdout=subprocess.PIPE,
 998                              preexec_fn = _gitenv)
 999         for blob in chunkyreader(p.stdout):
1000             yield blob
1001         _git_wait('git cat-file', p)
1002
1003     def _join(self, it):
1004         type = it.next()
1005         if type == 'blob':
1006             for blob in it:
1007                 yield blob
1008         elif type == 'tree':
1009             treefile = ''.join(it)
1010             for (mode, name, sha) in tree_decode(treefile):
1011                 for blob in self.join(sha.encode('hex')):
1012                     yield blob
1013         elif type == 'commit':
1014             treeline = ''.join(it).split('\n')[0]
1015             assert(treeline.startswith('tree '))
1016             for blob in self.join(treeline[5:]):
1017                 yield blob
1018         else:
1019             raise GitError('invalid object type %r: expected blob/tree/commit'
1020                            % type)
1021
1022     def join(self, id):
1023         """Generate a list of the content of all blobs that can be reached
1024         from an object.  The hash given in 'id' must point to a blob, a tree
1025         or a commit. The content of all blobs that can be seen from trees or
1026         commits will be added to the list.
1027         """
1028         try:
1029             for d in self._join(self.get(id)):
1030                 yield d
1031         except StopIteration:
1032             log('booger!\n')
1033
1034 def tags():
1035     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1036     tags = {}
1037     for (n,c) in list_refs():
1038         if n.startswith('refs/tags/'):
1039             name = n[10:]
1040             if not c in tags:
1041                 tags[c] = []
1042
1043             tags[c].append(name)  # more than one tag can point at 'c'
1044
1045     return tags