lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5 import os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
   6 from bup.helpers import *
   7 from bup import _helpers, path, midx, bloom
   8
   9 SEEK_END=2  # os.SEEK_END is not defined in python 2.4
  10
  11 verbose = 0
  12 ignore_midx = 0
  13 home_repodir = os.path.expanduser('~/.bup')
  14 repodir = None
  15
  16 _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
  17 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
  18
  19 _total_searches = 0
  20 _total_steps = 0
  21
  22
  23 class GitError(Exception):
  24     pass
  25
  26
  27 def repo(sub = ''):
  28     """Get the path to the git repository or one of its subdirectories."""
  29     global repodir
  30     if not repodir:
  31         raise GitError('You should call check_repo_or_die()')
  32
  33     # If there's a .git subdirectory, then the actual repo is in there.
  34     gd = os.path.join(repodir, '.git')
  35     if os.path.exists(gd):
  36         repodir = gd
  37
  38     return os.path.join(repodir, sub)
  39
  40
  41 def shorten_hash(s):
  42     return re.sub(r'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)',
  43                   r'\1\2*\3', s)
  44
  45
  46 def repo_rel(path):
  47     full = os.path.abspath(path)
  48     fullrepo = os.path.abspath(repo(''))
  49     if not fullrepo.endswith('/'):
  50         fullrepo += '/'
  51     if full.startswith(fullrepo):
  52         path = full[len(fullrepo):]
  53     if path.startswith('index-cache/'):
  54         path = path[len('index-cache/'):]
  55     return shorten_hash(path)
  56
  57
  58 def all_packdirs():
  59     paths = [repo('objects/pack')]
  60     paths += glob.glob(repo('index-cache/*/.'))
  61     return paths
  62
  63
  64 def auto_midx(objdir):
  65     args = [path.exe(), 'midx', '--auto', '--dir', objdir]
  66     try:
  67         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
  68     except OSError, e:
  69         # make sure 'args' gets printed to help with debugging
  70         add_error('%r: exception: %s' % (args, e))
  71         raise
  72     if rv:
  73         add_error('%r: returned %d' % (args, rv))
  74
  75     args = [path.exe(), 'bloom', '--dir', objdir]
  76     try:
  77         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
  78     except OSError, e:
  79         # make sure 'args' gets printed to help with debugging
  80         add_error('%r: exception: %s' % (args, e))
  81         raise
  82     if rv:
  83         add_error('%r: returned %d' % (args, rv))
  84
  85
  86 def mangle_name(name, mode, gitmode):
  87     """Mangle a file name to present an abstract name for segmented files.
  88     Mangled file names will have the ".bup" extension added to them. If a
  89     file's name already ends with ".bup", a ".bupl" extension is added to
  90     disambiguate normal files from semgmented ones.
  91     """
  92     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
  93         return name + '.bup'
  94     elif name.endswith('.bup') or name[:-1].endswith('.bup'):
  95         return name + '.bupl'
  96     else:
  97         return name
  98
  99
 100 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 101 def demangle_name(name):
 102     """Remove name mangling from a file name, if necessary.
 103
 104     The return value is a tuple (demangled_filename,mode), where mode is one of
 105     the following:
 106
 107     * BUP_NORMAL  : files that should be read as-is from the repository
 108     * BUP_CHUNKED : files that were chunked and need to be assembled
 109
 110     For more information on the name mangling algorythm, see mangle_name()
 111     """
 112     if name.endswith('.bupl'):
 113         return (name[:-5], BUP_NORMAL)
 114     elif name.endswith('.bup'):
 115         return (name[:-4], BUP_CHUNKED)
 116     else:
 117         return (name, BUP_NORMAL)
 118
 119
 120 def _encode_packobj(type, content):
 121     szout = ''
 122     sz = len(content)
 123     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 124     sz >>= 4
 125     while 1:
 126         if sz: szbits |= 0x80
 127         szout += chr(szbits)
 128         if not sz:
 129             break
 130         szbits = sz & 0x7f
 131         sz >>= 7
 132     z = zlib.compressobj(1)
 133     yield szout
 134     yield z.compress(content)
 135     yield z.flush()
 136
 137
 138 def _encode_looseobj(type, content):
 139     z = zlib.compressobj(1)
 140     yield z.compress('%s %d\0' % (type, len(content)))
 141     yield z.compress(content)
 142     yield z.flush()
 143
 144
 145 def _decode_looseobj(buf):
 146     assert(buf);
 147     s = zlib.decompress(buf)
 148     i = s.find('\0')
 149     assert(i > 0)
 150     l = s[:i].split(' ')
 151     type = l[0]
 152     sz = int(l[1])
 153     content = s[i+1:]
 154     assert(type in _typemap)
 155     assert(sz == len(content))
 156     return (type, content)
 157
 158
 159 def _decode_packobj(buf):
 160     assert(buf)
 161     c = ord(buf[0])
 162     type = _typermap[(c & 0x70) >> 4]
 163     sz = c & 0x0f
 164     shift = 4
 165     i = 0
 166     while c & 0x80:
 167         i += 1
 168         c = ord(buf[i])
 169         sz |= (c & 0x7f) << shift
 170         shift += 7
 171         if not (c & 0x80):
 172             break
 173     return (type, zlib.decompress(buf[i+1:]))
 174
 175
 176 class PackIdx:
 177     def __init__(self):
 178         assert(0)
 179
 180     def find_offset(self, hash):
 181         """Get the offset of an object inside the index file."""
 182         idx = self._idx_from_hash(hash)
 183         if idx != None:
 184             return self._ofs_from_idx(idx)
 185         return None
 186
 187     def exists(self, hash, want_source=False):
 188         """Return nonempty if the object exists in this index."""
 189         if hash and (self._idx_from_hash(hash) != None):
 190             return want_source and os.path.basename(self.name) or True
 191         return None
 192
 193     def __len__(self):
 194         return int(self.fanout[255])
 195
 196     def _idx_from_hash(self, hash):
 197         global _total_searches, _total_steps
 198         _total_searches += 1
 199         assert(len(hash) == 20)
 200         b1 = ord(hash[0])
 201         start = self.fanout[b1-1] # range -1..254
 202         end = self.fanout[b1] # range 0..255
 203         want = str(hash)
 204         _total_steps += 1  # lookup table is a step
 205         while start < end:
 206             _total_steps += 1
 207             mid = start + (end-start)/2
 208             v = self._idx_to_hash(mid)
 209             if v < want:
 210                 start = mid+1
 211             elif v > want:
 212                 end = mid
 213             else: # got it!
 214                 return mid
 215         return None
 216
 217
 218 class PackIdxV1(PackIdx):
 219     """Object representation of a Git pack index (version 1) file."""
 220     def __init__(self, filename, f):
 221         self.name = filename
 222         self.idxnames = [self.name]
 223         self.map = mmap_read(f)
 224         self.fanout = list(struct.unpack('!256I',
 225                                          str(buffer(self.map, 0, 256*4))))
 226         self.fanout.append(0)  # entry "-1"
 227         nsha = self.fanout[255]
 228         self.sha_ofs = 256*4
 229         self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
 230
 231     def _ofs_from_idx(self, idx):
 232         return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
 233
 234     def _idx_to_hash(self, idx):
 235         return str(self.shatable[idx*24+4 : idx*24+24])
 236
 237     def __iter__(self):
 238         for i in xrange(self.fanout[255]):
 239             yield buffer(self.map, 256*4 + 24*i + 4, 20)
 240
 241
 242 class PackIdxV2(PackIdx):
 243     """Object representation of a Git pack index (version 2) file."""
 244     def __init__(self, filename, f):
 245         self.name = filename
 246         self.idxnames = [self.name]
 247         self.map = mmap_read(f)
 248         assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
 249         self.fanout = list(struct.unpack('!256I',
 250                                          str(buffer(self.map, 8, 256*4))))
 251         self.fanout.append(0)  # entry "-1"
 252         nsha = self.fanout[255]
 253         self.sha_ofs = 8 + 256*4
 254         self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
 255         self.ofstable = buffer(self.map,
 256                                self.sha_ofs + nsha*20 + nsha*4,
 257                                nsha*4)
 258         self.ofs64table = buffer(self.map,
 259                                  8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
 260
 261     def _ofs_from_idx(self, idx):
 262         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
 263         if ofs & 0x80000000:
 264             idx64 = ofs & 0x7fffffff
 265             ofs = struct.unpack('!Q',
 266                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
 267         return ofs
 268
 269     def _idx_to_hash(self, idx):
 270         return str(self.shatable[idx*20:(idx+1)*20])
 271
 272     def __iter__(self):
 273         for i in xrange(self.fanout[255]):
 274             yield buffer(self.map, 8 + 256*4 + 20*i, 20)
 275
 276
 277 _mpi_count = 0
 278 class PackIdxList:
 279     def __init__(self, dir):
 280         global _mpi_count
 281         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 282         _mpi_count += 1
 283         self.dir = dir
 284         self.also = set()
 285         self.packs = []
 286         self.do_bloom = False
 287         self.bloom = None
 288         self.refresh()
 289
 290     def __del__(self):
 291         global _mpi_count
 292         _mpi_count -= 1
 293         assert(_mpi_count == 0)
 294
 295     def __iter__(self):
 296         return iter(idxmerge(self.packs))
 297
 298     def __len__(self):
 299         return sum(len(pack) for pack in self.packs)
 300
 301     def exists(self, hash, want_source=False):
 302         """Return nonempty if the object exists in the index files."""
 303         global _total_searches
 304         _total_searches += 1
 305         if hash in self.also:
 306             return True
 307         if self.do_bloom and self.bloom:
 308             if self.bloom.exists(hash):
 309                 self.do_bloom = False
 310             else:
 311                 _total_searches -= 1  # was counted by bloom
 312                 return None
 313         for i in xrange(len(self.packs)):
 314             p = self.packs[i]
 315             _total_searches -= 1  # will be incremented by sub-pack
 316             ix = p.exists(hash, want_source=want_source)
 317             if ix:
 318                 # reorder so most recently used packs are searched first
 319                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 320                 return ix
 321         self.do_bloom = True
 322         return None
 323
 324     def refresh(self, skip_midx = False):
 325         """Refresh the index list.
 326         This method verifies if .midx files were superseded (e.g. all of its
 327         contents are in another, bigger .midx file) and removes the superseded
 328         files.
 329
 330         If skip_midx is True, all work on .midx files will be skipped and .midx
 331         files will be removed from the list.
 332
 333         The module-global variable 'ignore_midx' can force this function to
 334         always act as if skip_midx was True.
 335         """
 336         self.bloom = None # Always reopen the bloom as it may have been relaced
 337         self.do_bloom = False
 338         skip_midx = skip_midx or ignore_midx
 339         d = dict((p.name, p) for p in self.packs
 340                  if not skip_midx or not isinstance(p, midx.PackMidx))
 341         if os.path.exists(self.dir):
 342             if not skip_midx:
 343                 midxl = []
 344                 for ix in self.packs:
 345                     if isinstance(ix, midx.PackMidx):
 346                         for name in ix.idxnames:
 347                             d[os.path.join(self.dir, name)] = ix
 348                 for full in glob.glob(os.path.join(self.dir,'*.midx')):
 349                     if not d.get(full):
 350                         mx = midx.PackMidx(full)
 351                         (mxd, mxf) = os.path.split(mx.name)
 352                         broken = False
 353                         for n in mx.idxnames:
 354                             if not os.path.exists(os.path.join(mxd, n)):
 355                                 log(('warning: index %s missing\n' +
 356                                     '  used by %s\n') % (n, mxf))
 357                                 broken = True
 358                         if broken:
 359                             del mx
 360                             unlink(full)
 361                         else:
 362                             midxl.append(mx)
 363                 midxl.sort(lambda x,y: -cmp(len(x),len(y)))
 364                 for ix in midxl:
 365                     any_needed = False
 366                     for sub in ix.idxnames:
 367                         found = d.get(os.path.join(self.dir, sub))
 368                         if not found or isinstance(found, PackIdx):
 369                             # doesn't exist, or exists but not in a midx
 370                             any_needed = True
 371                             break
 372                     if any_needed:
 373                         d[ix.name] = ix
 374                         for name in ix.idxnames:
 375                             d[os.path.join(self.dir, name)] = ix
 376                     elif not ix.force_keep:
 377                         debug1('midx: removing redundant: %s\n'
 378                                % os.path.basename(ix.name))
 379                         unlink(ix.name)
 380             for full in glob.glob(os.path.join(self.dir,'*.idx')):
 381                 if not d.get(full):
 382                     try:
 383                         ix = open_idx(full)
 384                     except GitError, e:
 385                         add_error(e)
 386                         continue
 387                     d[full] = ix
 388             bfull = os.path.join(self.dir, 'bup.bloom')
 389             if self.bloom is None and os.path.exists(bfull):
 390                 self.bloom = bloom.ShaBloom(bfull)
 391             self.packs = list(set(d.values()))
 392             self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
 393             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 394                 self.do_bloom = True
 395             else:
 396                 self.bloom = None
 397         debug1('PackIdxList: using %d index%s.\n'
 398             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 399
 400     def add(self, hash):
 401         """Insert an additional object in the list."""
 402         self.also.add(hash)
 403
 404
 405 def calc_hash(type, content):
 406     """Calculate some content's hash in the Git fashion."""
 407     header = '%s %d\0' % (type, len(content))
 408     sum = Sha1(header)
 409     sum.update(content)
 410     return sum.digest()
 411
 412
 413 def _shalist_sort_key(ent):
 414     (mode, name, id) = ent
 415     if stat.S_ISDIR(int(mode, 8)):
 416         return name + '/'
 417     else:
 418         return name
 419
 420
 421 def open_idx(filename):
 422     if filename.endswith('.idx'):
 423         f = open(filename, 'rb')
 424         header = f.read(8)
 425         if header[0:4] == '\377tOc':
 426             version = struct.unpack('!I', header[4:8])[0]
 427             if version == 2:
 428                 return PackIdxV2(filename, f)
 429             else:
 430                 raise GitError('%s: expected idx file version 2, got %d'
 431                                % (filename, version))
 432         elif len(header) == 8 and header[0:4] < '\377tOc':
 433             return PackIdxV1(filename, f)
 434         else:
 435             raise GitError('%s: unrecognized idx file header' % filename)
 436     elif filename.endswith('.midx'):
 437         return midx.PackMidx(filename)
 438     else:
 439         raise GitError('idx filenames must end with .idx or .midx')
 440
 441
 442 def idxmerge(idxlist, final_progress=True):
 443     """Generate a list of all the objects reachable in a PackIdxList."""
 444     def pfunc(count, total):
 445         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 446                   % (count*100.0/total, count, total))
 447     def pfinal(count, total):
 448         if final_progress:
 449             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 450                      % (100, total, total))
 451     return merge_iter(idxlist, 10024, pfunc, pfinal)
 452
 453
 454 def _make_objcache():
 455     return PackIdxList(repo('objects/pack'))
 456
 457 class PackWriter:
 458     """Writes Git objects inside a pack file."""
 459     def __init__(self, objcache_maker=_make_objcache):
 460         self.count = 0
 461         self.outbytes = 0
 462         self.filename = None
 463         self.file = None
 464         self.idx = None
 465         self.objcache_maker = objcache_maker
 466         self.objcache = None
 467
 468     def __del__(self):
 469         self.close()
 470
 471     def _open(self):
 472         if not self.file:
 473             (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
 474             self.file = os.fdopen(fd, 'w+b')
 475             assert(name.endswith('.pack'))
 476             self.filename = name[:-5]
 477             self.file.write('PACK\0\0\0\2\0\0\0\0')
 478             self.idx = list(list() for i in xrange(256))
 479
 480     def _raw_write(self, datalist, sha):
 481         self._open()
 482         f = self.file
 483         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 484         # the file never has a *partial* blob.  So let's make sure it's
 485         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 486         # to our hashsplit algorithm.)  f.write() does its own buffering,
 487         # but that's okay because we'll flush it in _end().
 488         oneblob = ''.join(datalist)
 489         try:
 490             f.write(oneblob)
 491         except IOError, e:
 492             raise GitError, e, sys.exc_info()[2]
 493         nw = len(oneblob)
 494         crc = zlib.crc32(oneblob) & 0xffffffff
 495         self._update_idx(sha, crc, nw)
 496         self.outbytes += nw
 497         self.count += 1
 498         return nw, crc
 499
 500     def _update_idx(self, sha, crc, size):
 501         assert(sha)
 502         if self.idx:
 503             self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
 504
 505     def _write(self, sha, type, content):
 506         if verbose:
 507             log('>')
 508         if not sha:
 509             sha = calc_hash(type, content)
 510         size, crc = self._raw_write(_encode_packobj(type, content), sha=sha)
 511         return sha
 512
 513     def breakpoint(self):
 514         """Clear byte and object counts and return the last processed id."""
 515         id = self._end()
 516         self.outbytes = self.count = 0
 517         return id
 518
 519     def _require_objcache(self):
 520         if self.objcache is None and self.objcache_maker:
 521             self.objcache = self.objcache_maker()
 522         if self.objcache is None:
 523             raise GitError(
 524                     "PackWriter not opened or can't check exists w/o objcache")
 525
 526     def exists(self, id, want_source=False):
 527         """Return non-empty if an object is found in the object cache."""
 528         self._require_objcache()
 529         return self.objcache.exists(id, want_source=want_source)
 530
 531     def maybe_write(self, type, content):
 532         """Write an object to the pack file if not present and return its id."""
 533         self._require_objcache()
 534         sha = calc_hash(type, content)
 535         if not self.exists(sha):
 536             self._write(sha, type, content)
 537             self.objcache.add(sha)
 538         return sha
 539
 540     def new_blob(self, blob):
 541         """Create a blob object in the pack with the supplied content."""
 542         return self.maybe_write('blob', blob)
 543
 544     def new_tree(self, shalist):
 545         """Create a tree object in the pack."""
 546         shalist = sorted(shalist, key = _shalist_sort_key)
 547         l = []
 548         for (mode,name,bin) in shalist:
 549             assert(mode)
 550             assert(mode != '0')
 551             assert(mode[0] != '0')
 552             assert(name)
 553             assert(len(bin) == 20)
 554             l.append('%s %s\0%s' % (mode,name,bin))
 555         return self.maybe_write('tree', ''.join(l))
 556
 557     def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
 558         l = []
 559         if tree: l.append('tree %s' % tree.encode('hex'))
 560         if parent: l.append('parent %s' % parent.encode('hex'))
 561         if author: l.append('author %s %s' % (author, _git_date(adate)))
 562         if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
 563         l.append('')
 564         l.append(msg)
 565         return self.maybe_write('commit', '\n'.join(l))
 566
 567     def new_commit(self, parent, tree, date, msg):
 568         """Create a commit object in the pack."""
 569         userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
 570         commit = self._new_commit(tree, parent,
 571                                   userline, date, userline, date,
 572                                   msg)
 573         return commit
 574
 575     def abort(self):
 576         """Remove the pack file from disk."""
 577         f = self.file
 578         if f:
 579             self.idx = None
 580             self.file = None
 581             f.close()
 582             os.unlink(self.filename + '.pack')
 583
 584     def _end(self, run_midx=True):
 585         f = self.file
 586         if not f: return None
 587         self.file = None
 588         self.objcache = None
 589         idx = self.idx
 590         self.idx = None
 591
 592         # update object count
 593         f.seek(8)
 594         cp = struct.pack('!i', self.count)
 595         assert(len(cp) == 4)
 596         f.write(cp)
 597
 598         # calculate the pack sha1sum
 599         f.seek(0)
 600         sum = Sha1()
 601         for b in chunkyreader(f):
 602             sum.update(b)
 603         packbin = sum.digest()
 604         f.write(packbin)
 605         f.close()
 606
 607         obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
 608
 609         nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
 610         if os.path.exists(self.filename + '.map'):
 611             os.unlink(self.filename + '.map')
 612         os.rename(self.filename + '.pack', nameprefix + '.pack')
 613         os.rename(self.filename + '.idx', nameprefix + '.idx')
 614
 615         if run_midx:
 616             auto_midx(repo('objects/pack'))
 617         return nameprefix
 618
 619     def close(self, run_midx=True):
 620         """Close the pack file and move it to its definitive path."""
 621         return self._end(run_midx=run_midx)
 622
 623     def _write_pack_idx_v2(self, filename, idx, packbin):
 624         idx_f = open(filename, 'w+b')
 625         idx_f.write('\377tOc\0\0\0\2')
 626
 627         ofs64_ofs = 8 + 4*256 + 28*self.count
 628         idx_f.truncate(ofs64_ofs)
 629         idx_f.seek(0)
 630         idx_map = mmap_readwrite(idx_f, close=False)
 631         idx_f.seek(0, SEEK_END)
 632         count = _helpers.write_idx(idx_f, idx_map, idx, self.count)
 633         assert(count == self.count)
 634         idx_map.close()
 635         idx_f.write(packbin)
 636
 637         idx_f.seek(0)
 638         idx_sum = Sha1()
 639         b = idx_f.read(8 + 4*256)
 640         idx_sum.update(b)
 641
 642         obj_list_sum = Sha1()
 643         for b in chunkyreader(idx_f, 20*self.count):
 644             idx_sum.update(b)
 645             obj_list_sum.update(b)
 646         namebase = obj_list_sum.hexdigest()
 647
 648         for b in chunkyreader(idx_f):
 649             idx_sum.update(b)
 650         idx_f.write(idx_sum.digest())
 651         idx_f.close()
 652
 653         return namebase
 654
 655
 656 def _git_date(date):
 657     return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
 658
 659
 660 def _gitenv():
 661     os.environ['GIT_DIR'] = os.path.abspath(repo())
 662
 663
 664 def list_refs(refname = None):
 665     """Generate a list of tuples in the form (refname,hash).
 666     If a ref name is specified, list only this particular ref.
 667     """
 668     argv = ['git', 'show-ref', '--']
 669     if refname:
 670         argv += [refname]
 671     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 672     out = p.stdout.read().strip()
 673     rv = p.wait()  # not fatal
 674     if rv:
 675         assert(not out)
 676     if out:
 677         for d in out.split('\n'):
 678             (sha, name) = d.split(' ', 1)
 679             yield (name, sha.decode('hex'))
 680
 681
 682 def read_ref(refname):
 683     """Get the commit id of the most recent commit made on a given ref."""
 684     l = list(list_refs(refname))
 685     if l:
 686         assert(len(l) == 1)
 687         return l[0][1]
 688     else:
 689         return None
 690
 691
 692 def rev_list(ref, count=None):
 693     """Generate a list of reachable commits in reverse chronological order.
 694
 695     This generator walks through commits, from child to parent, that are
 696     reachable via the specified ref and yields a series of tuples of the form
 697     (date,hash).
 698
 699     If count is a non-zero integer, limit the number of commits to "count"
 700     objects.
 701     """
 702     assert(not ref.startswith('-'))
 703     opts = []
 704     if count:
 705         opts += ['-n', str(atoi(count))]
 706     argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
 707     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 708     commit = None
 709     for row in p.stdout:
 710         s = row.strip()
 711         if s.startswith('commit '):
 712             commit = s[7:].decode('hex')
 713         else:
 714             date = int(s)
 715             yield (date, commit)
 716     rv = p.wait()  # not fatal
 717     if rv:
 718         raise GitError, 'git rev-list returned error %d' % rv
 719
 720
 721 def rev_get_date(ref):
 722     """Get the date of the latest commit on the specified ref."""
 723     for (date, commit) in rev_list(ref, count=1):
 724         return date
 725     raise GitError, 'no such commit %r' % ref
 726
 727
 728 def rev_parse(committish):
 729     """Resolve the full hash for 'committish', if it exists.
 730
 731     Should be roughly equivalent to 'git rev-parse'.
 732
 733     Returns the hex value of the hash if it is found, None if 'committish' does
 734     not correspond to anything.
 735     """
 736     head = read_ref(committish)
 737     if head:
 738         debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
 739         return head
 740
 741     pL = PackIdxList(repo('objects/pack'))
 742
 743     if len(committish) == 40:
 744         try:
 745             hash = committish.decode('hex')
 746         except TypeError:
 747             return None
 748
 749         if pL.exists(hash):
 750             return hash
 751
 752     return None
 753
 754
 755 def update_ref(refname, newval, oldval):
 756     """Change the commit pointed to by a branch."""
 757     if not oldval:
 758         oldval = ''
 759     assert(refname.startswith('refs/heads/'))
 760     p = subprocess.Popen(['git', 'update-ref', refname,
 761                           newval.encode('hex'), oldval.encode('hex')],
 762                          preexec_fn = _gitenv)
 763     _git_wait('git update-ref', p)
 764
 765
 766 def guess_repo(path=None):
 767     """Set the path value in the global variable "repodir".
 768     This makes bup look for an existing bup repository, but not fail if a
 769     repository doesn't exist. Usually, if you are interacting with a bup
 770     repository, you would not be calling this function but using
 771     check_repo_or_die().
 772     """
 773     global repodir
 774     if path:
 775         repodir = path
 776     if not repodir:
 777         repodir = os.environ.get('BUP_DIR')
 778         if not repodir:
 779             repodir = os.path.expanduser('~/.bup')
 780
 781
 782 def init_repo(path=None):
 783     """Create the Git bare repository for bup in a given path."""
 784     guess_repo(path)
 785     d = repo()  # appends a / to the path
 786     parent = os.path.dirname(os.path.dirname(d))
 787     if parent and not os.path.exists(parent):
 788         raise GitError('parent directory "%s" does not exist\n' % parent)
 789     if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
 790         raise GitError('"%d" exists but is not a directory\n' % d)
 791     p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
 792                          preexec_fn = _gitenv)
 793     _git_wait('git init', p)
 794     # Force the index version configuration in order to ensure bup works
 795     # regardless of the version of the installed Git binary.
 796     p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
 797                          stdout=sys.stderr, preexec_fn = _gitenv)
 798     _git_wait('git config', p)
 799
 800
 801 def check_repo_or_die(path=None):
 802     """Make sure a bup repository exists, and abort if not.
 803     If the path to a particular repository was not specified, this function
 804     initializes the default repository automatically.
 805     """
 806     guess_repo(path)
 807     if not os.path.isdir(repo('objects/pack/.')):
 808         if repodir == home_repodir:
 809             init_repo()
 810         else:
 811             log('error: %r is not a bup/git repository\n' % repo())
 812             sys.exit(15)
 813
 814
 815 def treeparse(buf):
 816     """Generate a list of (mode, name, hash) tuples of objects from 'buf'."""
 817     ofs = 0
 818     while ofs < len(buf):
 819         z = buf[ofs:].find('\0')
 820         assert(z > 0)
 821         spl = buf[ofs:ofs+z].split(' ', 1)
 822         assert(len(spl) == 2)
 823         sha = buf[ofs+z+1:ofs+z+1+20]
 824         ofs += z+1+20
 825         yield (spl[0], spl[1], sha)
 826
 827
 828 _ver = None
 829 def ver():
 830     """Get Git's version and ensure a usable version is installed.
 831
 832     The returned version is formatted as an ordered tuple with each position
 833     representing a digit in the version tag. For example, the following tuple
 834     would represent version 1.6.6.9:
 835
 836         ('1', '6', '6', '9')
 837     """
 838     global _ver
 839     if not _ver:
 840         p = subprocess.Popen(['git', '--version'],
 841                              stdout=subprocess.PIPE)
 842         gvs = p.stdout.read()
 843         _git_wait('git --version', p)
 844         m = re.match(r'git version (\S+.\S+)', gvs)
 845         if not m:
 846             raise GitError('git --version weird output: %r' % gvs)
 847         _ver = tuple(m.group(1).split('.'))
 848     needed = ('1','5', '3', '1')
 849     if _ver < needed:
 850         raise GitError('git version %s or higher is required; you have %s'
 851                        % ('.'.join(needed), '.'.join(_ver)))
 852     return _ver
 853
 854
 855 def _git_wait(cmd, p):
 856     rv = p.wait()
 857     if rv != 0:
 858         raise GitError('%s returned %d' % (cmd, rv))
 859
 860
 861 def _git_capture(argv):
 862     p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
 863     r = p.stdout.read()
 864     _git_wait(repr(argv), p)
 865     return r
 866
 867
 868 class _AbortableIter:
 869     def __init__(self, it, onabort = None):
 870         self.it = it
 871         self.onabort = onabort
 872         self.done = None
 873
 874     def __iter__(self):
 875         return self
 876
 877     def next(self):
 878         try:
 879             return self.it.next()
 880         except StopIteration, e:
 881             self.done = True
 882             raise
 883         except:
 884             self.abort()
 885             raise
 886
 887     def abort(self):
 888         """Abort iteration and call the abortion callback, if needed."""
 889         if not self.done:
 890             self.done = True
 891             if self.onabort:
 892                 self.onabort()
 893
 894     def __del__(self):
 895         self.abort()
 896
 897
 898 _ver_warned = 0
 899 class CatPipe:
 900     """Link to 'git cat-file' that is used to retrieve blob data."""
 901     def __init__(self):
 902         global _ver_warned
 903         wanted = ('1','5','6')
 904         if ver() < wanted:
 905             if not _ver_warned:
 906                 log('warning: git version < %s; bup will be slow.\n'
 907                     % '.'.join(wanted))
 908                 _ver_warned = 1
 909             self.get = self._slow_get
 910         else:
 911             self.p = self.inprogress = None
 912             self.get = self._fast_get
 913
 914     def _abort(self):
 915         if self.p:
 916             self.p.stdout.close()
 917             self.p.stdin.close()
 918         self.p = None
 919         self.inprogress = None
 920
 921     def _restart(self):
 922         self._abort()
 923         self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
 924                                   stdin=subprocess.PIPE,
 925                                   stdout=subprocess.PIPE,
 926                                   close_fds = True,
 927                                   bufsize = 4096,
 928                                   preexec_fn = _gitenv)
 929
 930     def _fast_get(self, id):
 931         if not self.p or self.p.poll() != None:
 932             self._restart()
 933         assert(self.p)
 934         assert(self.p.poll() == None)
 935         if self.inprogress:
 936             log('_fast_get: opening %r while %r is open'
 937                 % (id, self.inprogress))
 938         assert(not self.inprogress)
 939         assert(id.find('\n') < 0)
 940         assert(id.find('\r') < 0)
 941         assert(not id.startswith('-'))
 942         self.inprogress = id
 943         self.p.stdin.write('%s\n' % id)
 944         self.p.stdin.flush()
 945         hdr = self.p.stdout.readline()
 946         if hdr.endswith(' missing\n'):
 947             self.inprogress = None
 948             raise KeyError('blob %r is missing' % id)
 949         spl = hdr.split(' ')
 950         if len(spl) != 3 or len(spl[0]) != 40:
 951             raise GitError('expected blob, got %r' % spl)
 952         (hex, type, size) = spl
 953
 954         it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
 955                            onabort = self._abort)
 956         try:
 957             yield type
 958             for blob in it:
 959                 yield blob
 960             assert(self.p.stdout.readline() == '\n')
 961             self.inprogress = None
 962         except Exception, e:
 963             it.abort()
 964             raise
 965
 966     def _slow_get(self, id):
 967         assert(id.find('\n') < 0)
 968         assert(id.find('\r') < 0)
 969         assert(id[0] != '-')
 970         type = _git_capture(['git', 'cat-file', '-t', id]).strip()
 971         yield type
 972
 973         p = subprocess.Popen(['git', 'cat-file', type, id],
 974                              stdout=subprocess.PIPE,
 975                              preexec_fn = _gitenv)
 976         for blob in chunkyreader(p.stdout):
 977             yield blob
 978         _git_wait('git cat-file', p)
 979
 980     def _join(self, it):
 981         type = it.next()
 982         if type == 'blob':
 983             for blob in it:
 984                 yield blob
 985         elif type == 'tree':
 986             treefile = ''.join(it)
 987             for (mode, name, sha) in treeparse(treefile):
 988                 for blob in self.join(sha.encode('hex')):
 989                     yield blob
 990         elif type == 'commit':
 991             treeline = ''.join(it).split('\n')[0]
 992             assert(treeline.startswith('tree '))
 993             for blob in self.join(treeline[5:]):
 994                 yield blob
 995         else:
 996             raise GitError('invalid object type %r: expected blob/tree/commit'
 997                            % type)
 998
 999     def join(self, id):
1000         """Generate a list of the content of all blobs that can be reached
1001         from an object.  The hash given in 'id' must point to a blob, a tree
1002         or a commit. The content of all blobs that can be seen from trees or
1003         commits will be added to the list.
1004         """
1005         try:
1006             for d in self._join(self.get(id)):
1007                 yield d
1008         except StopIteration:
1009             log('booger!\n')
1010
1011 def tags():
1012     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1013     tags = {}
1014     for (n,c) in list_refs():
1015         if n.startswith('refs/tags/'):
1016             name = n[10:]
1017             if not c in tags:
1018                 tags[c] = []
1019
1020             tags[c].append(name)  # more than one tag can point at 'c'
1021
1022     return tags