lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5 import os, zlib, time, subprocess, struct, stat, re, tempfile
   6 import heapq
   7 from bup.helpers import *
   8 from bup import _helpers
   9
  10 MIDX_VERSION = 2
  11
  12 verbose = 0
  13 ignore_midx = 0
  14 home_repodir = os.path.expanduser('~/.bup')
  15 repodir = None
  16
  17 _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
  18 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
  19
  20 _total_searches = 0
  21 _total_steps = 0
  22
  23
  24 class GitError(Exception):
  25     pass
  26
  27
  28 def repo(sub = ''):
  29     """Get the path to the git repository or one of its subdirectories."""
  30     global repodir
  31     if not repodir:
  32         raise GitError('You should call check_repo_or_die()')
  33
  34     # If there's a .git subdirectory, then the actual repo is in there.
  35     gd = os.path.join(repodir, '.git')
  36     if os.path.exists(gd):
  37         repodir = gd
  38
  39     return os.path.join(repodir, sub)
  40
  41
  42 def auto_midx(objdir):
  43     main_exe = os.environ.get('BUP_MAIN_EXE') or sys.argv[0]
  44     args = [main_exe, 'midx', '--auto', '--dir', objdir]
  45     rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
  46     if rv:
  47         add_error('%r: returned %d' % (args, rv))
  48
  49
  50 def mangle_name(name, mode, gitmode):
  51     """Mangle a file name to present an abstract name for segmented files.
  52     Mangled file names will have the ".bup" extension added to them. If a
  53     file's name already ends with ".bup", a ".bupl" extension is added to
  54     disambiguate normal files from semgmented ones.
  55     """
  56     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
  57         return name + '.bup'
  58     elif name.endswith('.bup') or name[:-1].endswith('.bup'):
  59         return name + '.bupl'
  60     else:
  61         return name
  62
  63
  64 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
  65 def demangle_name(name):
  66     """Remove name mangling from a file name, if necessary.
  67
  68     The return value is a tuple (demangled_filename,mode), where mode is one of
  69     the following:
  70
  71     * BUP_NORMAL  : files that should be read as-is from the repository
  72     * BUP_CHUNKED : files that were chunked and need to be assembled
  73
  74     For more information on the name mangling algorythm, see mangle_name()
  75     """
  76     if name.endswith('.bupl'):
  77         return (name[:-5], BUP_NORMAL)
  78     elif name.endswith('.bup'):
  79         return (name[:-4], BUP_CHUNKED)
  80     else:
  81         return (name, BUP_NORMAL)
  82
  83
  84 def _encode_packobj(type, content):
  85     szout = ''
  86     sz = len(content)
  87     szbits = (sz & 0x0f) | (_typemap[type]<<4)
  88     sz >>= 4
  89     while 1:
  90         if sz: szbits |= 0x80
  91         szout += chr(szbits)
  92         if not sz:
  93             break
  94         szbits = sz & 0x7f
  95         sz >>= 7
  96     z = zlib.compressobj(1)
  97     yield szout
  98     yield z.compress(content)
  99     yield z.flush()
 100
 101
 102 def _encode_looseobj(type, content):
 103     z = zlib.compressobj(1)
 104     yield z.compress('%s %d\0' % (type, len(content)))
 105     yield z.compress(content)
 106     yield z.flush()
 107
 108
 109 def _decode_looseobj(buf):
 110     assert(buf);
 111     s = zlib.decompress(buf)
 112     i = s.find('\0')
 113     assert(i > 0)
 114     l = s[:i].split(' ')
 115     type = l[0]
 116     sz = int(l[1])
 117     content = s[i+1:]
 118     assert(type in _typemap)
 119     assert(sz == len(content))
 120     return (type, content)
 121
 122
 123 def _decode_packobj(buf):
 124     assert(buf)
 125     c = ord(buf[0])
 126     type = _typermap[(c & 0x70) >> 4]
 127     sz = c & 0x0f
 128     shift = 4
 129     i = 0
 130     while c & 0x80:
 131         i += 1
 132         c = ord(buf[i])
 133         sz |= (c & 0x7f) << shift
 134         shift += 7
 135         if not (c & 0x80):
 136             break
 137     return (type, zlib.decompress(buf[i+1:]))
 138
 139
 140 class PackIdx:
 141     """Object representation of a Git pack index file."""
 142     def __init__(self, filename):
 143         self.name = filename
 144         self.idxnames = [self.name]
 145         self.map = mmap_read(open(filename))
 146         assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
 147         self.fanout = list(struct.unpack('!256I',
 148                                          str(buffer(self.map, 8, 256*4))))
 149         self.fanout.append(0)  # entry "-1"
 150         nsha = self.fanout[255]
 151         self.ofstable = buffer(self.map,
 152                                8 + 256*4 + nsha*20 + nsha*4,
 153                                nsha*4)
 154         self.ofs64table = buffer(self.map,
 155                                  8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
 156
 157     def _ofs_from_idx(self, idx):
 158         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
 159         if ofs & 0x80000000:
 160             idx64 = ofs & 0x7fffffff
 161             ofs = struct.unpack('!I',
 162                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
 163         return ofs
 164
 165     def _idx_from_hash(self, hash):
 166         global _total_searches, _total_steps
 167         _total_searches += 1
 168         assert(len(hash) == 20)
 169         b1 = ord(hash[0])
 170         start = self.fanout[b1-1] # range -1..254
 171         end = self.fanout[b1] # range 0..255
 172         buf = buffer(self.map, 8 + 256*4, end*20)
 173         want = str(hash)
 174         _total_steps += 1  # lookup table is a step
 175         while start < end:
 176             _total_steps += 1
 177             mid = start + (end-start)/2
 178             v = str(buf[mid*20:(mid+1)*20])
 179             if v < want:
 180                 start = mid+1
 181             elif v > want:
 182                 end = mid
 183             else: # got it!
 184                 return mid
 185         return None
 186
 187     def find_offset(self, hash):
 188         """Get the offset of an object inside the index file."""
 189         idx = self._idx_from_hash(hash)
 190         if idx != None:
 191             return self._ofs_from_idx(idx)
 192         return None
 193
 194     def exists(self, hash):
 195         """Return nonempty if the object exists in this index."""
 196         return hash and (self._idx_from_hash(hash) != None) and True or None
 197
 198     def __iter__(self):
 199         for i in xrange(self.fanout[255]):
 200             yield buffer(self.map, 8 + 256*4 + 20*i, 20)
 201
 202     def __len__(self):
 203         return int(self.fanout[255])
 204
 205
 206 extract_bits = _helpers.extract_bits
 207
 208
 209 class PackMidx:
 210     """Wrapper which contains data from multiple index files.
 211     Multiple index (.midx) files constitute a wrapper around index (.idx) files
 212     and make it possible for bup to expand Git's indexing capabilities to vast
 213     amounts of files.
 214     """
 215     def __init__(self, filename):
 216         self.name = filename
 217         self.force_keep = False
 218         assert(filename.endswith('.midx'))
 219         self.map = mmap_read(open(filename))
 220         if str(self.map[0:4]) != 'MIDX':
 221             log('Warning: skipping: invalid MIDX header in %r\n' % filename)
 222             self.force_keep = True
 223             return self._init_failed()
 224         ver = struct.unpack('!I', self.map[4:8])[0]
 225         if ver < MIDX_VERSION:
 226             log('Warning: ignoring old-style (v%d) midx %r\n'
 227                 % (ver, filename))
 228             self.force_keep = False  # old stuff is boring
 229             return self._init_failed()
 230         if ver > MIDX_VERSION:
 231             log('Warning: ignoring too-new (v%d) midx %r\n'
 232                 % (ver, filename))
 233             self.force_keep = True  # new stuff is exciting
 234             return self._init_failed()
 235
 236         self.bits = _helpers.firstword(self.map[8:12])
 237         self.entries = 2**self.bits
 238         self.fanout = buffer(self.map, 12, self.entries*4)
 239         shaofs = 12 + self.entries*4
 240         nsha = self._fanget(self.entries-1)
 241         self.shalist = buffer(self.map, shaofs, nsha*20)
 242         self.idxnames = str(self.map[shaofs + 20*nsha:]).split('\0')
 243
 244     def _init_failed(self):
 245         self.bits = 0
 246         self.entries = 1
 247         self.fanout = buffer('\0\0\0\0')
 248         self.shalist = buffer('\0'*20)
 249         self.idxnames = []
 250
 251     def _fanget(self, i):
 252         start = i*4
 253         s = self.fanout[start:start+4]
 254         return _helpers.firstword(s)
 255
 256     def _get(self, i):
 257         return str(self.shalist[i*20:(i+1)*20])
 258
 259     def exists(self, hash):
 260         """Return nonempty if the object exists in the index files."""
 261         global _total_searches, _total_steps
 262         _total_searches += 1
 263         want = str(hash)
 264         el = extract_bits(want, self.bits)
 265         if el:
 266             start = self._fanget(el-1)
 267             startv = el << (32-self.bits)
 268         else:
 269             start = 0
 270             startv = 0
 271         end = self._fanget(el)
 272         endv = (el+1) << (32-self.bits)
 273         _total_steps += 1   # lookup table is a step
 274         hashv = _helpers.firstword(hash)
 275         #print '(%08x) %08x %08x %08x' % (extract_bits(want, 32), startv, hashv, endv)
 276         while start < end:
 277             _total_steps += 1
 278             #print '! %08x %08x %08x   %d - %d' % (startv, hashv, endv, start, end)
 279             mid = start + (hashv-startv)*(end-start-1)/(endv-startv)
 280             #print '  %08x %08x %08x   %d %d %d' % (startv, hashv, endv, start, mid, end)
 281             v = self._get(mid)
 282             #print '    %08x' % self._num(v)
 283             if v < want:
 284                 start = mid+1
 285                 startv = _helpers.firstword(v)
 286             elif v > want:
 287                 end = mid
 288                 endv = _helpers.firstword(v)
 289             else: # got it!
 290                 return True
 291         return None
 292
 293     def __iter__(self):
 294         for i in xrange(self._fanget(self.entries-1)):
 295             yield buffer(self.shalist, i*20, 20)
 296
 297     def __len__(self):
 298         return int(self._fanget(self.entries-1))
 299
 300
 301 _mpi_count = 0
 302 class PackIdxList:
 303     def __init__(self, dir):
 304         global _mpi_count
 305         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 306         _mpi_count += 1
 307         self.dir = dir
 308         self.also = {}
 309         self.packs = []
 310         self.refresh()
 311
 312     def __del__(self):
 313         global _mpi_count
 314         _mpi_count -= 1
 315         assert(_mpi_count == 0)
 316
 317     def __iter__(self):
 318         return iter(idxmerge(self.packs))
 319
 320     def __len__(self):
 321         return sum(len(pack) for pack in self.packs)
 322
 323     def exists(self, hash):
 324         """Return nonempty if the object exists in the index files."""
 325         global _total_searches
 326         _total_searches += 1
 327         if hash in self.also:
 328             return True
 329         for i in range(len(self.packs)):
 330             p = self.packs[i]
 331             _total_searches -= 1  # will be incremented by sub-pack
 332             if p.exists(hash):
 333                 # reorder so most recently used packs are searched first
 334                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 335                 return p.name
 336         return None
 337
 338     def refresh(self, skip_midx = False):
 339         """Refresh the index list.
 340         This method verifies if .midx files were superseded (e.g. all of its
 341         contents are in another, bigger .midx file) and removes the superseded
 342         files.
 343
 344         If skip_midx is True, all work on .midx files will be skipped and .midx
 345         files will be removed from the list.
 346
 347         The module-global variable 'ignore_midx' can force this function to
 348         always act as if skip_midx was True.
 349         """
 350         skip_midx = skip_midx or ignore_midx
 351         d = dict((p.name, p) for p in self.packs
 352                  if not skip_midx or not isinstance(p, PackMidx))
 353         if os.path.exists(self.dir):
 354             if not skip_midx:
 355                 midxl = []
 356                 for ix in self.packs:
 357                     if isinstance(ix, PackMidx):
 358                         for name in ix.idxnames:
 359                             d[os.path.join(self.dir, name)] = ix
 360                 for f in os.listdir(self.dir):
 361                     full = os.path.join(self.dir, f)
 362                     if f.endswith('.midx') and not d.get(full):
 363                         mx = PackMidx(full)
 364                         (mxd, mxf) = os.path.split(mx.name)
 365                         broken = 0
 366                         for n in mx.idxnames:
 367                             if not os.path.exists(os.path.join(mxd, n)):
 368                                 log(('warning: index %s missing\n' +
 369                                     '  used by %s\n') % (n, mxf))
 370                                 broken += 1
 371                         if not broken:
 372                             midxl.append(mx)
 373                 midxl.sort(lambda x,y: -cmp(len(x),len(y)))
 374                 for ix in midxl:
 375                     any = 0
 376                     for sub in ix.idxnames:
 377                         found = d.get(os.path.join(self.dir, sub))
 378                         if not found or isinstance(found, PackIdx):
 379                             # doesn't exist, or exists but not in a midx
 380                             d[ix.name] = ix
 381                             for name in ix.idxnames:
 382                                 d[os.path.join(self.dir, name)] = ix
 383                             any += 1
 384                             break
 385                     if not any and not ix.force_keep:
 386                         debug1('midx: removing redundant: %s\n'
 387                                % os.path.basename(ix.name))
 388                         unlink(ix.name)
 389             for f in os.listdir(self.dir):
 390                 full = os.path.join(self.dir, f)
 391                 if f.endswith('.idx') and not d.get(full):
 392                     ix = PackIdx(full)
 393                     d[full] = ix
 394             self.packs = list(set(d.values()))
 395         debug1('PackIdxList: using %d index%s.\n'
 396             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 397
 398     def add(self, hash):
 399         """Insert an additional object in the list."""
 400         self.also[hash] = 1
 401
 402     def zap_also(self):
 403         """Remove all additional objects from the list."""
 404         self.also = {}
 405
 406
 407 def calc_hash(type, content):
 408     """Calculate some content's hash in the Git fashion."""
 409     header = '%s %d\0' % (type, len(content))
 410     sum = Sha1(header)
 411     sum.update(content)
 412     return sum.digest()
 413
 414
 415 def _shalist_sort_key(ent):
 416     (mode, name, id) = ent
 417     if stat.S_ISDIR(int(mode, 8)):
 418         return name + '/'
 419     else:
 420         return name
 421
 422
 423 def open_idx(filename):
 424     if filename.endswith('.idx'):
 425         return PackIdx(filename)
 426     elif filename.endswith('.midx'):
 427         return PackMidx(filename)
 428     else:
 429         raise GitError('idx filenames must end with .idx or .midx')
 430
 431
 432 def idxmerge(idxlist, final_progress=True):
 433     """Generate a list of all the objects reachable in a PackIdxList."""
 434     total = sum(len(i) for i in idxlist)
 435     iters = (iter(i) for i in idxlist)
 436     heap = [(next(it), it) for it in iters]
 437     heapq.heapify(heap)
 438     count = 0
 439     last = None
 440     while heap:
 441         if (count % 10024) == 0:
 442             progress('Reading indexes: %.2f%% (%d/%d)\r'
 443                      % (count*100.0/total, count, total))
 444         (e, it) = heap[0]
 445         if e != last:
 446             yield e
 447             last = e
 448         count += 1
 449         e = next(it)
 450         if e:
 451             heapq.heapreplace(heap, (e, it))
 452         else:
 453             heapq.heappop(heap)
 454     if final_progress:
 455         log('Reading indexes: %.2f%% (%d/%d), done.\n' % (100, total, total))
 456
 457
 458 class PackWriter:
 459     """Writes Git objects insid a pack file."""
 460     def __init__(self, objcache_maker=None):
 461         self.count = 0
 462         self.outbytes = 0
 463         self.filename = None
 464         self.file = None
 465         self.objcache_maker = objcache_maker
 466         self.objcache = None
 467
 468     def __del__(self):
 469         self.close()
 470
 471     def _make_objcache(self):
 472         if self.objcache == None:
 473             if self.objcache_maker:
 474                 self.objcache = self.objcache_maker()
 475             else:
 476                 self.objcache = PackIdxList(repo('objects/pack'))
 477
 478     def _open(self):
 479         if not self.file:
 480             self._make_objcache()
 481             (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
 482             self.file = os.fdopen(fd, 'w+b')
 483             assert(name.endswith('.pack'))
 484             self.filename = name[:-5]
 485             self.file.write('PACK\0\0\0\2\0\0\0\0')
 486
 487     def _raw_write(self, datalist):
 488         self._open()
 489         f = self.file
 490         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 491         # the file never has a *partial* blob.  So let's make sure it's
 492         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 493         # to our hashsplit algorithm.)  f.write() does its own buffering,
 494         # but that's okay because we'll flush it in _end().
 495         oneblob = ''.join(datalist)
 496         f.write(oneblob)
 497         self.outbytes += len(oneblob)
 498         self.count += 1
 499
 500     def _write(self, bin, type, content):
 501         if verbose:
 502             log('>')
 503         self._raw_write(_encode_packobj(type, content))
 504         return bin
 505
 506     def breakpoint(self):
 507         """Clear byte and object counts and return the last processed id."""
 508         id = self._end()
 509         self.outbytes = self.count = 0
 510         return id
 511
 512     def write(self, type, content):
 513         """Write an object in this pack file."""
 514         return self._write(calc_hash(type, content), type, content)
 515
 516     def exists(self, id):
 517         """Return non-empty if an object is found in the object cache."""
 518         if not self.objcache:
 519             self._make_objcache()
 520         return self.objcache.exists(id)
 521
 522     def maybe_write(self, type, content):
 523         """Write an object to the pack file if not present and return its id."""
 524         bin = calc_hash(type, content)
 525         if not self.exists(bin):
 526             self._write(bin, type, content)
 527             self.objcache.add(bin)
 528         return bin
 529
 530     def new_blob(self, blob):
 531         """Create a blob object in the pack with the supplied content."""
 532         return self.maybe_write('blob', blob)
 533
 534     def new_tree(self, shalist):
 535         """Create a tree object in the pack."""
 536         shalist = sorted(shalist, key = _shalist_sort_key)
 537         l = []
 538         for (mode,name,bin) in shalist:
 539             assert(mode)
 540             assert(mode != '0')
 541             assert(mode[0] != '0')
 542             assert(name)
 543             assert(len(bin) == 20)
 544             l.append('%s %s\0%s' % (mode,name,bin))
 545         return self.maybe_write('tree', ''.join(l))
 546
 547     def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
 548         l = []
 549         if tree: l.append('tree %s' % tree.encode('hex'))
 550         if parent: l.append('parent %s' % parent.encode('hex'))
 551         if author: l.append('author %s %s' % (author, _git_date(adate)))
 552         if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
 553         l.append('')
 554         l.append(msg)
 555         return self.maybe_write('commit', '\n'.join(l))
 556
 557     def new_commit(self, parent, tree, msg):
 558         """Create a commit object in the pack."""
 559         now = time.time()
 560         userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
 561         commit = self._new_commit(tree, parent,
 562                                   userline, now, userline, now,
 563                                   msg)
 564         return commit
 565
 566     def abort(self):
 567         """Remove the pack file from disk."""
 568         f = self.file
 569         if f:
 570             self.file = None
 571             f.close()
 572             os.unlink(self.filename + '.pack')
 573
 574     def _end(self):
 575         f = self.file
 576         if not f: return None
 577         self.file = None
 578         self.objcache = None
 579
 580         # update object count
 581         f.seek(8)
 582         cp = struct.pack('!i', self.count)
 583         assert(len(cp) == 4)
 584         f.write(cp)
 585
 586         # calculate the pack sha1sum
 587         f.seek(0)
 588         sum = Sha1()
 589         while 1:
 590             b = f.read(65536)
 591             sum.update(b)
 592             if not b: break
 593         f.write(sum.digest())
 594
 595         f.close()
 596
 597         p = subprocess.Popen(['git', 'index-pack', '-v',
 598                               '--index-version=2',
 599                               self.filename + '.pack'],
 600                              preexec_fn = _gitenv,
 601                              stdout = subprocess.PIPE)
 602         out = p.stdout.read().strip()
 603         _git_wait('git index-pack', p)
 604         if not out:
 605             raise GitError('git index-pack produced no output')
 606         nameprefix = repo('objects/pack/%s' % out)
 607         if os.path.exists(self.filename + '.map'):
 608             os.unlink(self.filename + '.map')
 609         os.rename(self.filename + '.pack', nameprefix + '.pack')
 610         os.rename(self.filename + '.idx', nameprefix + '.idx')
 611
 612         auto_midx(repo('objects/pack'))
 613         return nameprefix
 614
 615     def close(self):
 616         """Close the pack file and move it to its definitive path."""
 617         return self._end()
 618
 619
 620 def _git_date(date):
 621     return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
 622
 623
 624 def _gitenv():
 625     os.environ['GIT_DIR'] = os.path.abspath(repo())
 626
 627
 628 def list_refs(refname = None):
 629     """Generate a list of tuples in the form (refname,hash).
 630     If a ref name is specified, list only this particular ref.
 631     """
 632     argv = ['git', 'show-ref', '--']
 633     if refname:
 634         argv += [refname]
 635     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 636     out = p.stdout.read().strip()
 637     rv = p.wait()  # not fatal
 638     if rv:
 639         assert(not out)
 640     if out:
 641         for d in out.split('\n'):
 642             (sha, name) = d.split(' ', 1)
 643             yield (name, sha.decode('hex'))
 644
 645
 646 def read_ref(refname):
 647     """Get the commit id of the most recent commit made on a given ref."""
 648     l = list(list_refs(refname))
 649     if l:
 650         assert(len(l) == 1)
 651         return l[0][1]
 652     else:
 653         return None
 654
 655
 656 def rev_list(ref, count=None):
 657     """Generate a list of reachable commits in reverse chronological order.
 658
 659     This generator walks through commits, from child to parent, that are
 660     reachable via the specified ref and yields a series of tuples of the form
 661     (date,hash).
 662
 663     If count is a non-zero integer, limit the number of commits to "count"
 664     objects.
 665     """
 666     assert(not ref.startswith('-'))
 667     opts = []
 668     if count:
 669         opts += ['-n', str(atoi(count))]
 670     argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
 671     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 672     commit = None
 673     for row in p.stdout:
 674         s = row.strip()
 675         if s.startswith('commit '):
 676             commit = s[7:].decode('hex')
 677         else:
 678             date = int(s)
 679             yield (date, commit)
 680     rv = p.wait()  # not fatal
 681     if rv:
 682         raise GitError, 'git rev-list returned error %d' % rv
 683
 684
 685 def rev_get_date(ref):
 686     """Get the date of the latest commit on the specified ref."""
 687     for (date, commit) in rev_list(ref, count=1):
 688         return date
 689     raise GitError, 'no such commit %r' % ref
 690
 691
 692 def update_ref(refname, newval, oldval):
 693     """Change the commit pointed to by a branch."""
 694     if not oldval:
 695         oldval = ''
 696     assert(refname.startswith('refs/heads/'))
 697     p = subprocess.Popen(['git', 'update-ref', refname,
 698                           newval.encode('hex'), oldval.encode('hex')],
 699                          preexec_fn = _gitenv)
 700     _git_wait('git update-ref', p)
 701
 702
 703 def guess_repo(path=None):
 704     """Set the path value in the global variable "repodir".
 705     This makes bup look for an existing bup repository, but not fail if a
 706     repository doesn't exist. Usually, if you are interacting with a bup
 707     repository, you would not be calling this function but using
 708     check_repo_or_die().
 709     """
 710     global repodir
 711     if path:
 712         repodir = path
 713     if not repodir:
 714         repodir = os.environ.get('BUP_DIR')
 715         if not repodir:
 716             repodir = os.path.expanduser('~/.bup')
 717
 718
 719 def init_repo(path=None):
 720     """Create the Git bare repository for bup in a given path."""
 721     guess_repo(path)
 722     d = repo()
 723     if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
 724         raise GitError('"%d" exists but is not a directory\n' % d)
 725     p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
 726                          preexec_fn = _gitenv)
 727     _git_wait('git init', p)
 728     # Force the index version configuration in order to ensure bup works
 729     # regardless of the version of the installed Git binary.
 730     p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
 731                          stdout=sys.stderr, preexec_fn = _gitenv)
 732     _git_wait('git config', p)
 733
 734
 735 def check_repo_or_die(path=None):
 736     """Make sure a bup repository exists, and abort if not.
 737     If the path to a particular repository was not specified, this function
 738     initializes the default repository automatically.
 739     """
 740     guess_repo(path)
 741     if not os.path.isdir(repo('objects/pack/.')):
 742         if repodir == home_repodir:
 743             init_repo()
 744         else:
 745             log('error: %r is not a bup/git repository\n' % repo())
 746             sys.exit(15)
 747
 748
 749 def treeparse(buf):
 750     """Generate a list of (mode, name, hash) tuples of objects from 'buf'."""
 751     ofs = 0
 752     while ofs < len(buf):
 753         z = buf[ofs:].find('\0')
 754         assert(z > 0)
 755         spl = buf[ofs:ofs+z].split(' ', 1)
 756         assert(len(spl) == 2)
 757         sha = buf[ofs+z+1:ofs+z+1+20]
 758         ofs += z+1+20
 759         yield (spl[0], spl[1], sha)
 760
 761
 762 _ver = None
 763 def ver():
 764     """Get Git's version and ensure a usable version is installed.
 765
 766     The returned version is formatted as an ordered tuple with each position
 767     representing a digit in the version tag. For example, the following tuple
 768     would represent version 1.6.6.9:
 769
 770         ('1', '6', '6', '9')
 771     """
 772     global _ver
 773     if not _ver:
 774         p = subprocess.Popen(['git', '--version'],
 775                              stdout=subprocess.PIPE)
 776         gvs = p.stdout.read()
 777         _git_wait('git --version', p)
 778         m = re.match(r'git version (\S+.\S+)', gvs)
 779         if not m:
 780             raise GitError('git --version weird output: %r' % gvs)
 781         _ver = tuple(m.group(1).split('.'))
 782     needed = ('1','5', '3', '1')
 783     if _ver < needed:
 784         raise GitError('git version %s or higher is required; you have %s'
 785                        % ('.'.join(needed), '.'.join(_ver)))
 786     return _ver
 787
 788
 789 def _git_wait(cmd, p):
 790     rv = p.wait()
 791     if rv != 0:
 792         raise GitError('%s returned %d' % (cmd, rv))
 793
 794
 795 def _git_capture(argv):
 796     p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
 797     r = p.stdout.read()
 798     _git_wait(repr(argv), p)
 799     return r
 800
 801
 802 class _AbortableIter:
 803     def __init__(self, it, onabort = None):
 804         self.it = it
 805         self.onabort = onabort
 806         self.done = None
 807
 808     def __iter__(self):
 809         return self
 810
 811     def next(self):
 812         try:
 813             return self.it.next()
 814         except StopIteration, e:
 815             self.done = True
 816             raise
 817         except:
 818             self.abort()
 819             raise
 820
 821     def abort(self):
 822         """Abort iteration and call the abortion callback, if needed."""
 823         if not self.done:
 824             self.done = True
 825             if self.onabort:
 826                 self.onabort()
 827
 828     def __del__(self):
 829         self.abort()
 830
 831
 832 _ver_warned = 0
 833 class CatPipe:
 834     """Link to 'git cat-file' that is used to retrieve blob data."""
 835     def __init__(self):
 836         global _ver_warned
 837         wanted = ('1','5','6')
 838         if ver() < wanted:
 839             if not _ver_warned:
 840                 log('warning: git version < %s; bup will be slow.\n'
 841                     % '.'.join(wanted))
 842                 _ver_warned = 1
 843             self.get = self._slow_get
 844         else:
 845             self.p = self.inprogress = None
 846             self.get = self._fast_get
 847
 848     def _abort(self):
 849         if self.p:
 850             self.p.stdout.close()
 851             self.p.stdin.close()
 852         self.p = None
 853         self.inprogress = None
 854
 855     def _restart(self):
 856         self._abort()
 857         self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
 858                                   stdin=subprocess.PIPE,
 859                                   stdout=subprocess.PIPE,
 860                                   close_fds = True,
 861                                   preexec_fn = _gitenv)
 862
 863     def _fast_get(self, id):
 864         if not self.p or self.p.poll() != None:
 865             self._restart()
 866         assert(self.p)
 867         assert(self.p.poll() == None)
 868         if self.inprogress:
 869             log('_fast_get: opening %r while %r is open'
 870                 % (id, self.inprogress))
 871         assert(not self.inprogress)
 872         assert(id.find('\n') < 0)
 873         assert(id.find('\r') < 0)
 874         assert(id[0] != '-')
 875         self.inprogress = id
 876         self.p.stdin.write('%s\n' % id)
 877         hdr = self.p.stdout.readline()
 878         if hdr.endswith(' missing\n'):
 879             raise KeyError('blob %r is missing' % id)
 880         spl = hdr.split(' ')
 881         if len(spl) != 3 or len(spl[0]) != 40:
 882             raise GitError('expected blob, got %r' % spl)
 883         (hex, type, size) = spl
 884
 885         it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
 886                            onabort = self._abort)
 887         try:
 888             yield type
 889             for blob in it:
 890                 yield blob
 891             assert(self.p.stdout.readline() == '\n')
 892             self.inprogress = None
 893         except Exception, e:
 894             it.abort()
 895             raise
 896
 897     def _slow_get(self, id):
 898         assert(id.find('\n') < 0)
 899         assert(id.find('\r') < 0)
 900         assert(id[0] != '-')
 901         type = _git_capture(['git', 'cat-file', '-t', id]).strip()
 902         yield type
 903
 904         p = subprocess.Popen(['git', 'cat-file', type, id],
 905                              stdout=subprocess.PIPE,
 906                              preexec_fn = _gitenv)
 907         for blob in chunkyreader(p.stdout):
 908             yield blob
 909         _git_wait('git cat-file', p)
 910
 911     def _join(self, it):
 912         type = it.next()
 913         if type == 'blob':
 914             for blob in it:
 915                 yield blob
 916         elif type == 'tree':
 917             treefile = ''.join(it)
 918             for (mode, name, sha) in treeparse(treefile):
 919                 for blob in self.join(sha.encode('hex')):
 920                     yield blob
 921         elif type == 'commit':
 922             treeline = ''.join(it).split('\n')[0]
 923             assert(treeline.startswith('tree '))
 924             for blob in self.join(treeline[5:]):
 925                 yield blob
 926         else:
 927             raise GitError('invalid object type %r: expected blob/tree/commit'
 928                            % type)
 929
 930     def join(self, id):
 931         """Generate a list of the content of all blobs that can be reached
 932         from an object.  The hash given in 'id' must point to a blob, a tree
 933         or a commit. The content of all blobs that can be seen from trees or
 934         commits will be added to the list.
 935         """
 936         try:
 937             for d in self._join(self.get(id)):
 938                 yield d
 939         except StopIteration:
 940             log('booger!\n')