lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5 import os, zlib, time, subprocess, struct, stat, re, tempfile
   6 import heapq
   7 from bup.helpers import *
   8
   9 MIDX_VERSION = 2
  10
  11 verbose = 0
  12 ignore_midx = 0
  13 home_repodir = os.path.expanduser('~/.bup')
  14 repodir = None
  15
  16 _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
  17 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
  18
  19 _total_searches = 0
  20 _total_steps = 0
  21
  22
  23 class GitError(Exception):
  24     pass
  25
  26
  27 def repo(sub = ''):
  28     """Get the path to the git repository or one of its subdirectories."""
  29     global repodir
  30     if not repodir:
  31         raise GitError('You should call check_repo_or_die()')
  32
  33     # If there's a .git subdirectory, then the actual repo is in there.
  34     gd = os.path.join(repodir, '.git')
  35     if os.path.exists(gd):
  36         repodir = gd
  37
  38     return os.path.join(repodir, sub)
  39
  40
  41 def mangle_name(name, mode, gitmode):
  42     """Mangle a file name to present an abstract name for segmented files.
  43     Mangled file names will have the ".bup" extension added to them. If a
  44     file's name already ends with ".bup", a ".bupl" extension is added to
  45     disambiguate normal files from semgmented ones.
  46     """
  47     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
  48         return name + '.bup'
  49     elif name.endswith('.bup') or name[:-1].endswith('.bup'):
  50         return name + '.bupl'
  51     else:
  52         return name
  53
  54
  55 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
  56 def demangle_name(name):
  57     """Remove name mangling from a file name, if necessary.
  58
  59     The return value is a tuple (demangled_filename,mode), where mode is one of
  60     the following:
  61
  62     * BUP_NORMAL  : files that should be read as-is from the repository
  63     * BUP_CHUNKED : files that were chunked and need to be assembled
  64
  65     For more information on the name mangling algorythm, see mangle_name()
  66     """
  67     if name.endswith('.bupl'):
  68         return (name[:-5], BUP_NORMAL)
  69     elif name.endswith('.bup'):
  70         return (name[:-4], BUP_CHUNKED)
  71     else:
  72         return (name, BUP_NORMAL)
  73
  74
  75 def _encode_packobj(type, content):
  76     szout = ''
  77     sz = len(content)
  78     szbits = (sz & 0x0f) | (_typemap[type]<<4)
  79     sz >>= 4
  80     while 1:
  81         if sz: szbits |= 0x80
  82         szout += chr(szbits)
  83         if not sz:
  84             break
  85         szbits = sz & 0x7f
  86         sz >>= 7
  87     z = zlib.compressobj(1)
  88     yield szout
  89     yield z.compress(content)
  90     yield z.flush()
  91
  92
  93 def _encode_looseobj(type, content):
  94     z = zlib.compressobj(1)
  95     yield z.compress('%s %d\0' % (type, len(content)))
  96     yield z.compress(content)
  97     yield z.flush()
  98
  99
 100 def _decode_looseobj(buf):
 101     assert(buf);
 102     s = zlib.decompress(buf)
 103     i = s.find('\0')
 104     assert(i > 0)
 105     l = s[:i].split(' ')
 106     type = l[0]
 107     sz = int(l[1])
 108     content = s[i+1:]
 109     assert(type in _typemap)
 110     assert(sz == len(content))
 111     return (type, content)
 112
 113
 114 def _decode_packobj(buf):
 115     assert(buf)
 116     c = ord(buf[0])
 117     type = _typermap[(c & 0x70) >> 4]
 118     sz = c & 0x0f
 119     shift = 4
 120     i = 0
 121     while c & 0x80:
 122         i += 1
 123         c = ord(buf[i])
 124         sz |= (c & 0x7f) << shift
 125         shift += 7
 126         if not (c & 0x80):
 127             break
 128     return (type, zlib.decompress(buf[i+1:]))
 129
 130
 131 class PackIdx:
 132     """Object representation of a Git pack index file."""
 133     def __init__(self, filename):
 134         self.name = filename
 135         self.map = mmap_read(open(filename))
 136         assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
 137         self.fanout = list(struct.unpack('!256I',
 138                                          str(buffer(self.map, 8, 256*4))))
 139         self.fanout.append(0)  # entry "-1"
 140         nsha = self.fanout[255]
 141         self.ofstable = buffer(self.map,
 142                                8 + 256*4 + nsha*20 + nsha*4,
 143                                nsha*4)
 144         self.ofs64table = buffer(self.map,
 145                                  8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
 146
 147     def _ofs_from_idx(self, idx):
 148         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
 149         if ofs & 0x80000000:
 150             idx64 = ofs & 0x7fffffff
 151             ofs = struct.unpack('!I',
 152                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
 153         return ofs
 154
 155     def _idx_from_hash(self, hash):
 156         global _total_searches, _total_steps
 157         _total_searches += 1
 158         assert(len(hash) == 20)
 159         b1 = ord(hash[0])
 160         start = self.fanout[b1-1] # range -1..254
 161         end = self.fanout[b1] # range 0..255
 162         buf = buffer(self.map, 8 + 256*4, end*20)
 163         want = str(hash)
 164         _total_steps += 1  # lookup table is a step
 165         while start < end:
 166             _total_steps += 1
 167             mid = start + (end-start)/2
 168             v = str(buf[mid*20:(mid+1)*20])
 169             if v < want:
 170                 start = mid+1
 171             elif v > want:
 172                 end = mid
 173             else: # got it!
 174                 return mid
 175         return None
 176
 177     def find_offset(self, hash):
 178         """Get the offset of an object inside the index file."""
 179         idx = self._idx_from_hash(hash)
 180         if idx != None:
 181             return self._ofs_from_idx(idx)
 182         return None
 183
 184     def exists(self, hash):
 185         """Return nonempty if the object exists in this index."""
 186         return hash and (self._idx_from_hash(hash) != None) and True or None
 187
 188     def __iter__(self):
 189         for i in xrange(self.fanout[255]):
 190             yield buffer(self.map, 8 + 256*4 + 20*i, 20)
 191
 192     def __len__(self):
 193         return int(self.fanout[255])
 194
 195
 196 def extract_bits(buf, nbits):
 197     """Take the first 'nbits' bits from 'buf' and return them as an integer."""
 198     mask = (1<<nbits) - 1
 199     v = struct.unpack('!I', buf[0:4])[0]
 200     v = (v >> (32-nbits)) & mask
 201     return v
 202
 203
 204 class PackMidx:
 205     """Wrapper which contains data from multiple index files.
 206     Multiple index (.midx) files constitute a wrapper around index (.idx) files
 207     and make it possible for bup to expand Git's indexing capabilities to vast
 208     amounts of files.
 209     """
 210     def __init__(self, filename):
 211         self.name = filename
 212         self.force_keep = False
 213         assert(filename.endswith('.midx'))
 214         self.map = mmap_read(open(filename))
 215         if str(self.map[0:4]) != 'MIDX':
 216             log('Warning: skipping: invalid MIDX header in %r\n' % filename)
 217             self.force_keep = True
 218             return self._init_failed()
 219         ver = struct.unpack('!I', self.map[4:8])[0]
 220         if ver < MIDX_VERSION:
 221             log('Warning: ignoring old-style (v%d) midx %r\n'
 222                 % (ver, filename))
 223             self.force_keep = False  # old stuff is boring
 224             return self._init_failed()
 225         if ver > MIDX_VERSION:
 226             log('Warning: ignoring too-new (v%d) midx %r\n'
 227                 % (ver, filename))
 228             self.force_keep = True  # new stuff is exciting
 229             return self._init_failed()
 230
 231         self.bits = struct.unpack('!I', self.map[8:12])[0]
 232         self.entries = 2**self.bits
 233         self.fanout = buffer(self.map, 12, self.entries*4)
 234         shaofs = 12 + self.entries*4
 235         nsha = self._fanget(self.entries-1)
 236         self.shalist = buffer(self.map, shaofs, nsha*20)
 237         self.idxnames = str(self.map[shaofs + 20*nsha:]).split('\0')
 238
 239     def _init_failed(self):
 240         self.bits = 0
 241         self.entries = 1
 242         self.fanout = buffer('\0\0\0\0')
 243         self.shalist = buffer('\0'*20)
 244         self.idxnames = []
 245
 246     def _fanget(self, i):
 247         start = i*4
 248         s = self.fanout[start:start+4]
 249         return struct.unpack('!I', s)[0]
 250
 251     def exists(self, hash):
 252         """Return nonempty if the object exists in the index files."""
 253         global _total_searches, _total_steps
 254         _total_searches += 1
 255         want = str(hash)
 256         el = extract_bits(want, self.bits)
 257         if el:
 258             start = self._fanget(el-1)
 259         else:
 260             start = 0
 261         end = self._fanget(el)
 262         _total_steps += 1   # lookup table is a step
 263         while start < end:
 264             _total_steps += 1
 265             mid = start + (end-start)/2
 266             v = str(self.shalist[mid*20:(mid+1)*20])
 267             if v < want:
 268                 start = mid+1
 269             elif v > want:
 270                 end = mid
 271             else: # got it!
 272                 return True
 273         return None
 274
 275     def __iter__(self):
 276         for i in xrange(self._fanget(self.entries-1)):
 277             yield buffer(self.shalist, i*20, 20)
 278
 279     def __len__(self):
 280         return int(self._fanget(self.entries-1))
 281
 282
 283 _mpi_count = 0
 284 class PackIdxList:
 285     def __init__(self, dir):
 286         global _mpi_count
 287         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 288         _mpi_count += 1
 289         self.dir = dir
 290         self.also = {}
 291         self.packs = []
 292         self.refresh()
 293
 294     def __del__(self):
 295         global _mpi_count
 296         _mpi_count -= 1
 297         assert(_mpi_count == 0)
 298
 299     def __iter__(self):
 300         return iter(idxmerge(self.packs))
 301
 302     def __len__(self):
 303         return sum(len(pack) for pack in self.packs)
 304
 305     def exists(self, hash):
 306         """Return nonempty if the object exists in the index files."""
 307         global _total_searches
 308         _total_searches += 1
 309         if hash in self.also:
 310             return True
 311         for i in range(len(self.packs)):
 312             p = self.packs[i]
 313             _total_searches -= 1  # will be incremented by sub-pack
 314             if p.exists(hash):
 315                 # reorder so most recently used packs are searched first
 316                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 317                 return p.name
 318         return None
 319
 320     def refresh(self, skip_midx = False):
 321         """Refresh the index list.
 322         This method verifies if .midx files were superseded (e.g. all of its
 323         contents are in another, bigger .midx file) and removes the superseded
 324         files.
 325
 326         If skip_midx is True, all work on .midx files will be skipped and .midx
 327         files will be removed from the list.
 328
 329         The module-global variable 'ignore_midx' can force this function to
 330         always act as if skip_midx was True.
 331         """
 332         skip_midx = skip_midx or ignore_midx
 333         d = dict((p.name, p) for p in self.packs
 334                  if not skip_midx or not isinstance(p, PackMidx))
 335         if os.path.exists(self.dir):
 336             if not skip_midx:
 337                 midxl = []
 338                 for ix in self.packs:
 339                     if isinstance(ix, PackMidx):
 340                         for name in ix.idxnames:
 341                             d[os.path.join(self.dir, name)] = ix
 342                 for f in os.listdir(self.dir):
 343                     full = os.path.join(self.dir, f)
 344                     if f.endswith('.midx') and not d.get(full):
 345                         mx = PackMidx(full)
 346                         (mxd, mxf) = os.path.split(mx.name)
 347                         broken = 0
 348                         for n in mx.idxnames:
 349                             if not os.path.exists(os.path.join(mxd, n)):
 350                                 log(('warning: index %s missing\n' +
 351                                     '  used by %s\n') % (n, mxf))
 352                                 broken += 1
 353                         if not broken:
 354                             midxl.append(mx)
 355                 midxl.sort(lambda x,y: -cmp(len(x),len(y)))
 356                 for ix in midxl:
 357                     any = 0
 358                     for sub in ix.idxnames:
 359                         found = d.get(os.path.join(self.dir, sub))
 360                         if not found or isinstance(found, PackIdx):
 361                             # doesn't exist, or exists but not in a midx
 362                             d[ix.name] = ix
 363                             for name in ix.idxnames:
 364                                 d[os.path.join(self.dir, name)] = ix
 365                             any += 1
 366                             break
 367                     if not any and not ix.force_keep:
 368                         log('midx: removing redundant: %s\n'
 369                             % os.path.basename(ix.name))
 370                         unlink(ix.name)
 371             for f in os.listdir(self.dir):
 372                 full = os.path.join(self.dir, f)
 373                 if f.endswith('.idx') and not d.get(full):
 374                     ix = PackIdx(full)
 375                     d[full] = ix
 376             self.packs = list(set(d.values()))
 377         log('PackIdxList: using %d index%s.\n'
 378             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 379
 380     def add(self, hash):
 381         """Insert an additional object in the list."""
 382         self.also[hash] = 1
 383
 384     def zap_also(self):
 385         """Remove all additional objects from the list."""
 386         self.also = {}
 387
 388
 389 def calc_hash(type, content):
 390     """Calculate some content's hash in the Git fashion."""
 391     header = '%s %d\0' % (type, len(content))
 392     sum = Sha1(header)
 393     sum.update(content)
 394     return sum.digest()
 395
 396
 397 def _shalist_sort_key(ent):
 398     (mode, name, id) = ent
 399     if stat.S_ISDIR(int(mode, 8)):
 400         return name + '/'
 401     else:
 402         return name
 403
 404
 405 def idxmerge(idxlist):
 406     """Generate a list of all the objects reachable in a PackIdxList."""
 407     total = sum(len(i) for i in idxlist)
 408     iters = (iter(i) for i in idxlist)
 409     heap = [(next(it), it) for it in iters]
 410     heapq.heapify(heap)
 411     count = 0
 412     last = None
 413     while heap:
 414         if (count % 10024) == 0:
 415             progress('Reading indexes: %.2f%% (%d/%d)\r'
 416                      % (count*100.0/total, count, total))
 417         (e, it) = heap[0]
 418         if e != last:
 419             yield e
 420             last = e
 421         count += 1
 422         e = next(it)
 423         if e:
 424             heapq.heapreplace(heap, (e, it))
 425         else:
 426             heapq.heappop(heap)
 427     log('Reading indexes: %.2f%% (%d/%d), done.\n' % (100, total, total))
 428
 429
 430 class PackWriter:
 431     """Writes Git objects insid a pack file."""
 432     def __init__(self, objcache_maker=None):
 433         self.count = 0
 434         self.outbytes = 0
 435         self.filename = None
 436         self.file = None
 437         self.objcache_maker = objcache_maker
 438         self.objcache = None
 439
 440     def __del__(self):
 441         self.close()
 442
 443     def _make_objcache(self):
 444         if self.objcache == None:
 445             if self.objcache_maker:
 446                 self.objcache = self.objcache_maker()
 447             else:
 448                 self.objcache = PackIdxList(repo('objects/pack'))
 449
 450     def _open(self):
 451         if not self.file:
 452             self._make_objcache()
 453             (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
 454             self.file = os.fdopen(fd, 'w+b')
 455             assert(name.endswith('.pack'))
 456             self.filename = name[:-5]
 457             self.file.write('PACK\0\0\0\2\0\0\0\0')
 458
 459     def _raw_write(self, datalist):
 460         self._open()
 461         f = self.file
 462         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 463         # the file never has a *partial* blob.  So let's make sure it's
 464         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 465         # to our hashsplit algorithm.)  f.write() does its own buffering,
 466         # but that's okay because we'll flush it in _end().
 467         oneblob = ''.join(datalist)
 468         f.write(oneblob)
 469         self.outbytes += len(oneblob)
 470         self.count += 1
 471
 472     def _write(self, bin, type, content):
 473         if verbose:
 474             log('>')
 475         self._raw_write(_encode_packobj(type, content))
 476         return bin
 477
 478     def breakpoint(self):
 479         """Clear byte and object counts and return the last processed id."""
 480         id = self._end()
 481         self.outbytes = self.count = 0
 482         return id
 483
 484     def write(self, type, content):
 485         """Write an object in this pack file."""
 486         return self._write(calc_hash(type, content), type, content)
 487
 488     def exists(self, id):
 489         """Return non-empty if an object is found in the object cache."""
 490         if not self.objcache:
 491             self._make_objcache()
 492         return self.objcache.exists(id)
 493
 494     def maybe_write(self, type, content):
 495         """Write an object to the pack file if not present and return its id."""
 496         bin = calc_hash(type, content)
 497         if not self.exists(bin):
 498             self._write(bin, type, content)
 499             self.objcache.add(bin)
 500         return bin
 501
 502     def new_blob(self, blob):
 503         """Create a blob object in the pack with the supplied content."""
 504         return self.maybe_write('blob', blob)
 505
 506     def new_tree(self, shalist):
 507         """Create a tree object in the pack."""
 508         shalist = sorted(shalist, key = _shalist_sort_key)
 509         l = []
 510         for (mode,name,bin) in shalist:
 511             assert(mode)
 512             assert(mode != '0')
 513             assert(mode[0] != '0')
 514             assert(name)
 515             assert(len(bin) == 20)
 516             l.append('%s %s\0%s' % (mode,name,bin))
 517         return self.maybe_write('tree', ''.join(l))
 518
 519     def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
 520         l = []
 521         if tree: l.append('tree %s' % tree.encode('hex'))
 522         if parent: l.append('parent %s' % parent.encode('hex'))
 523         if author: l.append('author %s %s' % (author, _git_date(adate)))
 524         if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
 525         l.append('')
 526         l.append(msg)
 527         return self.maybe_write('commit', '\n'.join(l))
 528
 529     def new_commit(self, parent, tree, msg):
 530         """Create a commit object in the pack."""
 531         now = time.time()
 532         userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
 533         commit = self._new_commit(tree, parent,
 534                                   userline, now, userline, now,
 535                                   msg)
 536         return commit
 537
 538     def abort(self):
 539         """Remove the pack file from disk."""
 540         f = self.file
 541         if f:
 542             self.file = None
 543             f.close()
 544             os.unlink(self.filename + '.pack')
 545
 546     def _end(self):
 547         f = self.file
 548         if not f: return None
 549         self.file = None
 550         self.objcache = None
 551
 552         # update object count
 553         f.seek(8)
 554         cp = struct.pack('!i', self.count)
 555         assert(len(cp) == 4)
 556         f.write(cp)
 557
 558         # calculate the pack sha1sum
 559         f.seek(0)
 560         sum = Sha1()
 561         while 1:
 562             b = f.read(65536)
 563             sum.update(b)
 564             if not b: break
 565         f.write(sum.digest())
 566
 567         f.close()
 568
 569         p = subprocess.Popen(['git', 'index-pack', '-v',
 570                               '--index-version=2',
 571                               self.filename + '.pack'],
 572                              preexec_fn = _gitenv,
 573                              stdout = subprocess.PIPE)
 574         out = p.stdout.read().strip()
 575         _git_wait('git index-pack', p)
 576         if not out:
 577             raise GitError('git index-pack produced no output')
 578         nameprefix = repo('objects/pack/%s' % out)
 579         if os.path.exists(self.filename + '.map'):
 580             os.unlink(self.filename + '.map')
 581         os.rename(self.filename + '.pack', nameprefix + '.pack')
 582         os.rename(self.filename + '.idx', nameprefix + '.idx')
 583         return nameprefix
 584
 585     def close(self):
 586         """Close the pack file and move it to its definitive path."""
 587         return self._end()
 588
 589
 590 def _git_date(date):
 591     return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
 592
 593
 594 def _gitenv():
 595     os.environ['GIT_DIR'] = os.path.abspath(repo())
 596
 597
 598 def list_refs(refname = None):
 599     """Generate a list of tuples in the form (refname,hash).
 600     If a ref name is specified, list only this particular ref.
 601     """
 602     argv = ['git', 'show-ref', '--']
 603     if refname:
 604         argv += [refname]
 605     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 606     out = p.stdout.read().strip()
 607     rv = p.wait()  # not fatal
 608     if rv:
 609         assert(not out)
 610     if out:
 611         for d in out.split('\n'):
 612             (sha, name) = d.split(' ', 1)
 613             yield (name, sha.decode('hex'))
 614
 615
 616 def read_ref(refname):
 617     """Get the commit id of the most recent commit made on a given ref."""
 618     l = list(list_refs(refname))
 619     if l:
 620         assert(len(l) == 1)
 621         return l[0][1]
 622     else:
 623         return None
 624
 625
 626 def rev_list(ref, count=None):
 627     """Generate a list of reachable commits in reverse chronological order.
 628
 629     This generator walks through commits, from child to parent, that are
 630     reachable via the specified ref and yields a series of tuples of the form
 631     (date,hash).
 632
 633     If count is a non-zero integer, limit the number of commits to "count"
 634     objects.
 635     """
 636     assert(not ref.startswith('-'))
 637     opts = []
 638     if count:
 639         opts += ['-n', str(atoi(count))]
 640     argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
 641     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 642     commit = None
 643     for row in p.stdout:
 644         s = row.strip()
 645         if s.startswith('commit '):
 646             commit = s[7:].decode('hex')
 647         else:
 648             date = int(s)
 649             yield (date, commit)
 650     rv = p.wait()  # not fatal
 651     if rv:
 652         raise GitError, 'git rev-list returned error %d' % rv
 653
 654
 655 def rev_get_date(ref):
 656     """Get the date of the latest commit on the specified ref."""
 657     for (date, commit) in rev_list(ref, count=1):
 658         return date
 659     raise GitError, 'no such commit %r' % ref
 660
 661
 662 def update_ref(refname, newval, oldval):
 663     """Change the commit pointed to by a branch."""
 664     if not oldval:
 665         oldval = ''
 666     assert(refname.startswith('refs/heads/'))
 667     p = subprocess.Popen(['git', 'update-ref', refname,
 668                           newval.encode('hex'), oldval.encode('hex')],
 669                          preexec_fn = _gitenv)
 670     _git_wait('git update-ref', p)
 671
 672
 673 def guess_repo(path=None):
 674     """Set the path value in the global variable "repodir".
 675     This makes bup look for an existing bup repository, but not fail if a
 676     repository doesn't exist. Usually, if you are interacting with a bup
 677     repository, you would not be calling this function but using
 678     check_repo_or_die().
 679     """
 680     global repodir
 681     if path:
 682         repodir = path
 683     if not repodir:
 684         repodir = os.environ.get('BUP_DIR')
 685         if not repodir:
 686             repodir = os.path.expanduser('~/.bup')
 687
 688
 689 def init_repo(path=None):
 690     """Create the Git bare repository for bup in a given path."""
 691     guess_repo(path)
 692     d = repo()
 693     if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
 694         raise GitError('"%d" exists but is not a directory\n' % d)
 695     p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
 696                          preexec_fn = _gitenv)
 697     _git_wait('git init', p)
 698     # Force the index version configuration in order to ensure bup works
 699     # regardless of the version of the installed Git binary.
 700     p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
 701                          stdout=sys.stderr, preexec_fn = _gitenv)
 702     _git_wait('git config', p)
 703
 704
 705 def check_repo_or_die(path=None):
 706     """Make sure a bup repository exists, and abort if not.
 707     If the path to a particular repository was not specified, this function
 708     initializes the default repository automatically.
 709     """
 710     guess_repo(path)
 711     if not os.path.isdir(repo('objects/pack/.')):
 712         if repodir == home_repodir:
 713             init_repo()
 714         else:
 715             log('error: %r is not a bup/git repository\n' % repo())
 716             sys.exit(15)
 717
 718
 719 def treeparse(buf):
 720     """Generate a list of (mode, name, hash) tuples of objects from 'buf'."""
 721     ofs = 0
 722     while ofs < len(buf):
 723         z = buf[ofs:].find('\0')
 724         assert(z > 0)
 725         spl = buf[ofs:ofs+z].split(' ', 1)
 726         assert(len(spl) == 2)
 727         sha = buf[ofs+z+1:ofs+z+1+20]
 728         ofs += z+1+20
 729         yield (spl[0], spl[1], sha)
 730
 731
 732 _ver = None
 733 def ver():
 734     """Get Git's version and ensure a usable version is installed.
 735
 736     The returned version is formatted as an ordered tuple with each position
 737     representing a digit in the version tag. For example, the following tuple
 738     would represent version 1.6.6.9:
 739
 740         ('1', '6', '6', '9')
 741     """
 742     global _ver
 743     if not _ver:
 744         p = subprocess.Popen(['git', '--version'],
 745                              stdout=subprocess.PIPE)
 746         gvs = p.stdout.read()
 747         _git_wait('git --version', p)
 748         m = re.match(r'git version (\S+.\S+)', gvs)
 749         if not m:
 750             raise GitError('git --version weird output: %r' % gvs)
 751         _ver = tuple(m.group(1).split('.'))
 752     needed = ('1','5', '3', '1')
 753     if _ver < needed:
 754         raise GitError('git version %s or higher is required; you have %s'
 755                        % ('.'.join(needed), '.'.join(_ver)))
 756     return _ver
 757
 758
 759 def _git_wait(cmd, p):
 760     rv = p.wait()
 761     if rv != 0:
 762         raise GitError('%s returned %d' % (cmd, rv))
 763
 764
 765 def _git_capture(argv):
 766     p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
 767     r = p.stdout.read()
 768     _git_wait(repr(argv), p)
 769     return r
 770
 771
 772 class _AbortableIter:
 773     def __init__(self, it, onabort = None):
 774         self.it = it
 775         self.onabort = onabort
 776         self.done = None
 777
 778     def __iter__(self):
 779         return self
 780
 781     def next(self):
 782         try:
 783             return self.it.next()
 784         except StopIteration, e:
 785             self.done = True
 786             raise
 787         except:
 788             self.abort()
 789             raise
 790
 791     def abort(self):
 792         """Abort iteration and call the abortion callback, if needed."""
 793         if not self.done:
 794             self.done = True
 795             if self.onabort:
 796                 self.onabort()
 797
 798     def __del__(self):
 799         self.abort()
 800
 801
 802 _ver_warned = 0
 803 class CatPipe:
 804     """Link to 'git cat-file' that is used to retrieve blob data."""
 805     def __init__(self):
 806         global _ver_warned
 807         wanted = ('1','5','6')
 808         if ver() < wanted:
 809             if not _ver_warned:
 810                 log('warning: git version < %s; bup will be slow.\n'
 811                     % '.'.join(wanted))
 812                 _ver_warned = 1
 813             self.get = self._slow_get
 814         else:
 815             self.p = self.inprogress = None
 816             self.get = self._fast_get
 817
 818     def _abort(self):
 819         if self.p:
 820             self.p.stdout.close()
 821             self.p.stdin.close()
 822         self.p = None
 823         self.inprogress = None
 824
 825     def _restart(self):
 826         self._abort()
 827         self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
 828                                   stdin=subprocess.PIPE,
 829                                   stdout=subprocess.PIPE,
 830                                   close_fds = True,
 831                                   preexec_fn = _gitenv)
 832
 833     def _fast_get(self, id):
 834         if not self.p or self.p.poll() != None:
 835             self._restart()
 836         assert(self.p)
 837         assert(self.p.poll() == None)
 838         if self.inprogress:
 839             log('_fast_get: opening %r while %r is open'
 840                 % (id, self.inprogress))
 841         assert(not self.inprogress)
 842         assert(id.find('\n') < 0)
 843         assert(id.find('\r') < 0)
 844         assert(id[0] != '-')
 845         self.inprogress = id
 846         self.p.stdin.write('%s\n' % id)
 847         hdr = self.p.stdout.readline()
 848         if hdr.endswith(' missing\n'):
 849             raise KeyError('blob %r is missing' % id)
 850         spl = hdr.split(' ')
 851         if len(spl) != 3 or len(spl[0]) != 40:
 852             raise GitError('expected blob, got %r' % spl)
 853         (hex, type, size) = spl
 854
 855         it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
 856                            onabort = self._abort)
 857         try:
 858             yield type
 859             for blob in it:
 860                 yield blob
 861             assert(self.p.stdout.readline() == '\n')
 862             self.inprogress = None
 863         except Exception, e:
 864             it.abort()
 865             raise
 866
 867     def _slow_get(self, id):
 868         assert(id.find('\n') < 0)
 869         assert(id.find('\r') < 0)
 870         assert(id[0] != '-')
 871         type = _git_capture(['git', 'cat-file', '-t', id]).strip()
 872         yield type
 873
 874         p = subprocess.Popen(['git', 'cat-file', type, id],
 875                              stdout=subprocess.PIPE,
 876                              preexec_fn = _gitenv)
 877         for blob in chunkyreader(p.stdout):
 878             yield blob
 879         _git_wait('git cat-file', p)
 880
 881     def _join(self, it):
 882         type = it.next()
 883         if type == 'blob':
 884             for blob in it:
 885                 yield blob
 886         elif type == 'tree':
 887             treefile = ''.join(it)
 888             for (mode, name, sha) in treeparse(treefile):
 889                 for blob in self.join(sha.encode('hex')):
 890                     yield blob
 891         elif type == 'commit':
 892             treeline = ''.join(it).split('\n')[0]
 893             assert(treeline.startswith('tree '))
 894             for blob in self.join(treeline[5:]):
 895                 yield blob
 896         else:
 897             raise GitError('invalid object type %r: expected blob/tree/commit'
 898                            % type)
 899
 900     def join(self, id):
 901         """Generate a list of the content of all blobs that can be reached
 902         from an object.  The hash given in 'id' must point to a blob, a tree
 903         or a commit. The content of all blobs that can be seen from trees or
 904         commits will be added to the list.
 905         """
 906         try:
 907             for d in self._join(self.get(id)):
 908                 yield d
 909         except StopIteration:
 910             log('booger!\n')