lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5 import os, zlib, time, subprocess, struct, stat, re, tempfile
   6 import heapq
   7 from bup.helpers import *
   8
   9 verbose = 0
  10 ignore_midx = 0
  11 home_repodir = os.path.expanduser('~/.bup')
  12 repodir = None
  13
  14 _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
  15 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
  16
  17 _total_searches = 0
  18 _total_steps = 0
  19
  20
  21 class GitError(Exception):
  22     pass
  23
  24
  25 def repo(sub = ''):
  26     """Get the path to the git repository or one of its subdirectories."""
  27     global repodir
  28     if not repodir:
  29         raise GitError('You should call check_repo_or_die()')
  30
  31     # If there's a .git subdirectory, then the actual repo is in there.
  32     gd = os.path.join(repodir, '.git')
  33     if os.path.exists(gd):
  34         repodir = gd
  35
  36     return os.path.join(repodir, sub)
  37
  38
  39 def mangle_name(name, mode, gitmode):
  40     """Mangle a file name to present an abstract name for segmented files.
  41     Mangled file names will have the ".bup" extension added to them. If a
  42     file's name already ends with ".bup", a ".bupl" extension is added to
  43     disambiguate normal files from semgmented ones.
  44     """
  45     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
  46         return name + '.bup'
  47     elif name.endswith('.bup') or name[:-1].endswith('.bup'):
  48         return name + '.bupl'
  49     else:
  50         return name
  51
  52
  53 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
  54 def demangle_name(name):
  55     """Remove name mangling from a file name, if necessary.
  56
  57     The return value is a tuple (demangled_filename,mode), where mode is one of
  58     the following:
  59
  60     * BUP_NORMAL  : files that should be read as-is from the repository
  61     * BUP_CHUNKED : files that were chunked and need to be assembled
  62
  63     For more information on the name mangling algorythm, see mangle_name()
  64     """
  65     if name.endswith('.bupl'):
  66         return (name[:-5], BUP_NORMAL)
  67     elif name.endswith('.bup'):
  68         return (name[:-4], BUP_CHUNKED)
  69     else:
  70         return (name, BUP_NORMAL)
  71
  72
  73 def _encode_packobj(type, content):
  74     szout = ''
  75     sz = len(content)
  76     szbits = (sz & 0x0f) | (_typemap[type]<<4)
  77     sz >>= 4
  78     while 1:
  79         if sz: szbits |= 0x80
  80         szout += chr(szbits)
  81         if not sz:
  82             break
  83         szbits = sz & 0x7f
  84         sz >>= 7
  85     z = zlib.compressobj(1)
  86     yield szout
  87     yield z.compress(content)
  88     yield z.flush()
  89
  90
  91 def _encode_looseobj(type, content):
  92     z = zlib.compressobj(1)
  93     yield z.compress('%s %d\0' % (type, len(content)))
  94     yield z.compress(content)
  95     yield z.flush()
  96
  97
  98 def _decode_looseobj(buf):
  99     assert(buf);
 100     s = zlib.decompress(buf)
 101     i = s.find('\0')
 102     assert(i > 0)
 103     l = s[:i].split(' ')
 104     type = l[0]
 105     sz = int(l[1])
 106     content = s[i+1:]
 107     assert(type in _typemap)
 108     assert(sz == len(content))
 109     return (type, content)
 110
 111
 112 def _decode_packobj(buf):
 113     assert(buf)
 114     c = ord(buf[0])
 115     type = _typermap[(c & 0x70) >> 4]
 116     sz = c & 0x0f
 117     shift = 4
 118     i = 0
 119     while c & 0x80:
 120         i += 1
 121         c = ord(buf[i])
 122         sz |= (c & 0x7f) << shift
 123         shift += 7
 124         if not (c & 0x80):
 125             break
 126     return (type, zlib.decompress(buf[i+1:]))
 127
 128
 129 class PackIdx:
 130     """Object representation of a Git pack index file."""
 131     def __init__(self, filename):
 132         self.name = filename
 133         self.map = mmap_read(open(filename))
 134         assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
 135         self.fanout = list(struct.unpack('!256I',
 136                                          str(buffer(self.map, 8, 256*4))))
 137         self.fanout.append(0)  # entry "-1"
 138         nsha = self.fanout[255]
 139         self.ofstable = buffer(self.map,
 140                                8 + 256*4 + nsha*20 + nsha*4,
 141                                nsha*4)
 142         self.ofs64table = buffer(self.map,
 143                                  8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
 144
 145     def _ofs_from_idx(self, idx):
 146         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
 147         if ofs & 0x80000000:
 148             idx64 = ofs & 0x7fffffff
 149             ofs = struct.unpack('!I',
 150                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
 151         return ofs
 152
 153     def _idx_from_hash(self, hash):
 154         global _total_searches, _total_steps
 155         _total_searches += 1
 156         assert(len(hash) == 20)
 157         b1 = ord(hash[0])
 158         start = self.fanout[b1-1] # range -1..254
 159         end = self.fanout[b1] # range 0..255
 160         buf = buffer(self.map, 8 + 256*4, end*20)
 161         want = str(hash)
 162         _total_steps += 1  # lookup table is a step
 163         while start < end:
 164             _total_steps += 1
 165             mid = start + (end-start)/2
 166             v = str(buf[mid*20:(mid+1)*20])
 167             if v < want:
 168                 start = mid+1
 169             elif v > want:
 170                 end = mid
 171             else: # got it!
 172                 return mid
 173         return None
 174
 175     def find_offset(self, hash):
 176         """Get the offset of an object inside the index file."""
 177         idx = self._idx_from_hash(hash)
 178         if idx != None:
 179             return self._ofs_from_idx(idx)
 180         return None
 181
 182     def exists(self, hash):
 183         """Return nonempty if the object exists in this index."""
 184         return hash and (self._idx_from_hash(hash) != None) and True or None
 185
 186     def __iter__(self):
 187         for i in xrange(self.fanout[255]):
 188             yield buffer(self.map, 8 + 256*4 + 20*i, 20)
 189
 190     def __len__(self):
 191         return int(self.fanout[255])
 192
 193
 194 def extract_bits(buf, nbits):
 195     """Take the first 'nbits' bits from 'buf' and return them as an integer."""
 196     mask = (1<<nbits) - 1
 197     v = struct.unpack('!I', buf[0:4])[0]
 198     v = (v >> (32-nbits)) & mask
 199     return v
 200
 201
 202 class PackMidx:
 203     """Wrapper which contains data from multiple index files.
 204     Multiple index (.midx) files constitute a wrapper around index (.idx) files
 205     and make it possible for bup to expand Git's indexing capabilities to vast
 206     amounts of files.
 207     """
 208     def __init__(self, filename):
 209         self.name = filename
 210         assert(filename.endswith('.midx'))
 211         self.map = mmap_read(open(filename))
 212         if str(self.map[0:8]) == 'MIDX\0\0\0\1':
 213             log('Warning: ignoring old-style midx %r\n' % filename)
 214             self.bits = 0
 215             self.entries = 1
 216             self.fanout = buffer('\0\0\0\0')
 217             self.shalist = buffer('\0'*20)
 218             self.idxnames = []
 219         else:
 220             assert(str(self.map[0:8]) == 'MIDX\0\0\0\2')
 221             self.bits = struct.unpack('!I', self.map[8:12])[0]
 222             self.entries = 2**self.bits
 223             self.fanout = buffer(self.map, 12, self.entries*4)
 224             shaofs = 12 + self.entries*4
 225             nsha = self._fanget(self.entries-1)
 226             self.shalist = buffer(self.map, shaofs, nsha*20)
 227             self.idxnames = str(self.map[shaofs + 20*nsha:]).split('\0')
 228
 229     def _fanget(self, i):
 230         start = i*4
 231         s = self.fanout[start:start+4]
 232         return struct.unpack('!I', s)[0]
 233
 234     def _get(self, i):
 235         return str(self.shalist[i*20:(i+1)*20])
 236
 237     def _num(self, hash):
 238         return struct.unpack('!I', hash[:4])[0]
 239
 240     def exists(self, hash):
 241         """Return nonempty if the object exists in the index files."""
 242         global _total_searches, _total_steps
 243         _total_searches += 1
 244         want = str(hash)
 245         el = extract_bits(want, self.bits)
 246         if el:
 247             start = self._fanget(el-1)
 248             startv = el << (32-self.bits)
 249         else:
 250             start = 0
 251             startv = 0
 252         end = self._fanget(el)
 253         endv = (el+1) << (32-self.bits)
 254         _total_steps += 1   # lookup table is a step
 255         hashv = self._num(hash)
 256         #print '(%08x) %08x %08x %08x' % (extract_bits(want, 32), startv, hashv, endv)
 257         while start < end:
 258             _total_steps += 1
 259             #print '! %08x %08x %08x   %d - %d' % (startv, hashv, endv, start, end)
 260             mid = start + (hashv-startv)*(end-start-1)/(endv-startv)
 261             #print '  %08x %08x %08x   %d %d %d' % (startv, hashv, endv, start, mid, end)
 262             v = self._get(mid)
 263             #print '    %08x' % self._num(v)
 264             if v < want:
 265                 start = mid+1
 266                 startv = self._num(v)
 267             elif v > want:
 268                 end = mid
 269                 endv = self._num(v)
 270             else: # got it!
 271                 return True
 272         return None
 273
 274     def __iter__(self):
 275         for i in xrange(self._fanget(self.entries-1)):
 276             yield buffer(self.shalist, i*20, 20)
 277
 278     def __len__(self):
 279         return int(self._fanget(self.entries-1))
 280
 281
 282 _mpi_count = 0
 283 class PackIdxList:
 284     def __init__(self, dir):
 285         global _mpi_count
 286         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 287         _mpi_count += 1
 288         self.dir = dir
 289         self.also = {}
 290         self.packs = []
 291         self.refresh()
 292
 293     def __del__(self):
 294         global _mpi_count
 295         _mpi_count -= 1
 296         assert(_mpi_count == 0)
 297
 298     def __iter__(self):
 299         return iter(idxmerge(self.packs))
 300
 301     def __len__(self):
 302         return sum(len(pack) for pack in self.packs)
 303
 304     def exists(self, hash):
 305         """Return nonempty if the object exists in the index files."""
 306         global _total_searches
 307         _total_searches += 1
 308         if hash in self.also:
 309             return True
 310         for i in range(len(self.packs)):
 311             p = self.packs[i]
 312             _total_searches -= 1  # will be incremented by sub-pack
 313             if p.exists(hash):
 314                 # reorder so most recently used packs are searched first
 315                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 316                 return p.name
 317         return None
 318
 319     def refresh(self, skip_midx = False):
 320         """Refresh the index list.
 321         This method verifies if .midx files were superseded (e.g. all of its
 322         contents are in another, bigger .midx file) and removes the superseded
 323         files.
 324
 325         If skip_midx is True, all work on .midx files will be skipped and .midx
 326         files will be removed from the list.
 327
 328         The module-global variable 'ignore_midx' can force this function to
 329         always act as if skip_midx was True.
 330         """
 331         skip_midx = skip_midx or ignore_midx
 332         d = dict((p.name, p) for p in self.packs
 333                  if not skip_midx or not isinstance(p, PackMidx))
 334         if os.path.exists(self.dir):
 335             if not skip_midx:
 336                 midxl = []
 337                 for ix in self.packs:
 338                     if isinstance(ix, PackMidx):
 339                         for name in ix.idxnames:
 340                             d[os.path.join(self.dir, name)] = ix
 341                 for f in os.listdir(self.dir):
 342                     full = os.path.join(self.dir, f)
 343                     if f.endswith('.midx') and not d.get(full):
 344                         mx = PackMidx(full)
 345                         (mxd, mxf) = os.path.split(mx.name)
 346                         broken = 0
 347                         for n in mx.idxnames:
 348                             if not os.path.exists(os.path.join(mxd, n)):
 349                                 log(('warning: index %s missing\n' +
 350                                     '  used by %s\n') % (n, mxf))
 351                                 broken += 1
 352                         if not broken:
 353                             midxl.append(mx)
 354                 midxl.sort(lambda x,y: -cmp(len(x),len(y)))
 355                 for ix in midxl:
 356                     any = 0
 357                     for sub in ix.idxnames:
 358                         found = d.get(os.path.join(self.dir, sub))
 359                         if not found or isinstance(found, PackIdx):
 360                             # doesn't exist, or exists but not in a midx
 361                             d[ix.name] = ix
 362                             for name in ix.idxnames:
 363                                 d[os.path.join(self.dir, name)] = ix
 364                             any += 1
 365                             break
 366                     if not any:
 367                         log('midx: removing redundant: %s\n'
 368                             % os.path.basename(ix.name))
 369                         unlink(ix.name)
 370             for f in os.listdir(self.dir):
 371                 full = os.path.join(self.dir, f)
 372                 if f.endswith('.idx') and not d.get(full):
 373                     ix = PackIdx(full)
 374                     d[full] = ix
 375             self.packs = list(set(d.values()))
 376         log('PackIdxList: using %d index%s.\n'
 377             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 378
 379     def add(self, hash):
 380         """Insert an additional object in the list."""
 381         self.also[hash] = 1
 382
 383     def zap_also(self):
 384         """Remove all additional objects from the list."""
 385         self.also = {}
 386
 387
 388 def calc_hash(type, content):
 389     """Calculate some content's hash in the Git fashion."""
 390     header = '%s %d\0' % (type, len(content))
 391     sum = Sha1(header)
 392     sum.update(content)
 393     return sum.digest()
 394
 395
 396 def _shalist_sort_key(ent):
 397     (mode, name, id) = ent
 398     if stat.S_ISDIR(int(mode, 8)):
 399         return name + '/'
 400     else:
 401         return name
 402
 403
 404 def idxmerge(idxlist):
 405     """Generate a list of all the objects reachable in a PackIdxList."""
 406     total = sum(len(i) for i in idxlist)
 407     iters = (iter(i) for i in idxlist)
 408     heap = [(next(it), it) for it in iters]
 409     heapq.heapify(heap)
 410     count = 0
 411     last = None
 412     while heap:
 413         if (count % 10024) == 0:
 414             progress('Reading indexes: %.2f%% (%d/%d)\r'
 415                      % (count*100.0/total, count, total))
 416         (e, it) = heap[0]
 417         if e != last:
 418             yield e
 419             last = e
 420         count += 1
 421         e = next(it)
 422         if e:
 423             heapq.heapreplace(heap, (e, it))
 424         else:
 425             heapq.heappop(heap)
 426     log('Reading indexes: %.2f%% (%d/%d), done.\n' % (100, total, total))
 427
 428
 429 class PackWriter:
 430     """Writes Git objects insid a pack file."""
 431     def __init__(self, objcache_maker=None):
 432         self.count = 0
 433         self.outbytes = 0
 434         self.filename = None
 435         self.file = None
 436         self.objcache_maker = objcache_maker
 437         self.objcache = None
 438
 439     def __del__(self):
 440         self.close()
 441
 442     def _make_objcache(self):
 443         if self.objcache == None:
 444             if self.objcache_maker:
 445                 self.objcache = self.objcache_maker()
 446             else:
 447                 self.objcache = PackIdxList(repo('objects/pack'))
 448
 449     def _open(self):
 450         if not self.file:
 451             self._make_objcache()
 452             (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
 453             self.file = os.fdopen(fd, 'w+b')
 454             assert(name.endswith('.pack'))
 455             self.filename = name[:-5]
 456             self.file.write('PACK\0\0\0\2\0\0\0\0')
 457
 458     def _raw_write(self, datalist):
 459         self._open()
 460         f = self.file
 461         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 462         # the file never has a *partial* blob.  So let's make sure it's
 463         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 464         # to our hashsplit algorithm.)  f.write() does its own buffering,
 465         # but that's okay because we'll flush it in _end().
 466         oneblob = ''.join(datalist)
 467         f.write(oneblob)
 468         self.outbytes += len(oneblob)
 469         self.count += 1
 470
 471     def _write(self, bin, type, content):
 472         if verbose:
 473             log('>')
 474         self._raw_write(_encode_packobj(type, content))
 475         return bin
 476
 477     def breakpoint(self):
 478         """Clear byte and object counts and return the last processed id."""
 479         id = self._end()
 480         self.outbytes = self.count = 0
 481         return id
 482
 483     def write(self, type, content):
 484         """Write an object in this pack file."""
 485         return self._write(calc_hash(type, content), type, content)
 486
 487     def exists(self, id):
 488         """Return non-empty if an object is found in the object cache."""
 489         if not self.objcache:
 490             self._make_objcache()
 491         return self.objcache.exists(id)
 492
 493     def maybe_write(self, type, content):
 494         """Write an object to the pack file if not present and return its id."""
 495         bin = calc_hash(type, content)
 496         if not self.exists(bin):
 497             self._write(bin, type, content)
 498             self.objcache.add(bin)
 499         return bin
 500
 501     def new_blob(self, blob):
 502         """Create a blob object in the pack with the supplied content."""
 503         return self.maybe_write('blob', blob)
 504
 505     def new_tree(self, shalist):
 506         """Create a tree object in the pack."""
 507         shalist = sorted(shalist, key = _shalist_sort_key)
 508         l = []
 509         for (mode,name,bin) in shalist:
 510             assert(mode)
 511             assert(mode != '0')
 512             assert(mode[0] != '0')
 513             assert(name)
 514             assert(len(bin) == 20)
 515             l.append('%s %s\0%s' % (mode,name,bin))
 516         return self.maybe_write('tree', ''.join(l))
 517
 518     def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
 519         l = []
 520         if tree: l.append('tree %s' % tree.encode('hex'))
 521         if parent: l.append('parent %s' % parent.encode('hex'))
 522         if author: l.append('author %s %s' % (author, _git_date(adate)))
 523         if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
 524         l.append('')
 525         l.append(msg)
 526         return self.maybe_write('commit', '\n'.join(l))
 527
 528     def new_commit(self, parent, tree, msg):
 529         """Create a commit object in the pack."""
 530         now = time.time()
 531         userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
 532         commit = self._new_commit(tree, parent,
 533                                   userline, now, userline, now,
 534                                   msg)
 535         return commit
 536
 537     def abort(self):
 538         """Remove the pack file from disk."""
 539         f = self.file
 540         if f:
 541             self.file = None
 542             f.close()
 543             os.unlink(self.filename + '.pack')
 544
 545     def _end(self):
 546         f = self.file
 547         if not f: return None
 548         self.file = None
 549         self.objcache = None
 550
 551         # update object count
 552         f.seek(8)
 553         cp = struct.pack('!i', self.count)
 554         assert(len(cp) == 4)
 555         f.write(cp)
 556
 557         # calculate the pack sha1sum
 558         f.seek(0)
 559         sum = Sha1()
 560         while 1:
 561             b = f.read(65536)
 562             sum.update(b)
 563             if not b: break
 564         f.write(sum.digest())
 565
 566         f.close()
 567
 568         p = subprocess.Popen(['git', 'index-pack', '-v',
 569                               '--index-version=2',
 570                               self.filename + '.pack'],
 571                              preexec_fn = _gitenv,
 572                              stdout = subprocess.PIPE)
 573         out = p.stdout.read().strip()
 574         _git_wait('git index-pack', p)
 575         if not out:
 576             raise GitError('git index-pack produced no output')
 577         nameprefix = repo('objects/pack/%s' % out)
 578         if os.path.exists(self.filename + '.map'):
 579             os.unlink(self.filename + '.map')
 580         os.rename(self.filename + '.pack', nameprefix + '.pack')
 581         os.rename(self.filename + '.idx', nameprefix + '.idx')
 582         return nameprefix
 583
 584     def close(self):
 585         """Close the pack file and move it to its definitive path."""
 586         return self._end()
 587
 588
 589 def _git_date(date):
 590     return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
 591
 592
 593 def _gitenv():
 594     os.environ['GIT_DIR'] = os.path.abspath(repo())
 595
 596
 597 def list_refs(refname = None):
 598     """Generate a list of tuples in the form (refname,hash).
 599     If a ref name is specified, list only this particular ref.
 600     """
 601     argv = ['git', 'show-ref', '--']
 602     if refname:
 603         argv += [refname]
 604     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 605     out = p.stdout.read().strip()
 606     rv = p.wait()  # not fatal
 607     if rv:
 608         assert(not out)
 609     if out:
 610         for d in out.split('\n'):
 611             (sha, name) = d.split(' ', 1)
 612             yield (name, sha.decode('hex'))
 613
 614
 615 def read_ref(refname):
 616     """Get the commit id of the most recent commit made on a given ref."""
 617     l = list(list_refs(refname))
 618     if l:
 619         assert(len(l) == 1)
 620         return l[0][1]
 621     else:
 622         return None
 623
 624
 625 def rev_list(ref, count=None):
 626     """Generate a list of reachable commits in reverse chronological order.
 627
 628     This generator walks through commits, from child to parent, that are
 629     reachable via the specified ref and yields a series of tuples of the form
 630     (date,hash).
 631
 632     If count is a non-zero integer, limit the number of commits to "count"
 633     objects.
 634     """
 635     assert(not ref.startswith('-'))
 636     opts = []
 637     if count:
 638         opts += ['-n', str(atoi(count))]
 639     argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
 640     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 641     commit = None
 642     for row in p.stdout:
 643         s = row.strip()
 644         if s.startswith('commit '):
 645             commit = s[7:].decode('hex')
 646         else:
 647             date = int(s)
 648             yield (date, commit)
 649     rv = p.wait()  # not fatal
 650     if rv:
 651         raise GitError, 'git rev-list returned error %d' % rv
 652
 653
 654 def rev_get_date(ref):
 655     """Get the date of the latest commit on the specified ref."""
 656     for (date, commit) in rev_list(ref, count=1):
 657         return date
 658     raise GitError, 'no such commit %r' % ref
 659
 660
 661 def update_ref(refname, newval, oldval):
 662     """Change the commit pointed to by a branch."""
 663     if not oldval:
 664         oldval = ''
 665     assert(refname.startswith('refs/heads/'))
 666     p = subprocess.Popen(['git', 'update-ref', refname,
 667                           newval.encode('hex'), oldval.encode('hex')],
 668                          preexec_fn = _gitenv)
 669     _git_wait('git update-ref', p)
 670
 671
 672 def guess_repo(path=None):
 673     """Set the path value in the global variable "repodir".
 674     This makes bup look for an existing bup repository, but not fail if a
 675     repository doesn't exist. Usually, if you are interacting with a bup
 676     repository, you would not be calling this function but using
 677     check_repo_or_die().
 678     """
 679     global repodir
 680     if path:
 681         repodir = path
 682     if not repodir:
 683         repodir = os.environ.get('BUP_DIR')
 684         if not repodir:
 685             repodir = os.path.expanduser('~/.bup')
 686
 687
 688 def init_repo(path=None):
 689     """Create the Git bare repository for bup in a given path."""
 690     guess_repo(path)
 691     d = repo()
 692     if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
 693         raise GitError('"%d" exists but is not a directory\n' % d)
 694     p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
 695                          preexec_fn = _gitenv)
 696     _git_wait('git init', p)
 697     # Force the index version configuration in order to ensure bup works
 698     # regardless of the version of the installed Git binary.
 699     p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
 700                          stdout=sys.stderr, preexec_fn = _gitenv)
 701     _git_wait('git config', p)
 702
 703
 704 def check_repo_or_die(path=None):
 705     """Make sure a bup repository exists, and abort if not.
 706     If the path to a particular repository was not specified, this function
 707     initializes the default repository automatically.
 708     """
 709     guess_repo(path)
 710     if not os.path.isdir(repo('objects/pack/.')):
 711         if repodir == home_repodir:
 712             init_repo()
 713         else:
 714             log('error: %r is not a bup/git repository\n' % repo())
 715             sys.exit(15)
 716
 717
 718 def treeparse(buf):
 719     """Generate a list of (mode, name, hash) tuples of objects from 'buf'."""
 720     ofs = 0
 721     while ofs < len(buf):
 722         z = buf[ofs:].find('\0')
 723         assert(z > 0)
 724         spl = buf[ofs:ofs+z].split(' ', 1)
 725         assert(len(spl) == 2)
 726         sha = buf[ofs+z+1:ofs+z+1+20]
 727         ofs += z+1+20
 728         yield (spl[0], spl[1], sha)
 729
 730
 731 _ver = None
 732 def ver():
 733     """Get Git's version and ensure a usable version is installed.
 734
 735     The returned version is formatted as an ordered tuple with each position
 736     representing a digit in the version tag. For example, the following tuple
 737     would represent version 1.6.6.9:
 738
 739         ('1', '6', '6', '9')
 740     """
 741     global _ver
 742     if not _ver:
 743         p = subprocess.Popen(['git', '--version'],
 744                              stdout=subprocess.PIPE)
 745         gvs = p.stdout.read()
 746         _git_wait('git --version', p)
 747         m = re.match(r'git version (\S+.\S+)', gvs)
 748         if not m:
 749             raise GitError('git --version weird output: %r' % gvs)
 750         _ver = tuple(m.group(1).split('.'))
 751     needed = ('1','5', '3', '1')
 752     if _ver < needed:
 753         raise GitError('git version %s or higher is required; you have %s'
 754                        % ('.'.join(needed), '.'.join(_ver)))
 755     return _ver
 756
 757
 758 def _git_wait(cmd, p):
 759     rv = p.wait()
 760     if rv != 0:
 761         raise GitError('%s returned %d' % (cmd, rv))
 762
 763
 764 def _git_capture(argv):
 765     p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
 766     r = p.stdout.read()
 767     _git_wait(repr(argv), p)
 768     return r
 769
 770
 771 class _AbortableIter:
 772     def __init__(self, it, onabort = None):
 773         self.it = it
 774         self.onabort = onabort
 775         self.done = None
 776
 777     def __iter__(self):
 778         return self
 779
 780     def next(self):
 781         try:
 782             return self.it.next()
 783         except StopIteration, e:
 784             self.done = True
 785             raise
 786         except:
 787             self.abort()
 788             raise
 789
 790     def abort(self):
 791         """Abort iteration and call the abortion callback, if needed."""
 792         if not self.done:
 793             self.done = True
 794             if self.onabort:
 795                 self.onabort()
 796
 797     def __del__(self):
 798         self.abort()
 799
 800
 801 _ver_warned = 0
 802 class CatPipe:
 803     """Link to 'git cat-file' that is used to retrieve blob data."""
 804     def __init__(self):
 805         global _ver_warned
 806         wanted = ('1','5','6')
 807         if ver() < wanted:
 808             if not _ver_warned:
 809                 log('warning: git version < %s; bup will be slow.\n'
 810                     % '.'.join(wanted))
 811                 _ver_warned = 1
 812             self.get = self._slow_get
 813         else:
 814             self.p = self.inprogress = None
 815             self.get = self._fast_get
 816
 817     def _abort(self):
 818         if self.p:
 819             self.p.stdout.close()
 820             self.p.stdin.close()
 821         self.p = None
 822         self.inprogress = None
 823
 824     def _restart(self):
 825         self._abort()
 826         self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
 827                                   stdin=subprocess.PIPE,
 828                                   stdout=subprocess.PIPE,
 829                                   close_fds = True,
 830                                   preexec_fn = _gitenv)
 831
 832     def _fast_get(self, id):
 833         if not self.p or self.p.poll() != None:
 834             self._restart()
 835         assert(self.p)
 836         assert(self.p.poll() == None)
 837         if self.inprogress:
 838             log('_fast_get: opening %r while %r is open'
 839                 % (id, self.inprogress))
 840         assert(not self.inprogress)
 841         assert(id.find('\n') < 0)
 842         assert(id.find('\r') < 0)
 843         assert(id[0] != '-')
 844         self.inprogress = id
 845         self.p.stdin.write('%s\n' % id)
 846         hdr = self.p.stdout.readline()
 847         if hdr.endswith(' missing\n'):
 848             raise KeyError('blob %r is missing' % id)
 849         spl = hdr.split(' ')
 850         if len(spl) != 3 or len(spl[0]) != 40:
 851             raise GitError('expected blob, got %r' % spl)
 852         (hex, type, size) = spl
 853
 854         it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
 855                            onabort = self._abort)
 856         try:
 857             yield type
 858             for blob in it:
 859                 yield blob
 860             assert(self.p.stdout.readline() == '\n')
 861             self.inprogress = None
 862         except Exception, e:
 863             it.abort()
 864             raise
 865
 866     def _slow_get(self, id):
 867         assert(id.find('\n') < 0)
 868         assert(id.find('\r') < 0)
 869         assert(id[0] != '-')
 870         type = _git_capture(['git', 'cat-file', '-t', id]).strip()
 871         yield type
 872
 873         p = subprocess.Popen(['git', 'cat-file', type, id],
 874                              stdout=subprocess.PIPE,
 875                              preexec_fn = _gitenv)
 876         for blob in chunkyreader(p.stdout):
 877             yield blob
 878         _git_wait('git cat-file', p)
 879
 880     def _join(self, it):
 881         type = it.next()
 882         if type == 'blob':
 883             for blob in it:
 884                 yield blob
 885         elif type == 'tree':
 886             treefile = ''.join(it)
 887             for (mode, name, sha) in treeparse(treefile):
 888                 for blob in self.join(sha.encode('hex')):
 889                     yield blob
 890         elif type == 'commit':
 891             treeline = ''.join(it).split('\n')[0]
 892             assert(treeline.startswith('tree '))
 893             for blob in self.join(treeline[5:]):
 894                 yield blob
 895         else:
 896             raise GitError('invalid object type %r: expected blob/tree/commit'
 897                            % type)
 898
 899     def join(self, id):
 900         """Generate a list of the content of all blobs that can be reached
 901         from an object.  The hash given in 'id' must point to a blob, a tree
 902         or a commit. The content of all blobs that can be seen from trees or
 903         commits will be added to the list.
 904         """
 905         try:
 906             for d in self._join(self.get(id)):
 907                 yield d
 908         except StopIteration:
 909             log('booger!\n')