lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5 import os, zlib, time, subprocess, struct, stat, re, tempfile
   6 import heapq
   7 from bup.helpers import *
   8
   9 verbose = 0
  10 ignore_midx = 0
  11 home_repodir = os.path.expanduser('~/.bup')
  12 repodir = None
  13
  14 _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
  15 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
  16
  17 _total_searches = 0
  18 _total_steps = 0
  19
  20
  21 class GitError(Exception):
  22     pass
  23
  24
  25 def repo(sub = ''):
  26     """Get the path to the git repository or one of its subdirectories."""
  27     global repodir
  28     if not repodir:
  29         raise GitError('You should call check_repo_or_die()')
  30
  31     # If there's a .git subdirectory, then the actual repo is in there.
  32     gd = os.path.join(repodir, '.git')
  33     if os.path.exists(gd):
  34         repodir = gd
  35
  36     return os.path.join(repodir, sub)
  37
  38
  39 def mangle_name(name, mode, gitmode):
  40     """Mangle a file name to present an abstract name for segmented files.
  41     Mangled file names will have the ".bup" extension added to them. If a
  42     file's name already ends with ".bup", a ".bupl" extension is added to
  43     disambiguate normal files from semgmented ones.
  44     """
  45     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
  46         return name + '.bup'
  47     elif name.endswith('.bup') or name[:-1].endswith('.bup'):
  48         return name + '.bupl'
  49     else:
  50         return name
  51
  52
  53 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
  54 def demangle_name(name):
  55     """Remove name mangling from a file name, if necessary.
  56
  57     The return value is a tuple (demangled_filename,mode), where mode is one of
  58     the following:
  59
  60     * BUP_NORMAL  : files that should be read as-is from the repository
  61     * BUP_CHUNKED : files that were chunked and need to be assembled
  62
  63     For more information on the name mangling algorythm, see mangle_name()
  64     """
  65     if name.endswith('.bupl'):
  66         return (name[:-5], BUP_NORMAL)
  67     elif name.endswith('.bup'):
  68         return (name[:-4], BUP_CHUNKED)
  69     else:
  70         return (name, BUP_NORMAL)
  71
  72
  73 def _encode_packobj(type, content):
  74     szout = ''
  75     sz = len(content)
  76     szbits = (sz & 0x0f) | (_typemap[type]<<4)
  77     sz >>= 4
  78     while 1:
  79         if sz: szbits |= 0x80
  80         szout += chr(szbits)
  81         if not sz:
  82             break
  83         szbits = sz & 0x7f
  84         sz >>= 7
  85     z = zlib.compressobj(1)
  86     yield szout
  87     yield z.compress(content)
  88     yield z.flush()
  89
  90
  91 def _encode_looseobj(type, content):
  92     z = zlib.compressobj(1)
  93     yield z.compress('%s %d\0' % (type, len(content)))
  94     yield z.compress(content)
  95     yield z.flush()
  96
  97
  98 def _decode_looseobj(buf):
  99     assert(buf);
 100     s = zlib.decompress(buf)
 101     i = s.find('\0')
 102     assert(i > 0)
 103     l = s[:i].split(' ')
 104     type = l[0]
 105     sz = int(l[1])
 106     content = s[i+1:]
 107     assert(type in _typemap)
 108     assert(sz == len(content))
 109     return (type, content)
 110
 111
 112 def _decode_packobj(buf):
 113     assert(buf)
 114     c = ord(buf[0])
 115     type = _typermap[(c & 0x70) >> 4]
 116     sz = c & 0x0f
 117     shift = 4
 118     i = 0
 119     while c & 0x80:
 120         i += 1
 121         c = ord(buf[i])
 122         sz |= (c & 0x7f) << shift
 123         shift += 7
 124         if not (c & 0x80):
 125             break
 126     return (type, zlib.decompress(buf[i+1:]))
 127
 128
 129 class PackIdx:
 130     """Object representation of a Git pack index file."""
 131     def __init__(self, filename):
 132         self.name = filename
 133         self.map = mmap_read(open(filename))
 134         assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
 135         self.fanout = list(struct.unpack('!256I',
 136                                          str(buffer(self.map, 8, 256*4))))
 137         self.fanout.append(0)  # entry "-1"
 138         nsha = self.fanout[255]
 139         self.ofstable = buffer(self.map,
 140                                8 + 256*4 + nsha*20 + nsha*4,
 141                                nsha*4)
 142         self.ofs64table = buffer(self.map,
 143                                  8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
 144
 145     def _ofs_from_idx(self, idx):
 146         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
 147         if ofs & 0x80000000:
 148             idx64 = ofs & 0x7fffffff
 149             ofs = struct.unpack('!I',
 150                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
 151         return ofs
 152
 153     def _idx_from_hash(self, hash):
 154         global _total_searches, _total_steps
 155         _total_searches += 1
 156         assert(len(hash) == 20)
 157         b1 = ord(hash[0])
 158         start = self.fanout[b1-1] # range -1..254
 159         end = self.fanout[b1] # range 0..255
 160         buf = buffer(self.map, 8 + 256*4, end*20)
 161         want = str(hash)
 162         _total_steps += 1  # lookup table is a step
 163         while start < end:
 164             _total_steps += 1
 165             mid = start + (end-start)/2
 166             v = str(buf[mid*20:(mid+1)*20])
 167             if v < want:
 168                 start = mid+1
 169             elif v > want:
 170                 end = mid
 171             else: # got it!
 172                 return mid
 173         return None
 174
 175     def find_offset(self, hash):
 176         """Get the offset of an object inside the index file."""
 177         idx = self._idx_from_hash(hash)
 178         if idx != None:
 179             return self._ofs_from_idx(idx)
 180         return None
 181
 182     def exists(self, hash):
 183         """Return nonempty if the object exists in this index."""
 184         return hash and (self._idx_from_hash(hash) != None) and True or None
 185
 186     def __iter__(self):
 187         for i in xrange(self.fanout[255]):
 188             yield buffer(self.map, 8 + 256*4 + 20*i, 20)
 189
 190     def __len__(self):
 191         return int(self.fanout[255])
 192
 193
 194 def extract_bits(buf, nbits):
 195     """Take the first 'nbits' bits from 'buf' and return them as an integer."""
 196     mask = (1<<nbits) - 1
 197     v = struct.unpack('!I', buf[0:4])[0]
 198     v = (v >> (32-nbits)) & mask
 199     return v
 200
 201
 202 class PackMidx:
 203     """Wrapper which contains data from multiple index files.
 204     Multiple index (.midx) files constitute a wrapper around index (.idx) files
 205     and make it possible for bup to expand Git's indexing capabilities to vast
 206     amounts of files.
 207     """
 208     def __init__(self, filename):
 209         self.name = filename
 210         assert(filename.endswith('.midx'))
 211         self.map = mmap_read(open(filename))
 212         if str(self.map[0:8]) == 'MIDX\0\0\0\1':
 213             log('Warning: ignoring old-style midx %r\n' % filename)
 214             self.bits = 0
 215             self.entries = 1
 216             self.fanout = buffer('\0\0\0\0')
 217             self.shalist = buffer('\0'*20)
 218             self.idxnames = []
 219         else:
 220             assert(str(self.map[0:8]) == 'MIDX\0\0\0\2')
 221             self.bits = struct.unpack('!I', self.map[8:12])[0]
 222             self.entries = 2**self.bits
 223             self.fanout = buffer(self.map, 12, self.entries*4)
 224             shaofs = 12 + self.entries*4
 225             nsha = self._fanget(self.entries-1)
 226             self.shalist = buffer(self.map, shaofs, nsha*20)
 227             self.idxnames = str(self.map[shaofs + 20*nsha:]).split('\0')
 228
 229     def _fanget(self, i):
 230         start = i*4
 231         s = self.fanout[start:start+4]
 232         return struct.unpack('!I', s)[0]
 233
 234     def exists(self, hash):
 235         """Return nonempty if the object exists in the index files."""
 236         global _total_searches, _total_steps
 237         _total_searches += 1
 238         want = str(hash)
 239         el = extract_bits(want, self.bits)
 240         if el:
 241             start = self._fanget(el-1)
 242         else:
 243             start = 0
 244         end = self._fanget(el)
 245         _total_steps += 1   # lookup table is a step
 246         while start < end:
 247             _total_steps += 1
 248             mid = start + (end-start)/2
 249             v = str(self.shalist[mid*20:(mid+1)*20])
 250             if v < want:
 251                 start = mid+1
 252             elif v > want:
 253                 end = mid
 254             else: # got it!
 255                 return True
 256         return None
 257
 258     def __iter__(self):
 259         for i in xrange(self._fanget(self.entries-1)):
 260             yield buffer(self.shalist, i*20, 20)
 261
 262     def __len__(self):
 263         return int(self._fanget(self.entries-1))
 264
 265
 266 _mpi_count = 0
 267 class PackIdxList:
 268     def __init__(self, dir):
 269         global _mpi_count
 270         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 271         _mpi_count += 1
 272         self.dir = dir
 273         self.also = {}
 274         self.packs = []
 275         self.refresh()
 276
 277     def __del__(self):
 278         global _mpi_count
 279         _mpi_count -= 1
 280         assert(_mpi_count == 0)
 281
 282     def __iter__(self):
 283         return iter(idxmerge(self.packs))
 284
 285     def __len__(self):
 286         return sum(len(pack) for pack in self.packs)
 287
 288     def exists(self, hash):
 289         """Return nonempty if the object exists in the index files."""
 290         global _total_searches
 291         _total_searches += 1
 292         if hash in self.also:
 293             return True
 294         for i in range(len(self.packs)):
 295             p = self.packs[i]
 296             _total_searches -= 1  # will be incremented by sub-pack
 297             if p.exists(hash):
 298                 # reorder so most recently used packs are searched first
 299                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 300                 return p.name
 301         return None
 302
 303     def refresh(self, skip_midx = False):
 304         """Refresh the index list.
 305         This method verifies if .midx files were superseded (e.g. all of its
 306         contents are in another, bigger .midx file) and removes the superseded
 307         files.
 308
 309         If skip_midx is True, all work on .midx files will be skipped and .midx
 310         files will be removed from the list.
 311
 312         The module-global variable 'ignore_midx' can force this function to
 313         always act as if skip_midx was True.
 314         """
 315         skip_midx = skip_midx or ignore_midx
 316         d = dict((p.name, p) for p in self.packs
 317                  if not skip_midx or not isinstance(p, PackMidx))
 318         if os.path.exists(self.dir):
 319             if not skip_midx:
 320                 midxl = []
 321                 for ix in self.packs:
 322                     if isinstance(ix, PackMidx):
 323                         for name in ix.idxnames:
 324                             d[os.path.join(self.dir, name)] = ix
 325                 for f in os.listdir(self.dir):
 326                     full = os.path.join(self.dir, f)
 327                     if f.endswith('.midx') and not d.get(full):
 328                         mx = PackMidx(full)
 329                         (mxd, mxf) = os.path.split(mx.name)
 330                         broken = 0
 331                         for n in mx.idxnames:
 332                             if not os.path.exists(os.path.join(mxd, n)):
 333                                 log(('warning: index %s missing\n' +
 334                                     '  used by %s\n') % (n, mxf))
 335                                 broken += 1
 336                         if not broken:
 337                             midxl.append(mx)
 338                 midxl.sort(lambda x,y: -cmp(len(x),len(y)))
 339                 for ix in midxl:
 340                     any = 0
 341                     for sub in ix.idxnames:
 342                         found = d.get(os.path.join(self.dir, sub))
 343                         if not found or isinstance(found, PackIdx):
 344                             # doesn't exist, or exists but not in a midx
 345                             d[ix.name] = ix
 346                             for name in ix.idxnames:
 347                                 d[os.path.join(self.dir, name)] = ix
 348                             any += 1
 349                             break
 350                     if not any:
 351                         log('midx: removing redundant: %s\n'
 352                             % os.path.basename(ix.name))
 353                         unlink(ix.name)
 354             for f in os.listdir(self.dir):
 355                 full = os.path.join(self.dir, f)
 356                 if f.endswith('.idx') and not d.get(full):
 357                     ix = PackIdx(full)
 358                     d[full] = ix
 359             self.packs = list(set(d.values()))
 360         log('PackIdxList: using %d index%s.\n'
 361             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 362
 363     def add(self, hash):
 364         """Insert an additional object in the list."""
 365         self.also[hash] = 1
 366
 367     def zap_also(self):
 368         """Remove all additional objects from the list."""
 369         self.also = {}
 370
 371
 372 def calc_hash(type, content):
 373     """Calculate some content's hash in the Git fashion."""
 374     header = '%s %d\0' % (type, len(content))
 375     sum = Sha1(header)
 376     sum.update(content)
 377     return sum.digest()
 378
 379
 380 def _shalist_sort_key(ent):
 381     (mode, name, id) = ent
 382     if stat.S_ISDIR(int(mode, 8)):
 383         return name + '/'
 384     else:
 385         return name
 386
 387
 388 def idxmerge(idxlist):
 389     """Generate a list of all the objects reachable in a PackIdxList."""
 390     total = sum(len(i) for i in idxlist)
 391     iters = (iter(i) for i in idxlist)
 392     heap = [(next(it), it) for it in iters]
 393     heapq.heapify(heap)
 394     count = 0
 395     last = None
 396     while heap:
 397         if (count % 10024) == 0:
 398             progress('Reading indexes: %.2f%% (%d/%d)\r'
 399                      % (count*100.0/total, count, total))
 400         (e, it) = heap[0]
 401         if e != last:
 402             yield e
 403             last = e
 404         count += 1
 405         e = next(it)
 406         if e:
 407             heapq.heapreplace(heap, (e, it))
 408         else:
 409             heapq.heappop(heap)
 410     log('Reading indexes: %.2f%% (%d/%d), done.\n' % (100, total, total))
 411
 412
 413 class PackWriter:
 414     """Writes Git objects insid a pack file."""
 415     def __init__(self, objcache_maker=None):
 416         self.count = 0
 417         self.outbytes = 0
 418         self.filename = None
 419         self.file = None
 420         self.objcache_maker = objcache_maker
 421         self.objcache = None
 422
 423     def __del__(self):
 424         self.close()
 425
 426     def _make_objcache(self):
 427         if self.objcache == None:
 428             if self.objcache_maker:
 429                 self.objcache = self.objcache_maker()
 430             else:
 431                 self.objcache = PackIdxList(repo('objects/pack'))
 432
 433     def _open(self):
 434         if not self.file:
 435             self._make_objcache()
 436             (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
 437             self.file = os.fdopen(fd, 'w+b')
 438             assert(name.endswith('.pack'))
 439             self.filename = name[:-5]
 440             self.file.write('PACK\0\0\0\2\0\0\0\0')
 441
 442     def _raw_write(self, datalist):
 443         self._open()
 444         f = self.file
 445         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 446         # the file never has a *partial* blob.  So let's make sure it's
 447         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 448         # to our hashsplit algorithm.)  f.write() does its own buffering,
 449         # but that's okay because we'll flush it in _end().
 450         oneblob = ''.join(datalist)
 451         f.write(oneblob)
 452         self.outbytes += len(oneblob)
 453         self.count += 1
 454
 455     def _write(self, bin, type, content):
 456         if verbose:
 457             log('>')
 458         self._raw_write(_encode_packobj(type, content))
 459         return bin
 460
 461     def breakpoint(self):
 462         """Clear byte and object counts and return the last processed id."""
 463         id = self._end()
 464         self.outbytes = self.count = 0
 465         return id
 466
 467     def write(self, type, content):
 468         """Write an object in this pack file."""
 469         return self._write(calc_hash(type, content), type, content)
 470
 471     def exists(self, id):
 472         """Return non-empty if an object is found in the object cache."""
 473         if not self.objcache:
 474             self._make_objcache()
 475         return self.objcache.exists(id)
 476
 477     def maybe_write(self, type, content):
 478         """Write an object to the pack file if not present and return its id."""
 479         bin = calc_hash(type, content)
 480         if not self.exists(bin):
 481             self._write(bin, type, content)
 482             self.objcache.add(bin)
 483         return bin
 484
 485     def new_blob(self, blob):
 486         """Create a blob object in the pack with the supplied content."""
 487         return self.maybe_write('blob', blob)
 488
 489     def new_tree(self, shalist):
 490         """Create a tree object in the pack."""
 491         shalist = sorted(shalist, key = _shalist_sort_key)
 492         l = []
 493         for (mode,name,bin) in shalist:
 494             assert(mode)
 495             assert(mode != '0')
 496             assert(mode[0] != '0')
 497             assert(name)
 498             assert(len(bin) == 20)
 499             l.append('%s %s\0%s' % (mode,name,bin))
 500         return self.maybe_write('tree', ''.join(l))
 501
 502     def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
 503         l = []
 504         if tree: l.append('tree %s' % tree.encode('hex'))
 505         if parent: l.append('parent %s' % parent.encode('hex'))
 506         if author: l.append('author %s %s' % (author, _git_date(adate)))
 507         if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
 508         l.append('')
 509         l.append(msg)
 510         return self.maybe_write('commit', '\n'.join(l))
 511
 512     def new_commit(self, parent, tree, msg):
 513         """Create a commit object in the pack."""
 514         now = time.time()
 515         userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
 516         commit = self._new_commit(tree, parent,
 517                                   userline, now, userline, now,
 518                                   msg)
 519         return commit
 520
 521     def abort(self):
 522         """Remove the pack file from disk."""
 523         f = self.file
 524         if f:
 525             self.file = None
 526             f.close()
 527             os.unlink(self.filename + '.pack')
 528
 529     def _end(self):
 530         f = self.file
 531         if not f: return None
 532         self.file = None
 533         self.objcache = None
 534
 535         # update object count
 536         f.seek(8)
 537         cp = struct.pack('!i', self.count)
 538         assert(len(cp) == 4)
 539         f.write(cp)
 540
 541         # calculate the pack sha1sum
 542         f.seek(0)
 543         sum = Sha1()
 544         while 1:
 545             b = f.read(65536)
 546             sum.update(b)
 547             if not b: break
 548         f.write(sum.digest())
 549
 550         f.close()
 551
 552         p = subprocess.Popen(['git', 'index-pack', '-v',
 553                               '--index-version=2',
 554                               self.filename + '.pack'],
 555                              preexec_fn = _gitenv,
 556                              stdout = subprocess.PIPE)
 557         out = p.stdout.read().strip()
 558         _git_wait('git index-pack', p)
 559         if not out:
 560             raise GitError('git index-pack produced no output')
 561         nameprefix = repo('objects/pack/%s' % out)
 562         if os.path.exists(self.filename + '.map'):
 563             os.unlink(self.filename + '.map')
 564         os.rename(self.filename + '.pack', nameprefix + '.pack')
 565         os.rename(self.filename + '.idx', nameprefix + '.idx')
 566         return nameprefix
 567
 568     def close(self):
 569         """Close the pack file and move it to its definitive path."""
 570         return self._end()
 571
 572
 573 def _git_date(date):
 574     return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
 575
 576
 577 def _gitenv():
 578     os.environ['GIT_DIR'] = os.path.abspath(repo())
 579
 580
 581 def list_refs(refname = None):
 582     """Generate a list of tuples in the form (refname,hash).
 583     If a ref name is specified, list only this particular ref.
 584     """
 585     argv = ['git', 'show-ref', '--']
 586     if refname:
 587         argv += [refname]
 588     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 589     out = p.stdout.read().strip()
 590     rv = p.wait()  # not fatal
 591     if rv:
 592         assert(not out)
 593     if out:
 594         for d in out.split('\n'):
 595             (sha, name) = d.split(' ', 1)
 596             yield (name, sha.decode('hex'))
 597
 598
 599 def read_ref(refname):
 600     """Get the commit id of the most recent commit made on a given ref."""
 601     l = list(list_refs(refname))
 602     if l:
 603         assert(len(l) == 1)
 604         return l[0][1]
 605     else:
 606         return None
 607
 608
 609 def rev_list(ref, count=None):
 610     """Generate a list of reachable commits in reverse chronological order.
 611
 612     This generator walks through commits, from child to parent, that are
 613     reachable via the specified ref and yields a series of tuples of the form
 614     (date,hash).
 615
 616     If count is a non-zero integer, limit the number of commits to "count"
 617     objects.
 618     """
 619     assert(not ref.startswith('-'))
 620     opts = []
 621     if count:
 622         opts += ['-n', str(atoi(count))]
 623     argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
 624     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 625     commit = None
 626     for row in p.stdout:
 627         s = row.strip()
 628         if s.startswith('commit '):
 629             commit = s[7:].decode('hex')
 630         else:
 631             date = int(s)
 632             yield (date, commit)
 633     rv = p.wait()  # not fatal
 634     if rv:
 635         raise GitError, 'git rev-list returned error %d' % rv
 636
 637
 638 def rev_get_date(ref):
 639     """Get the date of the latest commit on the specified ref."""
 640     for (date, commit) in rev_list(ref, count=1):
 641         return date
 642     raise GitError, 'no such commit %r' % ref
 643
 644
 645 def update_ref(refname, newval, oldval):
 646     """Change the commit pointed to by a branch."""
 647     if not oldval:
 648         oldval = ''
 649     assert(refname.startswith('refs/heads/'))
 650     p = subprocess.Popen(['git', 'update-ref', refname,
 651                           newval.encode('hex'), oldval.encode('hex')],
 652                          preexec_fn = _gitenv)
 653     _git_wait('git update-ref', p)
 654
 655
 656 def guess_repo(path=None):
 657     """Set the path value in the global variable "repodir".
 658     This makes bup look for an existing bup repository, but not fail if a
 659     repository doesn't exist. Usually, if you are interacting with a bup
 660     repository, you would not be calling this function but using
 661     check_repo_or_die().
 662     """
 663     global repodir
 664     if path:
 665         repodir = path
 666     if not repodir:
 667         repodir = os.environ.get('BUP_DIR')
 668         if not repodir:
 669             repodir = os.path.expanduser('~/.bup')
 670
 671
 672 def init_repo(path=None):
 673     """Create the Git bare repository for bup in a given path."""
 674     guess_repo(path)
 675     d = repo()
 676     if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
 677         raise GitError('"%d" exists but is not a directory\n' % d)
 678     p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
 679                          preexec_fn = _gitenv)
 680     _git_wait('git init', p)
 681     # Force the index version configuration in order to ensure bup works
 682     # regardless of the version of the installed Git binary.
 683     p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
 684                          stdout=sys.stderr, preexec_fn = _gitenv)
 685     _git_wait('git config', p)
 686
 687
 688 def check_repo_or_die(path=None):
 689     """Make sure a bup repository exists, and abort if not.
 690     If the path to a particular repository was not specified, this function
 691     initializes the default repository automatically.
 692     """
 693     guess_repo(path)
 694     if not os.path.isdir(repo('objects/pack/.')):
 695         if repodir == home_repodir:
 696             init_repo()
 697         else:
 698             log('error: %r is not a bup/git repository\n' % repo())
 699             sys.exit(15)
 700
 701
 702 def treeparse(buf):
 703     """Generate a list of (mode, name, hash) tuples of objects from 'buf'."""
 704     ofs = 0
 705     while ofs < len(buf):
 706         z = buf[ofs:].find('\0')
 707         assert(z > 0)
 708         spl = buf[ofs:ofs+z].split(' ', 1)
 709         assert(len(spl) == 2)
 710         sha = buf[ofs+z+1:ofs+z+1+20]
 711         ofs += z+1+20
 712         yield (spl[0], spl[1], sha)
 713
 714
 715 _ver = None
 716 def ver():
 717     """Get Git's version and ensure a usable version is installed.
 718
 719     The returned version is formatted as an ordered tuple with each position
 720     representing a digit in the version tag. For example, the following tuple
 721     would represent version 1.6.6.9:
 722
 723         ('1', '6', '6', '9')
 724     """
 725     global _ver
 726     if not _ver:
 727         p = subprocess.Popen(['git', '--version'],
 728                              stdout=subprocess.PIPE)
 729         gvs = p.stdout.read()
 730         _git_wait('git --version', p)
 731         m = re.match(r'git version (\S+.\S+)', gvs)
 732         if not m:
 733             raise GitError('git --version weird output: %r' % gvs)
 734         _ver = tuple(m.group(1).split('.'))
 735     needed = ('1','5', '3', '1')
 736     if _ver < needed:
 737         raise GitError('git version %s or higher is required; you have %s'
 738                        % ('.'.join(needed), '.'.join(_ver)))
 739     return _ver
 740
 741
 742 def _git_wait(cmd, p):
 743     rv = p.wait()
 744     if rv != 0:
 745         raise GitError('%s returned %d' % (cmd, rv))
 746
 747
 748 def _git_capture(argv):
 749     p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
 750     r = p.stdout.read()
 751     _git_wait(repr(argv), p)
 752     return r
 753
 754
 755 class _AbortableIter:
 756     def __init__(self, it, onabort = None):
 757         self.it = it
 758         self.onabort = onabort
 759         self.done = None
 760
 761     def __iter__(self):
 762         return self
 763
 764     def next(self):
 765         try:
 766             return self.it.next()
 767         except StopIteration, e:
 768             self.done = True
 769             raise
 770         except:
 771             self.abort()
 772             raise
 773
 774     def abort(self):
 775         """Abort iteration and call the abortion callback, if needed."""
 776         if not self.done:
 777             self.done = True
 778             if self.onabort:
 779                 self.onabort()
 780
 781     def __del__(self):
 782         self.abort()
 783
 784
 785 _ver_warned = 0
 786 class CatPipe:
 787     """Link to 'git cat-file' that is used to retrieve blob data."""
 788     def __init__(self):
 789         global _ver_warned
 790         wanted = ('1','5','6')
 791         if ver() < wanted:
 792             if not _ver_warned:
 793                 log('warning: git version < %s; bup will be slow.\n'
 794                     % '.'.join(wanted))
 795                 _ver_warned = 1
 796             self.get = self._slow_get
 797         else:
 798             self.p = self.inprogress = None
 799             self.get = self._fast_get
 800
 801     def _abort(self):
 802         if self.p:
 803             self.p.stdout.close()
 804             self.p.stdin.close()
 805         self.p = None
 806         self.inprogress = None
 807
 808     def _restart(self):
 809         self._abort()
 810         self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
 811                                   stdin=subprocess.PIPE,
 812                                   stdout=subprocess.PIPE,
 813                                   close_fds = True,
 814                                   preexec_fn = _gitenv)
 815
 816     def _fast_get(self, id):
 817         if not self.p or self.p.poll() != None:
 818             self._restart()
 819         assert(self.p)
 820         assert(self.p.poll() == None)
 821         if self.inprogress:
 822             log('_fast_get: opening %r while %r is open'
 823                 % (id, self.inprogress))
 824         assert(not self.inprogress)
 825         assert(id.find('\n') < 0)
 826         assert(id.find('\r') < 0)
 827         assert(id[0] != '-')
 828         self.inprogress = id
 829         self.p.stdin.write('%s\n' % id)
 830         hdr = self.p.stdout.readline()
 831         if hdr.endswith(' missing\n'):
 832             raise KeyError('blob %r is missing' % id)
 833         spl = hdr.split(' ')
 834         if len(spl) != 3 or len(spl[0]) != 40:
 835             raise GitError('expected blob, got %r' % spl)
 836         (hex, type, size) = spl
 837
 838         it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
 839                            onabort = self._abort)
 840         try:
 841             yield type
 842             for blob in it:
 843                 yield blob
 844             assert(self.p.stdout.readline() == '\n')
 845             self.inprogress = None
 846         except Exception, e:
 847             it.abort()
 848             raise
 849
 850     def _slow_get(self, id):
 851         assert(id.find('\n') < 0)
 852         assert(id.find('\r') < 0)
 853         assert(id[0] != '-')
 854         type = _git_capture(['git', 'cat-file', '-t', id]).strip()
 855         yield type
 856
 857         p = subprocess.Popen(['git', 'cat-file', type, id],
 858                              stdout=subprocess.PIPE,
 859                              preexec_fn = _gitenv)
 860         for blob in chunkyreader(p.stdout):
 861             yield blob
 862         _git_wait('git cat-file', p)
 863
 864     def _join(self, it):
 865         type = it.next()
 866         if type == 'blob':
 867             for blob in it:
 868                 yield blob
 869         elif type == 'tree':
 870             treefile = ''.join(it)
 871             for (mode, name, sha) in treeparse(treefile):
 872                 for blob in self.join(sha.encode('hex')):
 873                     yield blob
 874         elif type == 'commit':
 875             treeline = ''.join(it).split('\n')[0]
 876             assert(treeline.startswith('tree '))
 877             for blob in self.join(treeline[5:]):
 878                 yield blob
 879         else:
 880             raise GitError('invalid object type %r: expected blob/tree/commit'
 881                            % type)
 882
 883     def join(self, id):
 884         """Generate a list of the content of all blobs that can be reached
 885         from an object.  The hash given in 'id' must point to a blob, a tree
 886         or a commit. The content of all blobs that can be seen from trees or
 887         commits will be added to the list.
 888         """
 889         try:
 890             for d in self._join(self.get(id)):
 891                 yield d
 892         except StopIteration:
 893             log('booger!\n')