lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5 import os, zlib, time, subprocess, struct, stat, re, tempfile, heapq
   6 from bup.helpers import *
   7 from bup import _helpers
   8
   9 MIDX_VERSION = 2
  10
  11 verbose = 0
  12 ignore_midx = 0
  13 home_repodir = os.path.expanduser('~/.bup')
  14 repodir = None
  15
  16 _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
  17 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
  18
  19 _total_searches = 0
  20 _total_steps = 0
  21
  22
  23 class GitError(Exception):
  24     pass
  25
  26
  27 def repo(sub = ''):
  28     """Get the path to the git repository or one of its subdirectories."""
  29     global repodir
  30     if not repodir:
  31         raise GitError('You should call check_repo_or_die()')
  32
  33     # If there's a .git subdirectory, then the actual repo is in there.
  34     gd = os.path.join(repodir, '.git')
  35     if os.path.exists(gd):
  36         repodir = gd
  37
  38     return os.path.join(repodir, sub)
  39
  40
  41 def auto_midx(objdir):
  42     main_exe = os.environ.get('BUP_MAIN_EXE') or sys.argv[0]
  43     args = [main_exe, 'midx', '--auto', '--dir', objdir]
  44     rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
  45     if rv:
  46         add_error('%r: returned %d' % (args, rv))
  47
  48
  49 def mangle_name(name, mode, gitmode):
  50     """Mangle a file name to present an abstract name for segmented files.
  51     Mangled file names will have the ".bup" extension added to them. If a
  52     file's name already ends with ".bup", a ".bupl" extension is added to
  53     disambiguate normal files from semgmented ones.
  54     """
  55     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
  56         return name + '.bup'
  57     elif name.endswith('.bup') or name[:-1].endswith('.bup'):
  58         return name + '.bupl'
  59     else:
  60         return name
  61
  62
  63 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
  64 def demangle_name(name):
  65     """Remove name mangling from a file name, if necessary.
  66
  67     The return value is a tuple (demangled_filename,mode), where mode is one of
  68     the following:
  69
  70     * BUP_NORMAL  : files that should be read as-is from the repository
  71     * BUP_CHUNKED : files that were chunked and need to be assembled
  72
  73     For more information on the name mangling algorythm, see mangle_name()
  74     """
  75     if name.endswith('.bupl'):
  76         return (name[:-5], BUP_NORMAL)
  77     elif name.endswith('.bup'):
  78         return (name[:-4], BUP_CHUNKED)
  79     else:
  80         return (name, BUP_NORMAL)
  81
  82
  83 def _encode_packobj(type, content):
  84     szout = ''
  85     sz = len(content)
  86     szbits = (sz & 0x0f) | (_typemap[type]<<4)
  87     sz >>= 4
  88     while 1:
  89         if sz: szbits |= 0x80
  90         szout += chr(szbits)
  91         if not sz:
  92             break
  93         szbits = sz & 0x7f
  94         sz >>= 7
  95     z = zlib.compressobj(1)
  96     yield szout
  97     yield z.compress(content)
  98     yield z.flush()
  99
 100
 101 def _encode_looseobj(type, content):
 102     z = zlib.compressobj(1)
 103     yield z.compress('%s %d\0' % (type, len(content)))
 104     yield z.compress(content)
 105     yield z.flush()
 106
 107
 108 def _decode_looseobj(buf):
 109     assert(buf);
 110     s = zlib.decompress(buf)
 111     i = s.find('\0')
 112     assert(i > 0)
 113     l = s[:i].split(' ')
 114     type = l[0]
 115     sz = int(l[1])
 116     content = s[i+1:]
 117     assert(type in _typemap)
 118     assert(sz == len(content))
 119     return (type, content)
 120
 121
 122 def _decode_packobj(buf):
 123     assert(buf)
 124     c = ord(buf[0])
 125     type = _typermap[(c & 0x70) >> 4]
 126     sz = c & 0x0f
 127     shift = 4
 128     i = 0
 129     while c & 0x80:
 130         i += 1
 131         c = ord(buf[i])
 132         sz |= (c & 0x7f) << shift
 133         shift += 7
 134         if not (c & 0x80):
 135             break
 136     return (type, zlib.decompress(buf[i+1:]))
 137
 138
 139 class PackIdx:
 140     def __init__(self):
 141         assert(0)
 142
 143     def find_offset(self, hash):
 144         """Get the offset of an object inside the index file."""
 145         idx = self._idx_from_hash(hash)
 146         if idx != None:
 147             return self._ofs_from_idx(idx)
 148         return None
 149
 150     def exists(self, hash):
 151         """Return nonempty if the object exists in this index."""
 152         return hash and (self._idx_from_hash(hash) != None) and True or None
 153
 154     def __len__(self):
 155         return int(self.fanout[255])
 156
 157     def _idx_from_hash(self, hash):
 158         global _total_searches, _total_steps
 159         _total_searches += 1
 160         assert(len(hash) == 20)
 161         b1 = ord(hash[0])
 162         start = self.fanout[b1-1] # range -1..254
 163         end = self.fanout[b1] # range 0..255
 164         want = str(hash)
 165         _total_steps += 1  # lookup table is a step
 166         while start < end:
 167             _total_steps += 1
 168             mid = start + (end-start)/2
 169             v = self._idx_to_hash(mid)
 170             if v < want:
 171                 start = mid+1
 172             elif v > want:
 173                 end = mid
 174             else: # got it!
 175                 return mid
 176         return None
 177
 178
 179 class PackIdxV1(PackIdx):
 180     """Object representation of a Git pack index (version 1) file."""
 181     def __init__(self, filename, f):
 182         self.name = filename
 183         self.idxnames = [self.name]
 184         self.map = mmap_read(f)
 185         self.fanout = list(struct.unpack('!256I',
 186                                          str(buffer(self.map, 0, 256*4))))
 187         self.fanout.append(0)  # entry "-1"
 188         nsha = self.fanout[255]
 189         self.shatable = buffer(self.map, 256*4, nsha*24)
 190
 191     def _ofs_from_idx(self, idx):
 192         return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
 193
 194     def _idx_to_hash(self, idx):
 195         return str(self.shatable[idx*24+4 : idx*24+24])
 196
 197     def __iter__(self):
 198         for i in xrange(self.fanout[255]):
 199             yield buffer(self.map, 256*4 + 24*i + 4, 20)
 200
 201
 202 class PackIdxV2(PackIdx):
 203     """Object representation of a Git pack index (version 2) file."""
 204     def __init__(self, filename, f):
 205         self.name = filename
 206         self.idxnames = [self.name]
 207         self.map = mmap_read(f)
 208         assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
 209         self.fanout = list(struct.unpack('!256I',
 210                                          str(buffer(self.map, 8, 256*4))))
 211         self.fanout.append(0)  # entry "-1"
 212         nsha = self.fanout[255]
 213         self.shatable = buffer(self.map, 8 + 256*4, nsha*20)
 214         self.ofstable = buffer(self.map,
 215                                8 + 256*4 + nsha*20 + nsha*4,
 216                                nsha*4)
 217         self.ofs64table = buffer(self.map,
 218                                  8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
 219
 220     def _ofs_from_idx(self, idx):
 221         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
 222         if ofs & 0x80000000:
 223             idx64 = ofs & 0x7fffffff
 224             ofs = struct.unpack('!Q',
 225                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
 226         return ofs
 227
 228     def _idx_to_hash(self, idx):
 229         return str(self.shatable[idx*20:(idx+1)*20])
 230
 231     def __iter__(self):
 232         for i in xrange(self.fanout[255]):
 233             yield buffer(self.map, 8 + 256*4 + 20*i, 20)
 234
 235
 236 extract_bits = _helpers.extract_bits
 237
 238
 239 class PackMidx:
 240     """Wrapper which contains data from multiple index files.
 241     Multiple index (.midx) files constitute a wrapper around index (.idx) files
 242     and make it possible for bup to expand Git's indexing capabilities to vast
 243     amounts of files.
 244     """
 245     def __init__(self, filename):
 246         self.name = filename
 247         self.force_keep = False
 248         assert(filename.endswith('.midx'))
 249         self.map = mmap_read(open(filename))
 250         if str(self.map[0:4]) != 'MIDX':
 251             log('Warning: skipping: invalid MIDX header in %r\n' % filename)
 252             self.force_keep = True
 253             return self._init_failed()
 254         ver = struct.unpack('!I', self.map[4:8])[0]
 255         if ver < MIDX_VERSION:
 256             log('Warning: ignoring old-style (v%d) midx %r\n'
 257                 % (ver, filename))
 258             self.force_keep = False  # old stuff is boring
 259             return self._init_failed()
 260         if ver > MIDX_VERSION:
 261             log('Warning: ignoring too-new (v%d) midx %r\n'
 262                 % (ver, filename))
 263             self.force_keep = True  # new stuff is exciting
 264             return self._init_failed()
 265
 266         self.bits = _helpers.firstword(self.map[8:12])
 267         self.entries = 2**self.bits
 268         self.fanout = buffer(self.map, 12, self.entries*4)
 269         shaofs = 12 + self.entries*4
 270         nsha = self._fanget(self.entries-1)
 271         self.shalist = buffer(self.map, shaofs, nsha*20)
 272         self.idxnames = str(self.map[shaofs + 20*nsha:]).split('\0')
 273
 274     def _init_failed(self):
 275         self.bits = 0
 276         self.entries = 1
 277         self.fanout = buffer('\0\0\0\0')
 278         self.shalist = buffer('\0'*20)
 279         self.idxnames = []
 280
 281     def _fanget(self, i):
 282         start = i*4
 283         s = self.fanout[start:start+4]
 284         return _helpers.firstword(s)
 285
 286     def _get(self, i):
 287         return str(self.shalist[i*20:(i+1)*20])
 288
 289     def exists(self, hash):
 290         """Return nonempty if the object exists in the index files."""
 291         global _total_searches, _total_steps
 292         _total_searches += 1
 293         want = str(hash)
 294         el = extract_bits(want, self.bits)
 295         if el:
 296             start = self._fanget(el-1)
 297             startv = el << (32-self.bits)
 298         else:
 299             start = 0
 300             startv = 0
 301         end = self._fanget(el)
 302         endv = (el+1) << (32-self.bits)
 303         _total_steps += 1   # lookup table is a step
 304         hashv = _helpers.firstword(hash)
 305         #print '(%08x) %08x %08x %08x' % (extract_bits(want, 32), startv, hashv, endv)
 306         while start < end:
 307             _total_steps += 1
 308             #print '! %08x %08x %08x   %d - %d' % (startv, hashv, endv, start, end)
 309             mid = start + (hashv-startv)*(end-start-1)/(endv-startv)
 310             #print '  %08x %08x %08x   %d %d %d' % (startv, hashv, endv, start, mid, end)
 311             v = self._get(mid)
 312             #print '    %08x' % self._num(v)
 313             if v < want:
 314                 start = mid+1
 315                 startv = _helpers.firstword(v)
 316             elif v > want:
 317                 end = mid
 318                 endv = _helpers.firstword(v)
 319             else: # got it!
 320                 return True
 321         return None
 322
 323     def __iter__(self):
 324         for i in xrange(self._fanget(self.entries-1)):
 325             yield buffer(self.shalist, i*20, 20)
 326
 327     def __len__(self):
 328         return int(self._fanget(self.entries-1))
 329
 330
 331 _mpi_count = 0
 332 class PackIdxList:
 333     def __init__(self, dir):
 334         global _mpi_count
 335         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 336         _mpi_count += 1
 337         self.dir = dir
 338         self.also = {}
 339         self.packs = []
 340         self.refresh()
 341
 342     def __del__(self):
 343         global _mpi_count
 344         _mpi_count -= 1
 345         assert(_mpi_count == 0)
 346
 347     def __iter__(self):
 348         return iter(idxmerge(self.packs))
 349
 350     def __len__(self):
 351         return sum(len(pack) for pack in self.packs)
 352
 353     def exists(self, hash):
 354         """Return nonempty if the object exists in the index files."""
 355         global _total_searches
 356         _total_searches += 1
 357         if hash in self.also:
 358             return True
 359         for i in range(len(self.packs)):
 360             p = self.packs[i]
 361             _total_searches -= 1  # will be incremented by sub-pack
 362             if p.exists(hash):
 363                 # reorder so most recently used packs are searched first
 364                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 365                 return p.name
 366         return None
 367
 368     def refresh(self, skip_midx = False):
 369         """Refresh the index list.
 370         This method verifies if .midx files were superseded (e.g. all of its
 371         contents are in another, bigger .midx file) and removes the superseded
 372         files.
 373
 374         If skip_midx is True, all work on .midx files will be skipped and .midx
 375         files will be removed from the list.
 376
 377         The module-global variable 'ignore_midx' can force this function to
 378         always act as if skip_midx was True.
 379         """
 380         skip_midx = skip_midx or ignore_midx
 381         d = dict((p.name, p) for p in self.packs
 382                  if not skip_midx or not isinstance(p, PackMidx))
 383         if os.path.exists(self.dir):
 384             if not skip_midx:
 385                 midxl = []
 386                 for ix in self.packs:
 387                     if isinstance(ix, PackMidx):
 388                         for name in ix.idxnames:
 389                             d[os.path.join(self.dir, name)] = ix
 390                 for f in os.listdir(self.dir):
 391                     full = os.path.join(self.dir, f)
 392                     if f.endswith('.midx') and not d.get(full):
 393                         mx = PackMidx(full)
 394                         (mxd, mxf) = os.path.split(mx.name)
 395                         broken = 0
 396                         for n in mx.idxnames:
 397                             if not os.path.exists(os.path.join(mxd, n)):
 398                                 log(('warning: index %s missing\n' +
 399                                     '  used by %s\n') % (n, mxf))
 400                                 broken += 1
 401                         if broken:
 402                             del mx
 403                             unlink(full)
 404                         else:
 405                             midxl.append(mx)
 406                 midxl.sort(lambda x,y: -cmp(len(x),len(y)))
 407                 for ix in midxl:
 408                     any = 0
 409                     for sub in ix.idxnames:
 410                         found = d.get(os.path.join(self.dir, sub))
 411                         if not found or isinstance(found, PackIdx):
 412                             # doesn't exist, or exists but not in a midx
 413                             d[ix.name] = ix
 414                             for name in ix.idxnames:
 415                                 d[os.path.join(self.dir, name)] = ix
 416                             any += 1
 417                             break
 418                     if not any and not ix.force_keep:
 419                         debug1('midx: removing redundant: %s\n'
 420                                % os.path.basename(ix.name))
 421                         unlink(ix.name)
 422             for f in os.listdir(self.dir):
 423                 full = os.path.join(self.dir, f)
 424                 if f.endswith('.idx') and not d.get(full):
 425                     try:
 426                         ix = open_idx(full)
 427                     except GitError, e:
 428                         add_error(e)
 429                         continue
 430                     d[full] = ix
 431             self.packs = list(set(d.values()))
 432         debug1('PackIdxList: using %d index%s.\n'
 433             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 434
 435     def packname_containing(self, hash):
 436         # figure out which pack contains a given hash.
 437         # FIXME: if the midx file format would just *store* this information,
 438         # we could calculate it a lot more efficiently.  But it's not needed
 439         # often, so let's do it like this.
 440         for f in os.listdir(self.dir):
 441             if f.endswith('.idx'):
 442                 full = os.path.join(self.dir, f)
 443                 try:
 444                     ix = open_idx(full)
 445                 except GitError, e:
 446                     add_error(e)
 447                     continue
 448                 if ix.exists(hash):
 449                     return full
 450
 451     def add(self, hash):
 452         """Insert an additional object in the list."""
 453         self.also[hash] = 1
 454
 455     def zap_also(self):
 456         """Remove all additional objects from the list."""
 457         self.also = {}
 458
 459
 460 def calc_hash(type, content):
 461     """Calculate some content's hash in the Git fashion."""
 462     header = '%s %d\0' % (type, len(content))
 463     sum = Sha1(header)
 464     sum.update(content)
 465     return sum.digest()
 466
 467
 468 def _shalist_sort_key(ent):
 469     (mode, name, id) = ent
 470     if stat.S_ISDIR(int(mode, 8)):
 471         return name + '/'
 472     else:
 473         return name
 474
 475
 476 def open_idx(filename):
 477     if filename.endswith('.idx'):
 478         f = open(filename, 'rb')
 479         header = f.read(8)
 480         if header[0:4] == '\377tOc':
 481             version = struct.unpack('!I', header[4:8])[0]
 482             if version == 2:
 483                 return PackIdxV2(filename, f)
 484             else:
 485                 raise GitError('%s: expected idx file version 2, got %d'
 486                                % (filename, version))
 487         elif len(header) == 8 and header[0:4] < '\377tOc':
 488             return PackIdxV1(filename, f)
 489         else:
 490             raise GitError('%s: unrecognized idx file header' % filename)
 491     elif filename.endswith('.midx'):
 492         return PackMidx(filename)
 493     else:
 494         raise GitError('idx filenames must end with .idx or .midx')
 495
 496
 497 def idxmerge(idxlist, final_progress=True):
 498     """Generate a list of all the objects reachable in a PackIdxList."""
 499     total = sum(len(i) for i in idxlist)
 500     iters = (iter(i) for i in idxlist)
 501     heap = [(next(it), it) for it in iters]
 502     heapq.heapify(heap)
 503     count = 0
 504     last = None
 505     while heap:
 506         if (count % 10024) == 0:
 507             progress('Reading indexes: %.2f%% (%d/%d)\r'
 508                      % (count*100.0/total, count, total))
 509         (e, it) = heap[0]
 510         if e != last:
 511             yield e
 512             last = e
 513         count += 1
 514         e = next(it)
 515         if e:
 516             heapq.heapreplace(heap, (e, it))
 517         else:
 518             heapq.heappop(heap)
 519     if final_progress:
 520         log('Reading indexes: %.2f%% (%d/%d), done.\n' % (100, total, total))
 521
 522
 523 class PackWriter:
 524     """Writes Git objects insid a pack file."""
 525     def __init__(self, objcache_maker=None):
 526         self.count = 0
 527         self.outbytes = 0
 528         self.filename = None
 529         self.file = None
 530         self.idx = None
 531         self.objcache_maker = objcache_maker
 532         self.objcache = None
 533
 534     def __del__(self):
 535         self.close()
 536
 537     def _make_objcache(self):
 538         if self.objcache == None:
 539             if self.objcache_maker:
 540                 self.objcache = self.objcache_maker()
 541             else:
 542                 self.objcache = PackIdxList(repo('objects/pack'))
 543
 544     def _open(self):
 545         if not self.file:
 546             self._make_objcache()
 547             (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
 548             self.file = os.fdopen(fd, 'w+b')
 549             assert(name.endswith('.pack'))
 550             self.filename = name[:-5]
 551             self.file.write('PACK\0\0\0\2\0\0\0\0')
 552             self.idx = list(list() for i in xrange(256))
 553
 554     # the 'sha' parameter is used in client.py's _raw_write(), but not needed
 555     # in this basic version.
 556     def _raw_write(self, datalist, sha):
 557         self._open()
 558         f = self.file
 559         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 560         # the file never has a *partial* blob.  So let's make sure it's
 561         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 562         # to our hashsplit algorithm.)  f.write() does its own buffering,
 563         # but that's okay because we'll flush it in _end().
 564         oneblob = ''.join(datalist)
 565         f.write(oneblob)
 566         nw = len(oneblob)
 567         crc = zlib.crc32(oneblob) & 0xffffffff
 568         self._update_idx(sha, crc, nw)
 569         self.outbytes += nw
 570         self.count += 1
 571         return nw, crc
 572
 573     def _update_idx(self, sha, crc, size):
 574         assert(sha)
 575         if self.idx:
 576             self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
 577
 578     def _write(self, sha, type, content):
 579         if verbose:
 580             log('>')
 581         if not sha:
 582             sha = calc_hash(type, content)
 583         size, crc = self._raw_write(_encode_packobj(type, content), sha=sha)
 584         return sha
 585
 586     def breakpoint(self):
 587         """Clear byte and object counts and return the last processed id."""
 588         id = self._end()
 589         self.outbytes = self.count = 0
 590         return id
 591
 592     def write(self, type, content):
 593         """Write an object in this pack file."""
 594         return self._write(calc_hash(type, content), type, content)
 595
 596     def exists(self, id):
 597         """Return non-empty if an object is found in the object cache."""
 598         if not self.objcache:
 599             self._make_objcache()
 600         return self.objcache.exists(id)
 601
 602     def maybe_write(self, type, content):
 603         """Write an object to the pack file if not present and return its id."""
 604         sha = calc_hash(type, content)
 605         if not self.exists(sha):
 606             self._write(sha, type, content)
 607             self.objcache.add(sha)
 608         return sha
 609
 610     def new_blob(self, blob):
 611         """Create a blob object in the pack with the supplied content."""
 612         return self.maybe_write('blob', blob)
 613
 614     def new_tree(self, shalist):
 615         """Create a tree object in the pack."""
 616         shalist = sorted(shalist, key = _shalist_sort_key)
 617         l = []
 618         for (mode,name,bin) in shalist:
 619             assert(mode)
 620             assert(mode != '0')
 621             assert(mode[0] != '0')
 622             assert(name)
 623             assert(len(bin) == 20)
 624             l.append('%s %s\0%s' % (mode,name,bin))
 625         return self.maybe_write('tree', ''.join(l))
 626
 627     def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
 628         l = []
 629         if tree: l.append('tree %s' % tree.encode('hex'))
 630         if parent: l.append('parent %s' % parent.encode('hex'))
 631         if author: l.append('author %s %s' % (author, _git_date(adate)))
 632         if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
 633         l.append('')
 634         l.append(msg)
 635         return self.maybe_write('commit', '\n'.join(l))
 636
 637     def new_commit(self, parent, tree, date, msg):
 638         """Create a commit object in the pack."""
 639         userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
 640         commit = self._new_commit(tree, parent,
 641                                   userline, date, userline, date,
 642                                   msg)
 643         return commit
 644
 645     def abort(self):
 646         """Remove the pack file from disk."""
 647         f = self.file
 648         if f:
 649             self.idx = None
 650             self.file = None
 651             f.close()
 652             os.unlink(self.filename + '.pack')
 653
 654     def _end(self):
 655         f = self.file
 656         if not f: return None
 657         self.file = None
 658         self.objcache = None
 659         idx = self.idx
 660         self.idx = None
 661
 662         # update object count
 663         f.seek(8)
 664         cp = struct.pack('!i', self.count)
 665         assert(len(cp) == 4)
 666         f.write(cp)
 667
 668         # calculate the pack sha1sum
 669         f.seek(0)
 670         sum = Sha1()
 671         for b in chunkyreader(f):
 672             sum.update(b)
 673         packbin = sum.digest()
 674         f.write(packbin)
 675         f.close()
 676
 677         idx_f = open(self.filename + '.idx', 'wb')
 678         obj_list_sha = self._write_pack_idx_v2(idx_f, idx, packbin)
 679         idx_f.close()
 680
 681         nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
 682         if os.path.exists(self.filename + '.map'):
 683             os.unlink(self.filename + '.map')
 684         os.rename(self.filename + '.pack', nameprefix + '.pack')
 685         os.rename(self.filename + '.idx', nameprefix + '.idx')
 686
 687         auto_midx(repo('objects/pack'))
 688         return nameprefix
 689
 690     def close(self):
 691         """Close the pack file and move it to its definitive path."""
 692         return self._end()
 693
 694     def _write_pack_idx_v2(self, file, idx, packbin):
 695         sum = Sha1()
 696
 697         def write(data):
 698             file.write(data)
 699             sum.update(data)
 700
 701         write('\377tOc\0\0\0\2')
 702
 703         n = 0
 704         for part in idx:
 705             n += len(part)
 706             write(struct.pack('!i', n))
 707             part.sort(key=lambda x: x[0])
 708
 709         obj_list_sum = Sha1()
 710         for part in idx:
 711             for entry in part:
 712                 write(entry[0])
 713                 obj_list_sum.update(entry[0])
 714         for part in idx:
 715             for entry in part:
 716                 write(struct.pack('!I', entry[1]))
 717         ofs64_list = []
 718         for part in idx:
 719             for entry in part:
 720                 if entry[2] & 0x80000000:
 721                     write(struct.pack('!I', 0x80000000 | len(ofs64_list)))
 722                     ofs64_list.append(struct.pack('!Q', entry[2]))
 723                 else:
 724                     write(struct.pack('!i', entry[2]))
 725         for ofs64 in ofs64_list:
 726             write(ofs64)
 727
 728         write(packbin)
 729         file.write(sum.digest())
 730         return obj_list_sum.hexdigest()
 731
 732
 733 def _git_date(date):
 734     return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
 735
 736
 737 def _gitenv():
 738     os.environ['GIT_DIR'] = os.path.abspath(repo())
 739
 740
 741 def list_refs(refname = None):
 742     """Generate a list of tuples in the form (refname,hash).
 743     If a ref name is specified, list only this particular ref.
 744     """
 745     argv = ['git', 'show-ref', '--']
 746     if refname:
 747         argv += [refname]
 748     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 749     out = p.stdout.read().strip()
 750     rv = p.wait()  # not fatal
 751     if rv:
 752         assert(not out)
 753     if out:
 754         for d in out.split('\n'):
 755             (sha, name) = d.split(' ', 1)
 756             yield (name, sha.decode('hex'))
 757
 758
 759 def read_ref(refname):
 760     """Get the commit id of the most recent commit made on a given ref."""
 761     l = list(list_refs(refname))
 762     if l:
 763         assert(len(l) == 1)
 764         return l[0][1]
 765     else:
 766         return None
 767
 768
 769 def rev_list(ref, count=None):
 770     """Generate a list of reachable commits in reverse chronological order.
 771
 772     This generator walks through commits, from child to parent, that are
 773     reachable via the specified ref and yields a series of tuples of the form
 774     (date,hash).
 775
 776     If count is a non-zero integer, limit the number of commits to "count"
 777     objects.
 778     """
 779     assert(not ref.startswith('-'))
 780     opts = []
 781     if count:
 782         opts += ['-n', str(atoi(count))]
 783     argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
 784     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 785     commit = None
 786     for row in p.stdout:
 787         s = row.strip()
 788         if s.startswith('commit '):
 789             commit = s[7:].decode('hex')
 790         else:
 791             date = int(s)
 792             yield (date, commit)
 793     rv = p.wait()  # not fatal
 794     if rv:
 795         raise GitError, 'git rev-list returned error %d' % rv
 796
 797
 798 def rev_get_date(ref):
 799     """Get the date of the latest commit on the specified ref."""
 800     for (date, commit) in rev_list(ref, count=1):
 801         return date
 802     raise GitError, 'no such commit %r' % ref
 803
 804
 805 def rev_parse(committish):
 806     """Resolve the full hash for 'committish', if it exists.
 807
 808     Should be roughly equivalent to 'git rev-parse'.
 809
 810     Returns the hex value of the hash if it is found, None if 'committish' does
 811     not correspond to anything.
 812     """
 813     head = read_ref(committish)
 814     if head:
 815         debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
 816         return head
 817
 818     pL = PackIdxList(repo('objects/pack'))
 819
 820     if len(committish) == 40:
 821         try:
 822             hash = committish.decode('hex')
 823         except TypeError:
 824             return None
 825
 826         if pL.exists(hash):
 827             return hash
 828
 829     return None
 830
 831
 832 def update_ref(refname, newval, oldval):
 833     """Change the commit pointed to by a branch."""
 834     if not oldval:
 835         oldval = ''
 836     assert(refname.startswith('refs/heads/'))
 837     p = subprocess.Popen(['git', 'update-ref', refname,
 838                           newval.encode('hex'), oldval.encode('hex')],
 839                          preexec_fn = _gitenv)
 840     _git_wait('git update-ref', p)
 841
 842
 843 def guess_repo(path=None):
 844     """Set the path value in the global variable "repodir".
 845     This makes bup look for an existing bup repository, but not fail if a
 846     repository doesn't exist. Usually, if you are interacting with a bup
 847     repository, you would not be calling this function but using
 848     check_repo_or_die().
 849     """
 850     global repodir
 851     if path:
 852         repodir = path
 853     if not repodir:
 854         repodir = os.environ.get('BUP_DIR')
 855         if not repodir:
 856             repodir = os.path.expanduser('~/.bup')
 857
 858
 859 def init_repo(path=None):
 860     """Create the Git bare repository for bup in a given path."""
 861     guess_repo(path)
 862     d = repo()
 863     if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
 864         raise GitError('"%d" exists but is not a directory\n' % d)
 865     p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
 866                          preexec_fn = _gitenv)
 867     _git_wait('git init', p)
 868     # Force the index version configuration in order to ensure bup works
 869     # regardless of the version of the installed Git binary.
 870     p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
 871                          stdout=sys.stderr, preexec_fn = _gitenv)
 872     _git_wait('git config', p)
 873
 874
 875 def check_repo_or_die(path=None):
 876     """Make sure a bup repository exists, and abort if not.
 877     If the path to a particular repository was not specified, this function
 878     initializes the default repository automatically.
 879     """
 880     guess_repo(path)
 881     if not os.path.isdir(repo('objects/pack/.')):
 882         if repodir == home_repodir:
 883             init_repo()
 884         else:
 885             log('error: %r is not a bup/git repository\n' % repo())
 886             sys.exit(15)
 887
 888
 889 def treeparse(buf):
 890     """Generate a list of (mode, name, hash) tuples of objects from 'buf'."""
 891     ofs = 0
 892     while ofs < len(buf):
 893         z = buf[ofs:].find('\0')
 894         assert(z > 0)
 895         spl = buf[ofs:ofs+z].split(' ', 1)
 896         assert(len(spl) == 2)
 897         sha = buf[ofs+z+1:ofs+z+1+20]
 898         ofs += z+1+20
 899         yield (spl[0], spl[1], sha)
 900
 901
 902 _ver = None
 903 def ver():
 904     """Get Git's version and ensure a usable version is installed.
 905
 906     The returned version is formatted as an ordered tuple with each position
 907     representing a digit in the version tag. For example, the following tuple
 908     would represent version 1.6.6.9:
 909
 910         ('1', '6', '6', '9')
 911     """
 912     global _ver
 913     if not _ver:
 914         p = subprocess.Popen(['git', '--version'],
 915                              stdout=subprocess.PIPE)
 916         gvs = p.stdout.read()
 917         _git_wait('git --version', p)
 918         m = re.match(r'git version (\S+.\S+)', gvs)
 919         if not m:
 920             raise GitError('git --version weird output: %r' % gvs)
 921         _ver = tuple(m.group(1).split('.'))
 922     needed = ('1','5', '3', '1')
 923     if _ver < needed:
 924         raise GitError('git version %s or higher is required; you have %s'
 925                        % ('.'.join(needed), '.'.join(_ver)))
 926     return _ver
 927
 928
 929 def _git_wait(cmd, p):
 930     rv = p.wait()
 931     if rv != 0:
 932         raise GitError('%s returned %d' % (cmd, rv))
 933
 934
 935 def _git_capture(argv):
 936     p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
 937     r = p.stdout.read()
 938     _git_wait(repr(argv), p)
 939     return r
 940
 941
 942 class _AbortableIter:
 943     def __init__(self, it, onabort = None):
 944         self.it = it
 945         self.onabort = onabort
 946         self.done = None
 947
 948     def __iter__(self):
 949         return self
 950
 951     def next(self):
 952         try:
 953             return self.it.next()
 954         except StopIteration, e:
 955             self.done = True
 956             raise
 957         except:
 958             self.abort()
 959             raise
 960
 961     def abort(self):
 962         """Abort iteration and call the abortion callback, if needed."""
 963         if not self.done:
 964             self.done = True
 965             if self.onabort:
 966                 self.onabort()
 967
 968     def __del__(self):
 969         self.abort()
 970
 971
 972 _ver_warned = 0
 973 class CatPipe:
 974     """Link to 'git cat-file' that is used to retrieve blob data."""
 975     def __init__(self):
 976         global _ver_warned
 977         wanted = ('1','5','6')
 978         if ver() < wanted:
 979             if not _ver_warned:
 980                 log('warning: git version < %s; bup will be slow.\n'
 981                     % '.'.join(wanted))
 982                 _ver_warned = 1
 983             self.get = self._slow_get
 984         else:
 985             self.p = self.inprogress = None
 986             self.get = self._fast_get
 987
 988     def _abort(self):
 989         if self.p:
 990             self.p.stdout.close()
 991             self.p.stdin.close()
 992         self.p = None
 993         self.inprogress = None
 994
 995     def _restart(self):
 996         self._abort()
 997         self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
 998                                   stdin=subprocess.PIPE,
 999                                   stdout=subprocess.PIPE,
1000                                   close_fds = True,
1001                                   preexec_fn = _gitenv)
1002
1003     def _fast_get(self, id):
1004         if not self.p or self.p.poll() != None:
1005             self._restart()
1006         assert(self.p)
1007         assert(self.p.poll() == None)
1008         if self.inprogress:
1009             log('_fast_get: opening %r while %r is open'
1010                 % (id, self.inprogress))
1011         assert(not self.inprogress)
1012         assert(id.find('\n') < 0)
1013         assert(id.find('\r') < 0)
1014         assert(not id.startswith('-'))
1015         self.inprogress = id
1016         self.p.stdin.write('%s\n' % id)
1017         hdr = self.p.stdout.readline()
1018         if hdr.endswith(' missing\n'):
1019             self.inprogress = None
1020             raise KeyError('blob %r is missing' % id)
1021         spl = hdr.split(' ')
1022         if len(spl) != 3 or len(spl[0]) != 40:
1023             raise GitError('expected blob, got %r' % spl)
1024         (hex, type, size) = spl
1025
1026         it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
1027                            onabort = self._abort)
1028         try:
1029             yield type
1030             for blob in it:
1031                 yield blob
1032             assert(self.p.stdout.readline() == '\n')
1033             self.inprogress = None
1034         except Exception, e:
1035             it.abort()
1036             raise
1037
1038     def _slow_get(self, id):
1039         assert(id.find('\n') < 0)
1040         assert(id.find('\r') < 0)
1041         assert(id[0] != '-')
1042         type = _git_capture(['git', 'cat-file', '-t', id]).strip()
1043         yield type
1044
1045         p = subprocess.Popen(['git', 'cat-file', type, id],
1046                              stdout=subprocess.PIPE,
1047                              preexec_fn = _gitenv)
1048         for blob in chunkyreader(p.stdout):
1049             yield blob
1050         _git_wait('git cat-file', p)
1051
1052     def _join(self, it):
1053         type = it.next()
1054         if type == 'blob':
1055             for blob in it:
1056                 yield blob
1057         elif type == 'tree':
1058             treefile = ''.join(it)
1059             for (mode, name, sha) in treeparse(treefile):
1060                 for blob in self.join(sha.encode('hex')):
1061                     yield blob
1062         elif type == 'commit':
1063             treeline = ''.join(it).split('\n')[0]
1064             assert(treeline.startswith('tree '))
1065             for blob in self.join(treeline[5:]):
1066                 yield blob
1067         else:
1068             raise GitError('invalid object type %r: expected blob/tree/commit'
1069                            % type)
1070
1071     def join(self, id):
1072         """Generate a list of the content of all blobs that can be reached
1073         from an object.  The hash given in 'id' must point to a blob, a tree
1074         or a commit. The content of all blobs that can be seen from trees or
1075         commits will be added to the list.
1076         """
1077         try:
1078             for d in self._join(self.get(id)):
1079                 yield d
1080         except StopIteration:
1081             log('booger!\n')