lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5 import os, sys, zlib, time, subprocess, struct, stat, re, tempfile
   6 from bup.helpers import *
   7 from bup import _helpers, path
   8
   9 MIDX_VERSION = 2
  10
  11 verbose = 0
  12 ignore_midx = 0
  13 home_repodir = os.path.expanduser('~/.bup')
  14 repodir = None
  15
  16 _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
  17 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
  18
  19 _total_searches = 0
  20 _total_steps = 0
  21
  22
  23 class GitError(Exception):
  24     pass
  25
  26
  27 def repo(sub = ''):
  28     """Get the path to the git repository or one of its subdirectories."""
  29     global repodir
  30     if not repodir:
  31         raise GitError('You should call check_repo_or_die()')
  32
  33     # If there's a .git subdirectory, then the actual repo is in there.
  34     gd = os.path.join(repodir, '.git')
  35     if os.path.exists(gd):
  36         repodir = gd
  37
  38     return os.path.join(repodir, sub)
  39
  40
  41 def auto_midx(objdir):
  42     args = [path.exe(), 'midx', '--auto', '--dir', objdir]
  43     try:
  44         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
  45     except OSError, e:
  46         # make sure 'args' gets printed to help with debugging
  47         add_error('%r: exception: %s' % (args, e))
  48         raise
  49     if rv:
  50         add_error('%r: returned %d' % (args, rv))
  51
  52
  53 def mangle_name(name, mode, gitmode):
  54     """Mangle a file name to present an abstract name for segmented files.
  55     Mangled file names will have the ".bup" extension added to them. If a
  56     file's name already ends with ".bup", a ".bupl" extension is added to
  57     disambiguate normal files from semgmented ones.
  58     """
  59     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
  60         return name + '.bup'
  61     elif name.endswith('.bup') or name[:-1].endswith('.bup'):
  62         return name + '.bupl'
  63     else:
  64         return name
  65
  66
  67 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
  68 def demangle_name(name):
  69     """Remove name mangling from a file name, if necessary.
  70
  71     The return value is a tuple (demangled_filename,mode), where mode is one of
  72     the following:
  73
  74     * BUP_NORMAL  : files that should be read as-is from the repository
  75     * BUP_CHUNKED : files that were chunked and need to be assembled
  76
  77     For more information on the name mangling algorythm, see mangle_name()
  78     """
  79     if name.endswith('.bupl'):
  80         return (name[:-5], BUP_NORMAL)
  81     elif name.endswith('.bup'):
  82         return (name[:-4], BUP_CHUNKED)
  83     else:
  84         return (name, BUP_NORMAL)
  85
  86
  87 def _encode_packobj(type, content):
  88     szout = ''
  89     sz = len(content)
  90     szbits = (sz & 0x0f) | (_typemap[type]<<4)
  91     sz >>= 4
  92     while 1:
  93         if sz: szbits |= 0x80
  94         szout += chr(szbits)
  95         if not sz:
  96             break
  97         szbits = sz & 0x7f
  98         sz >>= 7
  99     z = zlib.compressobj(1)
 100     yield szout
 101     yield z.compress(content)
 102     yield z.flush()
 103
 104
 105 def _encode_looseobj(type, content):
 106     z = zlib.compressobj(1)
 107     yield z.compress('%s %d\0' % (type, len(content)))
 108     yield z.compress(content)
 109     yield z.flush()
 110
 111
 112 def _decode_looseobj(buf):
 113     assert(buf);
 114     s = zlib.decompress(buf)
 115     i = s.find('\0')
 116     assert(i > 0)
 117     l = s[:i].split(' ')
 118     type = l[0]
 119     sz = int(l[1])
 120     content = s[i+1:]
 121     assert(type in _typemap)
 122     assert(sz == len(content))
 123     return (type, content)
 124
 125
 126 def _decode_packobj(buf):
 127     assert(buf)
 128     c = ord(buf[0])
 129     type = _typermap[(c & 0x70) >> 4]
 130     sz = c & 0x0f
 131     shift = 4
 132     i = 0
 133     while c & 0x80:
 134         i += 1
 135         c = ord(buf[i])
 136         sz |= (c & 0x7f) << shift
 137         shift += 7
 138         if not (c & 0x80):
 139             break
 140     return (type, zlib.decompress(buf[i+1:]))
 141
 142
 143 class PackIdx:
 144     def __init__(self):
 145         assert(0)
 146
 147     def find_offset(self, hash):
 148         """Get the offset of an object inside the index file."""
 149         idx = self._idx_from_hash(hash)
 150         if idx != None:
 151             return self._ofs_from_idx(idx)
 152         return None
 153
 154     def exists(self, hash):
 155         """Return nonempty if the object exists in this index."""
 156         return hash and (self._idx_from_hash(hash) != None) and True or None
 157
 158     def __len__(self):
 159         return int(self.fanout[255])
 160
 161     def _idx_from_hash(self, hash):
 162         global _total_searches, _total_steps
 163         _total_searches += 1
 164         assert(len(hash) == 20)
 165         b1 = ord(hash[0])
 166         start = self.fanout[b1-1] # range -1..254
 167         end = self.fanout[b1] # range 0..255
 168         want = str(hash)
 169         _total_steps += 1  # lookup table is a step
 170         while start < end:
 171             _total_steps += 1
 172             mid = start + (end-start)/2
 173             v = self._idx_to_hash(mid)
 174             if v < want:
 175                 start = mid+1
 176             elif v > want:
 177                 end = mid
 178             else: # got it!
 179                 return mid
 180         return None
 181
 182
 183 class PackIdxV1(PackIdx):
 184     """Object representation of a Git pack index (version 1) file."""
 185     def __init__(self, filename, f):
 186         self.name = filename
 187         self.idxnames = [self.name]
 188         self.map = mmap_read(f)
 189         self.fanout = list(struct.unpack('!256I',
 190                                          str(buffer(self.map, 0, 256*4))))
 191         self.fanout.append(0)  # entry "-1"
 192         nsha = self.fanout[255]
 193         self.shatable = buffer(self.map, 256*4, nsha*24)
 194
 195     def _ofs_from_idx(self, idx):
 196         return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
 197
 198     def _idx_to_hash(self, idx):
 199         return str(self.shatable[idx*24+4 : idx*24+24])
 200
 201     def __iter__(self):
 202         for i in xrange(self.fanout[255]):
 203             yield buffer(self.map, 256*4 + 24*i + 4, 20)
 204
 205
 206 class PackIdxV2(PackIdx):
 207     """Object representation of a Git pack index (version 2) file."""
 208     def __init__(self, filename, f):
 209         self.name = filename
 210         self.idxnames = [self.name]
 211         self.map = mmap_read(f)
 212         assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
 213         self.fanout = list(struct.unpack('!256I',
 214                                          str(buffer(self.map, 8, 256*4))))
 215         self.fanout.append(0)  # entry "-1"
 216         nsha = self.fanout[255]
 217         self.shatable = buffer(self.map, 8 + 256*4, nsha*20)
 218         self.ofstable = buffer(self.map,
 219                                8 + 256*4 + nsha*20 + nsha*4,
 220                                nsha*4)
 221         self.ofs64table = buffer(self.map,
 222                                  8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
 223
 224     def _ofs_from_idx(self, idx):
 225         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
 226         if ofs & 0x80000000:
 227             idx64 = ofs & 0x7fffffff
 228             ofs = struct.unpack('!Q',
 229                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
 230         return ofs
 231
 232     def _idx_to_hash(self, idx):
 233         return str(self.shatable[idx*20:(idx+1)*20])
 234
 235     def __iter__(self):
 236         for i in xrange(self.fanout[255]):
 237             yield buffer(self.map, 8 + 256*4 + 20*i, 20)
 238
 239
 240 extract_bits = _helpers.extract_bits
 241
 242
 243 class PackMidx:
 244     """Wrapper which contains data from multiple index files.
 245     Multiple index (.midx) files constitute a wrapper around index (.idx) files
 246     and make it possible for bup to expand Git's indexing capabilities to vast
 247     amounts of files.
 248     """
 249     def __init__(self, filename):
 250         self.name = filename
 251         self.force_keep = False
 252         assert(filename.endswith('.midx'))
 253         self.map = mmap_read(open(filename))
 254         if str(self.map[0:4]) != 'MIDX':
 255             log('Warning: skipping: invalid MIDX header in %r\n' % filename)
 256             self.force_keep = True
 257             return self._init_failed()
 258         ver = struct.unpack('!I', self.map[4:8])[0]
 259         if ver < MIDX_VERSION:
 260             log('Warning: ignoring old-style (v%d) midx %r\n'
 261                 % (ver, filename))
 262             self.force_keep = False  # old stuff is boring
 263             return self._init_failed()
 264         if ver > MIDX_VERSION:
 265             log('Warning: ignoring too-new (v%d) midx %r\n'
 266                 % (ver, filename))
 267             self.force_keep = True  # new stuff is exciting
 268             return self._init_failed()
 269
 270         self.bits = _helpers.firstword(self.map[8:12])
 271         self.entries = 2**self.bits
 272         self.fanout = buffer(self.map, 12, self.entries*4)
 273         shaofs = 12 + self.entries*4
 274         nsha = self._fanget(self.entries-1)
 275         self.shalist = buffer(self.map, shaofs, nsha*20)
 276         self.idxnames = str(self.map[shaofs + 20*nsha:]).split('\0')
 277
 278     def _init_failed(self):
 279         self.bits = 0
 280         self.entries = 1
 281         self.fanout = buffer('\0\0\0\0')
 282         self.shalist = buffer('\0'*20)
 283         self.idxnames = []
 284
 285     def _fanget(self, i):
 286         start = i*4
 287         s = self.fanout[start:start+4]
 288         return _helpers.firstword(s)
 289
 290     def _get(self, i):
 291         return str(self.shalist[i*20:(i+1)*20])
 292
 293     def exists(self, hash):
 294         """Return nonempty if the object exists in the index files."""
 295         global _total_searches, _total_steps
 296         _total_searches += 1
 297         want = str(hash)
 298         el = extract_bits(want, self.bits)
 299         if el:
 300             start = self._fanget(el-1)
 301             startv = el << (32-self.bits)
 302         else:
 303             start = 0
 304             startv = 0
 305         end = self._fanget(el)
 306         endv = (el+1) << (32-self.bits)
 307         _total_steps += 1   # lookup table is a step
 308         hashv = _helpers.firstword(hash)
 309         #print '(%08x) %08x %08x %08x' % (extract_bits(want, 32), startv, hashv, endv)
 310         while start < end:
 311             _total_steps += 1
 312             #print '! %08x %08x %08x   %d - %d' % (startv, hashv, endv, start, end)
 313             mid = start + (hashv-startv)*(end-start-1)/(endv-startv)
 314             #print '  %08x %08x %08x   %d %d %d' % (startv, hashv, endv, start, mid, end)
 315             v = self._get(mid)
 316             #print '    %08x' % self._num(v)
 317             if v < want:
 318                 start = mid+1
 319                 startv = _helpers.firstword(v)
 320             elif v > want:
 321                 end = mid
 322                 endv = _helpers.firstword(v)
 323             else: # got it!
 324                 return True
 325         return None
 326
 327     def __iter__(self):
 328         for i in xrange(self._fanget(self.entries-1)):
 329             yield buffer(self.shalist, i*20, 20)
 330
 331     def __len__(self):
 332         return int(self._fanget(self.entries-1))
 333
 334
 335 _mpi_count = 0
 336 class PackIdxList:
 337     def __init__(self, dir):
 338         global _mpi_count
 339         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 340         _mpi_count += 1
 341         self.dir = dir
 342         self.also = {}
 343         self.packs = []
 344         self.refresh()
 345
 346     def __del__(self):
 347         global _mpi_count
 348         _mpi_count -= 1
 349         assert(_mpi_count == 0)
 350
 351     def __iter__(self):
 352         return iter(idxmerge(self.packs))
 353
 354     def __len__(self):
 355         return sum(len(pack) for pack in self.packs)
 356
 357     def exists(self, hash):
 358         """Return nonempty if the object exists in the index files."""
 359         global _total_searches
 360         _total_searches += 1
 361         if hash in self.also:
 362             return True
 363         for i in range(len(self.packs)):
 364             p = self.packs[i]
 365             _total_searches -= 1  # will be incremented by sub-pack
 366             if p.exists(hash):
 367                 # reorder so most recently used packs are searched first
 368                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 369                 return p.name
 370         return None
 371
 372     def refresh(self, skip_midx = False):
 373         """Refresh the index list.
 374         This method verifies if .midx files were superseded (e.g. all of its
 375         contents are in another, bigger .midx file) and removes the superseded
 376         files.
 377
 378         If skip_midx is True, all work on .midx files will be skipped and .midx
 379         files will be removed from the list.
 380
 381         The module-global variable 'ignore_midx' can force this function to
 382         always act as if skip_midx was True.
 383         """
 384         skip_midx = skip_midx or ignore_midx
 385         d = dict((p.name, p) for p in self.packs
 386                  if not skip_midx or not isinstance(p, PackMidx))
 387         if os.path.exists(self.dir):
 388             if not skip_midx:
 389                 midxl = []
 390                 for ix in self.packs:
 391                     if isinstance(ix, PackMidx):
 392                         for name in ix.idxnames:
 393                             d[os.path.join(self.dir, name)] = ix
 394                 for f in os.listdir(self.dir):
 395                     full = os.path.join(self.dir, f)
 396                     if f.endswith('.midx') and not d.get(full):
 397                         mx = PackMidx(full)
 398                         (mxd, mxf) = os.path.split(mx.name)
 399                         broken = 0
 400                         for n in mx.idxnames:
 401                             if not os.path.exists(os.path.join(mxd, n)):
 402                                 log(('warning: index %s missing\n' +
 403                                     '  used by %s\n') % (n, mxf))
 404                                 broken += 1
 405                         if broken:
 406                             del mx
 407                             unlink(full)
 408                         else:
 409                             midxl.append(mx)
 410                 midxl.sort(lambda x,y: -cmp(len(x),len(y)))
 411                 for ix in midxl:
 412                     any = 0
 413                     for sub in ix.idxnames:
 414                         found = d.get(os.path.join(self.dir, sub))
 415                         if not found or isinstance(found, PackIdx):
 416                             # doesn't exist, or exists but not in a midx
 417                             d[ix.name] = ix
 418                             for name in ix.idxnames:
 419                                 d[os.path.join(self.dir, name)] = ix
 420                             any += 1
 421                             break
 422                     if not any and not ix.force_keep:
 423                         debug1('midx: removing redundant: %s\n'
 424                                % os.path.basename(ix.name))
 425                         unlink(ix.name)
 426             for f in os.listdir(self.dir):
 427                 full = os.path.join(self.dir, f)
 428                 if f.endswith('.idx') and not d.get(full):
 429                     try:
 430                         ix = open_idx(full)
 431                     except GitError, e:
 432                         add_error(e)
 433                         continue
 434                     d[full] = ix
 435             self.packs = list(set(d.values()))
 436         debug1('PackIdxList: using %d index%s.\n'
 437             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 438
 439     def packname_containing(self, hash):
 440         # figure out which pack contains a given hash.
 441         # FIXME: if the midx file format would just *store* this information,
 442         # we could calculate it a lot more efficiently.  But it's not needed
 443         # often, so let's do it like this.
 444         for f in os.listdir(self.dir):
 445             if f.endswith('.idx'):
 446                 full = os.path.join(self.dir, f)
 447                 try:
 448                     ix = open_idx(full)
 449                 except GitError, e:
 450                     add_error(e)
 451                     continue
 452                 if ix.exists(hash):
 453                     return full
 454
 455     def add(self, hash):
 456         """Insert an additional object in the list."""
 457         self.also[hash] = 1
 458
 459     def zap_also(self):
 460         """Remove all additional objects from the list."""
 461         self.also = {}
 462
 463
 464 def calc_hash(type, content):
 465     """Calculate some content's hash in the Git fashion."""
 466     header = '%s %d\0' % (type, len(content))
 467     sum = Sha1(header)
 468     sum.update(content)
 469     return sum.digest()
 470
 471
 472 def _shalist_sort_key(ent):
 473     (mode, name, id) = ent
 474     if stat.S_ISDIR(int(mode, 8)):
 475         return name + '/'
 476     else:
 477         return name
 478
 479
 480 def open_idx(filename):
 481     if filename.endswith('.idx'):
 482         f = open(filename, 'rb')
 483         header = f.read(8)
 484         if header[0:4] == '\377tOc':
 485             version = struct.unpack('!I', header[4:8])[0]
 486             if version == 2:
 487                 return PackIdxV2(filename, f)
 488             else:
 489                 raise GitError('%s: expected idx file version 2, got %d'
 490                                % (filename, version))
 491         elif len(header) == 8 and header[0:4] < '\377tOc':
 492             return PackIdxV1(filename, f)
 493         else:
 494             raise GitError('%s: unrecognized idx file header' % filename)
 495     elif filename.endswith('.midx'):
 496         return PackMidx(filename)
 497     else:
 498         raise GitError('idx filenames must end with .idx or .midx')
 499
 500
 501 def idxmerge(idxlist, final_progress=True):
 502     """Generate a list of all the objects reachable in a PackIdxList."""
 503     def pfunc(count, total):
 504         progress('Reading indexes: %.2f%% (%d/%d)\r'
 505                  % (count*100.0/total, count, total))
 506     def pfinal(count, total):
 507         if final_progress:
 508             log('Reading indexes: %.2f%% (%d/%d), done.\n' % (100, total, total))
 509     return merge_iter(idxlist, 10024, pfunc, pfinal)
 510
 511
 512 def _make_objcache():
 513     return PackIdxList(repo('objects/pack'))
 514
 515 class PackWriter:
 516     """Writes Git objects insid a pack file."""
 517     def __init__(self, objcache_maker=_make_objcache):
 518         self.count = 0
 519         self.outbytes = 0
 520         self.filename = None
 521         self.file = None
 522         self.idx = None
 523         self.objcache_maker = objcache_maker
 524         self.objcache = None
 525
 526     def __del__(self):
 527         self.close()
 528
 529     def _open(self):
 530         if not self.file:
 531             (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
 532             self.file = os.fdopen(fd, 'w+b')
 533             assert(name.endswith('.pack'))
 534             self.filename = name[:-5]
 535             self.file.write('PACK\0\0\0\2\0\0\0\0')
 536             self.idx = list(list() for i in xrange(256))
 537
 538     # the 'sha' parameter is used in client.py's _raw_write(), but not needed
 539     # in this basic version.
 540     def _raw_write(self, datalist, sha):
 541         self._open()
 542         f = self.file
 543         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 544         # the file never has a *partial* blob.  So let's make sure it's
 545         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 546         # to our hashsplit algorithm.)  f.write() does its own buffering,
 547         # but that's okay because we'll flush it in _end().
 548         oneblob = ''.join(datalist)
 549         try:
 550             f.write(oneblob)
 551         except IOError, e:
 552             raise GitError, e, sys.exc_info()[2]
 553         nw = len(oneblob)
 554         crc = zlib.crc32(oneblob) & 0xffffffff
 555         self._update_idx(sha, crc, nw)
 556         self.outbytes += nw
 557         self.count += 1
 558         return nw, crc
 559
 560     def _update_idx(self, sha, crc, size):
 561         assert(sha)
 562         if self.idx:
 563             self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
 564
 565     def _write(self, sha, type, content):
 566         if verbose:
 567             log('>')
 568         if not sha:
 569             sha = calc_hash(type, content)
 570         size, crc = self._raw_write(_encode_packobj(type, content), sha=sha)
 571         return sha
 572
 573     def breakpoint(self):
 574         """Clear byte and object counts and return the last processed id."""
 575         id = self._end()
 576         self.outbytes = self.count = 0
 577         return id
 578
 579     def write(self, type, content):
 580         """Write an object in this pack file."""
 581         return self._write(calc_hash(type, content), type, content)
 582
 583     def _require_objcache(self):
 584         if self.objcache is None and self.objcache_maker:
 585             self.objcache = self.objcache_maker()
 586         if self.objcache is None:
 587             raise GitError(
 588                     "PackWriter not opened or can't check exists w/o objcache")
 589
 590     def exists(self, id):
 591         """Return non-empty if an object is found in the object cache."""
 592         self._require_objcache()
 593         return self.objcache.exists(id)
 594
 595     def maybe_write(self, type, content):
 596         """Write an object to the pack file if not present and return its id."""
 597         self._require_objcache()
 598         sha = calc_hash(type, content)
 599         if not self.exists(sha):
 600             self._write(sha, type, content)
 601             self.objcache.add(sha)
 602         return sha
 603
 604     def new_blob(self, blob):
 605         """Create a blob object in the pack with the supplied content."""
 606         return self.maybe_write('blob', blob)
 607
 608     def new_tree(self, shalist):
 609         """Create a tree object in the pack."""
 610         shalist = sorted(shalist, key = _shalist_sort_key)
 611         l = []
 612         for (mode,name,bin) in shalist:
 613             assert(mode)
 614             assert(mode != '0')
 615             assert(mode[0] != '0')
 616             assert(name)
 617             assert(len(bin) == 20)
 618             l.append('%s %s\0%s' % (mode,name,bin))
 619         return self.maybe_write('tree', ''.join(l))
 620
 621     def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
 622         l = []
 623         if tree: l.append('tree %s' % tree.encode('hex'))
 624         if parent: l.append('parent %s' % parent.encode('hex'))
 625         if author: l.append('author %s %s' % (author, _git_date(adate)))
 626         if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
 627         l.append('')
 628         l.append(msg)
 629         return self.maybe_write('commit', '\n'.join(l))
 630
 631     def new_commit(self, parent, tree, date, msg):
 632         """Create a commit object in the pack."""
 633         userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
 634         commit = self._new_commit(tree, parent,
 635                                   userline, date, userline, date,
 636                                   msg)
 637         return commit
 638
 639     def abort(self):
 640         """Remove the pack file from disk."""
 641         f = self.file
 642         if f:
 643             self.idx = None
 644             self.file = None
 645             f.close()
 646             os.unlink(self.filename + '.pack')
 647
 648     def _end(self, run_midx=True):
 649         f = self.file
 650         if not f: return None
 651         self.file = None
 652         self.objcache = None
 653         idx = self.idx
 654         self.idx = None
 655
 656         # update object count
 657         f.seek(8)
 658         cp = struct.pack('!i', self.count)
 659         assert(len(cp) == 4)
 660         f.write(cp)
 661
 662         # calculate the pack sha1sum
 663         f.seek(0)
 664         sum = Sha1()
 665         for b in chunkyreader(f):
 666             sum.update(b)
 667         packbin = sum.digest()
 668         f.write(packbin)
 669         f.close()
 670
 671         idx_f = open(self.filename + '.idx', 'wb')
 672         obj_list_sha = self._write_pack_idx_v2(idx_f, idx, packbin)
 673         idx_f.close()
 674
 675         nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
 676         if os.path.exists(self.filename + '.map'):
 677             os.unlink(self.filename + '.map')
 678         os.rename(self.filename + '.pack', nameprefix + '.pack')
 679         os.rename(self.filename + '.idx', nameprefix + '.idx')
 680
 681         if run_midx:
 682             auto_midx(repo('objects/pack'))
 683         return nameprefix
 684
 685     def close(self, run_midx=True):
 686         """Close the pack file and move it to its definitive path."""
 687         return self._end(run_midx=run_midx)
 688
 689     def _write_pack_idx_v2(self, file, idx, packbin):
 690         sum = Sha1()
 691
 692         def write(data):
 693             file.write(data)
 694             sum.update(data)
 695
 696         write('\377tOc\0\0\0\2')
 697
 698         n = 0
 699         for part in idx:
 700             n += len(part)
 701             write(struct.pack('!i', n))
 702             part.sort(key=lambda x: x[0])
 703
 704         obj_list_sum = Sha1()
 705         for part in idx:
 706             for entry in part:
 707                 write(entry[0])
 708                 obj_list_sum.update(entry[0])
 709         for part in idx:
 710             for entry in part:
 711                 write(struct.pack('!I', entry[1]))
 712         ofs64_list = []
 713         for part in idx:
 714             for entry in part:
 715                 if entry[2] & 0x80000000:
 716                     write(struct.pack('!I', 0x80000000 | len(ofs64_list)))
 717                     ofs64_list.append(struct.pack('!Q', entry[2]))
 718                 else:
 719                     write(struct.pack('!i', entry[2]))
 720         for ofs64 in ofs64_list:
 721             write(ofs64)
 722
 723         write(packbin)
 724         file.write(sum.digest())
 725         return obj_list_sum.hexdigest()
 726
 727
 728 def _git_date(date):
 729     return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
 730
 731
 732 def _gitenv():
 733     os.environ['GIT_DIR'] = os.path.abspath(repo())
 734
 735
 736 def list_refs(refname = None):
 737     """Generate a list of tuples in the form (refname,hash).
 738     If a ref name is specified, list only this particular ref.
 739     """
 740     argv = ['git', 'show-ref', '--']
 741     if refname:
 742         argv += [refname]
 743     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 744     out = p.stdout.read().strip()
 745     rv = p.wait()  # not fatal
 746     if rv:
 747         assert(not out)
 748     if out:
 749         for d in out.split('\n'):
 750             (sha, name) = d.split(' ', 1)
 751             yield (name, sha.decode('hex'))
 752
 753
 754 def read_ref(refname):
 755     """Get the commit id of the most recent commit made on a given ref."""
 756     l = list(list_refs(refname))
 757     if l:
 758         assert(len(l) == 1)
 759         return l[0][1]
 760     else:
 761         return None
 762
 763
 764 def rev_list(ref, count=None):
 765     """Generate a list of reachable commits in reverse chronological order.
 766
 767     This generator walks through commits, from child to parent, that are
 768     reachable via the specified ref and yields a series of tuples of the form
 769     (date,hash).
 770
 771     If count is a non-zero integer, limit the number of commits to "count"
 772     objects.
 773     """
 774     assert(not ref.startswith('-'))
 775     opts = []
 776     if count:
 777         opts += ['-n', str(atoi(count))]
 778     argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
 779     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 780     commit = None
 781     for row in p.stdout:
 782         s = row.strip()
 783         if s.startswith('commit '):
 784             commit = s[7:].decode('hex')
 785         else:
 786             date = int(s)
 787             yield (date, commit)
 788     rv = p.wait()  # not fatal
 789     if rv:
 790         raise GitError, 'git rev-list returned error %d' % rv
 791
 792
 793 def rev_get_date(ref):
 794     """Get the date of the latest commit on the specified ref."""
 795     for (date, commit) in rev_list(ref, count=1):
 796         return date
 797     raise GitError, 'no such commit %r' % ref
 798
 799
 800 def rev_parse(committish):
 801     """Resolve the full hash for 'committish', if it exists.
 802
 803     Should be roughly equivalent to 'git rev-parse'.
 804
 805     Returns the hex value of the hash if it is found, None if 'committish' does
 806     not correspond to anything.
 807     """
 808     head = read_ref(committish)
 809     if head:
 810         debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
 811         return head
 812
 813     pL = PackIdxList(repo('objects/pack'))
 814
 815     if len(committish) == 40:
 816         try:
 817             hash = committish.decode('hex')
 818         except TypeError:
 819             return None
 820
 821         if pL.exists(hash):
 822             return hash
 823
 824     return None
 825
 826
 827 def update_ref(refname, newval, oldval):
 828     """Change the commit pointed to by a branch."""
 829     if not oldval:
 830         oldval = ''
 831     assert(refname.startswith('refs/heads/'))
 832     p = subprocess.Popen(['git', 'update-ref', refname,
 833                           newval.encode('hex'), oldval.encode('hex')],
 834                          preexec_fn = _gitenv)
 835     _git_wait('git update-ref', p)
 836
 837
 838 def guess_repo(path=None):
 839     """Set the path value in the global variable "repodir".
 840     This makes bup look for an existing bup repository, but not fail if a
 841     repository doesn't exist. Usually, if you are interacting with a bup
 842     repository, you would not be calling this function but using
 843     check_repo_or_die().
 844     """
 845     global repodir
 846     if path:
 847         repodir = path
 848     if not repodir:
 849         repodir = os.environ.get('BUP_DIR')
 850         if not repodir:
 851             repodir = os.path.expanduser('~/.bup')
 852
 853
 854 def init_repo(path=None):
 855     """Create the Git bare repository for bup in a given path."""
 856     guess_repo(path)
 857     d = repo()  # appends a / to the path
 858     parent = os.path.dirname(os.path.dirname(d))
 859     if parent and not os.path.exists(parent):
 860         raise GitError('parent directory "%s" does not exist\n' % parent)
 861     if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
 862         raise GitError('"%d" exists but is not a directory\n' % d)
 863     p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
 864                          preexec_fn = _gitenv)
 865     _git_wait('git init', p)
 866     # Force the index version configuration in order to ensure bup works
 867     # regardless of the version of the installed Git binary.
 868     p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
 869                          stdout=sys.stderr, preexec_fn = _gitenv)
 870     _git_wait('git config', p)
 871
 872
 873 def check_repo_or_die(path=None):
 874     """Make sure a bup repository exists, and abort if not.
 875     If the path to a particular repository was not specified, this function
 876     initializes the default repository automatically.
 877     """
 878     guess_repo(path)
 879     if not os.path.isdir(repo('objects/pack/.')):
 880         if repodir == home_repodir:
 881             init_repo()
 882         else:
 883             log('error: %r is not a bup/git repository\n' % repo())
 884             sys.exit(15)
 885
 886
 887 def treeparse(buf):
 888     """Generate a list of (mode, name, hash) tuples of objects from 'buf'."""
 889     ofs = 0
 890     while ofs < len(buf):
 891         z = buf[ofs:].find('\0')
 892         assert(z > 0)
 893         spl = buf[ofs:ofs+z].split(' ', 1)
 894         assert(len(spl) == 2)
 895         sha = buf[ofs+z+1:ofs+z+1+20]
 896         ofs += z+1+20
 897         yield (spl[0], spl[1], sha)
 898
 899
 900 _ver = None
 901 def ver():
 902     """Get Git's version and ensure a usable version is installed.
 903
 904     The returned version is formatted as an ordered tuple with each position
 905     representing a digit in the version tag. For example, the following tuple
 906     would represent version 1.6.6.9:
 907
 908         ('1', '6', '6', '9')
 909     """
 910     global _ver
 911     if not _ver:
 912         p = subprocess.Popen(['git', '--version'],
 913                              stdout=subprocess.PIPE)
 914         gvs = p.stdout.read()
 915         _git_wait('git --version', p)
 916         m = re.match(r'git version (\S+.\S+)', gvs)
 917         if not m:
 918             raise GitError('git --version weird output: %r' % gvs)
 919         _ver = tuple(m.group(1).split('.'))
 920     needed = ('1','5', '3', '1')
 921     if _ver < needed:
 922         raise GitError('git version %s or higher is required; you have %s'
 923                        % ('.'.join(needed), '.'.join(_ver)))
 924     return _ver
 925
 926
 927 def _git_wait(cmd, p):
 928     rv = p.wait()
 929     if rv != 0:
 930         raise GitError('%s returned %d' % (cmd, rv))
 931
 932
 933 def _git_capture(argv):
 934     p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
 935     r = p.stdout.read()
 936     _git_wait(repr(argv), p)
 937     return r
 938
 939
 940 class _AbortableIter:
 941     def __init__(self, it, onabort = None):
 942         self.it = it
 943         self.onabort = onabort
 944         self.done = None
 945
 946     def __iter__(self):
 947         return self
 948
 949     def next(self):
 950         try:
 951             return self.it.next()
 952         except StopIteration, e:
 953             self.done = True
 954             raise
 955         except:
 956             self.abort()
 957             raise
 958
 959     def abort(self):
 960         """Abort iteration and call the abortion callback, if needed."""
 961         if not self.done:
 962             self.done = True
 963             if self.onabort:
 964                 self.onabort()
 965
 966     def __del__(self):
 967         self.abort()
 968
 969
 970 _ver_warned = 0
 971 class CatPipe:
 972     """Link to 'git cat-file' that is used to retrieve blob data."""
 973     def __init__(self):
 974         global _ver_warned
 975         wanted = ('1','5','6')
 976         if ver() < wanted:
 977             if not _ver_warned:
 978                 log('warning: git version < %s; bup will be slow.\n'
 979                     % '.'.join(wanted))
 980                 _ver_warned = 1
 981             self.get = self._slow_get
 982         else:
 983             self.p = self.inprogress = None
 984             self.get = self._fast_get
 985
 986     def _abort(self):
 987         if self.p:
 988             self.p.stdout.close()
 989             self.p.stdin.close()
 990         self.p = None
 991         self.inprogress = None
 992
 993     def _restart(self):
 994         self._abort()
 995         self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
 996                                   stdin=subprocess.PIPE,
 997                                   stdout=subprocess.PIPE,
 998                                   close_fds = True,
 999                                   bufsize = 4096,
1000                                   preexec_fn = _gitenv)
1001
1002     def _fast_get(self, id):
1003         if not self.p or self.p.poll() != None:
1004             self._restart()
1005         assert(self.p)
1006         assert(self.p.poll() == None)
1007         if self.inprogress:
1008             log('_fast_get: opening %r while %r is open'
1009                 % (id, self.inprogress))
1010         assert(not self.inprogress)
1011         assert(id.find('\n') < 0)
1012         assert(id.find('\r') < 0)
1013         assert(not id.startswith('-'))
1014         self.inprogress = id
1015         self.p.stdin.write('%s\n' % id)
1016         self.p.stdin.flush()
1017         hdr = self.p.stdout.readline()
1018         if hdr.endswith(' missing\n'):
1019             self.inprogress = None
1020             raise KeyError('blob %r is missing' % id)
1021         spl = hdr.split(' ')
1022         if len(spl) != 3 or len(spl[0]) != 40:
1023             raise GitError('expected blob, got %r' % spl)
1024         (hex, type, size) = spl
1025
1026         it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
1027                            onabort = self._abort)
1028         try:
1029             yield type
1030             for blob in it:
1031                 yield blob
1032             assert(self.p.stdout.readline() == '\n')
1033             self.inprogress = None
1034         except Exception, e:
1035             it.abort()
1036             raise
1037
1038     def _slow_get(self, id):
1039         assert(id.find('\n') < 0)
1040         assert(id.find('\r') < 0)
1041         assert(id[0] != '-')
1042         type = _git_capture(['git', 'cat-file', '-t', id]).strip()
1043         yield type
1044
1045         p = subprocess.Popen(['git', 'cat-file', type, id],
1046                              stdout=subprocess.PIPE,
1047                              preexec_fn = _gitenv)
1048         for blob in chunkyreader(p.stdout):
1049             yield blob
1050         _git_wait('git cat-file', p)
1051
1052     def _join(self, it):
1053         type = it.next()
1054         if type == 'blob':
1055             for blob in it:
1056                 yield blob
1057         elif type == 'tree':
1058             treefile = ''.join(it)
1059             for (mode, name, sha) in treeparse(treefile):
1060                 for blob in self.join(sha.encode('hex')):
1061                     yield blob
1062         elif type == 'commit':
1063             treeline = ''.join(it).split('\n')[0]
1064             assert(treeline.startswith('tree '))
1065             for blob in self.join(treeline[5:]):
1066                 yield blob
1067         else:
1068             raise GitError('invalid object type %r: expected blob/tree/commit'
1069                            % type)
1070
1071     def join(self, id):
1072         """Generate a list of the content of all blobs that can be reached
1073         from an object.  The hash given in 'id' must point to a blob, a tree
1074         or a commit. The content of all blobs that can be seen from trees or
1075         commits will be added to the list.
1076         """
1077         try:
1078             for d in self._join(self.get(id)):
1079                 yield d
1080         except StopIteration:
1081             log('booger!\n')
1082
1083 def tags():
1084     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1085     tags = {}
1086     for (n,c) in list_refs():
1087         if n.startswith('refs/tags/'):
1088             name = n[10:]
1089             if not c in tags:
1090                 tags[c] = []
1091
1092             tags[c].append(name)  # more than one tag can point at 'c'
1093
1094     return tags