lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5 import os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
   6 from bup.helpers import *
   7 from bup import _helpers, path, midx, bloom, xstat
   8
   9 max_pack_size = 1000*1000*1000  # larger packs will slow down pruning
  10 max_pack_objects = 200*1000  # cache memory usage is about 83 bytes per object
  11 SEEK_END=2  # os.SEEK_END is not defined in python 2.4
  12
  13 verbose = 0
  14 ignore_midx = 0
  15 home_repodir = os.path.expanduser('~/.bup')
  16 repodir = None
  17
  18 _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
  19 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
  20
  21 _total_searches = 0
  22 _total_steps = 0
  23
  24
  25 class GitError(Exception):
  26     pass
  27
  28
  29 def repo(sub = ''):
  30     """Get the path to the git repository or one of its subdirectories."""
  31     global repodir
  32     if not repodir:
  33         raise GitError('You should call check_repo_or_die()')
  34
  35     # If there's a .git subdirectory, then the actual repo is in there.
  36     gd = os.path.join(repodir, '.git')
  37     if os.path.exists(gd):
  38         repodir = gd
  39
  40     return os.path.join(repodir, sub)
  41
  42
  43 def shorten_hash(s):
  44     return re.sub(r'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)',
  45                   r'\1\2*\3', s)
  46
  47
  48 def repo_rel(path):
  49     full = os.path.abspath(path)
  50     fullrepo = os.path.abspath(repo(''))
  51     if not fullrepo.endswith('/'):
  52         fullrepo += '/'
  53     if full.startswith(fullrepo):
  54         path = full[len(fullrepo):]
  55     if path.startswith('index-cache/'):
  56         path = path[len('index-cache/'):]
  57     return shorten_hash(path)
  58
  59
  60 def all_packdirs():
  61     paths = [repo('objects/pack')]
  62     paths += glob.glob(repo('index-cache/*/.'))
  63     return paths
  64
  65
  66 def auto_midx(objdir):
  67     args = [path.exe(), 'midx', '--auto', '--dir', objdir]
  68     try:
  69         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
  70     except OSError, e:
  71         # make sure 'args' gets printed to help with debugging
  72         add_error('%r: exception: %s' % (args, e))
  73         raise
  74     if rv:
  75         add_error('%r: returned %d' % (args, rv))
  76
  77     args = [path.exe(), 'bloom', '--dir', objdir]
  78     try:
  79         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
  80     except OSError, e:
  81         # make sure 'args' gets printed to help with debugging
  82         add_error('%r: exception: %s' % (args, e))
  83         raise
  84     if rv:
  85         add_error('%r: returned %d' % (args, rv))
  86
  87
  88 def mangle_name(name, mode, gitmode):
  89     """Mangle a file name to present an abstract name for segmented files.
  90     Mangled file names will have the ".bup" extension added to them. If a
  91     file's name already ends with ".bup", a ".bupl" extension is added to
  92     disambiguate normal files from semgmented ones.
  93     """
  94     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
  95         return name + '.bup'
  96     elif name.endswith('.bup') or name[:-1].endswith('.bup'):
  97         return name + '.bupl'
  98     else:
  99         return name
 100
 101
 102 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 103 def demangle_name(name):
 104     """Remove name mangling from a file name, if necessary.
 105
 106     The return value is a tuple (demangled_filename,mode), where mode is one of
 107     the following:
 108
 109     * BUP_NORMAL  : files that should be read as-is from the repository
 110     * BUP_CHUNKED : files that were chunked and need to be assembled
 111
 112     For more information on the name mangling algorythm, see mangle_name()
 113     """
 114     if name.endswith('.bupl'):
 115         return (name[:-5], BUP_NORMAL)
 116     elif name.endswith('.bup'):
 117         return (name[:-4], BUP_CHUNKED)
 118     else:
 119         return (name, BUP_NORMAL)
 120
 121
 122 def calc_hash(type, content):
 123     """Calculate some content's hash in the Git fashion."""
 124     header = '%s %d\0' % (type, len(content))
 125     sum = Sha1(header)
 126     sum.update(content)
 127     return sum.digest()
 128
 129
 130 def _shalist_sort_key(ent):
 131     (mode, name, id) = ent
 132     assert(mode+0 == mode)
 133     if stat.S_ISDIR(mode):
 134         return name + '/'
 135     else:
 136         return name
 137
 138
 139 def tree_encode(shalist):
 140     """Generate a git tree object from (mode,name,hash) tuples."""
 141     shalist = sorted(shalist, key = _shalist_sort_key)
 142     l = []
 143     for (mode,name,bin) in shalist:
 144         assert(mode)
 145         assert(mode+0 == mode)
 146         assert(name)
 147         assert(len(bin) == 20)
 148         s = '%o %s\0%s' % (mode,name,bin)
 149         assert(s[0] != '0')  # 0-padded octal is not acceptable in a git tree
 150         l.append(s)
 151     return ''.join(l)
 152
 153
 154 def tree_decode(buf):
 155     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 156     ofs = 0
 157     while ofs < len(buf):
 158         z = buf[ofs:].find('\0')
 159         assert(z > 0)
 160         spl = buf[ofs:ofs+z].split(' ', 1)
 161         assert(len(spl) == 2)
 162         mode,name = spl
 163         sha = buf[ofs+z+1:ofs+z+1+20]
 164         ofs += z+1+20
 165         yield (int(mode, 8), name, sha)
 166
 167
 168 def _encode_packobj(type, content, compression_level=1):
 169     szout = ''
 170     sz = len(content)
 171     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 172     sz >>= 4
 173     while 1:
 174         if sz: szbits |= 0x80
 175         szout += chr(szbits)
 176         if not sz:
 177             break
 178         szbits = sz & 0x7f
 179         sz >>= 7
 180     if compression_level > 9:
 181         compression_level = 9
 182     elif compression_level < 0:
 183         compression_level = 0
 184     z = zlib.compressobj(compression_level)
 185     yield szout
 186     yield z.compress(content)
 187     yield z.flush()
 188
 189
 190 def _encode_looseobj(type, content, compression_level=1):
 191     z = zlib.compressobj(compression_level)
 192     yield z.compress('%s %d\0' % (type, len(content)))
 193     yield z.compress(content)
 194     yield z.flush()
 195
 196
 197 def _decode_looseobj(buf):
 198     assert(buf);
 199     s = zlib.decompress(buf)
 200     i = s.find('\0')
 201     assert(i > 0)
 202     l = s[:i].split(' ')
 203     type = l[0]
 204     sz = int(l[1])
 205     content = s[i+1:]
 206     assert(type in _typemap)
 207     assert(sz == len(content))
 208     return (type, content)
 209
 210
 211 def _decode_packobj(buf):
 212     assert(buf)
 213     c = ord(buf[0])
 214     type = _typermap[(c & 0x70) >> 4]
 215     sz = c & 0x0f
 216     shift = 4
 217     i = 0
 218     while c & 0x80:
 219         i += 1
 220         c = ord(buf[i])
 221         sz |= (c & 0x7f) << shift
 222         shift += 7
 223         if not (c & 0x80):
 224             break
 225     return (type, zlib.decompress(buf[i+1:]))
 226
 227
 228 class PackIdx:
 229     def __init__(self):
 230         assert(0)
 231
 232     def find_offset(self, hash):
 233         """Get the offset of an object inside the index file."""
 234         idx = self._idx_from_hash(hash)
 235         if idx != None:
 236             return self._ofs_from_idx(idx)
 237         return None
 238
 239     def exists(self, hash, want_source=False):
 240         """Return nonempty if the object exists in this index."""
 241         if hash and (self._idx_from_hash(hash) != None):
 242             return want_source and os.path.basename(self.name) or True
 243         return None
 244
 245     def __len__(self):
 246         return int(self.fanout[255])
 247
 248     def _idx_from_hash(self, hash):
 249         global _total_searches, _total_steps
 250         _total_searches += 1
 251         assert(len(hash) == 20)
 252         b1 = ord(hash[0])
 253         start = self.fanout[b1-1] # range -1..254
 254         end = self.fanout[b1] # range 0..255
 255         want = str(hash)
 256         _total_steps += 1  # lookup table is a step
 257         while start < end:
 258             _total_steps += 1
 259             mid = start + (end-start)/2
 260             v = self._idx_to_hash(mid)
 261             if v < want:
 262                 start = mid+1
 263             elif v > want:
 264                 end = mid
 265             else: # got it!
 266                 return mid
 267         return None
 268
 269
 270 class PackIdxV1(PackIdx):
 271     """Object representation of a Git pack index (version 1) file."""
 272     def __init__(self, filename, f):
 273         self.name = filename
 274         self.idxnames = [self.name]
 275         self.map = mmap_read(f)
 276         self.fanout = list(struct.unpack('!256I',
 277                                          str(buffer(self.map, 0, 256*4))))
 278         self.fanout.append(0)  # entry "-1"
 279         nsha = self.fanout[255]
 280         self.sha_ofs = 256*4
 281         self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
 282
 283     def _ofs_from_idx(self, idx):
 284         return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
 285
 286     def _idx_to_hash(self, idx):
 287         return str(self.shatable[idx*24+4 : idx*24+24])
 288
 289     def __iter__(self):
 290         for i in xrange(self.fanout[255]):
 291             yield buffer(self.map, 256*4 + 24*i + 4, 20)
 292
 293
 294 class PackIdxV2(PackIdx):
 295     """Object representation of a Git pack index (version 2) file."""
 296     def __init__(self, filename, f):
 297         self.name = filename
 298         self.idxnames = [self.name]
 299         self.map = mmap_read(f)
 300         assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
 301         self.fanout = list(struct.unpack('!256I',
 302                                          str(buffer(self.map, 8, 256*4))))
 303         self.fanout.append(0)  # entry "-1"
 304         nsha = self.fanout[255]
 305         self.sha_ofs = 8 + 256*4
 306         self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
 307         self.ofstable = buffer(self.map,
 308                                self.sha_ofs + nsha*20 + nsha*4,
 309                                nsha*4)
 310         self.ofs64table = buffer(self.map,
 311                                  8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
 312
 313     def _ofs_from_idx(self, idx):
 314         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
 315         if ofs & 0x80000000:
 316             idx64 = ofs & 0x7fffffff
 317             ofs = struct.unpack('!Q',
 318                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
 319         return ofs
 320
 321     def _idx_to_hash(self, idx):
 322         return str(self.shatable[idx*20:(idx+1)*20])
 323
 324     def __iter__(self):
 325         for i in xrange(self.fanout[255]):
 326             yield buffer(self.map, 8 + 256*4 + 20*i, 20)
 327
 328
 329 _mpi_count = 0
 330 class PackIdxList:
 331     def __init__(self, dir):
 332         global _mpi_count
 333         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 334         _mpi_count += 1
 335         self.dir = dir
 336         self.also = set()
 337         self.packs = []
 338         self.do_bloom = False
 339         self.bloom = None
 340         self.refresh()
 341
 342     def __del__(self):
 343         global _mpi_count
 344         _mpi_count -= 1
 345         assert(_mpi_count == 0)
 346
 347     def __iter__(self):
 348         return iter(idxmerge(self.packs))
 349
 350     def __len__(self):
 351         return sum(len(pack) for pack in self.packs)
 352
 353     def exists(self, hash, want_source=False):
 354         """Return nonempty if the object exists in the index files."""
 355         global _total_searches
 356         _total_searches += 1
 357         if hash in self.also:
 358             return True
 359         if self.do_bloom and self.bloom:
 360             if self.bloom.exists(hash):
 361                 self.do_bloom = False
 362             else:
 363                 _total_searches -= 1  # was counted by bloom
 364                 return None
 365         for i in xrange(len(self.packs)):
 366             p = self.packs[i]
 367             _total_searches -= 1  # will be incremented by sub-pack
 368             ix = p.exists(hash, want_source=want_source)
 369             if ix:
 370                 # reorder so most recently used packs are searched first
 371                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 372                 return ix
 373         self.do_bloom = True
 374         return None
 375
 376     def refresh(self, skip_midx = False):
 377         """Refresh the index list.
 378         This method verifies if .midx files were superseded (e.g. all of its
 379         contents are in another, bigger .midx file) and removes the superseded
 380         files.
 381
 382         If skip_midx is True, all work on .midx files will be skipped and .midx
 383         files will be removed from the list.
 384
 385         The module-global variable 'ignore_midx' can force this function to
 386         always act as if skip_midx was True.
 387         """
 388         self.bloom = None # Always reopen the bloom as it may have been relaced
 389         self.do_bloom = False
 390         skip_midx = skip_midx or ignore_midx
 391         d = dict((p.name, p) for p in self.packs
 392                  if not skip_midx or not isinstance(p, midx.PackMidx))
 393         if os.path.exists(self.dir):
 394             if not skip_midx:
 395                 midxl = []
 396                 for ix in self.packs:
 397                     if isinstance(ix, midx.PackMidx):
 398                         for name in ix.idxnames:
 399                             d[os.path.join(self.dir, name)] = ix
 400                 for full in glob.glob(os.path.join(self.dir,'*.midx')):
 401                     if not d.get(full):
 402                         mx = midx.PackMidx(full)
 403                         (mxd, mxf) = os.path.split(mx.name)
 404                         broken = False
 405                         for n in mx.idxnames:
 406                             if not os.path.exists(os.path.join(mxd, n)):
 407                                 log(('warning: index %s missing\n' +
 408                                     '  used by %s\n') % (n, mxf))
 409                                 broken = True
 410                         if broken:
 411                             del mx
 412                             unlink(full)
 413                         else:
 414                             midxl.append(mx)
 415                 midxl.sort(key=lambda ix:
 416                            (-len(ix), -xstat.stat(ix.name).st_mtime))
 417                 for ix in midxl:
 418                     any_needed = False
 419                     for sub in ix.idxnames:
 420                         found = d.get(os.path.join(self.dir, sub))
 421                         if not found or isinstance(found, PackIdx):
 422                             # doesn't exist, or exists but not in a midx
 423                             any_needed = True
 424                             break
 425                     if any_needed:
 426                         d[ix.name] = ix
 427                         for name in ix.idxnames:
 428                             d[os.path.join(self.dir, name)] = ix
 429                     elif not ix.force_keep:
 430                         debug1('midx: removing redundant: %s\n'
 431                                % os.path.basename(ix.name))
 432                         unlink(ix.name)
 433             for full in glob.glob(os.path.join(self.dir,'*.idx')):
 434                 if not d.get(full):
 435                     try:
 436                         ix = open_idx(full)
 437                     except GitError, e:
 438                         add_error(e)
 439                         continue
 440                     d[full] = ix
 441             bfull = os.path.join(self.dir, 'bup.bloom')
 442             if self.bloom is None and os.path.exists(bfull):
 443                 self.bloom = bloom.ShaBloom(bfull)
 444             self.packs = list(set(d.values()))
 445             self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
 446             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 447                 self.do_bloom = True
 448             else:
 449                 self.bloom = None
 450         debug1('PackIdxList: using %d index%s.\n'
 451             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 452
 453     def add(self, hash):
 454         """Insert an additional object in the list."""
 455         self.also.add(hash)
 456
 457
 458 def open_idx(filename):
 459     if filename.endswith('.idx'):
 460         f = open(filename, 'rb')
 461         header = f.read(8)
 462         if header[0:4] == '\377tOc':
 463             version = struct.unpack('!I', header[4:8])[0]
 464             if version == 2:
 465                 return PackIdxV2(filename, f)
 466             else:
 467                 raise GitError('%s: expected idx file version 2, got %d'
 468                                % (filename, version))
 469         elif len(header) == 8 and header[0:4] < '\377tOc':
 470             return PackIdxV1(filename, f)
 471         else:
 472             raise GitError('%s: unrecognized idx file header' % filename)
 473     elif filename.endswith('.midx'):
 474         return midx.PackMidx(filename)
 475     else:
 476         raise GitError('idx filenames must end with .idx or .midx')
 477
 478
 479 def idxmerge(idxlist, final_progress=True):
 480     """Generate a list of all the objects reachable in a PackIdxList."""
 481     def pfunc(count, total):
 482         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 483                   % (count*100.0/total, count, total))
 484     def pfinal(count, total):
 485         if final_progress:
 486             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 487                      % (100, total, total))
 488     return merge_iter(idxlist, 10024, pfunc, pfinal)
 489
 490
 491 def _make_objcache():
 492     return PackIdxList(repo('objects/pack'))
 493
 494 class PackWriter:
 495     """Writes Git objects inside a pack file."""
 496     def __init__(self, objcache_maker=_make_objcache, compression_level=1):
 497         self.count = 0
 498         self.outbytes = 0
 499         self.filename = None
 500         self.file = None
 501         self.idx = None
 502         self.objcache_maker = objcache_maker
 503         self.objcache = None
 504         self.compression_level = compression_level
 505
 506     def __del__(self):
 507         self.close()
 508
 509     def _open(self):
 510         if not self.file:
 511             (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
 512             self.file = os.fdopen(fd, 'w+b')
 513             assert(name.endswith('.pack'))
 514             self.filename = name[:-5]
 515             self.file.write('PACK\0\0\0\2\0\0\0\0')
 516             self.idx = list(list() for i in xrange(256))
 517
 518     def _raw_write(self, datalist, sha):
 519         self._open()
 520         f = self.file
 521         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 522         # the file never has a *partial* blob.  So let's make sure it's
 523         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 524         # to our hashsplit algorithm.)  f.write() does its own buffering,
 525         # but that's okay because we'll flush it in _end().
 526         oneblob = ''.join(datalist)
 527         try:
 528             f.write(oneblob)
 529         except IOError, e:
 530             raise GitError, e, sys.exc_info()[2]
 531         nw = len(oneblob)
 532         crc = zlib.crc32(oneblob) & 0xffffffff
 533         self._update_idx(sha, crc, nw)
 534         self.outbytes += nw
 535         self.count += 1
 536         return nw, crc
 537
 538     def _update_idx(self, sha, crc, size):
 539         assert(sha)
 540         if self.idx:
 541             self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
 542
 543     def _write(self, sha, type, content):
 544         if verbose:
 545             log('>')
 546         if not sha:
 547             sha = calc_hash(type, content)
 548         size, crc = self._raw_write(_encode_packobj(type, content,
 549                                                     self.compression_level),
 550                                     sha=sha)
 551         if self.outbytes >= max_pack_size or self.count >= max_pack_objects:
 552             self.breakpoint()
 553         return sha
 554
 555     def breakpoint(self):
 556         """Clear byte and object counts and return the last processed id."""
 557         id = self._end()
 558         self.outbytes = self.count = 0
 559         return id
 560
 561     def _require_objcache(self):
 562         if self.objcache is None and self.objcache_maker:
 563             self.objcache = self.objcache_maker()
 564         if self.objcache is None:
 565             raise GitError(
 566                     "PackWriter not opened or can't check exists w/o objcache")
 567
 568     def exists(self, id, want_source=False):
 569         """Return non-empty if an object is found in the object cache."""
 570         self._require_objcache()
 571         return self.objcache.exists(id, want_source=want_source)
 572
 573     def maybe_write(self, type, content):
 574         """Write an object to the pack file if not present and return its id."""
 575         sha = calc_hash(type, content)
 576         if not self.exists(sha):
 577             self._write(sha, type, content)
 578             self._require_objcache()
 579             self.objcache.add(sha)
 580         return sha
 581
 582     def new_blob(self, blob):
 583         """Create a blob object in the pack with the supplied content."""
 584         return self.maybe_write('blob', blob)
 585
 586     def new_tree(self, shalist):
 587         """Create a tree object in the pack."""
 588         content = tree_encode(shalist)
 589         return self.maybe_write('tree', content)
 590
 591     def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
 592         l = []
 593         if tree: l.append('tree %s' % tree.encode('hex'))
 594         if parent: l.append('parent %s' % parent.encode('hex'))
 595         if author: l.append('author %s %s' % (author, _git_date(adate)))
 596         if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
 597         l.append('')
 598         l.append(msg)
 599         return self.maybe_write('commit', '\n'.join(l))
 600
 601     def new_commit(self, parent, tree, date, msg):
 602         """Create a commit object in the pack."""
 603         userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
 604         commit = self._new_commit(tree, parent,
 605                                   userline, date, userline, date,
 606                                   msg)
 607         return commit
 608
 609     def abort(self):
 610         """Remove the pack file from disk."""
 611         f = self.file
 612         if f:
 613             self.idx = None
 614             self.file = None
 615             f.close()
 616             os.unlink(self.filename + '.pack')
 617
 618     def _end(self, run_midx=True):
 619         f = self.file
 620         if not f: return None
 621         self.file = None
 622         self.objcache = None
 623         idx = self.idx
 624         self.idx = None
 625
 626         # update object count
 627         f.seek(8)
 628         cp = struct.pack('!i', self.count)
 629         assert(len(cp) == 4)
 630         f.write(cp)
 631
 632         # calculate the pack sha1sum
 633         f.seek(0)
 634         sum = Sha1()
 635         for b in chunkyreader(f):
 636             sum.update(b)
 637         packbin = sum.digest()
 638         f.write(packbin)
 639         f.close()
 640
 641         obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
 642
 643         nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
 644         if os.path.exists(self.filename + '.map'):
 645             os.unlink(self.filename + '.map')
 646         os.rename(self.filename + '.pack', nameprefix + '.pack')
 647         os.rename(self.filename + '.idx', nameprefix + '.idx')
 648
 649         if run_midx:
 650             auto_midx(repo('objects/pack'))
 651         return nameprefix
 652
 653     def close(self, run_midx=True):
 654         """Close the pack file and move it to its definitive path."""
 655         return self._end(run_midx=run_midx)
 656
 657     def _write_pack_idx_v2(self, filename, idx, packbin):
 658         idx_f = open(filename, 'w+b')
 659         idx_f.write('\377tOc\0\0\0\2')
 660
 661         ofs64_ofs = 8 + 4*256 + 28*self.count
 662         idx_f.truncate(ofs64_ofs)
 663         idx_f.seek(0)
 664         idx_map = mmap_readwrite(idx_f, close=False)
 665         idx_f.seek(0, SEEK_END)
 666         count = _helpers.write_idx(idx_f, idx_map, idx, self.count)
 667         assert(count == self.count)
 668         idx_map.close()
 669         idx_f.write(packbin)
 670
 671         idx_f.seek(0)
 672         idx_sum = Sha1()
 673         b = idx_f.read(8 + 4*256)
 674         idx_sum.update(b)
 675
 676         obj_list_sum = Sha1()
 677         for b in chunkyreader(idx_f, 20*self.count):
 678             idx_sum.update(b)
 679             obj_list_sum.update(b)
 680         namebase = obj_list_sum.hexdigest()
 681
 682         for b in chunkyreader(idx_f):
 683             idx_sum.update(b)
 684         idx_f.write(idx_sum.digest())
 685         idx_f.close()
 686
 687         return namebase
 688
 689
 690 def _git_date(date):
 691     return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
 692
 693
 694 def _gitenv():
 695     os.environ['GIT_DIR'] = os.path.abspath(repo())
 696
 697
 698 def list_refs(refname = None):
 699     """Generate a list of tuples in the form (refname,hash).
 700     If a ref name is specified, list only this particular ref.
 701     """
 702     argv = ['git', 'show-ref', '--']
 703     if refname:
 704         argv += [refname]
 705     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 706     out = p.stdout.read().strip()
 707     rv = p.wait()  # not fatal
 708     if rv:
 709         assert(not out)
 710     if out:
 711         for d in out.split('\n'):
 712             (sha, name) = d.split(' ', 1)
 713             yield (name, sha.decode('hex'))
 714
 715
 716 def read_ref(refname):
 717     """Get the commit id of the most recent commit made on a given ref."""
 718     l = list(list_refs(refname))
 719     if l:
 720         assert(len(l) == 1)
 721         return l[0][1]
 722     else:
 723         return None
 724
 725
 726 def rev_list(ref, count=None):
 727     """Generate a list of reachable commits in reverse chronological order.
 728
 729     This generator walks through commits, from child to parent, that are
 730     reachable via the specified ref and yields a series of tuples of the form
 731     (date,hash).
 732
 733     If count is a non-zero integer, limit the number of commits to "count"
 734     objects.
 735     """
 736     assert(not ref.startswith('-'))
 737     opts = []
 738     if count:
 739         opts += ['-n', str(atoi(count))]
 740     argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
 741     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 742     commit = None
 743     for row in p.stdout:
 744         s = row.strip()
 745         if s.startswith('commit '):
 746             commit = s[7:].decode('hex')
 747         else:
 748             date = int(s)
 749             yield (date, commit)
 750     rv = p.wait()  # not fatal
 751     if rv:
 752         raise GitError, 'git rev-list returned error %d' % rv
 753
 754
 755 def rev_get_date(ref):
 756     """Get the date of the latest commit on the specified ref."""
 757     for (date, commit) in rev_list(ref, count=1):
 758         return date
 759     raise GitError, 'no such commit %r' % ref
 760
 761
 762 def rev_parse(committish):
 763     """Resolve the full hash for 'committish', if it exists.
 764
 765     Should be roughly equivalent to 'git rev-parse'.
 766
 767     Returns the hex value of the hash if it is found, None if 'committish' does
 768     not correspond to anything.
 769     """
 770     head = read_ref(committish)
 771     if head:
 772         debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
 773         return head
 774
 775     pL = PackIdxList(repo('objects/pack'))
 776
 777     if len(committish) == 40:
 778         try:
 779             hash = committish.decode('hex')
 780         except TypeError:
 781             return None
 782
 783         if pL.exists(hash):
 784             return hash
 785
 786     return None
 787
 788
 789 def update_ref(refname, newval, oldval):
 790     """Change the commit pointed to by a branch."""
 791     if not oldval:
 792         oldval = ''
 793     assert(refname.startswith('refs/heads/'))
 794     p = subprocess.Popen(['git', 'update-ref', refname,
 795                           newval.encode('hex'), oldval.encode('hex')],
 796                          preexec_fn = _gitenv)
 797     _git_wait('git update-ref', p)
 798
 799
 800 def guess_repo(path=None):
 801     """Set the path value in the global variable "repodir".
 802     This makes bup look for an existing bup repository, but not fail if a
 803     repository doesn't exist. Usually, if you are interacting with a bup
 804     repository, you would not be calling this function but using
 805     check_repo_or_die().
 806     """
 807     global repodir
 808     if path:
 809         repodir = path
 810     if not repodir:
 811         repodir = os.environ.get('BUP_DIR')
 812         if not repodir:
 813             repodir = os.path.expanduser('~/.bup')
 814
 815
 816 def init_repo(path=None):
 817     """Create the Git bare repository for bup in a given path."""
 818     guess_repo(path)
 819     d = repo()  # appends a / to the path
 820     parent = os.path.dirname(os.path.dirname(d))
 821     if parent and not os.path.exists(parent):
 822         raise GitError('parent directory "%s" does not exist\n' % parent)
 823     if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
 824         raise GitError('"%s" exists but is not a directory\n' % d)
 825     p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
 826                          preexec_fn = _gitenv)
 827     _git_wait('git init', p)
 828     # Force the index version configuration in order to ensure bup works
 829     # regardless of the version of the installed Git binary.
 830     p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
 831                          stdout=sys.stderr, preexec_fn = _gitenv)
 832     _git_wait('git config', p)
 833     # Enable the reflog
 834     p = subprocess.Popen(['git', 'config', 'core.logAllRefUpdates', 'true'],
 835                          stdout=sys.stderr, preexec_fn = _gitenv)
 836     _git_wait('git config', p)
 837
 838
 839 def check_repo_or_die(path=None):
 840     """Make sure a bup repository exists, and abort if not.
 841     If the path to a particular repository was not specified, this function
 842     initializes the default repository automatically.
 843     """
 844     guess_repo(path)
 845     try:
 846         os.stat(repo('objects/pack/.'))
 847     except OSError, e:
 848         if e.errno == errno.ENOENT:
 849             if repodir != home_repodir:
 850                 log('error: %r is not a bup repository; run "bup init"\n'
 851                     % repo())
 852                 sys.exit(15)
 853             else:
 854                 init_repo()
 855         else:
 856             log('error: %s\n' % e)
 857             sys.exit(14)
 858
 859
 860 _ver = None
 861 def ver():
 862     """Get Git's version and ensure a usable version is installed.
 863
 864     The returned version is formatted as an ordered tuple with each position
 865     representing a digit in the version tag. For example, the following tuple
 866     would represent version 1.6.6.9:
 867
 868         ('1', '6', '6', '9')
 869     """
 870     global _ver
 871     if not _ver:
 872         p = subprocess.Popen(['git', '--version'],
 873                              stdout=subprocess.PIPE)
 874         gvs = p.stdout.read()
 875         _git_wait('git --version', p)
 876         m = re.match(r'git version (\S+.\S+)', gvs)
 877         if not m:
 878             raise GitError('git --version weird output: %r' % gvs)
 879         _ver = tuple(m.group(1).split('.'))
 880     needed = ('1','5', '3', '1')
 881     if _ver < needed:
 882         raise GitError('git version %s or higher is required; you have %s'
 883                        % ('.'.join(needed), '.'.join(_ver)))
 884     return _ver
 885
 886
 887 def _git_wait(cmd, p):
 888     rv = p.wait()
 889     if rv != 0:
 890         raise GitError('%s returned %d' % (cmd, rv))
 891
 892
 893 def _git_capture(argv):
 894     p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
 895     r = p.stdout.read()
 896     _git_wait(repr(argv), p)
 897     return r
 898
 899
 900 class _AbortableIter:
 901     def __init__(self, it, onabort = None):
 902         self.it = it
 903         self.onabort = onabort
 904         self.done = None
 905
 906     def __iter__(self):
 907         return self
 908
 909     def next(self):
 910         try:
 911             return self.it.next()
 912         except StopIteration, e:
 913             self.done = True
 914             raise
 915         except:
 916             self.abort()
 917             raise
 918
 919     def abort(self):
 920         """Abort iteration and call the abortion callback, if needed."""
 921         if not self.done:
 922             self.done = True
 923             if self.onabort:
 924                 self.onabort()
 925
 926     def __del__(self):
 927         self.abort()
 928
 929
 930 _ver_warned = 0
 931 class CatPipe:
 932     """Link to 'git cat-file' that is used to retrieve blob data."""
 933     def __init__(self):
 934         global _ver_warned
 935         wanted = ('1','5','6')
 936         if ver() < wanted:
 937             if not _ver_warned:
 938                 log('warning: git version < %s; bup will be slow.\n'
 939                     % '.'.join(wanted))
 940                 _ver_warned = 1
 941             self.get = self._slow_get
 942         else:
 943             self.p = self.inprogress = None
 944             self.get = self._fast_get
 945
 946     def _abort(self):
 947         if self.p:
 948             self.p.stdout.close()
 949             self.p.stdin.close()
 950         self.p = None
 951         self.inprogress = None
 952
 953     def _restart(self):
 954         self._abort()
 955         self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
 956                                   stdin=subprocess.PIPE,
 957                                   stdout=subprocess.PIPE,
 958                                   close_fds = True,
 959                                   bufsize = 4096,
 960                                   preexec_fn = _gitenv)
 961
 962     def _fast_get(self, id):
 963         if not self.p or self.p.poll() != None:
 964             self._restart()
 965         assert(self.p)
 966         assert(self.p.poll() == None)
 967         if self.inprogress:
 968             log('_fast_get: opening %r while %r is open\n'
 969                 % (id, self.inprogress))
 970         assert(not self.inprogress)
 971         assert(id.find('\n') < 0)
 972         assert(id.find('\r') < 0)
 973         assert(not id.startswith('-'))
 974         self.inprogress = id
 975         self.p.stdin.write('%s\n' % id)
 976         self.p.stdin.flush()
 977         hdr = self.p.stdout.readline()
 978         if hdr.endswith(' missing\n'):
 979             self.inprogress = None
 980             raise KeyError('blob %r is missing' % id)
 981         spl = hdr.split(' ')
 982         if len(spl) != 3 or len(spl[0]) != 40:
 983             raise GitError('expected blob, got %r' % spl)
 984         (hex, type, size) = spl
 985
 986         it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
 987                            onabort = self._abort)
 988         try:
 989             yield type
 990             for blob in it:
 991                 yield blob
 992             assert(self.p.stdout.readline() == '\n')
 993             self.inprogress = None
 994         except Exception, e:
 995             it.abort()
 996             raise
 997
 998     def _slow_get(self, id):
 999         assert(id.find('\n') < 0)
1000         assert(id.find('\r') < 0)
1001         assert(id[0] != '-')
1002         type = _git_capture(['git', 'cat-file', '-t', id]).strip()
1003         yield type
1004
1005         p = subprocess.Popen(['git', 'cat-file', type, id],
1006                              stdout=subprocess.PIPE,
1007                              preexec_fn = _gitenv)
1008         for blob in chunkyreader(p.stdout):
1009             yield blob
1010         _git_wait('git cat-file', p)
1011
1012     def _join(self, it):
1013         type = it.next()
1014         if type == 'blob':
1015             for blob in it:
1016                 yield blob
1017         elif type == 'tree':
1018             treefile = ''.join(it)
1019             for (mode, name, sha) in tree_decode(treefile):
1020                 for blob in self.join(sha.encode('hex')):
1021                     yield blob
1022         elif type == 'commit':
1023             treeline = ''.join(it).split('\n')[0]
1024             assert(treeline.startswith('tree '))
1025             for blob in self.join(treeline[5:]):
1026                 yield blob
1027         else:
1028             raise GitError('invalid object type %r: expected blob/tree/commit'
1029                            % type)
1030
1031     def join(self, id):
1032         """Generate a list of the content of all blobs that can be reached
1033         from an object.  The hash given in 'id' must point to a blob, a tree
1034         or a commit. The content of all blobs that can be seen from trees or
1035         commits will be added to the list.
1036         """
1037         try:
1038             for d in self._join(self.get(id)):
1039                 yield d
1040         except StopIteration:
1041             log('booger!\n')
1042
1043 def tags():
1044     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1045     tags = {}
1046     for (n,c) in list_refs():
1047         if n.startswith('refs/tags/'):
1048             name = n[10:]
1049             if not c in tags:
1050                 tags[c] = []
1051
1052             tags[c].append(name)  # more than one tag can point at 'c'
1053
1054     return tags