lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5 import os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
   6 from bup.helpers import *
   7 from bup import _helpers, path, midx, bloom
   8
   9 max_pack_size = 1000*1000*1000  # larger packs will slow down pruning
  10 max_pack_objects = 200*1000  # cache memory usage is about 83 bytes per object
  11 SEEK_END=2  # os.SEEK_END is not defined in python 2.4
  12
  13 verbose = 0
  14 ignore_midx = 0
  15 home_repodir = os.path.expanduser('~/.bup')
  16 repodir = None
  17
  18 _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
  19 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
  20
  21 _total_searches = 0
  22 _total_steps = 0
  23
  24
  25 class GitError(Exception):
  26     pass
  27
  28
  29 def repo(sub = ''):
  30     """Get the path to the git repository or one of its subdirectories."""
  31     global repodir
  32     if not repodir:
  33         raise GitError('You should call check_repo_or_die()')
  34
  35     # If there's a .git subdirectory, then the actual repo is in there.
  36     gd = os.path.join(repodir, '.git')
  37     if os.path.exists(gd):
  38         repodir = gd
  39
  40     return os.path.join(repodir, sub)
  41
  42
  43 def shorten_hash(s):
  44     return re.sub(r'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)',
  45                   r'\1\2*\3', s)
  46
  47
  48 def repo_rel(path):
  49     full = os.path.abspath(path)
  50     fullrepo = os.path.abspath(repo(''))
  51     if not fullrepo.endswith('/'):
  52         fullrepo += '/'
  53     if full.startswith(fullrepo):
  54         path = full[len(fullrepo):]
  55     if path.startswith('index-cache/'):
  56         path = path[len('index-cache/'):]
  57     return shorten_hash(path)
  58
  59
  60 def all_packdirs():
  61     paths = [repo('objects/pack')]
  62     paths += glob.glob(repo('index-cache/*/.'))
  63     return paths
  64
  65
  66 def auto_midx(objdir):
  67     args = [path.exe(), 'midx', '--auto', '--dir', objdir]
  68     try:
  69         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
  70     except OSError, e:
  71         # make sure 'args' gets printed to help with debugging
  72         add_error('%r: exception: %s' % (args, e))
  73         raise
  74     if rv:
  75         add_error('%r: returned %d' % (args, rv))
  76
  77     args = [path.exe(), 'bloom', '--dir', objdir]
  78     try:
  79         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
  80     except OSError, e:
  81         # make sure 'args' gets printed to help with debugging
  82         add_error('%r: exception: %s' % (args, e))
  83         raise
  84     if rv:
  85         add_error('%r: returned %d' % (args, rv))
  86
  87
  88 def mangle_name(name, mode, gitmode):
  89     """Mangle a file name to present an abstract name for segmented files.
  90     Mangled file names will have the ".bup" extension added to them. If a
  91     file's name already ends with ".bup", a ".bupl" extension is added to
  92     disambiguate normal files from semgmented ones.
  93     """
  94     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
  95         return name + '.bup'
  96     elif name.endswith('.bup') or name[:-1].endswith('.bup'):
  97         return name + '.bupl'
  98     else:
  99         return name
 100
 101
 102 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 103 def demangle_name(name):
 104     """Remove name mangling from a file name, if necessary.
 105
 106     The return value is a tuple (demangled_filename,mode), where mode is one of
 107     the following:
 108
 109     * BUP_NORMAL  : files that should be read as-is from the repository
 110     * BUP_CHUNKED : files that were chunked and need to be assembled
 111
 112     For more information on the name mangling algorythm, see mangle_name()
 113     """
 114     if name.endswith('.bupl'):
 115         return (name[:-5], BUP_NORMAL)
 116     elif name.endswith('.bup'):
 117         return (name[:-4], BUP_CHUNKED)
 118     else:
 119         return (name, BUP_NORMAL)
 120
 121
 122 def calc_hash(type, content):
 123     """Calculate some content's hash in the Git fashion."""
 124     header = '%s %d\0' % (type, len(content))
 125     sum = Sha1(header)
 126     sum.update(content)
 127     return sum.digest()
 128
 129
 130 def _shalist_sort_key(ent):
 131     (mode, name, id) = ent
 132     assert(mode+0 == mode)
 133     if stat.S_ISDIR(mode):
 134         return name + '/'
 135     else:
 136         return name
 137
 138
 139 def tree_encode(shalist):
 140     """Generate a git tree object from (mode,name,hash) tuples."""
 141     shalist = sorted(shalist, key = _shalist_sort_key)
 142     l = []
 143     for (mode,name,bin) in shalist:
 144         assert(mode)
 145         assert(mode+0 == mode)
 146         assert(name)
 147         assert(len(bin) == 20)
 148         s = '%o %s\0%s' % (mode,name,bin)
 149         assert(s[0] != '0')  # 0-padded octal is not acceptable in a git tree
 150         l.append(s)
 151     return ''.join(l)
 152
 153
 154 def tree_decode(buf):
 155     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 156     ofs = 0
 157     while ofs < len(buf):
 158         z = buf[ofs:].find('\0')
 159         assert(z > 0)
 160         spl = buf[ofs:ofs+z].split(' ', 1)
 161         assert(len(spl) == 2)
 162         mode,name = spl
 163         sha = buf[ofs+z+1:ofs+z+1+20]
 164         ofs += z+1+20
 165         yield (int(mode, 8), name, sha)
 166
 167
 168 def _encode_packobj(type, content, compression_level=1):
 169     szout = ''
 170     sz = len(content)
 171     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 172     sz >>= 4
 173     while 1:
 174         if sz: szbits |= 0x80
 175         szout += chr(szbits)
 176         if not sz:
 177             break
 178         szbits = sz & 0x7f
 179         sz >>= 7
 180     if compression_level > 9:
 181         compression_level = 9
 182     elif compression_level < 0:
 183         compression_level = 0
 184     z = zlib.compressobj(compression_level)
 185     yield szout
 186     yield z.compress(content)
 187     yield z.flush()
 188
 189
 190 def _encode_looseobj(type, content, compression_level=1):
 191     z = zlib.compressobj(compression_level)
 192     yield z.compress('%s %d\0' % (type, len(content)))
 193     yield z.compress(content)
 194     yield z.flush()
 195
 196
 197 def _decode_looseobj(buf):
 198     assert(buf);
 199     s = zlib.decompress(buf)
 200     i = s.find('\0')
 201     assert(i > 0)
 202     l = s[:i].split(' ')
 203     type = l[0]
 204     sz = int(l[1])
 205     content = s[i+1:]
 206     assert(type in _typemap)
 207     assert(sz == len(content))
 208     return (type, content)
 209
 210
 211 def _decode_packobj(buf):
 212     assert(buf)
 213     c = ord(buf[0])
 214     type = _typermap[(c & 0x70) >> 4]
 215     sz = c & 0x0f
 216     shift = 4
 217     i = 0
 218     while c & 0x80:
 219         i += 1
 220         c = ord(buf[i])
 221         sz |= (c & 0x7f) << shift
 222         shift += 7
 223         if not (c & 0x80):
 224             break
 225     return (type, zlib.decompress(buf[i+1:]))
 226
 227
 228 class PackIdx:
 229     def __init__(self):
 230         assert(0)
 231
 232     def find_offset(self, hash):
 233         """Get the offset of an object inside the index file."""
 234         idx = self._idx_from_hash(hash)
 235         if idx != None:
 236             return self._ofs_from_idx(idx)
 237         return None
 238
 239     def exists(self, hash, want_source=False):
 240         """Return nonempty if the object exists in this index."""
 241         if hash and (self._idx_from_hash(hash) != None):
 242             return want_source and os.path.basename(self.name) or True
 243         return None
 244
 245     def __len__(self):
 246         return int(self.fanout[255])
 247
 248     def _idx_from_hash(self, hash):
 249         global _total_searches, _total_steps
 250         _total_searches += 1
 251         assert(len(hash) == 20)
 252         b1 = ord(hash[0])
 253         start = self.fanout[b1-1] # range -1..254
 254         end = self.fanout[b1] # range 0..255
 255         want = str(hash)
 256         _total_steps += 1  # lookup table is a step
 257         while start < end:
 258             _total_steps += 1
 259             mid = start + (end-start)/2
 260             v = self._idx_to_hash(mid)
 261             if v < want:
 262                 start = mid+1
 263             elif v > want:
 264                 end = mid
 265             else: # got it!
 266                 return mid
 267         return None
 268
 269
 270 class PackIdxV1(PackIdx):
 271     """Object representation of a Git pack index (version 1) file."""
 272     def __init__(self, filename, f):
 273         self.name = filename
 274         self.idxnames = [self.name]
 275         self.map = mmap_read(f)
 276         self.fanout = list(struct.unpack('!256I',
 277                                          str(buffer(self.map, 0, 256*4))))
 278         self.fanout.append(0)  # entry "-1"
 279         nsha = self.fanout[255]
 280         self.sha_ofs = 256*4
 281         self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
 282
 283     def _ofs_from_idx(self, idx):
 284         return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
 285
 286     def _idx_to_hash(self, idx):
 287         return str(self.shatable[idx*24+4 : idx*24+24])
 288
 289     def __iter__(self):
 290         for i in xrange(self.fanout[255]):
 291             yield buffer(self.map, 256*4 + 24*i + 4, 20)
 292
 293
 294 class PackIdxV2(PackIdx):
 295     """Object representation of a Git pack index (version 2) file."""
 296     def __init__(self, filename, f):
 297         self.name = filename
 298         self.idxnames = [self.name]
 299         self.map = mmap_read(f)
 300         assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
 301         self.fanout = list(struct.unpack('!256I',
 302                                          str(buffer(self.map, 8, 256*4))))
 303         self.fanout.append(0)  # entry "-1"
 304         nsha = self.fanout[255]
 305         self.sha_ofs = 8 + 256*4
 306         self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
 307         self.ofstable = buffer(self.map,
 308                                self.sha_ofs + nsha*20 + nsha*4,
 309                                nsha*4)
 310         self.ofs64table = buffer(self.map,
 311                                  8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
 312
 313     def _ofs_from_idx(self, idx):
 314         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
 315         if ofs & 0x80000000:
 316             idx64 = ofs & 0x7fffffff
 317             ofs = struct.unpack('!Q',
 318                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
 319         return ofs
 320
 321     def _idx_to_hash(self, idx):
 322         return str(self.shatable[idx*20:(idx+1)*20])
 323
 324     def __iter__(self):
 325         for i in xrange(self.fanout[255]):
 326             yield buffer(self.map, 8 + 256*4 + 20*i, 20)
 327
 328
 329 _mpi_count = 0
 330 class PackIdxList:
 331     def __init__(self, dir):
 332         global _mpi_count
 333         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 334         _mpi_count += 1
 335         self.dir = dir
 336         self.also = set()
 337         self.packs = []
 338         self.do_bloom = False
 339         self.bloom = None
 340         self.refresh()
 341
 342     def __del__(self):
 343         global _mpi_count
 344         _mpi_count -= 1
 345         assert(_mpi_count == 0)
 346
 347     def __iter__(self):
 348         return iter(idxmerge(self.packs))
 349
 350     def __len__(self):
 351         return sum(len(pack) for pack in self.packs)
 352
 353     def exists(self, hash, want_source=False):
 354         """Return nonempty if the object exists in the index files."""
 355         global _total_searches
 356         _total_searches += 1
 357         if hash in self.also:
 358             return True
 359         if self.do_bloom and self.bloom:
 360             if self.bloom.exists(hash):
 361                 self.do_bloom = False
 362             else:
 363                 _total_searches -= 1  # was counted by bloom
 364                 return None
 365         for i in xrange(len(self.packs)):
 366             p = self.packs[i]
 367             _total_searches -= 1  # will be incremented by sub-pack
 368             ix = p.exists(hash, want_source=want_source)
 369             if ix:
 370                 # reorder so most recently used packs are searched first
 371                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 372                 return ix
 373         self.do_bloom = True
 374         return None
 375
 376     def refresh(self, skip_midx = False):
 377         """Refresh the index list.
 378         This method verifies if .midx files were superseded (e.g. all of its
 379         contents are in another, bigger .midx file) and removes the superseded
 380         files.
 381
 382         If skip_midx is True, all work on .midx files will be skipped and .midx
 383         files will be removed from the list.
 384
 385         The module-global variable 'ignore_midx' can force this function to
 386         always act as if skip_midx was True.
 387         """
 388         self.bloom = None # Always reopen the bloom as it may have been relaced
 389         self.do_bloom = False
 390         skip_midx = skip_midx or ignore_midx
 391         d = dict((p.name, p) for p in self.packs
 392                  if not skip_midx or not isinstance(p, midx.PackMidx))
 393         if os.path.exists(self.dir):
 394             if not skip_midx:
 395                 midxl = []
 396                 for ix in self.packs:
 397                     if isinstance(ix, midx.PackMidx):
 398                         for name in ix.idxnames:
 399                             d[os.path.join(self.dir, name)] = ix
 400                 for full in glob.glob(os.path.join(self.dir,'*.midx')):
 401                     if not d.get(full):
 402                         mx = midx.PackMidx(full)
 403                         (mxd, mxf) = os.path.split(mx.name)
 404                         broken = False
 405                         for n in mx.idxnames:
 406                             if not os.path.exists(os.path.join(mxd, n)):
 407                                 log(('warning: index %s missing\n' +
 408                                     '  used by %s\n') % (n, mxf))
 409                                 broken = True
 410                         if broken:
 411                             del mx
 412                             unlink(full)
 413                         else:
 414                             midxl.append(mx)
 415                 midxl.sort(key=lambda ix:
 416                            (-len(ix), -os.stat(ix.name).st_mtime))
 417                 for ix in midxl:
 418                     any_needed = False
 419                     for sub in ix.idxnames:
 420                         found = d.get(os.path.join(self.dir, sub))
 421                         if not found or isinstance(found, PackIdx):
 422                             # doesn't exist, or exists but not in a midx
 423                             any_needed = True
 424                             break
 425                     if any_needed:
 426                         d[ix.name] = ix
 427                         for name in ix.idxnames:
 428                             d[os.path.join(self.dir, name)] = ix
 429                     elif not ix.force_keep:
 430                         debug1('midx: removing redundant: %s\n'
 431                                % os.path.basename(ix.name))
 432                         unlink(ix.name)
 433             for full in glob.glob(os.path.join(self.dir,'*.idx')):
 434                 if not d.get(full):
 435                     try:
 436                         ix = open_idx(full)
 437                     except GitError, e:
 438                         add_error(e)
 439                         continue
 440                     d[full] = ix
 441             bfull = os.path.join(self.dir, 'bup.bloom')
 442             if self.bloom is None and os.path.exists(bfull):
 443                 self.bloom = bloom.ShaBloom(bfull)
 444             self.packs = list(set(d.values()))
 445             self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
 446             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 447                 self.do_bloom = True
 448             else:
 449                 self.bloom = None
 450         debug1('PackIdxList: using %d index%s.\n'
 451             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 452
 453     def add(self, hash):
 454         """Insert an additional object in the list."""
 455         self.also.add(hash)
 456
 457
 458 def open_idx(filename):
 459     if filename.endswith('.idx'):
 460         f = open(filename, 'rb')
 461         header = f.read(8)
 462         if header[0:4] == '\377tOc':
 463             version = struct.unpack('!I', header[4:8])[0]
 464             if version == 2:
 465                 return PackIdxV2(filename, f)
 466             else:
 467                 raise GitError('%s: expected idx file version 2, got %d'
 468                                % (filename, version))
 469         elif len(header) == 8 and header[0:4] < '\377tOc':
 470             return PackIdxV1(filename, f)
 471         else:
 472             raise GitError('%s: unrecognized idx file header' % filename)
 473     elif filename.endswith('.midx'):
 474         return midx.PackMidx(filename)
 475     else:
 476         raise GitError('idx filenames must end with .idx or .midx')
 477
 478
 479 def idxmerge(idxlist, final_progress=True):
 480     """Generate a list of all the objects reachable in a PackIdxList."""
 481     def pfunc(count, total):
 482         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 483                   % (count*100.0/total, count, total))
 484     def pfinal(count, total):
 485         if final_progress:
 486             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 487                      % (100, total, total))
 488     return merge_iter(idxlist, 10024, pfunc, pfinal)
 489
 490
 491 def _make_objcache():
 492     return PackIdxList(repo('objects/pack'))
 493
 494 class PackWriter:
 495     """Writes Git objects inside a pack file."""
 496     def __init__(self, objcache_maker=_make_objcache, compression_level=1):
 497         self.count = 0
 498         self.outbytes = 0
 499         self.filename = None
 500         self.file = None
 501         self.idx = None
 502         self.objcache_maker = objcache_maker
 503         self.objcache = None
 504         self.compression_level = compression_level
 505
 506     def __del__(self):
 507         self.close()
 508
 509     def _open(self):
 510         if not self.file:
 511             (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
 512             self.file = os.fdopen(fd, 'w+b')
 513             assert(name.endswith('.pack'))
 514             self.filename = name[:-5]
 515             self.file.write('PACK\0\0\0\2\0\0\0\0')
 516             self.idx = list(list() for i in xrange(256))
 517
 518     def _raw_write(self, datalist, sha):
 519         self._open()
 520         f = self.file
 521         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 522         # the file never has a *partial* blob.  So let's make sure it's
 523         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 524         # to our hashsplit algorithm.)  f.write() does its own buffering,
 525         # but that's okay because we'll flush it in _end().
 526         oneblob = ''.join(datalist)
 527         try:
 528             f.write(oneblob)
 529         except IOError, e:
 530             raise GitError, e, sys.exc_info()[2]
 531         nw = len(oneblob)
 532         crc = zlib.crc32(oneblob) & 0xffffffff
 533         self._update_idx(sha, crc, nw)
 534         self.outbytes += nw
 535         self.count += 1
 536         return nw, crc
 537
 538     def _update_idx(self, sha, crc, size):
 539         assert(sha)
 540         if self.idx:
 541             self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
 542
 543     def _write(self, sha, type, content):
 544         if verbose:
 545             log('>')
 546         if not sha:
 547             sha = calc_hash(type, content)
 548         size, crc = self._raw_write(_encode_packobj(type, content,
 549                                                     self.compression_level),
 550                                     sha=sha)
 551         if self.outbytes >= max_pack_size or self.count >= max_pack_objects:
 552             self.breakpoint()
 553         return sha
 554
 555     def breakpoint(self):
 556         """Clear byte and object counts and return the last processed id."""
 557         id = self._end()
 558         self.outbytes = self.count = 0
 559         return id
 560
 561     def _require_objcache(self):
 562         if self.objcache is None and self.objcache_maker:
 563             self.objcache = self.objcache_maker()
 564         if self.objcache is None:
 565             raise GitError(
 566                     "PackWriter not opened or can't check exists w/o objcache")
 567
 568     def exists(self, id, want_source=False):
 569         """Return non-empty if an object is found in the object cache."""
 570         self._require_objcache()
 571         return self.objcache.exists(id, want_source=want_source)
 572
 573     def maybe_write(self, type, content):
 574         """Write an object to the pack file if not present and return its id."""
 575         sha = calc_hash(type, content)
 576         if not self.exists(sha):
 577             self._write(sha, type, content)
 578             self._require_objcache()
 579             self.objcache.add(sha)
 580         return sha
 581
 582     def new_blob(self, blob):
 583         """Create a blob object in the pack with the supplied content."""
 584         return self.maybe_write('blob', blob)
 585
 586     def new_tree(self, shalist):
 587         """Create a tree object in the pack."""
 588         content = tree_encode(shalist)
 589         return self.maybe_write('tree', content)
 590
 591     def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
 592         l = []
 593         if tree: l.append('tree %s' % tree.encode('hex'))
 594         if parent: l.append('parent %s' % parent.encode('hex'))
 595         if author: l.append('author %s %s' % (author, _git_date(adate)))
 596         if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
 597         l.append('')
 598         l.append(msg)
 599         return self.maybe_write('commit', '\n'.join(l))
 600
 601     def new_commit(self, parent, tree, date, msg):
 602         """Create a commit object in the pack."""
 603         userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
 604         commit = self._new_commit(tree, parent,
 605                                   userline, date, userline, date,
 606                                   msg)
 607         return commit
 608
 609     def abort(self):
 610         """Remove the pack file from disk."""
 611         f = self.file
 612         if f:
 613             self.idx = None
 614             self.file = None
 615             f.close()
 616             os.unlink(self.filename + '.pack')
 617
 618     def _end(self, run_midx=True):
 619         f = self.file
 620         if not f: return None
 621         self.file = None
 622         self.objcache = None
 623         idx = self.idx
 624         self.idx = None
 625
 626         # update object count
 627         f.seek(8)
 628         cp = struct.pack('!i', self.count)
 629         assert(len(cp) == 4)
 630         f.write(cp)
 631
 632         # calculate the pack sha1sum
 633         f.seek(0)
 634         sum = Sha1()
 635         for b in chunkyreader(f):
 636             sum.update(b)
 637         packbin = sum.digest()
 638         f.write(packbin)
 639         f.close()
 640
 641         obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
 642
 643         nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
 644         if os.path.exists(self.filename + '.map'):
 645             os.unlink(self.filename + '.map')
 646         os.rename(self.filename + '.pack', nameprefix + '.pack')
 647         os.rename(self.filename + '.idx', nameprefix + '.idx')
 648
 649         if run_midx:
 650             auto_midx(repo('objects/pack'))
 651         return nameprefix
 652
 653     def close(self, run_midx=True):
 654         """Close the pack file and move it to its definitive path."""
 655         return self._end(run_midx=run_midx)
 656
 657     def _write_pack_idx_v2(self, filename, idx, packbin):
 658         idx_f = open(filename, 'w+b')
 659         idx_f.write('\377tOc\0\0\0\2')
 660
 661         ofs64_ofs = 8 + 4*256 + 28*self.count
 662         idx_f.truncate(ofs64_ofs)
 663         idx_f.seek(0)
 664         idx_map = mmap_readwrite(idx_f, close=False)
 665         idx_f.seek(0, SEEK_END)
 666         count = _helpers.write_idx(idx_f, idx_map, idx, self.count)
 667         assert(count == self.count)
 668         idx_map.close()
 669         idx_f.write(packbin)
 670
 671         idx_f.seek(0)
 672         idx_sum = Sha1()
 673         b = idx_f.read(8 + 4*256)
 674         idx_sum.update(b)
 675
 676         obj_list_sum = Sha1()
 677         for b in chunkyreader(idx_f, 20*self.count):
 678             idx_sum.update(b)
 679             obj_list_sum.update(b)
 680         namebase = obj_list_sum.hexdigest()
 681
 682         for b in chunkyreader(idx_f):
 683             idx_sum.update(b)
 684         idx_f.write(idx_sum.digest())
 685         idx_f.close()
 686
 687         return namebase
 688
 689
 690 def _git_date(date):
 691     return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
 692
 693
 694 def _gitenv():
 695     os.environ['GIT_DIR'] = os.path.abspath(repo())
 696
 697
 698 def list_refs(refname = None):
 699     """Generate a list of tuples in the form (refname,hash).
 700     If a ref name is specified, list only this particular ref.
 701     """
 702     argv = ['git', 'show-ref', '--']
 703     if refname:
 704         argv += [refname]
 705     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 706     out = p.stdout.read().strip()
 707     rv = p.wait()  # not fatal
 708     if rv:
 709         assert(not out)
 710     if out:
 711         for d in out.split('\n'):
 712             (sha, name) = d.split(' ', 1)
 713             yield (name, sha.decode('hex'))
 714
 715
 716 def read_ref(refname):
 717     """Get the commit id of the most recent commit made on a given ref."""
 718     l = list(list_refs(refname))
 719     if l:
 720         assert(len(l) == 1)
 721         return l[0][1]
 722     else:
 723         return None
 724
 725
 726 def rev_list(ref, count=None):
 727     """Generate a list of reachable commits in reverse chronological order.
 728
 729     This generator walks through commits, from child to parent, that are
 730     reachable via the specified ref and yields a series of tuples of the form
 731     (date,hash).
 732
 733     If count is a non-zero integer, limit the number of commits to "count"
 734     objects.
 735     """
 736     assert(not ref.startswith('-'))
 737     opts = []
 738     if count:
 739         opts += ['-n', str(atoi(count))]
 740     argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
 741     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 742     commit = None
 743     for row in p.stdout:
 744         s = row.strip()
 745         if s.startswith('commit '):
 746             commit = s[7:].decode('hex')
 747         else:
 748             date = int(s)
 749             yield (date, commit)
 750     rv = p.wait()  # not fatal
 751     if rv:
 752         raise GitError, 'git rev-list returned error %d' % rv
 753
 754
 755 def rev_get_date(ref):
 756     """Get the date of the latest commit on the specified ref."""
 757     for (date, commit) in rev_list(ref, count=1):
 758         return date
 759     raise GitError, 'no such commit %r' % ref
 760
 761
 762 def rev_parse(committish):
 763     """Resolve the full hash for 'committish', if it exists.
 764
 765     Should be roughly equivalent to 'git rev-parse'.
 766
 767     Returns the hex value of the hash if it is found, None if 'committish' does
 768     not correspond to anything.
 769     """
 770     head = read_ref(committish)
 771     if head:
 772         debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
 773         return head
 774
 775     pL = PackIdxList(repo('objects/pack'))
 776
 777     if len(committish) == 40:
 778         try:
 779             hash = committish.decode('hex')
 780         except TypeError:
 781             return None
 782
 783         if pL.exists(hash):
 784             return hash
 785
 786     return None
 787
 788
 789 def update_ref(refname, newval, oldval):
 790     """Change the commit pointed to by a branch."""
 791     if not oldval:
 792         oldval = ''
 793     assert(refname.startswith('refs/heads/'))
 794     p = subprocess.Popen(['git', 'update-ref', refname,
 795                           newval.encode('hex'), oldval.encode('hex')],
 796                          preexec_fn = _gitenv)
 797     _git_wait('git update-ref', p)
 798
 799
 800 def guess_repo(path=None):
 801     """Set the path value in the global variable "repodir".
 802     This makes bup look for an existing bup repository, but not fail if a
 803     repository doesn't exist. Usually, if you are interacting with a bup
 804     repository, you would not be calling this function but using
 805     check_repo_or_die().
 806     """
 807     global repodir
 808     if path:
 809         repodir = path
 810     if not repodir:
 811         repodir = os.environ.get('BUP_DIR')
 812         if not repodir:
 813             repodir = os.path.expanduser('~/.bup')
 814
 815
 816 def init_repo(path=None):
 817     """Create the Git bare repository for bup in a given path."""
 818     guess_repo(path)
 819     d = repo()  # appends a / to the path
 820     parent = os.path.dirname(os.path.dirname(d))
 821     if parent and not os.path.exists(parent):
 822         raise GitError('parent directory "%s" does not exist\n' % parent)
 823     if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
 824         raise GitError('"%d" exists but is not a directory\n' % d)
 825     p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
 826                          preexec_fn = _gitenv)
 827     _git_wait('git init', p)
 828     # Force the index version configuration in order to ensure bup works
 829     # regardless of the version of the installed Git binary.
 830     p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
 831                          stdout=sys.stderr, preexec_fn = _gitenv)
 832     _git_wait('git config', p)
 833
 834
 835 def check_repo_or_die(path=None):
 836     """Make sure a bup repository exists, and abort if not.
 837     If the path to a particular repository was not specified, this function
 838     initializes the default repository automatically.
 839     """
 840     guess_repo(path)
 841     try:
 842         os.stat(repo('objects/pack/.'))
 843     except OSError, e:
 844         if e.errno == errno.ENOENT:
 845             if repodir != home_repodir:
 846                 log('error: %r is not a bup repository; run "bup init"\n'
 847                     % repo())
 848                 sys.exit(15)
 849             else:
 850                 init_repo()
 851         else:
 852             log('error: %s\n' % e)
 853             sys.exit(14)
 854
 855
 856 _ver = None
 857 def ver():
 858     """Get Git's version and ensure a usable version is installed.
 859
 860     The returned version is formatted as an ordered tuple with each position
 861     representing a digit in the version tag. For example, the following tuple
 862     would represent version 1.6.6.9:
 863
 864         ('1', '6', '6', '9')
 865     """
 866     global _ver
 867     if not _ver:
 868         p = subprocess.Popen(['git', '--version'],
 869                              stdout=subprocess.PIPE)
 870         gvs = p.stdout.read()
 871         _git_wait('git --version', p)
 872         m = re.match(r'git version (\S+.\S+)', gvs)
 873         if not m:
 874             raise GitError('git --version weird output: %r' % gvs)
 875         _ver = tuple(m.group(1).split('.'))
 876     needed = ('1','5', '3', '1')
 877     if _ver < needed:
 878         raise GitError('git version %s or higher is required; you have %s'
 879                        % ('.'.join(needed), '.'.join(_ver)))
 880     return _ver
 881
 882
 883 def _git_wait(cmd, p):
 884     rv = p.wait()
 885     if rv != 0:
 886         raise GitError('%s returned %d' % (cmd, rv))
 887
 888
 889 def _git_capture(argv):
 890     p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
 891     r = p.stdout.read()
 892     _git_wait(repr(argv), p)
 893     return r
 894
 895
 896 class _AbortableIter:
 897     def __init__(self, it, onabort = None):
 898         self.it = it
 899         self.onabort = onabort
 900         self.done = None
 901
 902     def __iter__(self):
 903         return self
 904
 905     def next(self):
 906         try:
 907             return self.it.next()
 908         except StopIteration, e:
 909             self.done = True
 910             raise
 911         except:
 912             self.abort()
 913             raise
 914
 915     def abort(self):
 916         """Abort iteration and call the abortion callback, if needed."""
 917         if not self.done:
 918             self.done = True
 919             if self.onabort:
 920                 self.onabort()
 921
 922     def __del__(self):
 923         self.abort()
 924
 925
 926 _ver_warned = 0
 927 class CatPipe:
 928     """Link to 'git cat-file' that is used to retrieve blob data."""
 929     def __init__(self):
 930         global _ver_warned
 931         wanted = ('1','5','6')
 932         if ver() < wanted:
 933             if not _ver_warned:
 934                 log('warning: git version < %s; bup will be slow.\n'
 935                     % '.'.join(wanted))
 936                 _ver_warned = 1
 937             self.get = self._slow_get
 938         else:
 939             self.p = self.inprogress = None
 940             self.get = self._fast_get
 941
 942     def _abort(self):
 943         if self.p:
 944             self.p.stdout.close()
 945             self.p.stdin.close()
 946         self.p = None
 947         self.inprogress = None
 948
 949     def _restart(self):
 950         self._abort()
 951         self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
 952                                   stdin=subprocess.PIPE,
 953                                   stdout=subprocess.PIPE,
 954                                   close_fds = True,
 955                                   bufsize = 4096,
 956                                   preexec_fn = _gitenv)
 957
 958     def _fast_get(self, id):
 959         if not self.p or self.p.poll() != None:
 960             self._restart()
 961         assert(self.p)
 962         assert(self.p.poll() == None)
 963         if self.inprogress:
 964             log('_fast_get: opening %r while %r is open'
 965                 % (id, self.inprogress))
 966         assert(not self.inprogress)
 967         assert(id.find('\n') < 0)
 968         assert(id.find('\r') < 0)
 969         assert(not id.startswith('-'))
 970         self.inprogress = id
 971         self.p.stdin.write('%s\n' % id)
 972         self.p.stdin.flush()
 973         hdr = self.p.stdout.readline()
 974         if hdr.endswith(' missing\n'):
 975             self.inprogress = None
 976             raise KeyError('blob %r is missing' % id)
 977         spl = hdr.split(' ')
 978         if len(spl) != 3 or len(spl[0]) != 40:
 979             raise GitError('expected blob, got %r' % spl)
 980         (hex, type, size) = spl
 981
 982         it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
 983                            onabort = self._abort)
 984         try:
 985             yield type
 986             for blob in it:
 987                 yield blob
 988             assert(self.p.stdout.readline() == '\n')
 989             self.inprogress = None
 990         except Exception, e:
 991             it.abort()
 992             raise
 993
 994     def _slow_get(self, id):
 995         assert(id.find('\n') < 0)
 996         assert(id.find('\r') < 0)
 997         assert(id[0] != '-')
 998         type = _git_capture(['git', 'cat-file', '-t', id]).strip()
 999         yield type
1000
1001         p = subprocess.Popen(['git', 'cat-file', type, id],
1002                              stdout=subprocess.PIPE,
1003                              preexec_fn = _gitenv)
1004         for blob in chunkyreader(p.stdout):
1005             yield blob
1006         _git_wait('git cat-file', p)
1007
1008     def _join(self, it):
1009         type = it.next()
1010         if type == 'blob':
1011             for blob in it:
1012                 yield blob
1013         elif type == 'tree':
1014             treefile = ''.join(it)
1015             for (mode, name, sha) in tree_decode(treefile):
1016                 for blob in self.join(sha.encode('hex')):
1017                     yield blob
1018         elif type == 'commit':
1019             treeline = ''.join(it).split('\n')[0]
1020             assert(treeline.startswith('tree '))
1021             for blob in self.join(treeline[5:]):
1022                 yield blob
1023         else:
1024             raise GitError('invalid object type %r: expected blob/tree/commit'
1025                            % type)
1026
1027     def join(self, id):
1028         """Generate a list of the content of all blobs that can be reached
1029         from an object.  The hash given in 'id' must point to a blob, a tree
1030         or a commit. The content of all blobs that can be seen from trees or
1031         commits will be added to the list.
1032         """
1033         try:
1034             for d in self._join(self.get(id)):
1035                 yield d
1036         except StopIteration:
1037             log('booger!\n')
1038
1039 def tags():
1040     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1041     tags = {}
1042     for (n,c) in list_refs():
1043         if n.startswith('refs/tags/'):
1044             name = n[10:]
1045             if not c in tags:
1046                 tags[c] = []
1047
1048             tags[c].append(name)  # more than one tag can point at 'c'
1049
1050     return tags