lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5 import os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
   6 from bup.helpers import *
   7 from bup import _helpers, path, midx, bloom, xstat
   8
   9 max_pack_size = 1000*1000*1000  # larger packs will slow down pruning
  10 max_pack_objects = 200*1000  # cache memory usage is about 83 bytes per object
  11
  12 verbose = 0
  13 ignore_midx = 0
  14 repodir = None
  15
  16 _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
  17 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
  18
  19 _total_searches = 0
  20 _total_steps = 0
  21
  22
  23 class GitError(Exception):
  24     pass
  25
  26
  27 def repo(sub = ''):
  28     """Get the path to the git repository or one of its subdirectories."""
  29     global repodir
  30     if not repodir:
  31         raise GitError('You should call check_repo_or_die()')
  32
  33     # If there's a .git subdirectory, then the actual repo is in there.
  34     gd = os.path.join(repodir, '.git')
  35     if os.path.exists(gd):
  36         repodir = gd
  37
  38     return os.path.join(repodir, sub)
  39
  40
  41 def shorten_hash(s):
  42     return re.sub(r'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)',
  43                   r'\1\2*\3', s)
  44
  45
  46 def repo_rel(path):
  47     full = os.path.abspath(path)
  48     fullrepo = os.path.abspath(repo(''))
  49     if not fullrepo.endswith('/'):
  50         fullrepo += '/'
  51     if full.startswith(fullrepo):
  52         path = full[len(fullrepo):]
  53     if path.startswith('index-cache/'):
  54         path = path[len('index-cache/'):]
  55     return shorten_hash(path)
  56
  57
  58 def all_packdirs():
  59     paths = [repo('objects/pack')]
  60     paths += glob.glob(repo('index-cache/*/.'))
  61     return paths
  62
  63
  64 def auto_midx(objdir):
  65     args = [path.exe(), 'midx', '--auto', '--dir', objdir]
  66     try:
  67         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
  68     except OSError, e:
  69         # make sure 'args' gets printed to help with debugging
  70         add_error('%r: exception: %s' % (args, e))
  71         raise
  72     if rv:
  73         add_error('%r: returned %d' % (args, rv))
  74
  75     args = [path.exe(), 'bloom', '--dir', objdir]
  76     try:
  77         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
  78     except OSError, e:
  79         # make sure 'args' gets printed to help with debugging
  80         add_error('%r: exception: %s' % (args, e))
  81         raise
  82     if rv:
  83         add_error('%r: returned %d' % (args, rv))
  84
  85
  86 def mangle_name(name, mode, gitmode):
  87     """Mangle a file name to present an abstract name for segmented files.
  88     Mangled file names will have the ".bup" extension added to them. If a
  89     file's name already ends with ".bup", a ".bupl" extension is added to
  90     disambiguate normal files from semgmented ones.
  91     """
  92     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
  93         return name + '.bup'
  94     elif name.endswith('.bup') or name[:-1].endswith('.bup'):
  95         return name + '.bupl'
  96     else:
  97         return name
  98
  99
 100 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 101 def demangle_name(name):
 102     """Remove name mangling from a file name, if necessary.
 103
 104     The return value is a tuple (demangled_filename,mode), where mode is one of
 105     the following:
 106
 107     * BUP_NORMAL  : files that should be read as-is from the repository
 108     * BUP_CHUNKED : files that were chunked and need to be assembled
 109
 110     For more information on the name mangling algorythm, see mangle_name()
 111     """
 112     if name.endswith('.bupl'):
 113         return (name[:-5], BUP_NORMAL)
 114     elif name.endswith('.bup'):
 115         return (name[:-4], BUP_CHUNKED)
 116     else:
 117         return (name, BUP_NORMAL)
 118
 119
 120 def calc_hash(type, content):
 121     """Calculate some content's hash in the Git fashion."""
 122     header = '%s %d\0' % (type, len(content))
 123     sum = Sha1(header)
 124     sum.update(content)
 125     return sum.digest()
 126
 127
 128 def shalist_item_sort_key(ent):
 129     (mode, name, id) = ent
 130     assert(mode+0 == mode)
 131     if stat.S_ISDIR(mode):
 132         return name + '/'
 133     else:
 134         return name
 135
 136
 137 def tree_encode(shalist):
 138     """Generate a git tree object from (mode,name,hash) tuples."""
 139     shalist = sorted(shalist, key = shalist_item_sort_key)
 140     l = []
 141     for (mode,name,bin) in shalist:
 142         assert(mode)
 143         assert(mode+0 == mode)
 144         assert(name)
 145         assert(len(bin) == 20)
 146         s = '%o %s\0%s' % (mode,name,bin)
 147         assert(s[0] != '0')  # 0-padded octal is not acceptable in a git tree
 148         l.append(s)
 149     return ''.join(l)
 150
 151
 152 def tree_decode(buf):
 153     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 154     ofs = 0
 155     while ofs < len(buf):
 156         z = buf.find('\0', ofs)
 157         assert(z > ofs)
 158         spl = buf[ofs:z].split(' ', 1)
 159         assert(len(spl) == 2)
 160         mode,name = spl
 161         sha = buf[z+1:z+1+20]
 162         ofs = z+1+20
 163         yield (int(mode, 8), name, sha)
 164
 165
 166 def _encode_packobj(type, content, compression_level=1):
 167     szout = ''
 168     sz = len(content)
 169     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 170     sz >>= 4
 171     while 1:
 172         if sz: szbits |= 0x80
 173         szout += chr(szbits)
 174         if not sz:
 175             break
 176         szbits = sz & 0x7f
 177         sz >>= 7
 178     if compression_level > 9:
 179         compression_level = 9
 180     elif compression_level < 0:
 181         compression_level = 0
 182     z = zlib.compressobj(compression_level)
 183     yield szout
 184     yield z.compress(content)
 185     yield z.flush()
 186
 187
 188 def _encode_looseobj(type, content, compression_level=1):
 189     z = zlib.compressobj(compression_level)
 190     yield z.compress('%s %d\0' % (type, len(content)))
 191     yield z.compress(content)
 192     yield z.flush()
 193
 194
 195 def _decode_looseobj(buf):
 196     assert(buf);
 197     s = zlib.decompress(buf)
 198     i = s.find('\0')
 199     assert(i > 0)
 200     l = s[:i].split(' ')
 201     type = l[0]
 202     sz = int(l[1])
 203     content = s[i+1:]
 204     assert(type in _typemap)
 205     assert(sz == len(content))
 206     return (type, content)
 207
 208
 209 def _decode_packobj(buf):
 210     assert(buf)
 211     c = ord(buf[0])
 212     type = _typermap[(c & 0x70) >> 4]
 213     sz = c & 0x0f
 214     shift = 4
 215     i = 0
 216     while c & 0x80:
 217         i += 1
 218         c = ord(buf[i])
 219         sz |= (c & 0x7f) << shift
 220         shift += 7
 221         if not (c & 0x80):
 222             break
 223     return (type, zlib.decompress(buf[i+1:]))
 224
 225
 226 class PackIdx:
 227     def __init__(self):
 228         assert(0)
 229
 230     def find_offset(self, hash):
 231         """Get the offset of an object inside the index file."""
 232         idx = self._idx_from_hash(hash)
 233         if idx != None:
 234             return self._ofs_from_idx(idx)
 235         return None
 236
 237     def exists(self, hash, want_source=False):
 238         """Return nonempty if the object exists in this index."""
 239         if hash and (self._idx_from_hash(hash) != None):
 240             return want_source and os.path.basename(self.name) or True
 241         return None
 242
 243     def __len__(self):
 244         return int(self.fanout[255])
 245
 246     def _idx_from_hash(self, hash):
 247         global _total_searches, _total_steps
 248         _total_searches += 1
 249         assert(len(hash) == 20)
 250         b1 = ord(hash[0])
 251         start = self.fanout[b1-1] # range -1..254
 252         end = self.fanout[b1] # range 0..255
 253         want = str(hash)
 254         _total_steps += 1  # lookup table is a step
 255         while start < end:
 256             _total_steps += 1
 257             mid = start + (end-start)/2
 258             v = self._idx_to_hash(mid)
 259             if v < want:
 260                 start = mid+1
 261             elif v > want:
 262                 end = mid
 263             else: # got it!
 264                 return mid
 265         return None
 266
 267
 268 class PackIdxV1(PackIdx):
 269     """Object representation of a Git pack index (version 1) file."""
 270     def __init__(self, filename, f):
 271         self.name = filename
 272         self.idxnames = [self.name]
 273         self.map = mmap_read(f)
 274         self.fanout = list(struct.unpack('!256I',
 275                                          str(buffer(self.map, 0, 256*4))))
 276         self.fanout.append(0)  # entry "-1"
 277         nsha = self.fanout[255]
 278         self.sha_ofs = 256*4
 279         self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
 280
 281     def _ofs_from_idx(self, idx):
 282         return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
 283
 284     def _idx_to_hash(self, idx):
 285         return str(self.shatable[idx*24+4 : idx*24+24])
 286
 287     def __iter__(self):
 288         for i in xrange(self.fanout[255]):
 289             yield buffer(self.map, 256*4 + 24*i + 4, 20)
 290
 291
 292 class PackIdxV2(PackIdx):
 293     """Object representation of a Git pack index (version 2) file."""
 294     def __init__(self, filename, f):
 295         self.name = filename
 296         self.idxnames = [self.name]
 297         self.map = mmap_read(f)
 298         assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
 299         self.fanout = list(struct.unpack('!256I',
 300                                          str(buffer(self.map, 8, 256*4))))
 301         self.fanout.append(0)  # entry "-1"
 302         nsha = self.fanout[255]
 303         self.sha_ofs = 8 + 256*4
 304         self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
 305         self.ofstable = buffer(self.map,
 306                                self.sha_ofs + nsha*20 + nsha*4,
 307                                nsha*4)
 308         self.ofs64table = buffer(self.map,
 309                                  8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
 310
 311     def _ofs_from_idx(self, idx):
 312         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
 313         if ofs & 0x80000000:
 314             idx64 = ofs & 0x7fffffff
 315             ofs = struct.unpack('!Q',
 316                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
 317         return ofs
 318
 319     def _idx_to_hash(self, idx):
 320         return str(self.shatable[idx*20:(idx+1)*20])
 321
 322     def __iter__(self):
 323         for i in xrange(self.fanout[255]):
 324             yield buffer(self.map, 8 + 256*4 + 20*i, 20)
 325
 326
 327 _mpi_count = 0
 328 class PackIdxList:
 329     def __init__(self, dir):
 330         global _mpi_count
 331         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 332         _mpi_count += 1
 333         self.dir = dir
 334         self.also = set()
 335         self.packs = []
 336         self.do_bloom = False
 337         self.bloom = None
 338         self.refresh()
 339
 340     def __del__(self):
 341         global _mpi_count
 342         _mpi_count -= 1
 343         assert(_mpi_count == 0)
 344
 345     def __iter__(self):
 346         return iter(idxmerge(self.packs))
 347
 348     def __len__(self):
 349         return sum(len(pack) for pack in self.packs)
 350
 351     def exists(self, hash, want_source=False):
 352         """Return nonempty if the object exists in the index files."""
 353         global _total_searches
 354         _total_searches += 1
 355         if hash in self.also:
 356             return True
 357         if self.do_bloom and self.bloom:
 358             if self.bloom.exists(hash):
 359                 self.do_bloom = False
 360             else:
 361                 _total_searches -= 1  # was counted by bloom
 362                 return None
 363         for i in xrange(len(self.packs)):
 364             p = self.packs[i]
 365             _total_searches -= 1  # will be incremented by sub-pack
 366             ix = p.exists(hash, want_source=want_source)
 367             if ix:
 368                 # reorder so most recently used packs are searched first
 369                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 370                 return ix
 371         self.do_bloom = True
 372         return None
 373
 374     def refresh(self, skip_midx = False):
 375         """Refresh the index list.
 376         This method verifies if .midx files were superseded (e.g. all of its
 377         contents are in another, bigger .midx file) and removes the superseded
 378         files.
 379
 380         If skip_midx is True, all work on .midx files will be skipped and .midx
 381         files will be removed from the list.
 382
 383         The module-global variable 'ignore_midx' can force this function to
 384         always act as if skip_midx was True.
 385         """
 386         self.bloom = None # Always reopen the bloom as it may have been relaced
 387         self.do_bloom = False
 388         skip_midx = skip_midx or ignore_midx
 389         d = dict((p.name, p) for p in self.packs
 390                  if not skip_midx or not isinstance(p, midx.PackMidx))
 391         if os.path.exists(self.dir):
 392             if not skip_midx:
 393                 midxl = []
 394                 for ix in self.packs:
 395                     if isinstance(ix, midx.PackMidx):
 396                         for name in ix.idxnames:
 397                             d[os.path.join(self.dir, name)] = ix
 398                 for full in glob.glob(os.path.join(self.dir,'*.midx')):
 399                     if not d.get(full):
 400                         mx = midx.PackMidx(full)
 401                         (mxd, mxf) = os.path.split(mx.name)
 402                         broken = False
 403                         for n in mx.idxnames:
 404                             if not os.path.exists(os.path.join(mxd, n)):
 405                                 log(('warning: index %s missing\n' +
 406                                     '  used by %s\n') % (n, mxf))
 407                                 broken = True
 408                         if broken:
 409                             del mx
 410                             unlink(full)
 411                         else:
 412                             midxl.append(mx)
 413                 midxl.sort(key=lambda ix:
 414                            (-len(ix), -xstat.stat(ix.name).st_mtime))
 415                 for ix in midxl:
 416                     any_needed = False
 417                     for sub in ix.idxnames:
 418                         found = d.get(os.path.join(self.dir, sub))
 419                         if not found or isinstance(found, PackIdx):
 420                             # doesn't exist, or exists but not in a midx
 421                             any_needed = True
 422                             break
 423                     if any_needed:
 424                         d[ix.name] = ix
 425                         for name in ix.idxnames:
 426                             d[os.path.join(self.dir, name)] = ix
 427                     elif not ix.force_keep:
 428                         debug1('midx: removing redundant: %s\n'
 429                                % os.path.basename(ix.name))
 430                         unlink(ix.name)
 431             for full in glob.glob(os.path.join(self.dir,'*.idx')):
 432                 if not d.get(full):
 433                     try:
 434                         ix = open_idx(full)
 435                     except GitError, e:
 436                         add_error(e)
 437                         continue
 438                     d[full] = ix
 439             bfull = os.path.join(self.dir, 'bup.bloom')
 440             if self.bloom is None and os.path.exists(bfull):
 441                 self.bloom = bloom.ShaBloom(bfull)
 442             self.packs = list(set(d.values()))
 443             self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
 444             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 445                 self.do_bloom = True
 446             else:
 447                 self.bloom = None
 448         debug1('PackIdxList: using %d index%s.\n'
 449             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 450
 451     def add(self, hash):
 452         """Insert an additional object in the list."""
 453         self.also.add(hash)
 454
 455
 456 def open_idx(filename):
 457     if filename.endswith('.idx'):
 458         f = open(filename, 'rb')
 459         header = f.read(8)
 460         if header[0:4] == '\377tOc':
 461             version = struct.unpack('!I', header[4:8])[0]
 462             if version == 2:
 463                 return PackIdxV2(filename, f)
 464             else:
 465                 raise GitError('%s: expected idx file version 2, got %d'
 466                                % (filename, version))
 467         elif len(header) == 8 and header[0:4] < '\377tOc':
 468             return PackIdxV1(filename, f)
 469         else:
 470             raise GitError('%s: unrecognized idx file header' % filename)
 471     elif filename.endswith('.midx'):
 472         return midx.PackMidx(filename)
 473     else:
 474         raise GitError('idx filenames must end with .idx or .midx')
 475
 476
 477 def idxmerge(idxlist, final_progress=True):
 478     """Generate a list of all the objects reachable in a PackIdxList."""
 479     def pfunc(count, total):
 480         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 481                   % (count*100.0/total, count, total))
 482     def pfinal(count, total):
 483         if final_progress:
 484             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 485                      % (100, total, total))
 486     return merge_iter(idxlist, 10024, pfunc, pfinal)
 487
 488
 489 def _make_objcache():
 490     return PackIdxList(repo('objects/pack'))
 491
 492 class PackWriter:
 493     """Writes Git objects inside a pack file."""
 494     def __init__(self, objcache_maker=_make_objcache, compression_level=1):
 495         self.count = 0
 496         self.outbytes = 0
 497         self.filename = None
 498         self.file = None
 499         self.idx = None
 500         self.objcache_maker = objcache_maker
 501         self.objcache = None
 502         self.compression_level = compression_level
 503
 504     def __del__(self):
 505         self.close()
 506
 507     def _open(self):
 508         if not self.file:
 509             (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
 510             self.file = os.fdopen(fd, 'w+b')
 511             assert(name.endswith('.pack'))
 512             self.filename = name[:-5]
 513             self.file.write('PACK\0\0\0\2\0\0\0\0')
 514             self.idx = list(list() for i in xrange(256))
 515
 516     def _raw_write(self, datalist, sha):
 517         self._open()
 518         f = self.file
 519         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 520         # the file never has a *partial* blob.  So let's make sure it's
 521         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 522         # to our hashsplit algorithm.)  f.write() does its own buffering,
 523         # but that's okay because we'll flush it in _end().
 524         oneblob = ''.join(datalist)
 525         try:
 526             f.write(oneblob)
 527         except IOError, e:
 528             raise GitError, e, sys.exc_info()[2]
 529         nw = len(oneblob)
 530         crc = zlib.crc32(oneblob) & 0xffffffff
 531         self._update_idx(sha, crc, nw)
 532         self.outbytes += nw
 533         self.count += 1
 534         return nw, crc
 535
 536     def _update_idx(self, sha, crc, size):
 537         assert(sha)
 538         if self.idx:
 539             self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
 540
 541     def _write(self, sha, type, content):
 542         if verbose:
 543             log('>')
 544         if not sha:
 545             sha = calc_hash(type, content)
 546         size, crc = self._raw_write(_encode_packobj(type, content,
 547                                                     self.compression_level),
 548                                     sha=sha)
 549         if self.outbytes >= max_pack_size or self.count >= max_pack_objects:
 550             self.breakpoint()
 551         return sha
 552
 553     def breakpoint(self):
 554         """Clear byte and object counts and return the last processed id."""
 555         id = self._end()
 556         self.outbytes = self.count = 0
 557         return id
 558
 559     def _require_objcache(self):
 560         if self.objcache is None and self.objcache_maker:
 561             self.objcache = self.objcache_maker()
 562         if self.objcache is None:
 563             raise GitError(
 564                     "PackWriter not opened or can't check exists w/o objcache")
 565
 566     def exists(self, id, want_source=False):
 567         """Return non-empty if an object is found in the object cache."""
 568         self._require_objcache()
 569         return self.objcache.exists(id, want_source=want_source)
 570
 571     def maybe_write(self, type, content):
 572         """Write an object to the pack file if not present and return its id."""
 573         sha = calc_hash(type, content)
 574         if not self.exists(sha):
 575             self._write(sha, type, content)
 576             self._require_objcache()
 577             self.objcache.add(sha)
 578         return sha
 579
 580     def new_blob(self, blob):
 581         """Create a blob object in the pack with the supplied content."""
 582         return self.maybe_write('blob', blob)
 583
 584     def new_tree(self, shalist):
 585         """Create a tree object in the pack."""
 586         content = tree_encode(shalist)
 587         return self.maybe_write('tree', content)
 588
 589     def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
 590         l = []
 591         if tree: l.append('tree %s' % tree.encode('hex'))
 592         if parent: l.append('parent %s' % parent.encode('hex'))
 593         if author: l.append('author %s %s' % (author, _git_date(adate)))
 594         if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
 595         l.append('')
 596         l.append(msg)
 597         return self.maybe_write('commit', '\n'.join(l))
 598
 599     def new_commit(self, parent, tree, date, msg):
 600         """Create a commit object in the pack."""
 601         userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
 602         commit = self._new_commit(tree, parent,
 603                                   userline, date, userline, date,
 604                                   msg)
 605         return commit
 606
 607     def abort(self):
 608         """Remove the pack file from disk."""
 609         f = self.file
 610         if f:
 611             self.idx = None
 612             self.file = None
 613             f.close()
 614             os.unlink(self.filename + '.pack')
 615
 616     def _end(self, run_midx=True):
 617         f = self.file
 618         if not f: return None
 619         self.file = None
 620         self.objcache = None
 621         idx = self.idx
 622         self.idx = None
 623
 624         # update object count
 625         f.seek(8)
 626         cp = struct.pack('!i', self.count)
 627         assert(len(cp) == 4)
 628         f.write(cp)
 629
 630         # calculate the pack sha1sum
 631         f.seek(0)
 632         sum = Sha1()
 633         for b in chunkyreader(f):
 634             sum.update(b)
 635         packbin = sum.digest()
 636         f.write(packbin)
 637         f.close()
 638
 639         obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
 640
 641         nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
 642         if os.path.exists(self.filename + '.map'):
 643             os.unlink(self.filename + '.map')
 644         os.rename(self.filename + '.pack', nameprefix + '.pack')
 645         os.rename(self.filename + '.idx', nameprefix + '.idx')
 646
 647         if run_midx:
 648             auto_midx(repo('objects/pack'))
 649         return nameprefix
 650
 651     def close(self, run_midx=True):
 652         """Close the pack file and move it to its definitive path."""
 653         return self._end(run_midx=run_midx)
 654
 655     def _write_pack_idx_v2(self, filename, idx, packbin):
 656         ofs64_count = 0
 657         for section in idx:
 658             for entry in section:
 659                 if entry[2] >= 2**31:
 660                     ofs64_count += 1
 661
 662         # Length: header + fan-out + shas-and-crcs + overflow-offsets
 663         index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
 664         idx_map = None
 665         idx_f = open(filename, 'w+b')
 666         try:
 667             idx_f.truncate(index_len)
 668             idx_map = mmap_readwrite(idx_f, close=False)
 669             count = _helpers.write_idx(filename, idx_map, idx, self.count)
 670             assert(count == self.count)
 671         finally:
 672             if idx_map: idx_map.close()
 673             idx_f.close()
 674
 675         idx_f = open(filename, 'a+b')
 676         try:
 677             idx_f.write(packbin)
 678             idx_f.seek(0)
 679             idx_sum = Sha1()
 680             b = idx_f.read(8 + 4*256)
 681             idx_sum.update(b)
 682
 683             obj_list_sum = Sha1()
 684             for b in chunkyreader(idx_f, 20*self.count):
 685                 idx_sum.update(b)
 686                 obj_list_sum.update(b)
 687             namebase = obj_list_sum.hexdigest()
 688
 689             for b in chunkyreader(idx_f):
 690                 idx_sum.update(b)
 691             idx_f.write(idx_sum.digest())
 692             return namebase
 693         finally:
 694             idx_f.close()
 695
 696
 697 def _git_date(date):
 698     return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
 699
 700
 701 def _gitenv():
 702     os.environ['GIT_DIR'] = os.path.abspath(repo())
 703
 704
 705 def list_refs(refname = None):
 706     """Generate a list of tuples in the form (refname,hash).
 707     If a ref name is specified, list only this particular ref.
 708     """
 709     argv = ['git', 'show-ref', '--']
 710     if refname:
 711         argv += [refname]
 712     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 713     out = p.stdout.read().strip()
 714     rv = p.wait()  # not fatal
 715     if rv:
 716         assert(not out)
 717     if out:
 718         for d in out.split('\n'):
 719             (sha, name) = d.split(' ', 1)
 720             yield (name, sha.decode('hex'))
 721
 722
 723 def read_ref(refname):
 724     """Get the commit id of the most recent commit made on a given ref."""
 725     l = list(list_refs(refname))
 726     if l:
 727         assert(len(l) == 1)
 728         return l[0][1]
 729     else:
 730         return None
 731
 732
 733 def rev_list(ref, count=None):
 734     """Generate a list of reachable commits in reverse chronological order.
 735
 736     This generator walks through commits, from child to parent, that are
 737     reachable via the specified ref and yields a series of tuples of the form
 738     (date,hash).
 739
 740     If count is a non-zero integer, limit the number of commits to "count"
 741     objects.
 742     """
 743     assert(not ref.startswith('-'))
 744     opts = []
 745     if count:
 746         opts += ['-n', str(atoi(count))]
 747     argv = ['git', 'rev-list', '--pretty=format:%at'] + opts + [ref, '--']
 748     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 749     commit = None
 750     for row in p.stdout:
 751         s = row.strip()
 752         if s.startswith('commit '):
 753             commit = s[7:].decode('hex')
 754         else:
 755             date = int(s)
 756             yield (date, commit)
 757     rv = p.wait()  # not fatal
 758     if rv:
 759         raise GitError, 'git rev-list returned error %d' % rv
 760
 761
 762 def get_commit_dates(refs):
 763     """Get the dates for the specified commit refs.  For now, every unique
 764        string in refs must resolve to a different commit or this
 765        function will fail."""
 766     result = []
 767     cmd = ['git', 'show', '-s', '--pretty=format:%ct']
 768     for chunk in batchpipe(cmd, refs, preexec_fn=_gitenv):
 769         result += [int(x) for x in chunk.splitlines()]
 770     if len(result) == len(refs):
 771         return result
 772     # git show suppressed duplicates -- fix it
 773     ref_dates = {}
 774     corrected_result = []
 775     dates = iter(result)
 776     for ref in refs:
 777         prev_date = ref_dates.get(ref)
 778         if prev_date:
 779             corrected_result.append(prev_date)
 780         else:
 781             date = next(dates)
 782             ref_dates[ref] = date
 783             corrected_result.append(date)
 784     assert(next(dates, None) is None)
 785     return corrected_result
 786
 787
 788 def rev_parse(committish):
 789     """Resolve the full hash for 'committish', if it exists.
 790
 791     Should be roughly equivalent to 'git rev-parse'.
 792
 793     Returns the hex value of the hash if it is found, None if 'committish' does
 794     not correspond to anything.
 795     """
 796     head = read_ref(committish)
 797     if head:
 798         debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
 799         return head
 800
 801     pL = PackIdxList(repo('objects/pack'))
 802
 803     if len(committish) == 40:
 804         try:
 805             hash = committish.decode('hex')
 806         except TypeError:
 807             return None
 808
 809         if pL.exists(hash):
 810             return hash
 811
 812     return None
 813
 814
 815 def update_ref(refname, newval, oldval):
 816     """Change the commit pointed to by a branch."""
 817     if not oldval:
 818         oldval = ''
 819     assert(refname.startswith('refs/heads/'))
 820     p = subprocess.Popen(['git', 'update-ref', refname,
 821                           newval.encode('hex'), oldval.encode('hex')],
 822                          preexec_fn = _gitenv)
 823     _git_wait('git update-ref', p)
 824
 825
 826 def guess_repo(path=None):
 827     """Set the path value in the global variable "repodir".
 828     This makes bup look for an existing bup repository, but not fail if a
 829     repository doesn't exist. Usually, if you are interacting with a bup
 830     repository, you would not be calling this function but using
 831     check_repo_or_die().
 832     """
 833     global repodir
 834     if path:
 835         repodir = path
 836     if not repodir:
 837         repodir = os.environ.get('BUP_DIR')
 838         if not repodir:
 839             repodir = os.path.expanduser('~/.bup')
 840
 841
 842 def init_repo(path=None):
 843     """Create the Git bare repository for bup in a given path."""
 844     guess_repo(path)
 845     d = repo()  # appends a / to the path
 846     parent = os.path.dirname(os.path.dirname(d))
 847     if parent and not os.path.exists(parent):
 848         raise GitError('parent directory "%s" does not exist\n' % parent)
 849     if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
 850         raise GitError('"%s" exists but is not a directory\n' % d)
 851     p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
 852                          preexec_fn = _gitenv)
 853     _git_wait('git init', p)
 854     # Force the index version configuration in order to ensure bup works
 855     # regardless of the version of the installed Git binary.
 856     p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
 857                          stdout=sys.stderr, preexec_fn = _gitenv)
 858     _git_wait('git config', p)
 859     # Enable the reflog
 860     p = subprocess.Popen(['git', 'config', 'core.logAllRefUpdates', 'true'],
 861                          stdout=sys.stderr, preexec_fn = _gitenv)
 862     _git_wait('git config', p)
 863
 864
 865 def check_repo_or_die(path=None):
 866     """Make sure a bup repository exists, and abort if not.
 867     If the path to a particular repository was not specified, this function
 868     initializes the default repository automatically.
 869     """
 870     guess_repo(path)
 871     try:
 872         os.stat(repo('objects/pack/.'))
 873     except OSError, e:
 874         if e.errno == errno.ENOENT:
 875             log('error: %r is not a bup repository; run "bup init"\n'
 876                 % repo())
 877             sys.exit(15)
 878         else:
 879             log('error: %s\n' % e)
 880             sys.exit(14)
 881
 882
 883 _ver = None
 884 def ver():
 885     """Get Git's version and ensure a usable version is installed.
 886
 887     The returned version is formatted as an ordered tuple with each position
 888     representing a digit in the version tag. For example, the following tuple
 889     would represent version 1.6.6.9:
 890
 891         ('1', '6', '6', '9')
 892     """
 893     global _ver
 894     if not _ver:
 895         p = subprocess.Popen(['git', '--version'],
 896                              stdout=subprocess.PIPE)
 897         gvs = p.stdout.read()
 898         _git_wait('git --version', p)
 899         m = re.match(r'git version (\S+.\S+)', gvs)
 900         if not m:
 901             raise GitError('git --version weird output: %r' % gvs)
 902         _ver = tuple(m.group(1).split('.'))
 903     needed = ('1','5', '3', '1')
 904     if _ver < needed:
 905         raise GitError('git version %s or higher is required; you have %s'
 906                        % ('.'.join(needed), '.'.join(_ver)))
 907     return _ver
 908
 909
 910 def _git_wait(cmd, p):
 911     rv = p.wait()
 912     if rv != 0:
 913         raise GitError('%s returned %d' % (cmd, rv))
 914
 915
 916 def _git_capture(argv):
 917     p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
 918     r = p.stdout.read()
 919     _git_wait(repr(argv), p)
 920     return r
 921
 922
 923 class _AbortableIter:
 924     def __init__(self, it, onabort = None):
 925         self.it = it
 926         self.onabort = onabort
 927         self.done = None
 928
 929     def __iter__(self):
 930         return self
 931
 932     def next(self):
 933         try:
 934             return self.it.next()
 935         except StopIteration, e:
 936             self.done = True
 937             raise
 938         except:
 939             self.abort()
 940             raise
 941
 942     def abort(self):
 943         """Abort iteration and call the abortion callback, if needed."""
 944         if not self.done:
 945             self.done = True
 946             if self.onabort:
 947                 self.onabort()
 948
 949     def __del__(self):
 950         self.abort()
 951
 952
 953 _ver_warned = 0
 954 class CatPipe:
 955     """Link to 'git cat-file' that is used to retrieve blob data."""
 956     def __init__(self):
 957         global _ver_warned
 958         wanted = ('1','5','6')
 959         if ver() < wanted:
 960             if not _ver_warned:
 961                 log('warning: git version < %s; bup will be slow.\n'
 962                     % '.'.join(wanted))
 963                 _ver_warned = 1
 964             self.get = self._slow_get
 965         else:
 966             self.p = self.inprogress = None
 967             self.get = self._fast_get
 968
 969     def _abort(self):
 970         if self.p:
 971             self.p.stdout.close()
 972             self.p.stdin.close()
 973         self.p = None
 974         self.inprogress = None
 975
 976     def _restart(self):
 977         self._abort()
 978         self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
 979                                   stdin=subprocess.PIPE,
 980                                   stdout=subprocess.PIPE,
 981                                   close_fds = True,
 982                                   bufsize = 4096,
 983                                   preexec_fn = _gitenv)
 984
 985     def _fast_get(self, id):
 986         if not self.p or self.p.poll() != None:
 987             self._restart()
 988         assert(self.p)
 989         poll_result = self.p.poll()
 990         assert(poll_result == None)
 991         if self.inprogress:
 992             log('_fast_get: opening %r while %r is open\n'
 993                 % (id, self.inprogress))
 994         assert(not self.inprogress)
 995         assert(id.find('\n') < 0)
 996         assert(id.find('\r') < 0)
 997         assert(not id.startswith('-'))
 998         self.inprogress = id
 999         self.p.stdin.write('%s\n' % id)
1000         self.p.stdin.flush()
1001         hdr = self.p.stdout.readline()
1002         if hdr.endswith(' missing\n'):
1003             self.inprogress = None
1004             raise KeyError('blob %r is missing' % id)
1005         spl = hdr.split(' ')
1006         if len(spl) != 3 or len(spl[0]) != 40:
1007             raise GitError('expected blob, got %r' % spl)
1008         (hex, type, size) = spl
1009
1010         it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
1011                            onabort = self._abort)
1012         try:
1013             yield type
1014             for blob in it:
1015                 yield blob
1016             readline_result = self.p.stdout.readline()
1017             assert(readline_result == '\n')
1018             self.inprogress = None
1019         except Exception, e:
1020             it.abort()
1021             raise
1022
1023     def _slow_get(self, id):
1024         assert(id.find('\n') < 0)
1025         assert(id.find('\r') < 0)
1026         assert(id[0] != '-')
1027         type = _git_capture(['git', 'cat-file', '-t', id]).strip()
1028         yield type
1029
1030         p = subprocess.Popen(['git', 'cat-file', type, id],
1031                              stdout=subprocess.PIPE,
1032                              preexec_fn = _gitenv)
1033         for blob in chunkyreader(p.stdout):
1034             yield blob
1035         _git_wait('git cat-file', p)
1036
1037     def _join(self, it):
1038         type = it.next()
1039         if type == 'blob':
1040             for blob in it:
1041                 yield blob
1042         elif type == 'tree':
1043             treefile = ''.join(it)
1044             for (mode, name, sha) in tree_decode(treefile):
1045                 for blob in self.join(sha.encode('hex')):
1046                     yield blob
1047         elif type == 'commit':
1048             treeline = ''.join(it).split('\n')[0]
1049             assert(treeline.startswith('tree '))
1050             for blob in self.join(treeline[5:]):
1051                 yield blob
1052         else:
1053             raise GitError('invalid object type %r: expected blob/tree/commit'
1054                            % type)
1055
1056     def join(self, id):
1057         """Generate a list of the content of all blobs that can be reached
1058         from an object.  The hash given in 'id' must point to a blob, a tree
1059         or a commit. The content of all blobs that can be seen from trees or
1060         commits will be added to the list.
1061         """
1062         try:
1063             for d in self._join(self.get(id)):
1064                 yield d
1065         except StopIteration:
1066             log('booger!\n')
1067
1068
1069 _cp = (None, None)
1070
1071 def cp():
1072     """Create a CatPipe object or reuse an already existing one."""
1073     global _cp
1074     cp_dir, cp = _cp
1075     cur_dir = os.path.realpath(repo())
1076     if cur_dir != cp_dir:
1077         cp = CatPipe()
1078         _cp = (cur_dir, cp)
1079     return cp
1080
1081
1082 def tags():
1083     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1084     tags = {}
1085     for (n,c) in list_refs():
1086         if n.startswith('refs/tags/'):
1087             name = n[10:]
1088             if not c in tags:
1089                 tags[c] = []
1090
1091             tags[c].append(name)  # more than one tag can point at 'c'
1092
1093     return tags