lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5 import os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
   6 from bup.helpers import *
   7 from bup import _helpers, path, midx, bloom, xstat
   8
   9 max_pack_size = 1000*1000*1000  # larger packs will slow down pruning
  10 max_pack_objects = 200*1000  # cache memory usage is about 83 bytes per object
  11
  12 verbose = 0
  13 ignore_midx = 0
  14 repodir = None
  15
  16 _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
  17 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
  18
  19 _total_searches = 0
  20 _total_steps = 0
  21
  22
  23 class GitError(Exception):
  24     pass
  25
  26
  27 def repo(sub = ''):
  28     """Get the path to the git repository or one of its subdirectories."""
  29     global repodir
  30     if not repodir:
  31         raise GitError('You should call check_repo_or_die()')
  32
  33     # If there's a .git subdirectory, then the actual repo is in there.
  34     gd = os.path.join(repodir, '.git')
  35     if os.path.exists(gd):
  36         repodir = gd
  37
  38     return os.path.join(repodir, sub)
  39
  40
  41 def shorten_hash(s):
  42     return re.sub(r'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)',
  43                   r'\1\2*\3', s)
  44
  45
  46 def repo_rel(path):
  47     full = os.path.abspath(path)
  48     fullrepo = os.path.abspath(repo(''))
  49     if not fullrepo.endswith('/'):
  50         fullrepo += '/'
  51     if full.startswith(fullrepo):
  52         path = full[len(fullrepo):]
  53     if path.startswith('index-cache/'):
  54         path = path[len('index-cache/'):]
  55     return shorten_hash(path)
  56
  57
  58 def all_packdirs():
  59     paths = [repo('objects/pack')]
  60     paths += glob.glob(repo('index-cache/*/.'))
  61     return paths
  62
  63
  64 def auto_midx(objdir):
  65     args = [path.exe(), 'midx', '--auto', '--dir', objdir]
  66     try:
  67         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
  68     except OSError, e:
  69         # make sure 'args' gets printed to help with debugging
  70         add_error('%r: exception: %s' % (args, e))
  71         raise
  72     if rv:
  73         add_error('%r: returned %d' % (args, rv))
  74
  75     args = [path.exe(), 'bloom', '--dir', objdir]
  76     try:
  77         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
  78     except OSError, e:
  79         # make sure 'args' gets printed to help with debugging
  80         add_error('%r: exception: %s' % (args, e))
  81         raise
  82     if rv:
  83         add_error('%r: returned %d' % (args, rv))
  84
  85
  86 def mangle_name(name, mode, gitmode):
  87     """Mangle a file name to present an abstract name for segmented files.
  88     Mangled file names will have the ".bup" extension added to them. If a
  89     file's name already ends with ".bup", a ".bupl" extension is added to
  90     disambiguate normal files from semgmented ones.
  91     """
  92     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
  93         return name + '.bup'
  94     elif name.endswith('.bup') or name[:-1].endswith('.bup'):
  95         return name + '.bupl'
  96     else:
  97         return name
  98
  99
 100 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 101 def demangle_name(name):
 102     """Remove name mangling from a file name, if necessary.
 103
 104     The return value is a tuple (demangled_filename,mode), where mode is one of
 105     the following:
 106
 107     * BUP_NORMAL  : files that should be read as-is from the repository
 108     * BUP_CHUNKED : files that were chunked and need to be assembled
 109
 110     For more information on the name mangling algorythm, see mangle_name()
 111     """
 112     if name.endswith('.bupl'):
 113         return (name[:-5], BUP_NORMAL)
 114     elif name.endswith('.bup'):
 115         return (name[:-4], BUP_CHUNKED)
 116     else:
 117         return (name, BUP_NORMAL)
 118
 119
 120 def calc_hash(type, content):
 121     """Calculate some content's hash in the Git fashion."""
 122     header = '%s %d\0' % (type, len(content))
 123     sum = Sha1(header)
 124     sum.update(content)
 125     return sum.digest()
 126
 127
 128 def shalist_item_sort_key(ent):
 129     (mode, name, id) = ent
 130     assert(mode+0 == mode)
 131     if stat.S_ISDIR(mode):
 132         return name + '/'
 133     else:
 134         return name
 135
 136
 137 def tree_encode(shalist):
 138     """Generate a git tree object from (mode,name,hash) tuples."""
 139     shalist = sorted(shalist, key = shalist_item_sort_key)
 140     l = []
 141     for (mode,name,bin) in shalist:
 142         assert(mode)
 143         assert(mode+0 == mode)
 144         assert(name)
 145         assert(len(bin) == 20)
 146         s = '%o %s\0%s' % (mode,name,bin)
 147         assert(s[0] != '0')  # 0-padded octal is not acceptable in a git tree
 148         l.append(s)
 149     return ''.join(l)
 150
 151
 152 def tree_decode(buf):
 153     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 154     ofs = 0
 155     while ofs < len(buf):
 156         z = buf.find('\0', ofs)
 157         assert(z > ofs)
 158         spl = buf[ofs:z].split(' ', 1)
 159         assert(len(spl) == 2)
 160         mode,name = spl
 161         sha = buf[z+1:z+1+20]
 162         ofs = z+1+20
 163         yield (int(mode, 8), name, sha)
 164
 165
 166 def _encode_packobj(type, content, compression_level=1):
 167     szout = ''
 168     sz = len(content)
 169     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 170     sz >>= 4
 171     while 1:
 172         if sz: szbits |= 0x80
 173         szout += chr(szbits)
 174         if not sz:
 175             break
 176         szbits = sz & 0x7f
 177         sz >>= 7
 178     if compression_level > 9:
 179         compression_level = 9
 180     elif compression_level < 0:
 181         compression_level = 0
 182     z = zlib.compressobj(compression_level)
 183     yield szout
 184     yield z.compress(content)
 185     yield z.flush()
 186
 187
 188 def _encode_looseobj(type, content, compression_level=1):
 189     z = zlib.compressobj(compression_level)
 190     yield z.compress('%s %d\0' % (type, len(content)))
 191     yield z.compress(content)
 192     yield z.flush()
 193
 194
 195 def _decode_looseobj(buf):
 196     assert(buf);
 197     s = zlib.decompress(buf)
 198     i = s.find('\0')
 199     assert(i > 0)
 200     l = s[:i].split(' ')
 201     type = l[0]
 202     sz = int(l[1])
 203     content = s[i+1:]
 204     assert(type in _typemap)
 205     assert(sz == len(content))
 206     return (type, content)
 207
 208
 209 def _decode_packobj(buf):
 210     assert(buf)
 211     c = ord(buf[0])
 212     type = _typermap[(c & 0x70) >> 4]
 213     sz = c & 0x0f
 214     shift = 4
 215     i = 0
 216     while c & 0x80:
 217         i += 1
 218         c = ord(buf[i])
 219         sz |= (c & 0x7f) << shift
 220         shift += 7
 221         if not (c & 0x80):
 222             break
 223     return (type, zlib.decompress(buf[i+1:]))
 224
 225
 226 class PackIdx:
 227     def __init__(self):
 228         assert(0)
 229
 230     def find_offset(self, hash):
 231         """Get the offset of an object inside the index file."""
 232         idx = self._idx_from_hash(hash)
 233         if idx != None:
 234             return self._ofs_from_idx(idx)
 235         return None
 236
 237     def exists(self, hash, want_source=False):
 238         """Return nonempty if the object exists in this index."""
 239         if hash and (self._idx_from_hash(hash) != None):
 240             return want_source and os.path.basename(self.name) or True
 241         return None
 242
 243     def __len__(self):
 244         return int(self.fanout[255])
 245
 246     def _idx_from_hash(self, hash):
 247         global _total_searches, _total_steps
 248         _total_searches += 1
 249         assert(len(hash) == 20)
 250         b1 = ord(hash[0])
 251         start = self.fanout[b1-1] # range -1..254
 252         end = self.fanout[b1] # range 0..255
 253         want = str(hash)
 254         _total_steps += 1  # lookup table is a step
 255         while start < end:
 256             _total_steps += 1
 257             mid = start + (end-start)/2
 258             v = self._idx_to_hash(mid)
 259             if v < want:
 260                 start = mid+1
 261             elif v > want:
 262                 end = mid
 263             else: # got it!
 264                 return mid
 265         return None
 266
 267
 268 class PackIdxV1(PackIdx):
 269     """Object representation of a Git pack index (version 1) file."""
 270     def __init__(self, filename, f):
 271         self.name = filename
 272         self.idxnames = [self.name]
 273         self.map = mmap_read(f)
 274         self.fanout = list(struct.unpack('!256I',
 275                                          str(buffer(self.map, 0, 256*4))))
 276         self.fanout.append(0)  # entry "-1"
 277         nsha = self.fanout[255]
 278         self.sha_ofs = 256*4
 279         self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
 280
 281     def _ofs_from_idx(self, idx):
 282         return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
 283
 284     def _idx_to_hash(self, idx):
 285         return str(self.shatable[idx*24+4 : idx*24+24])
 286
 287     def __iter__(self):
 288         for i in xrange(self.fanout[255]):
 289             yield buffer(self.map, 256*4 + 24*i + 4, 20)
 290
 291
 292 class PackIdxV2(PackIdx):
 293     """Object representation of a Git pack index (version 2) file."""
 294     def __init__(self, filename, f):
 295         self.name = filename
 296         self.idxnames = [self.name]
 297         self.map = mmap_read(f)
 298         assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
 299         self.fanout = list(struct.unpack('!256I',
 300                                          str(buffer(self.map, 8, 256*4))))
 301         self.fanout.append(0)  # entry "-1"
 302         nsha = self.fanout[255]
 303         self.sha_ofs = 8 + 256*4
 304         self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
 305         self.ofstable = buffer(self.map,
 306                                self.sha_ofs + nsha*20 + nsha*4,
 307                                nsha*4)
 308         self.ofs64table = buffer(self.map,
 309                                  8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
 310
 311     def _ofs_from_idx(self, idx):
 312         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
 313         if ofs & 0x80000000:
 314             idx64 = ofs & 0x7fffffff
 315             ofs = struct.unpack('!Q',
 316                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
 317         return ofs
 318
 319     def _idx_to_hash(self, idx):
 320         return str(self.shatable[idx*20:(idx+1)*20])
 321
 322     def __iter__(self):
 323         for i in xrange(self.fanout[255]):
 324             yield buffer(self.map, 8 + 256*4 + 20*i, 20)
 325
 326
 327 _mpi_count = 0
 328 class PackIdxList:
 329     def __init__(self, dir):
 330         global _mpi_count
 331         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 332         _mpi_count += 1
 333         self.dir = dir
 334         self.also = set()
 335         self.packs = []
 336         self.do_bloom = False
 337         self.bloom = None
 338         self.refresh()
 339
 340     def __del__(self):
 341         global _mpi_count
 342         _mpi_count -= 1
 343         assert(_mpi_count == 0)
 344
 345     def __iter__(self):
 346         return iter(idxmerge(self.packs))
 347
 348     def __len__(self):
 349         return sum(len(pack) for pack in self.packs)
 350
 351     def exists(self, hash, want_source=False):
 352         """Return nonempty if the object exists in the index files."""
 353         global _total_searches
 354         _total_searches += 1
 355         if hash in self.also:
 356             return True
 357         if self.do_bloom and self.bloom:
 358             if self.bloom.exists(hash):
 359                 self.do_bloom = False
 360             else:
 361                 _total_searches -= 1  # was counted by bloom
 362                 return None
 363         for i in xrange(len(self.packs)):
 364             p = self.packs[i]
 365             _total_searches -= 1  # will be incremented by sub-pack
 366             ix = p.exists(hash, want_source=want_source)
 367             if ix:
 368                 # reorder so most recently used packs are searched first
 369                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 370                 return ix
 371         self.do_bloom = True
 372         return None
 373
 374     def refresh(self, skip_midx = False):
 375         """Refresh the index list.
 376         This method verifies if .midx files were superseded (e.g. all of its
 377         contents are in another, bigger .midx file) and removes the superseded
 378         files.
 379
 380         If skip_midx is True, all work on .midx files will be skipped and .midx
 381         files will be removed from the list.
 382
 383         The module-global variable 'ignore_midx' can force this function to
 384         always act as if skip_midx was True.
 385         """
 386         self.bloom = None # Always reopen the bloom as it may have been relaced
 387         self.do_bloom = False
 388         skip_midx = skip_midx or ignore_midx
 389         d = dict((p.name, p) for p in self.packs
 390                  if not skip_midx or not isinstance(p, midx.PackMidx))
 391         if os.path.exists(self.dir):
 392             if not skip_midx:
 393                 midxl = []
 394                 for ix in self.packs:
 395                     if isinstance(ix, midx.PackMidx):
 396                         for name in ix.idxnames:
 397                             d[os.path.join(self.dir, name)] = ix
 398                 for full in glob.glob(os.path.join(self.dir,'*.midx')):
 399                     if not d.get(full):
 400                         mx = midx.PackMidx(full)
 401                         (mxd, mxf) = os.path.split(mx.name)
 402                         broken = False
 403                         for n in mx.idxnames:
 404                             if not os.path.exists(os.path.join(mxd, n)):
 405                                 log(('warning: index %s missing\n' +
 406                                     '  used by %s\n') % (n, mxf))
 407                                 broken = True
 408                         if broken:
 409                             del mx
 410                             unlink(full)
 411                         else:
 412                             midxl.append(mx)
 413                 midxl.sort(key=lambda ix:
 414                            (-len(ix), -xstat.stat(ix.name).st_mtime))
 415                 for ix in midxl:
 416                     any_needed = False
 417                     for sub in ix.idxnames:
 418                         found = d.get(os.path.join(self.dir, sub))
 419                         if not found or isinstance(found, PackIdx):
 420                             # doesn't exist, or exists but not in a midx
 421                             any_needed = True
 422                             break
 423                     if any_needed:
 424                         d[ix.name] = ix
 425                         for name in ix.idxnames:
 426                             d[os.path.join(self.dir, name)] = ix
 427                     elif not ix.force_keep:
 428                         debug1('midx: removing redundant: %s\n'
 429                                % os.path.basename(ix.name))
 430                         unlink(ix.name)
 431             for full in glob.glob(os.path.join(self.dir,'*.idx')):
 432                 if not d.get(full):
 433                     try:
 434                         ix = open_idx(full)
 435                     except GitError, e:
 436                         add_error(e)
 437                         continue
 438                     d[full] = ix
 439             bfull = os.path.join(self.dir, 'bup.bloom')
 440             if self.bloom is None and os.path.exists(bfull):
 441                 self.bloom = bloom.ShaBloom(bfull)
 442             self.packs = list(set(d.values()))
 443             self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
 444             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 445                 self.do_bloom = True
 446             else:
 447                 self.bloom = None
 448         debug1('PackIdxList: using %d index%s.\n'
 449             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 450
 451     def add(self, hash):
 452         """Insert an additional object in the list."""
 453         self.also.add(hash)
 454
 455
 456 def open_idx(filename):
 457     if filename.endswith('.idx'):
 458         f = open(filename, 'rb')
 459         header = f.read(8)
 460         if header[0:4] == '\377tOc':
 461             version = struct.unpack('!I', header[4:8])[0]
 462             if version == 2:
 463                 return PackIdxV2(filename, f)
 464             else:
 465                 raise GitError('%s: expected idx file version 2, got %d'
 466                                % (filename, version))
 467         elif len(header) == 8 and header[0:4] < '\377tOc':
 468             return PackIdxV1(filename, f)
 469         else:
 470             raise GitError('%s: unrecognized idx file header' % filename)
 471     elif filename.endswith('.midx'):
 472         return midx.PackMidx(filename)
 473     else:
 474         raise GitError('idx filenames must end with .idx or .midx')
 475
 476
 477 def idxmerge(idxlist, final_progress=True):
 478     """Generate a list of all the objects reachable in a PackIdxList."""
 479     def pfunc(count, total):
 480         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 481                   % (count*100.0/total, count, total))
 482     def pfinal(count, total):
 483         if final_progress:
 484             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 485                      % (100, total, total))
 486     return merge_iter(idxlist, 10024, pfunc, pfinal)
 487
 488
 489 def _make_objcache():
 490     return PackIdxList(repo('objects/pack'))
 491
 492 class PackWriter:
 493     """Writes Git objects inside a pack file."""
 494     def __init__(self, objcache_maker=_make_objcache, compression_level=1):
 495         self.count = 0
 496         self.outbytes = 0
 497         self.filename = None
 498         self.file = None
 499         self.idx = None
 500         self.objcache_maker = objcache_maker
 501         self.objcache = None
 502         self.compression_level = compression_level
 503
 504     def __del__(self):
 505         self.close()
 506
 507     def _open(self):
 508         if not self.file:
 509             (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
 510             self.file = os.fdopen(fd, 'w+b')
 511             assert(name.endswith('.pack'))
 512             self.filename = name[:-5]
 513             self.file.write('PACK\0\0\0\2\0\0\0\0')
 514             self.idx = list(list() for i in xrange(256))
 515
 516     def _raw_write(self, datalist, sha):
 517         self._open()
 518         f = self.file
 519         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 520         # the file never has a *partial* blob.  So let's make sure it's
 521         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 522         # to our hashsplit algorithm.)  f.write() does its own buffering,
 523         # but that's okay because we'll flush it in _end().
 524         oneblob = ''.join(datalist)
 525         try:
 526             f.write(oneblob)
 527         except IOError, e:
 528             raise GitError, e, sys.exc_info()[2]
 529         nw = len(oneblob)
 530         crc = zlib.crc32(oneblob) & 0xffffffff
 531         self._update_idx(sha, crc, nw)
 532         self.outbytes += nw
 533         self.count += 1
 534         return nw, crc
 535
 536     def _update_idx(self, sha, crc, size):
 537         assert(sha)
 538         if self.idx:
 539             self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
 540
 541     def _write(self, sha, type, content):
 542         if verbose:
 543             log('>')
 544         if not sha:
 545             sha = calc_hash(type, content)
 546         size, crc = self._raw_write(_encode_packobj(type, content,
 547                                                     self.compression_level),
 548                                     sha=sha)
 549         if self.outbytes >= max_pack_size or self.count >= max_pack_objects:
 550             self.breakpoint()
 551         return sha
 552
 553     def breakpoint(self):
 554         """Clear byte and object counts and return the last processed id."""
 555         id = self._end()
 556         self.outbytes = self.count = 0
 557         return id
 558
 559     def _require_objcache(self):
 560         if self.objcache is None and self.objcache_maker:
 561             self.objcache = self.objcache_maker()
 562         if self.objcache is None:
 563             raise GitError(
 564                     "PackWriter not opened or can't check exists w/o objcache")
 565
 566     def exists(self, id, want_source=False):
 567         """Return non-empty if an object is found in the object cache."""
 568         self._require_objcache()
 569         return self.objcache.exists(id, want_source=want_source)
 570
 571     def maybe_write(self, type, content):
 572         """Write an object to the pack file if not present and return its id."""
 573         sha = calc_hash(type, content)
 574         if not self.exists(sha):
 575             self._write(sha, type, content)
 576             self._require_objcache()
 577             self.objcache.add(sha)
 578         return sha
 579
 580     def new_blob(self, blob):
 581         """Create a blob object in the pack with the supplied content."""
 582         return self.maybe_write('blob', blob)
 583
 584     def new_tree(self, shalist):
 585         """Create a tree object in the pack."""
 586         content = tree_encode(shalist)
 587         return self.maybe_write('tree', content)
 588
 589     def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
 590         l = []
 591         if tree: l.append('tree %s' % tree.encode('hex'))
 592         if parent: l.append('parent %s' % parent.encode('hex'))
 593         if author: l.append('author %s %s' % (author, _git_date(adate)))
 594         if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
 595         l.append('')
 596         l.append(msg)
 597         return self.maybe_write('commit', '\n'.join(l))
 598
 599     def new_commit(self, parent, tree, date, msg):
 600         """Create a commit object in the pack."""
 601         userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
 602         commit = self._new_commit(tree, parent,
 603                                   userline, date, userline, date,
 604                                   msg)
 605         return commit
 606
 607     def abort(self):
 608         """Remove the pack file from disk."""
 609         f = self.file
 610         if f:
 611             self.idx = None
 612             self.file = None
 613             f.close()
 614             os.unlink(self.filename + '.pack')
 615
 616     def _end(self, run_midx=True):
 617         f = self.file
 618         if not f: return None
 619         self.file = None
 620         self.objcache = None
 621         idx = self.idx
 622         self.idx = None
 623
 624         # update object count
 625         f.seek(8)
 626         cp = struct.pack('!i', self.count)
 627         assert(len(cp) == 4)
 628         f.write(cp)
 629
 630         # calculate the pack sha1sum
 631         f.seek(0)
 632         sum = Sha1()
 633         for b in chunkyreader(f):
 634             sum.update(b)
 635         packbin = sum.digest()
 636         f.write(packbin)
 637         f.close()
 638
 639         obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
 640
 641         nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
 642         if os.path.exists(self.filename + '.map'):
 643             os.unlink(self.filename + '.map')
 644         os.rename(self.filename + '.pack', nameprefix + '.pack')
 645         os.rename(self.filename + '.idx', nameprefix + '.idx')
 646
 647         if run_midx:
 648             auto_midx(repo('objects/pack'))
 649         return nameprefix
 650
 651     def close(self, run_midx=True):
 652         """Close the pack file and move it to its definitive path."""
 653         return self._end(run_midx=run_midx)
 654
 655     def _write_pack_idx_v2(self, filename, idx, packbin):
 656         ofs64_count = 0
 657         for section in idx:
 658             for entry in section:
 659                 if entry[2] >= 2**31:
 660                     ofs64_count += 1
 661
 662         # Length: header + fan-out + shas-and-crcs + overflow-offsets
 663         index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
 664         idx_map = None
 665         idx_f = open(filename, 'w+b')
 666         try:
 667             idx_f.truncate(index_len)
 668             idx_map = mmap_readwrite(idx_f, close=False)
 669             count = _helpers.write_idx(filename, idx_map, idx, self.count)
 670             assert(count == self.count)
 671         finally:
 672             if idx_map: idx_map.close()
 673             idx_f.close()
 674
 675         idx_f = open(filename, 'a+b')
 676         try:
 677             idx_f.write(packbin)
 678             idx_f.seek(0)
 679             idx_sum = Sha1()
 680             b = idx_f.read(8 + 4*256)
 681             idx_sum.update(b)
 682
 683             obj_list_sum = Sha1()
 684             for b in chunkyreader(idx_f, 20*self.count):
 685                 idx_sum.update(b)
 686                 obj_list_sum.update(b)
 687             namebase = obj_list_sum.hexdigest()
 688
 689             for b in chunkyreader(idx_f):
 690                 idx_sum.update(b)
 691             idx_f.write(idx_sum.digest())
 692             return namebase
 693         finally:
 694             idx_f.close()
 695
 696
 697 def _git_date(date):
 698     return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
 699
 700
 701 def _gitenv():
 702     os.environ['GIT_DIR'] = os.path.abspath(repo())
 703
 704
 705 def list_refs(refname = None):
 706     """Generate a list of tuples in the form (refname,hash).
 707     If a ref name is specified, list only this particular ref.
 708     """
 709     argv = ['git', 'show-ref', '--']
 710     if refname:
 711         argv += [refname]
 712     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 713     out = p.stdout.read().strip()
 714     rv = p.wait()  # not fatal
 715     if rv:
 716         assert(not out)
 717     if out:
 718         for d in out.split('\n'):
 719             (sha, name) = d.split(' ', 1)
 720             yield (name, sha.decode('hex'))
 721
 722
 723 def read_ref(refname):
 724     """Get the commit id of the most recent commit made on a given ref."""
 725     l = list(list_refs(refname))
 726     if l:
 727         assert(len(l) == 1)
 728         return l[0][1]
 729     else:
 730         return None
 731
 732
 733 def rev_list(ref, count=None):
 734     """Generate a list of reachable commits in reverse chronological order.
 735
 736     This generator walks through commits, from child to parent, that are
 737     reachable via the specified ref and yields a series of tuples of the form
 738     (date,hash).
 739
 740     If count is a non-zero integer, limit the number of commits to "count"
 741     objects.
 742     """
 743     assert(not ref.startswith('-'))
 744     opts = []
 745     if count:
 746         opts += ['-n', str(atoi(count))]
 747     argv = ['git', 'rev-list', '--pretty=format:%at'] + opts + [ref, '--']
 748     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 749     commit = None
 750     for row in p.stdout:
 751         s = row.strip()
 752         if s.startswith('commit '):
 753             commit = s[7:].decode('hex')
 754         else:
 755             date = int(s)
 756             yield (date, commit)
 757     rv = p.wait()  # not fatal
 758     if rv:
 759         raise GitError, 'git rev-list returned error %d' % rv
 760
 761
 762 def get_commit_dates(refs):
 763     """Get the dates for the specified commit refs."""
 764     result = []
 765     cmd = ['git', 'show', '-s', '--pretty=format:%ct']
 766     for chunk in batchpipe(cmd, refs, preexec_fn=_gitenv):
 767         result += [int(x) for x in chunk.splitlines()]
 768     return result
 769
 770
 771 def rev_parse(committish):
 772     """Resolve the full hash for 'committish', if it exists.
 773
 774     Should be roughly equivalent to 'git rev-parse'.
 775
 776     Returns the hex value of the hash if it is found, None if 'committish' does
 777     not correspond to anything.
 778     """
 779     head = read_ref(committish)
 780     if head:
 781         debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
 782         return head
 783
 784     pL = PackIdxList(repo('objects/pack'))
 785
 786     if len(committish) == 40:
 787         try:
 788             hash = committish.decode('hex')
 789         except TypeError:
 790             return None
 791
 792         if pL.exists(hash):
 793             return hash
 794
 795     return None
 796
 797
 798 def update_ref(refname, newval, oldval):
 799     """Change the commit pointed to by a branch."""
 800     if not oldval:
 801         oldval = ''
 802     assert(refname.startswith('refs/heads/'))
 803     p = subprocess.Popen(['git', 'update-ref', refname,
 804                           newval.encode('hex'), oldval.encode('hex')],
 805                          preexec_fn = _gitenv)
 806     _git_wait('git update-ref', p)
 807
 808
 809 def guess_repo(path=None):
 810     """Set the path value in the global variable "repodir".
 811     This makes bup look for an existing bup repository, but not fail if a
 812     repository doesn't exist. Usually, if you are interacting with a bup
 813     repository, you would not be calling this function but using
 814     check_repo_or_die().
 815     """
 816     global repodir
 817     if path:
 818         repodir = path
 819     if not repodir:
 820         repodir = os.environ.get('BUP_DIR')
 821         if not repodir:
 822             repodir = os.path.expanduser('~/.bup')
 823
 824
 825 def init_repo(path=None):
 826     """Create the Git bare repository for bup in a given path."""
 827     guess_repo(path)
 828     d = repo()  # appends a / to the path
 829     parent = os.path.dirname(os.path.dirname(d))
 830     if parent and not os.path.exists(parent):
 831         raise GitError('parent directory "%s" does not exist\n' % parent)
 832     if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
 833         raise GitError('"%s" exists but is not a directory\n' % d)
 834     p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
 835                          preexec_fn = _gitenv)
 836     _git_wait('git init', p)
 837     # Force the index version configuration in order to ensure bup works
 838     # regardless of the version of the installed Git binary.
 839     p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
 840                          stdout=sys.stderr, preexec_fn = _gitenv)
 841     _git_wait('git config', p)
 842     # Enable the reflog
 843     p = subprocess.Popen(['git', 'config', 'core.logAllRefUpdates', 'true'],
 844                          stdout=sys.stderr, preexec_fn = _gitenv)
 845     _git_wait('git config', p)
 846
 847
 848 def check_repo_or_die(path=None):
 849     """Make sure a bup repository exists, and abort if not.
 850     If the path to a particular repository was not specified, this function
 851     initializes the default repository automatically.
 852     """
 853     guess_repo(path)
 854     try:
 855         os.stat(repo('objects/pack/.'))
 856     except OSError, e:
 857         if e.errno == errno.ENOENT:
 858             log('error: %r is not a bup repository; run "bup init"\n'
 859                 % repo())
 860             sys.exit(15)
 861         else:
 862             log('error: %s\n' % e)
 863             sys.exit(14)
 864
 865
 866 _ver = None
 867 def ver():
 868     """Get Git's version and ensure a usable version is installed.
 869
 870     The returned version is formatted as an ordered tuple with each position
 871     representing a digit in the version tag. For example, the following tuple
 872     would represent version 1.6.6.9:
 873
 874         ('1', '6', '6', '9')
 875     """
 876     global _ver
 877     if not _ver:
 878         p = subprocess.Popen(['git', '--version'],
 879                              stdout=subprocess.PIPE)
 880         gvs = p.stdout.read()
 881         _git_wait('git --version', p)
 882         m = re.match(r'git version (\S+.\S+)', gvs)
 883         if not m:
 884             raise GitError('git --version weird output: %r' % gvs)
 885         _ver = tuple(m.group(1).split('.'))
 886     needed = ('1','5', '3', '1')
 887     if _ver < needed:
 888         raise GitError('git version %s or higher is required; you have %s'
 889                        % ('.'.join(needed), '.'.join(_ver)))
 890     return _ver
 891
 892
 893 def _git_wait(cmd, p):
 894     rv = p.wait()
 895     if rv != 0:
 896         raise GitError('%s returned %d' % (cmd, rv))
 897
 898
 899 def _git_capture(argv):
 900     p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
 901     r = p.stdout.read()
 902     _git_wait(repr(argv), p)
 903     return r
 904
 905
 906 class _AbortableIter:
 907     def __init__(self, it, onabort = None):
 908         self.it = it
 909         self.onabort = onabort
 910         self.done = None
 911
 912     def __iter__(self):
 913         return self
 914
 915     def next(self):
 916         try:
 917             return self.it.next()
 918         except StopIteration, e:
 919             self.done = True
 920             raise
 921         except:
 922             self.abort()
 923             raise
 924
 925     def abort(self):
 926         """Abort iteration and call the abortion callback, if needed."""
 927         if not self.done:
 928             self.done = True
 929             if self.onabort:
 930                 self.onabort()
 931
 932     def __del__(self):
 933         self.abort()
 934
 935
 936 _ver_warned = 0
 937 class CatPipe:
 938     """Link to 'git cat-file' that is used to retrieve blob data."""
 939     def __init__(self):
 940         global _ver_warned
 941         wanted = ('1','5','6')
 942         if ver() < wanted:
 943             if not _ver_warned:
 944                 log('warning: git version < %s; bup will be slow.\n'
 945                     % '.'.join(wanted))
 946                 _ver_warned = 1
 947             self.get = self._slow_get
 948         else:
 949             self.p = self.inprogress = None
 950             self.get = self._fast_get
 951
 952     def _abort(self):
 953         if self.p:
 954             self.p.stdout.close()
 955             self.p.stdin.close()
 956         self.p = None
 957         self.inprogress = None
 958
 959     def _restart(self):
 960         self._abort()
 961         self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
 962                                   stdin=subprocess.PIPE,
 963                                   stdout=subprocess.PIPE,
 964                                   close_fds = True,
 965                                   bufsize = 4096,
 966                                   preexec_fn = _gitenv)
 967
 968     def _fast_get(self, id):
 969         if not self.p or self.p.poll() != None:
 970             self._restart()
 971         assert(self.p)
 972         poll_result = self.p.poll()
 973         assert(poll_result == None)
 974         if self.inprogress:
 975             log('_fast_get: opening %r while %r is open\n'
 976                 % (id, self.inprogress))
 977         assert(not self.inprogress)
 978         assert(id.find('\n') < 0)
 979         assert(id.find('\r') < 0)
 980         assert(not id.startswith('-'))
 981         self.inprogress = id
 982         self.p.stdin.write('%s\n' % id)
 983         self.p.stdin.flush()
 984         hdr = self.p.stdout.readline()
 985         if hdr.endswith(' missing\n'):
 986             self.inprogress = None
 987             raise KeyError('blob %r is missing' % id)
 988         spl = hdr.split(' ')
 989         if len(spl) != 3 or len(spl[0]) != 40:
 990             raise GitError('expected blob, got %r' % spl)
 991         (hex, type, size) = spl
 992
 993         it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
 994                            onabort = self._abort)
 995         try:
 996             yield type
 997             for blob in it:
 998                 yield blob
 999             readline_result = self.p.stdout.readline()
1000             assert(readline_result == '\n')
1001             self.inprogress = None
1002         except Exception, e:
1003             it.abort()
1004             raise
1005
1006     def _slow_get(self, id):
1007         assert(id.find('\n') < 0)
1008         assert(id.find('\r') < 0)
1009         assert(id[0] != '-')
1010         type = _git_capture(['git', 'cat-file', '-t', id]).strip()
1011         yield type
1012
1013         p = subprocess.Popen(['git', 'cat-file', type, id],
1014                              stdout=subprocess.PIPE,
1015                              preexec_fn = _gitenv)
1016         for blob in chunkyreader(p.stdout):
1017             yield blob
1018         _git_wait('git cat-file', p)
1019
1020     def _join(self, it):
1021         type = it.next()
1022         if type == 'blob':
1023             for blob in it:
1024                 yield blob
1025         elif type == 'tree':
1026             treefile = ''.join(it)
1027             for (mode, name, sha) in tree_decode(treefile):
1028                 for blob in self.join(sha.encode('hex')):
1029                     yield blob
1030         elif type == 'commit':
1031             treeline = ''.join(it).split('\n')[0]
1032             assert(treeline.startswith('tree '))
1033             for blob in self.join(treeline[5:]):
1034                 yield blob
1035         else:
1036             raise GitError('invalid object type %r: expected blob/tree/commit'
1037                            % type)
1038
1039     def join(self, id):
1040         """Generate a list of the content of all blobs that can be reached
1041         from an object.  The hash given in 'id' must point to a blob, a tree
1042         or a commit. The content of all blobs that can be seen from trees or
1043         commits will be added to the list.
1044         """
1045         try:
1046             for d in self._join(self.get(id)):
1047                 yield d
1048         except StopIteration:
1049             log('booger!\n')
1050
1051 def tags():
1052     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1053     tags = {}
1054     for (n,c) in list_refs():
1055         if n.startswith('refs/tags/'):
1056             name = n[10:]
1057             if not c in tags:
1058                 tags[c] = []
1059
1060             tags[c].append(name)  # more than one tag can point at 'c'
1061
1062     return tags