lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5 import os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
   6 from bup.helpers import *
   7 from bup import _helpers, path, midx, bloom, xstat
   8
   9 max_pack_size = 1000*1000*1000  # larger packs will slow down pruning
  10 max_pack_objects = 200*1000  # cache memory usage is about 83 bytes per object
  11
  12 verbose = 0
  13 ignore_midx = 0
  14 home_repodir = os.path.expanduser('~/.bup')
  15 repodir = None
  16
  17 _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
  18 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
  19
  20 _total_searches = 0
  21 _total_steps = 0
  22
  23
  24 class GitError(Exception):
  25     pass
  26
  27
  28 def repo(sub = ''):
  29     """Get the path to the git repository or one of its subdirectories."""
  30     global repodir
  31     if not repodir:
  32         raise GitError('You should call check_repo_or_die()')
  33
  34     # If there's a .git subdirectory, then the actual repo is in there.
  35     gd = os.path.join(repodir, '.git')
  36     if os.path.exists(gd):
  37         repodir = gd
  38
  39     return os.path.join(repodir, sub)
  40
  41
  42 def shorten_hash(s):
  43     return re.sub(r'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)',
  44                   r'\1\2*\3', s)
  45
  46
  47 def repo_rel(path):
  48     full = os.path.abspath(path)
  49     fullrepo = os.path.abspath(repo(''))
  50     if not fullrepo.endswith('/'):
  51         fullrepo += '/'
  52     if full.startswith(fullrepo):
  53         path = full[len(fullrepo):]
  54     if path.startswith('index-cache/'):
  55         path = path[len('index-cache/'):]
  56     return shorten_hash(path)
  57
  58
  59 def all_packdirs():
  60     paths = [repo('objects/pack')]
  61     paths += glob.glob(repo('index-cache/*/.'))
  62     return paths
  63
  64
  65 def auto_midx(objdir):
  66     args = [path.exe(), 'midx', '--auto', '--dir', objdir]
  67     try:
  68         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
  69     except OSError, e:
  70         # make sure 'args' gets printed to help with debugging
  71         add_error('%r: exception: %s' % (args, e))
  72         raise
  73     if rv:
  74         add_error('%r: returned %d' % (args, rv))
  75
  76     args = [path.exe(), 'bloom', '--dir', objdir]
  77     try:
  78         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
  79     except OSError, e:
  80         # make sure 'args' gets printed to help with debugging
  81         add_error('%r: exception: %s' % (args, e))
  82         raise
  83     if rv:
  84         add_error('%r: returned %d' % (args, rv))
  85
  86
  87 def mangle_name(name, mode, gitmode):
  88     """Mangle a file name to present an abstract name for segmented files.
  89     Mangled file names will have the ".bup" extension added to them. If a
  90     file's name already ends with ".bup", a ".bupl" extension is added to
  91     disambiguate normal files from semgmented ones.
  92     """
  93     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
  94         return name + '.bup'
  95     elif name.endswith('.bup') or name[:-1].endswith('.bup'):
  96         return name + '.bupl'
  97     else:
  98         return name
  99
 100
 101 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 102 def demangle_name(name):
 103     """Remove name mangling from a file name, if necessary.
 104
 105     The return value is a tuple (demangled_filename,mode), where mode is one of
 106     the following:
 107
 108     * BUP_NORMAL  : files that should be read as-is from the repository
 109     * BUP_CHUNKED : files that were chunked and need to be assembled
 110
 111     For more information on the name mangling algorythm, see mangle_name()
 112     """
 113     if name.endswith('.bupl'):
 114         return (name[:-5], BUP_NORMAL)
 115     elif name.endswith('.bup'):
 116         return (name[:-4], BUP_CHUNKED)
 117     else:
 118         return (name, BUP_NORMAL)
 119
 120
 121 def calc_hash(type, content):
 122     """Calculate some content's hash in the Git fashion."""
 123     header = '%s %d\0' % (type, len(content))
 124     sum = Sha1(header)
 125     sum.update(content)
 126     return sum.digest()
 127
 128
 129 def shalist_item_sort_key(ent):
 130     (mode, name, id) = ent
 131     assert(mode+0 == mode)
 132     if stat.S_ISDIR(mode):
 133         return name + '/'
 134     else:
 135         return name
 136
 137
 138 def tree_encode(shalist):
 139     """Generate a git tree object from (mode,name,hash) tuples."""
 140     shalist = sorted(shalist, key = shalist_item_sort_key)
 141     l = []
 142     for (mode,name,bin) in shalist:
 143         assert(mode)
 144         assert(mode+0 == mode)
 145         assert(name)
 146         assert(len(bin) == 20)
 147         s = '%o %s\0%s' % (mode,name,bin)
 148         assert(s[0] != '0')  # 0-padded octal is not acceptable in a git tree
 149         l.append(s)
 150     return ''.join(l)
 151
 152
 153 def tree_decode(buf):
 154     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 155     ofs = 0
 156     while ofs < len(buf):
 157         z = buf.find('\0', ofs)
 158         assert(z > ofs)
 159         spl = buf[ofs:z].split(' ', 1)
 160         assert(len(spl) == 2)
 161         mode,name = spl
 162         sha = buf[z+1:z+1+20]
 163         ofs = z+1+20
 164         yield (int(mode, 8), name, sha)
 165
 166
 167 def _encode_packobj(type, content, compression_level=1):
 168     szout = ''
 169     sz = len(content)
 170     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 171     sz >>= 4
 172     while 1:
 173         if sz: szbits |= 0x80
 174         szout += chr(szbits)
 175         if not sz:
 176             break
 177         szbits = sz & 0x7f
 178         sz >>= 7
 179     if compression_level > 9:
 180         compression_level = 9
 181     elif compression_level < 0:
 182         compression_level = 0
 183     z = zlib.compressobj(compression_level)
 184     yield szout
 185     yield z.compress(content)
 186     yield z.flush()
 187
 188
 189 def _encode_looseobj(type, content, compression_level=1):
 190     z = zlib.compressobj(compression_level)
 191     yield z.compress('%s %d\0' % (type, len(content)))
 192     yield z.compress(content)
 193     yield z.flush()
 194
 195
 196 def _decode_looseobj(buf):
 197     assert(buf);
 198     s = zlib.decompress(buf)
 199     i = s.find('\0')
 200     assert(i > 0)
 201     l = s[:i].split(' ')
 202     type = l[0]
 203     sz = int(l[1])
 204     content = s[i+1:]
 205     assert(type in _typemap)
 206     assert(sz == len(content))
 207     return (type, content)
 208
 209
 210 def _decode_packobj(buf):
 211     assert(buf)
 212     c = ord(buf[0])
 213     type = _typermap[(c & 0x70) >> 4]
 214     sz = c & 0x0f
 215     shift = 4
 216     i = 0
 217     while c & 0x80:
 218         i += 1
 219         c = ord(buf[i])
 220         sz |= (c & 0x7f) << shift
 221         shift += 7
 222         if not (c & 0x80):
 223             break
 224     return (type, zlib.decompress(buf[i+1:]))
 225
 226
 227 class PackIdx:
 228     def __init__(self):
 229         assert(0)
 230
 231     def find_offset(self, hash):
 232         """Get the offset of an object inside the index file."""
 233         idx = self._idx_from_hash(hash)
 234         if idx != None:
 235             return self._ofs_from_idx(idx)
 236         return None
 237
 238     def exists(self, hash, want_source=False):
 239         """Return nonempty if the object exists in this index."""
 240         if hash and (self._idx_from_hash(hash) != None):
 241             return want_source and os.path.basename(self.name) or True
 242         return None
 243
 244     def __len__(self):
 245         return int(self.fanout[255])
 246
 247     def _idx_from_hash(self, hash):
 248         global _total_searches, _total_steps
 249         _total_searches += 1
 250         assert(len(hash) == 20)
 251         b1 = ord(hash[0])
 252         start = self.fanout[b1-1] # range -1..254
 253         end = self.fanout[b1] # range 0..255
 254         want = str(hash)
 255         _total_steps += 1  # lookup table is a step
 256         while start < end:
 257             _total_steps += 1
 258             mid = start + (end-start)/2
 259             v = self._idx_to_hash(mid)
 260             if v < want:
 261                 start = mid+1
 262             elif v > want:
 263                 end = mid
 264             else: # got it!
 265                 return mid
 266         return None
 267
 268
 269 class PackIdxV1(PackIdx):
 270     """Object representation of a Git pack index (version 1) file."""
 271     def __init__(self, filename, f):
 272         self.name = filename
 273         self.idxnames = [self.name]
 274         self.map = mmap_read(f)
 275         self.fanout = list(struct.unpack('!256I',
 276                                          str(buffer(self.map, 0, 256*4))))
 277         self.fanout.append(0)  # entry "-1"
 278         nsha = self.fanout[255]
 279         self.sha_ofs = 256*4
 280         self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
 281
 282     def _ofs_from_idx(self, idx):
 283         return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
 284
 285     def _idx_to_hash(self, idx):
 286         return str(self.shatable[idx*24+4 : idx*24+24])
 287
 288     def __iter__(self):
 289         for i in xrange(self.fanout[255]):
 290             yield buffer(self.map, 256*4 + 24*i + 4, 20)
 291
 292
 293 class PackIdxV2(PackIdx):
 294     """Object representation of a Git pack index (version 2) file."""
 295     def __init__(self, filename, f):
 296         self.name = filename
 297         self.idxnames = [self.name]
 298         self.map = mmap_read(f)
 299         assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
 300         self.fanout = list(struct.unpack('!256I',
 301                                          str(buffer(self.map, 8, 256*4))))
 302         self.fanout.append(0)  # entry "-1"
 303         nsha = self.fanout[255]
 304         self.sha_ofs = 8 + 256*4
 305         self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
 306         self.ofstable = buffer(self.map,
 307                                self.sha_ofs + nsha*20 + nsha*4,
 308                                nsha*4)
 309         self.ofs64table = buffer(self.map,
 310                                  8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
 311
 312     def _ofs_from_idx(self, idx):
 313         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
 314         if ofs & 0x80000000:
 315             idx64 = ofs & 0x7fffffff
 316             ofs = struct.unpack('!Q',
 317                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
 318         return ofs
 319
 320     def _idx_to_hash(self, idx):
 321         return str(self.shatable[idx*20:(idx+1)*20])
 322
 323     def __iter__(self):
 324         for i in xrange(self.fanout[255]):
 325             yield buffer(self.map, 8 + 256*4 + 20*i, 20)
 326
 327
 328 _mpi_count = 0
 329 class PackIdxList:
 330     def __init__(self, dir):
 331         global _mpi_count
 332         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 333         _mpi_count += 1
 334         self.dir = dir
 335         self.also = set()
 336         self.packs = []
 337         self.do_bloom = False
 338         self.bloom = None
 339         self.refresh()
 340
 341     def __del__(self):
 342         global _mpi_count
 343         _mpi_count -= 1
 344         assert(_mpi_count == 0)
 345
 346     def __iter__(self):
 347         return iter(idxmerge(self.packs))
 348
 349     def __len__(self):
 350         return sum(len(pack) for pack in self.packs)
 351
 352     def exists(self, hash, want_source=False):
 353         """Return nonempty if the object exists in the index files."""
 354         global _total_searches
 355         _total_searches += 1
 356         if hash in self.also:
 357             return True
 358         if self.do_bloom and self.bloom:
 359             if self.bloom.exists(hash):
 360                 self.do_bloom = False
 361             else:
 362                 _total_searches -= 1  # was counted by bloom
 363                 return None
 364         for i in xrange(len(self.packs)):
 365             p = self.packs[i]
 366             _total_searches -= 1  # will be incremented by sub-pack
 367             ix = p.exists(hash, want_source=want_source)
 368             if ix:
 369                 # reorder so most recently used packs are searched first
 370                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 371                 return ix
 372         self.do_bloom = True
 373         return None
 374
 375     def refresh(self, skip_midx = False):
 376         """Refresh the index list.
 377         This method verifies if .midx files were superseded (e.g. all of its
 378         contents are in another, bigger .midx file) and removes the superseded
 379         files.
 380
 381         If skip_midx is True, all work on .midx files will be skipped and .midx
 382         files will be removed from the list.
 383
 384         The module-global variable 'ignore_midx' can force this function to
 385         always act as if skip_midx was True.
 386         """
 387         self.bloom = None # Always reopen the bloom as it may have been relaced
 388         self.do_bloom = False
 389         skip_midx = skip_midx or ignore_midx
 390         d = dict((p.name, p) for p in self.packs
 391                  if not skip_midx or not isinstance(p, midx.PackMidx))
 392         if os.path.exists(self.dir):
 393             if not skip_midx:
 394                 midxl = []
 395                 for ix in self.packs:
 396                     if isinstance(ix, midx.PackMidx):
 397                         for name in ix.idxnames:
 398                             d[os.path.join(self.dir, name)] = ix
 399                 for full in glob.glob(os.path.join(self.dir,'*.midx')):
 400                     if not d.get(full):
 401                         mx = midx.PackMidx(full)
 402                         (mxd, mxf) = os.path.split(mx.name)
 403                         broken = False
 404                         for n in mx.idxnames:
 405                             if not os.path.exists(os.path.join(mxd, n)):
 406                                 log(('warning: index %s missing\n' +
 407                                     '  used by %s\n') % (n, mxf))
 408                                 broken = True
 409                         if broken:
 410                             del mx
 411                             unlink(full)
 412                         else:
 413                             midxl.append(mx)
 414                 midxl.sort(key=lambda ix:
 415                            (-len(ix), -xstat.stat(ix.name).st_mtime))
 416                 for ix in midxl:
 417                     any_needed = False
 418                     for sub in ix.idxnames:
 419                         found = d.get(os.path.join(self.dir, sub))
 420                         if not found or isinstance(found, PackIdx):
 421                             # doesn't exist, or exists but not in a midx
 422                             any_needed = True
 423                             break
 424                     if any_needed:
 425                         d[ix.name] = ix
 426                         for name in ix.idxnames:
 427                             d[os.path.join(self.dir, name)] = ix
 428                     elif not ix.force_keep:
 429                         debug1('midx: removing redundant: %s\n'
 430                                % os.path.basename(ix.name))
 431                         unlink(ix.name)
 432             for full in glob.glob(os.path.join(self.dir,'*.idx')):
 433                 if not d.get(full):
 434                     try:
 435                         ix = open_idx(full)
 436                     except GitError, e:
 437                         add_error(e)
 438                         continue
 439                     d[full] = ix
 440             bfull = os.path.join(self.dir, 'bup.bloom')
 441             if self.bloom is None and os.path.exists(bfull):
 442                 self.bloom = bloom.ShaBloom(bfull)
 443             self.packs = list(set(d.values()))
 444             self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
 445             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 446                 self.do_bloom = True
 447             else:
 448                 self.bloom = None
 449         debug1('PackIdxList: using %d index%s.\n'
 450             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 451
 452     def add(self, hash):
 453         """Insert an additional object in the list."""
 454         self.also.add(hash)
 455
 456
 457 def open_idx(filename):
 458     if filename.endswith('.idx'):
 459         f = open(filename, 'rb')
 460         header = f.read(8)
 461         if header[0:4] == '\377tOc':
 462             version = struct.unpack('!I', header[4:8])[0]
 463             if version == 2:
 464                 return PackIdxV2(filename, f)
 465             else:
 466                 raise GitError('%s: expected idx file version 2, got %d'
 467                                % (filename, version))
 468         elif len(header) == 8 and header[0:4] < '\377tOc':
 469             return PackIdxV1(filename, f)
 470         else:
 471             raise GitError('%s: unrecognized idx file header' % filename)
 472     elif filename.endswith('.midx'):
 473         return midx.PackMidx(filename)
 474     else:
 475         raise GitError('idx filenames must end with .idx or .midx')
 476
 477
 478 def idxmerge(idxlist, final_progress=True):
 479     """Generate a list of all the objects reachable in a PackIdxList."""
 480     def pfunc(count, total):
 481         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 482                   % (count*100.0/total, count, total))
 483     def pfinal(count, total):
 484         if final_progress:
 485             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 486                      % (100, total, total))
 487     return merge_iter(idxlist, 10024, pfunc, pfinal)
 488
 489
 490 def _make_objcache():
 491     return PackIdxList(repo('objects/pack'))
 492
 493 class PackWriter:
 494     """Writes Git objects inside a pack file."""
 495     def __init__(self, objcache_maker=_make_objcache, compression_level=1):
 496         self.count = 0
 497         self.outbytes = 0
 498         self.filename = None
 499         self.file = None
 500         self.idx = None
 501         self.objcache_maker = objcache_maker
 502         self.objcache = None
 503         self.compression_level = compression_level
 504
 505     def __del__(self):
 506         self.close()
 507
 508     def _open(self):
 509         if not self.file:
 510             (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
 511             self.file = os.fdopen(fd, 'w+b')
 512             assert(name.endswith('.pack'))
 513             self.filename = name[:-5]
 514             self.file.write('PACK\0\0\0\2\0\0\0\0')
 515             self.idx = list(list() for i in xrange(256))
 516
 517     def _raw_write(self, datalist, sha):
 518         self._open()
 519         f = self.file
 520         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 521         # the file never has a *partial* blob.  So let's make sure it's
 522         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 523         # to our hashsplit algorithm.)  f.write() does its own buffering,
 524         # but that's okay because we'll flush it in _end().
 525         oneblob = ''.join(datalist)
 526         try:
 527             f.write(oneblob)
 528         except IOError, e:
 529             raise GitError, e, sys.exc_info()[2]
 530         nw = len(oneblob)
 531         crc = zlib.crc32(oneblob) & 0xffffffff
 532         self._update_idx(sha, crc, nw)
 533         self.outbytes += nw
 534         self.count += 1
 535         return nw, crc
 536
 537     def _update_idx(self, sha, crc, size):
 538         assert(sha)
 539         if self.idx:
 540             self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
 541
 542     def _write(self, sha, type, content):
 543         if verbose:
 544             log('>')
 545         if not sha:
 546             sha = calc_hash(type, content)
 547         size, crc = self._raw_write(_encode_packobj(type, content,
 548                                                     self.compression_level),
 549                                     sha=sha)
 550         if self.outbytes >= max_pack_size or self.count >= max_pack_objects:
 551             self.breakpoint()
 552         return sha
 553
 554     def breakpoint(self):
 555         """Clear byte and object counts and return the last processed id."""
 556         id = self._end()
 557         self.outbytes = self.count = 0
 558         return id
 559
 560     def _require_objcache(self):
 561         if self.objcache is None and self.objcache_maker:
 562             self.objcache = self.objcache_maker()
 563         if self.objcache is None:
 564             raise GitError(
 565                     "PackWriter not opened or can't check exists w/o objcache")
 566
 567     def exists(self, id, want_source=False):
 568         """Return non-empty if an object is found in the object cache."""
 569         self._require_objcache()
 570         return self.objcache.exists(id, want_source=want_source)
 571
 572     def maybe_write(self, type, content):
 573         """Write an object to the pack file if not present and return its id."""
 574         sha = calc_hash(type, content)
 575         if not self.exists(sha):
 576             self._write(sha, type, content)
 577             self._require_objcache()
 578             self.objcache.add(sha)
 579         return sha
 580
 581     def new_blob(self, blob):
 582         """Create a blob object in the pack with the supplied content."""
 583         return self.maybe_write('blob', blob)
 584
 585     def new_tree(self, shalist):
 586         """Create a tree object in the pack."""
 587         content = tree_encode(shalist)
 588         return self.maybe_write('tree', content)
 589
 590     def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
 591         l = []
 592         if tree: l.append('tree %s' % tree.encode('hex'))
 593         if parent: l.append('parent %s' % parent.encode('hex'))
 594         if author: l.append('author %s %s' % (author, _git_date(adate)))
 595         if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
 596         l.append('')
 597         l.append(msg)
 598         return self.maybe_write('commit', '\n'.join(l))
 599
 600     def new_commit(self, parent, tree, date, msg):
 601         """Create a commit object in the pack."""
 602         userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
 603         commit = self._new_commit(tree, parent,
 604                                   userline, date, userline, date,
 605                                   msg)
 606         return commit
 607
 608     def abort(self):
 609         """Remove the pack file from disk."""
 610         f = self.file
 611         if f:
 612             self.idx = None
 613             self.file = None
 614             f.close()
 615             os.unlink(self.filename + '.pack')
 616
 617     def _end(self, run_midx=True):
 618         f = self.file
 619         if not f: return None
 620         self.file = None
 621         self.objcache = None
 622         idx = self.idx
 623         self.idx = None
 624
 625         # update object count
 626         f.seek(8)
 627         cp = struct.pack('!i', self.count)
 628         assert(len(cp) == 4)
 629         f.write(cp)
 630
 631         # calculate the pack sha1sum
 632         f.seek(0)
 633         sum = Sha1()
 634         for b in chunkyreader(f):
 635             sum.update(b)
 636         packbin = sum.digest()
 637         f.write(packbin)
 638         f.close()
 639
 640         obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
 641
 642         nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
 643         if os.path.exists(self.filename + '.map'):
 644             os.unlink(self.filename + '.map')
 645         os.rename(self.filename + '.pack', nameprefix + '.pack')
 646         os.rename(self.filename + '.idx', nameprefix + '.idx')
 647
 648         if run_midx:
 649             auto_midx(repo('objects/pack'))
 650         return nameprefix
 651
 652     def close(self, run_midx=True):
 653         """Close the pack file and move it to its definitive path."""
 654         return self._end(run_midx=run_midx)
 655
 656     def _write_pack_idx_v2(self, filename, idx, packbin):
 657         ofs64_count = 0
 658         for section in idx:
 659             for entry in section:
 660                 if entry[2] >= 2**31:
 661                     ofs64_count += 1
 662
 663         # Length: header + fan-out + shas-and-crcs + overflow-offsets
 664         index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
 665         idx_map = None
 666         idx_f = open(filename, 'w+b')
 667         try:
 668             idx_f.truncate(index_len)
 669             idx_map = mmap_readwrite(idx_f, close=False)
 670             count = _helpers.write_idx(filename, idx_map, idx, self.count)
 671             assert(count == self.count)
 672         finally:
 673             if idx_map: idx_map.close()
 674             idx_f.close()
 675
 676         idx_f = open(filename, 'a+b')
 677         try:
 678             idx_f.write(packbin)
 679             idx_f.seek(0)
 680             idx_sum = Sha1()
 681             b = idx_f.read(8 + 4*256)
 682             idx_sum.update(b)
 683
 684             obj_list_sum = Sha1()
 685             for b in chunkyreader(idx_f, 20*self.count):
 686                 idx_sum.update(b)
 687                 obj_list_sum.update(b)
 688             namebase = obj_list_sum.hexdigest()
 689
 690             for b in chunkyreader(idx_f):
 691                 idx_sum.update(b)
 692             idx_f.write(idx_sum.digest())
 693             return namebase
 694         finally:
 695             idx_f.close()
 696
 697
 698 def _git_date(date):
 699     return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
 700
 701
 702 def _gitenv():
 703     os.environ['GIT_DIR'] = os.path.abspath(repo())
 704
 705
 706 def list_refs(refname = None):
 707     """Generate a list of tuples in the form (refname,hash).
 708     If a ref name is specified, list only this particular ref.
 709     """
 710     argv = ['git', 'show-ref', '--']
 711     if refname:
 712         argv += [refname]
 713     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 714     out = p.stdout.read().strip()
 715     rv = p.wait()  # not fatal
 716     if rv:
 717         assert(not out)
 718     if out:
 719         for d in out.split('\n'):
 720             (sha, name) = d.split(' ', 1)
 721             yield (name, sha.decode('hex'))
 722
 723
 724 def read_ref(refname):
 725     """Get the commit id of the most recent commit made on a given ref."""
 726     l = list(list_refs(refname))
 727     if l:
 728         assert(len(l) == 1)
 729         return l[0][1]
 730     else:
 731         return None
 732
 733
 734 def rev_list(ref, count=None):
 735     """Generate a list of reachable commits in reverse chronological order.
 736
 737     This generator walks through commits, from child to parent, that are
 738     reachable via the specified ref and yields a series of tuples of the form
 739     (date,hash).
 740
 741     If count is a non-zero integer, limit the number of commits to "count"
 742     objects.
 743     """
 744     assert(not ref.startswith('-'))
 745     opts = []
 746     if count:
 747         opts += ['-n', str(atoi(count))]
 748     argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
 749     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 750     commit = None
 751     for row in p.stdout:
 752         s = row.strip()
 753         if s.startswith('commit '):
 754             commit = s[7:].decode('hex')
 755         else:
 756             date = int(s)
 757             yield (date, commit)
 758     rv = p.wait()  # not fatal
 759     if rv:
 760         raise GitError, 'git rev-list returned error %d' % rv
 761
 762
 763 def rev_get_date(ref):
 764     """Get the date of the latest commit on the specified ref."""
 765     for (date, commit) in rev_list(ref, count=1):
 766         return date
 767     raise GitError, 'no such commit %r' % ref
 768
 769
 770 def rev_parse(committish):
 771     """Resolve the full hash for 'committish', if it exists.
 772
 773     Should be roughly equivalent to 'git rev-parse'.
 774
 775     Returns the hex value of the hash if it is found, None if 'committish' does
 776     not correspond to anything.
 777     """
 778     head = read_ref(committish)
 779     if head:
 780         debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
 781         return head
 782
 783     pL = PackIdxList(repo('objects/pack'))
 784
 785     if len(committish) == 40:
 786         try:
 787             hash = committish.decode('hex')
 788         except TypeError:
 789             return None
 790
 791         if pL.exists(hash):
 792             return hash
 793
 794     return None
 795
 796
 797 def update_ref(refname, newval, oldval):
 798     """Change the commit pointed to by a branch."""
 799     if not oldval:
 800         oldval = ''
 801     assert(refname.startswith('refs/heads/'))
 802     p = subprocess.Popen(['git', 'update-ref', refname,
 803                           newval.encode('hex'), oldval.encode('hex')],
 804                          preexec_fn = _gitenv)
 805     _git_wait('git update-ref', p)
 806
 807
 808 def guess_repo(path=None):
 809     """Set the path value in the global variable "repodir".
 810     This makes bup look for an existing bup repository, but not fail if a
 811     repository doesn't exist. Usually, if you are interacting with a bup
 812     repository, you would not be calling this function but using
 813     check_repo_or_die().
 814     """
 815     global repodir
 816     if path:
 817         repodir = path
 818     if not repodir:
 819         repodir = os.environ.get('BUP_DIR')
 820         if not repodir:
 821             repodir = os.path.expanduser('~/.bup')
 822
 823
 824 def init_repo(path=None):
 825     """Create the Git bare repository for bup in a given path."""
 826     guess_repo(path)
 827     d = repo()  # appends a / to the path
 828     parent = os.path.dirname(os.path.dirname(d))
 829     if parent and not os.path.exists(parent):
 830         raise GitError('parent directory "%s" does not exist\n' % parent)
 831     if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
 832         raise GitError('"%s" exists but is not a directory\n' % d)
 833     p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
 834                          preexec_fn = _gitenv)
 835     _git_wait('git init', p)
 836     # Force the index version configuration in order to ensure bup works
 837     # regardless of the version of the installed Git binary.
 838     p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
 839                          stdout=sys.stderr, preexec_fn = _gitenv)
 840     _git_wait('git config', p)
 841     # Enable the reflog
 842     p = subprocess.Popen(['git', 'config', 'core.logAllRefUpdates', 'true'],
 843                          stdout=sys.stderr, preexec_fn = _gitenv)
 844     _git_wait('git config', p)
 845
 846
 847 def check_repo_or_die(path=None):
 848     """Make sure a bup repository exists, and abort if not.
 849     If the path to a particular repository was not specified, this function
 850     initializes the default repository automatically.
 851     """
 852     guess_repo(path)
 853     try:
 854         os.stat(repo('objects/pack/.'))
 855     except OSError, e:
 856         if e.errno == errno.ENOENT:
 857             log('error: %r is not a bup repository; run "bup init"\n'
 858                 % repo())
 859             sys.exit(15)
 860         else:
 861             log('error: %s\n' % e)
 862             sys.exit(14)
 863
 864
 865 _ver = None
 866 def ver():
 867     """Get Git's version and ensure a usable version is installed.
 868
 869     The returned version is formatted as an ordered tuple with each position
 870     representing a digit in the version tag. For example, the following tuple
 871     would represent version 1.6.6.9:
 872
 873         ('1', '6', '6', '9')
 874     """
 875     global _ver
 876     if not _ver:
 877         p = subprocess.Popen(['git', '--version'],
 878                              stdout=subprocess.PIPE)
 879         gvs = p.stdout.read()
 880         _git_wait('git --version', p)
 881         m = re.match(r'git version (\S+.\S+)', gvs)
 882         if not m:
 883             raise GitError('git --version weird output: %r' % gvs)
 884         _ver = tuple(m.group(1).split('.'))
 885     needed = ('1','5', '3', '1')
 886     if _ver < needed:
 887         raise GitError('git version %s or higher is required; you have %s'
 888                        % ('.'.join(needed), '.'.join(_ver)))
 889     return _ver
 890
 891
 892 def _git_wait(cmd, p):
 893     rv = p.wait()
 894     if rv != 0:
 895         raise GitError('%s returned %d' % (cmd, rv))
 896
 897
 898 def _git_capture(argv):
 899     p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
 900     r = p.stdout.read()
 901     _git_wait(repr(argv), p)
 902     return r
 903
 904
 905 class _AbortableIter:
 906     def __init__(self, it, onabort = None):
 907         self.it = it
 908         self.onabort = onabort
 909         self.done = None
 910
 911     def __iter__(self):
 912         return self
 913
 914     def next(self):
 915         try:
 916             return self.it.next()
 917         except StopIteration, e:
 918             self.done = True
 919             raise
 920         except:
 921             self.abort()
 922             raise
 923
 924     def abort(self):
 925         """Abort iteration and call the abortion callback, if needed."""
 926         if not self.done:
 927             self.done = True
 928             if self.onabort:
 929                 self.onabort()
 930
 931     def __del__(self):
 932         self.abort()
 933
 934
 935 _ver_warned = 0
 936 class CatPipe:
 937     """Link to 'git cat-file' that is used to retrieve blob data."""
 938     def __init__(self):
 939         global _ver_warned
 940         wanted = ('1','5','6')
 941         if ver() < wanted:
 942             if not _ver_warned:
 943                 log('warning: git version < %s; bup will be slow.\n'
 944                     % '.'.join(wanted))
 945                 _ver_warned = 1
 946             self.get = self._slow_get
 947         else:
 948             self.p = self.inprogress = None
 949             self.get = self._fast_get
 950
 951     def _abort(self):
 952         if self.p:
 953             self.p.stdout.close()
 954             self.p.stdin.close()
 955         self.p = None
 956         self.inprogress = None
 957
 958     def _restart(self):
 959         self._abort()
 960         self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
 961                                   stdin=subprocess.PIPE,
 962                                   stdout=subprocess.PIPE,
 963                                   close_fds = True,
 964                                   bufsize = 4096,
 965                                   preexec_fn = _gitenv)
 966
 967     def _fast_get(self, id):
 968         if not self.p or self.p.poll() != None:
 969             self._restart()
 970         assert(self.p)
 971         poll_result = self.p.poll()
 972         assert(poll_result == None)
 973         if self.inprogress:
 974             log('_fast_get: opening %r while %r is open\n'
 975                 % (id, self.inprogress))
 976         assert(not self.inprogress)
 977         assert(id.find('\n') < 0)
 978         assert(id.find('\r') < 0)
 979         assert(not id.startswith('-'))
 980         self.inprogress = id
 981         self.p.stdin.write('%s\n' % id)
 982         self.p.stdin.flush()
 983         hdr = self.p.stdout.readline()
 984         if hdr.endswith(' missing\n'):
 985             self.inprogress = None
 986             raise KeyError('blob %r is missing' % id)
 987         spl = hdr.split(' ')
 988         if len(spl) != 3 or len(spl[0]) != 40:
 989             raise GitError('expected blob, got %r' % spl)
 990         (hex, type, size) = spl
 991
 992         it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
 993                            onabort = self._abort)
 994         try:
 995             yield type
 996             for blob in it:
 997                 yield blob
 998             readline_result = self.p.stdout.readline()
 999             assert(readline_result == '\n')
1000             self.inprogress = None
1001         except Exception, e:
1002             it.abort()
1003             raise
1004
1005     def _slow_get(self, id):
1006         assert(id.find('\n') < 0)
1007         assert(id.find('\r') < 0)
1008         assert(id[0] != '-')
1009         type = _git_capture(['git', 'cat-file', '-t', id]).strip()
1010         yield type
1011
1012         p = subprocess.Popen(['git', 'cat-file', type, id],
1013                              stdout=subprocess.PIPE,
1014                              preexec_fn = _gitenv)
1015         for blob in chunkyreader(p.stdout):
1016             yield blob
1017         _git_wait('git cat-file', p)
1018
1019     def _join(self, it):
1020         type = it.next()
1021         if type == 'blob':
1022             for blob in it:
1023                 yield blob
1024         elif type == 'tree':
1025             treefile = ''.join(it)
1026             for (mode, name, sha) in tree_decode(treefile):
1027                 for blob in self.join(sha.encode('hex')):
1028                     yield blob
1029         elif type == 'commit':
1030             treeline = ''.join(it).split('\n')[0]
1031             assert(treeline.startswith('tree '))
1032             for blob in self.join(treeline[5:]):
1033                 yield blob
1034         else:
1035             raise GitError('invalid object type %r: expected blob/tree/commit'
1036                            % type)
1037
1038     def join(self, id):
1039         """Generate a list of the content of all blobs that can be reached
1040         from an object.  The hash given in 'id' must point to a blob, a tree
1041         or a commit. The content of all blobs that can be seen from trees or
1042         commits will be added to the list.
1043         """
1044         try:
1045             for d in self._join(self.get(id)):
1046                 yield d
1047         except StopIteration:
1048             log('booger!\n')
1049
1050 def tags():
1051     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1052     tags = {}
1053     for (n,c) in list_refs():
1054         if n.startswith('refs/tags/'):
1055             name = n[10:]
1056             if not c in tags:
1057                 tags[c] = []
1058
1059             tags[c].append(name)  # more than one tag can point at 'c'
1060
1061     return tags