lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5 import os, sys, zlib, time, subprocess, struct, stat, re, tempfile, heapq
   6 from bup.helpers import *
   7 from bup import _helpers, path
   8
   9 MIDX_VERSION = 2
  10
  11 verbose = 0
  12 ignore_midx = 0
  13 home_repodir = os.path.expanduser('~/.bup')
  14 repodir = None
  15
  16 _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
  17 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
  18
  19 _total_searches = 0
  20 _total_steps = 0
  21
  22
  23 class GitError(Exception):
  24     pass
  25
  26
  27 def repo(sub = ''):
  28     """Get the path to the git repository or one of its subdirectories."""
  29     global repodir
  30     if not repodir:
  31         raise GitError('You should call check_repo_or_die()')
  32
  33     # If there's a .git subdirectory, then the actual repo is in there.
  34     gd = os.path.join(repodir, '.git')
  35     if os.path.exists(gd):
  36         repodir = gd
  37
  38     return os.path.join(repodir, sub)
  39
  40
  41 def auto_midx(objdir):
  42     args = [path.exe(), 'midx', '--auto', '--dir', objdir]
  43     try:
  44         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
  45     except OSError, e:
  46         # make sure 'args' gets printed to help with debugging
  47         add_error('%r: exception: %s' % (args, e))
  48         raise
  49     if rv:
  50         add_error('%r: returned %d' % (args, rv))
  51
  52
  53 def mangle_name(name, mode, gitmode):
  54     """Mangle a file name to present an abstract name for segmented files.
  55     Mangled file names will have the ".bup" extension added to them. If a
  56     file's name already ends with ".bup", a ".bupl" extension is added to
  57     disambiguate normal files from semgmented ones.
  58     """
  59     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
  60         return name + '.bup'
  61     elif name.endswith('.bup') or name[:-1].endswith('.bup'):
  62         return name + '.bupl'
  63     else:
  64         return name
  65
  66
  67 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
  68 def demangle_name(name):
  69     """Remove name mangling from a file name, if necessary.
  70
  71     The return value is a tuple (demangled_filename,mode), where mode is one of
  72     the following:
  73
  74     * BUP_NORMAL  : files that should be read as-is from the repository
  75     * BUP_CHUNKED : files that were chunked and need to be assembled
  76
  77     For more information on the name mangling algorythm, see mangle_name()
  78     """
  79     if name.endswith('.bupl'):
  80         return (name[:-5], BUP_NORMAL)
  81     elif name.endswith('.bup'):
  82         return (name[:-4], BUP_CHUNKED)
  83     else:
  84         return (name, BUP_NORMAL)
  85
  86
  87 def _encode_packobj(type, content):
  88     szout = ''
  89     sz = len(content)
  90     szbits = (sz & 0x0f) | (_typemap[type]<<4)
  91     sz >>= 4
  92     while 1:
  93         if sz: szbits |= 0x80
  94         szout += chr(szbits)
  95         if not sz:
  96             break
  97         szbits = sz & 0x7f
  98         sz >>= 7
  99     z = zlib.compressobj(1)
 100     yield szout
 101     yield z.compress(content)
 102     yield z.flush()
 103
 104
 105 def _encode_looseobj(type, content):
 106     z = zlib.compressobj(1)
 107     yield z.compress('%s %d\0' % (type, len(content)))
 108     yield z.compress(content)
 109     yield z.flush()
 110
 111
 112 def _decode_looseobj(buf):
 113     assert(buf);
 114     s = zlib.decompress(buf)
 115     i = s.find('\0')
 116     assert(i > 0)
 117     l = s[:i].split(' ')
 118     type = l[0]
 119     sz = int(l[1])
 120     content = s[i+1:]
 121     assert(type in _typemap)
 122     assert(sz == len(content))
 123     return (type, content)
 124
 125
 126 def _decode_packobj(buf):
 127     assert(buf)
 128     c = ord(buf[0])
 129     type = _typermap[(c & 0x70) >> 4]
 130     sz = c & 0x0f
 131     shift = 4
 132     i = 0
 133     while c & 0x80:
 134         i += 1
 135         c = ord(buf[i])
 136         sz |= (c & 0x7f) << shift
 137         shift += 7
 138         if not (c & 0x80):
 139             break
 140     return (type, zlib.decompress(buf[i+1:]))
 141
 142
 143 class PackIdx:
 144     def __init__(self):
 145         assert(0)
 146
 147     def find_offset(self, hash):
 148         """Get the offset of an object inside the index file."""
 149         idx = self._idx_from_hash(hash)
 150         if idx != None:
 151             return self._ofs_from_idx(idx)
 152         return None
 153
 154     def exists(self, hash):
 155         """Return nonempty if the object exists in this index."""
 156         return hash and (self._idx_from_hash(hash) != None) and True or None
 157
 158     def __len__(self):
 159         return int(self.fanout[255])
 160
 161     def _idx_from_hash(self, hash):
 162         global _total_searches, _total_steps
 163         _total_searches += 1
 164         assert(len(hash) == 20)
 165         b1 = ord(hash[0])
 166         start = self.fanout[b1-1] # range -1..254
 167         end = self.fanout[b1] # range 0..255
 168         want = str(hash)
 169         _total_steps += 1  # lookup table is a step
 170         while start < end:
 171             _total_steps += 1
 172             mid = start + (end-start)/2
 173             v = self._idx_to_hash(mid)
 174             if v < want:
 175                 start = mid+1
 176             elif v > want:
 177                 end = mid
 178             else: # got it!
 179                 return mid
 180         return None
 181
 182
 183 class PackIdxV1(PackIdx):
 184     """Object representation of a Git pack index (version 1) file."""
 185     def __init__(self, filename, f):
 186         self.name = filename
 187         self.idxnames = [self.name]
 188         self.map = mmap_read(f)
 189         self.fanout = list(struct.unpack('!256I',
 190                                          str(buffer(self.map, 0, 256*4))))
 191         self.fanout.append(0)  # entry "-1"
 192         nsha = self.fanout[255]
 193         self.shatable = buffer(self.map, 256*4, nsha*24)
 194
 195     def _ofs_from_idx(self, idx):
 196         return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
 197
 198     def _idx_to_hash(self, idx):
 199         return str(self.shatable[idx*24+4 : idx*24+24])
 200
 201     def __iter__(self):
 202         for i in xrange(self.fanout[255]):
 203             yield buffer(self.map, 256*4 + 24*i + 4, 20)
 204
 205
 206 class PackIdxV2(PackIdx):
 207     """Object representation of a Git pack index (version 2) file."""
 208     def __init__(self, filename, f):
 209         self.name = filename
 210         self.idxnames = [self.name]
 211         self.map = mmap_read(f)
 212         assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
 213         self.fanout = list(struct.unpack('!256I',
 214                                          str(buffer(self.map, 8, 256*4))))
 215         self.fanout.append(0)  # entry "-1"
 216         nsha = self.fanout[255]
 217         self.shatable = buffer(self.map, 8 + 256*4, nsha*20)
 218         self.ofstable = buffer(self.map,
 219                                8 + 256*4 + nsha*20 + nsha*4,
 220                                nsha*4)
 221         self.ofs64table = buffer(self.map,
 222                                  8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
 223
 224     def _ofs_from_idx(self, idx):
 225         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
 226         if ofs & 0x80000000:
 227             idx64 = ofs & 0x7fffffff
 228             ofs = struct.unpack('!Q',
 229                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
 230         return ofs
 231
 232     def _idx_to_hash(self, idx):
 233         return str(self.shatable[idx*20:(idx+1)*20])
 234
 235     def __iter__(self):
 236         for i in xrange(self.fanout[255]):
 237             yield buffer(self.map, 8 + 256*4 + 20*i, 20)
 238
 239
 240 extract_bits = _helpers.extract_bits
 241
 242
 243 class PackMidx:
 244     """Wrapper which contains data from multiple index files.
 245     Multiple index (.midx) files constitute a wrapper around index (.idx) files
 246     and make it possible for bup to expand Git's indexing capabilities to vast
 247     amounts of files.
 248     """
 249     def __init__(self, filename):
 250         self.name = filename
 251         self.force_keep = False
 252         assert(filename.endswith('.midx'))
 253         self.map = mmap_read(open(filename))
 254         if str(self.map[0:4]) != 'MIDX':
 255             log('Warning: skipping: invalid MIDX header in %r\n' % filename)
 256             self.force_keep = True
 257             return self._init_failed()
 258         ver = struct.unpack('!I', self.map[4:8])[0]
 259         if ver < MIDX_VERSION:
 260             log('Warning: ignoring old-style (v%d) midx %r\n'
 261                 % (ver, filename))
 262             self.force_keep = False  # old stuff is boring
 263             return self._init_failed()
 264         if ver > MIDX_VERSION:
 265             log('Warning: ignoring too-new (v%d) midx %r\n'
 266                 % (ver, filename))
 267             self.force_keep = True  # new stuff is exciting
 268             return self._init_failed()
 269
 270         self.bits = _helpers.firstword(self.map[8:12])
 271         self.entries = 2**self.bits
 272         self.fanout = buffer(self.map, 12, self.entries*4)
 273         shaofs = 12 + self.entries*4
 274         nsha = self._fanget(self.entries-1)
 275         self.shalist = buffer(self.map, shaofs, nsha*20)
 276         self.idxnames = str(self.map[shaofs + 20*nsha:]).split('\0')
 277
 278     def _init_failed(self):
 279         self.bits = 0
 280         self.entries = 1
 281         self.fanout = buffer('\0\0\0\0')
 282         self.shalist = buffer('\0'*20)
 283         self.idxnames = []
 284
 285     def _fanget(self, i):
 286         start = i*4
 287         s = self.fanout[start:start+4]
 288         return _helpers.firstword(s)
 289
 290     def _get(self, i):
 291         return str(self.shalist[i*20:(i+1)*20])
 292
 293     def exists(self, hash):
 294         """Return nonempty if the object exists in the index files."""
 295         global _total_searches, _total_steps
 296         _total_searches += 1
 297         want = str(hash)
 298         el = extract_bits(want, self.bits)
 299         if el:
 300             start = self._fanget(el-1)
 301             startv = el << (32-self.bits)
 302         else:
 303             start = 0
 304             startv = 0
 305         end = self._fanget(el)
 306         endv = (el+1) << (32-self.bits)
 307         _total_steps += 1   # lookup table is a step
 308         hashv = _helpers.firstword(hash)
 309         #print '(%08x) %08x %08x %08x' % (extract_bits(want, 32), startv, hashv, endv)
 310         while start < end:
 311             _total_steps += 1
 312             #print '! %08x %08x %08x   %d - %d' % (startv, hashv, endv, start, end)
 313             mid = start + (hashv-startv)*(end-start-1)/(endv-startv)
 314             #print '  %08x %08x %08x   %d %d %d' % (startv, hashv, endv, start, mid, end)
 315             v = self._get(mid)
 316             #print '    %08x' % self._num(v)
 317             if v < want:
 318                 start = mid+1
 319                 startv = _helpers.firstword(v)
 320             elif v > want:
 321                 end = mid
 322                 endv = _helpers.firstword(v)
 323             else: # got it!
 324                 return True
 325         return None
 326
 327     def __iter__(self):
 328         for i in xrange(self._fanget(self.entries-1)):
 329             yield buffer(self.shalist, i*20, 20)
 330
 331     def __len__(self):
 332         return int(self._fanget(self.entries-1))
 333
 334
 335 _mpi_count = 0
 336 class PackIdxList:
 337     def __init__(self, dir):
 338         global _mpi_count
 339         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 340         _mpi_count += 1
 341         self.dir = dir
 342         self.also = {}
 343         self.packs = []
 344         self.refresh()
 345
 346     def __del__(self):
 347         global _mpi_count
 348         _mpi_count -= 1
 349         assert(_mpi_count == 0)
 350
 351     def __iter__(self):
 352         return iter(idxmerge(self.packs))
 353
 354     def __len__(self):
 355         return sum(len(pack) for pack in self.packs)
 356
 357     def exists(self, hash):
 358         """Return nonempty if the object exists in the index files."""
 359         global _total_searches
 360         _total_searches += 1
 361         if hash in self.also:
 362             return True
 363         for i in range(len(self.packs)):
 364             p = self.packs[i]
 365             _total_searches -= 1  # will be incremented by sub-pack
 366             if p.exists(hash):
 367                 # reorder so most recently used packs are searched first
 368                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 369                 return p.name
 370         return None
 371
 372     def refresh(self, skip_midx = False):
 373         """Refresh the index list.
 374         This method verifies if .midx files were superseded (e.g. all of its
 375         contents are in another, bigger .midx file) and removes the superseded
 376         files.
 377
 378         If skip_midx is True, all work on .midx files will be skipped and .midx
 379         files will be removed from the list.
 380
 381         The module-global variable 'ignore_midx' can force this function to
 382         always act as if skip_midx was True.
 383         """
 384         skip_midx = skip_midx or ignore_midx
 385         d = dict((p.name, p) for p in self.packs
 386                  if not skip_midx or not isinstance(p, PackMidx))
 387         if os.path.exists(self.dir):
 388             if not skip_midx:
 389                 midxl = []
 390                 for ix in self.packs:
 391                     if isinstance(ix, PackMidx):
 392                         for name in ix.idxnames:
 393                             d[os.path.join(self.dir, name)] = ix
 394                 for f in os.listdir(self.dir):
 395                     full = os.path.join(self.dir, f)
 396                     if f.endswith('.midx') and not d.get(full):
 397                         mx = PackMidx(full)
 398                         (mxd, mxf) = os.path.split(mx.name)
 399                         broken = 0
 400                         for n in mx.idxnames:
 401                             if not os.path.exists(os.path.join(mxd, n)):
 402                                 log(('warning: index %s missing\n' +
 403                                     '  used by %s\n') % (n, mxf))
 404                                 broken += 1
 405                         if broken:
 406                             del mx
 407                             unlink(full)
 408                         else:
 409                             midxl.append(mx)
 410                 midxl.sort(lambda x,y: -cmp(len(x),len(y)))
 411                 for ix in midxl:
 412                     any = 0
 413                     for sub in ix.idxnames:
 414                         found = d.get(os.path.join(self.dir, sub))
 415                         if not found or isinstance(found, PackIdx):
 416                             # doesn't exist, or exists but not in a midx
 417                             d[ix.name] = ix
 418                             for name in ix.idxnames:
 419                                 d[os.path.join(self.dir, name)] = ix
 420                             any += 1
 421                             break
 422                     if not any and not ix.force_keep:
 423                         debug1('midx: removing redundant: %s\n'
 424                                % os.path.basename(ix.name))
 425                         unlink(ix.name)
 426             for f in os.listdir(self.dir):
 427                 full = os.path.join(self.dir, f)
 428                 if f.endswith('.idx') and not d.get(full):
 429                     try:
 430                         ix = open_idx(full)
 431                     except GitError, e:
 432                         add_error(e)
 433                         continue
 434                     d[full] = ix
 435             self.packs = list(set(d.values()))
 436         debug1('PackIdxList: using %d index%s.\n'
 437             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 438
 439     def packname_containing(self, hash):
 440         # figure out which pack contains a given hash.
 441         # FIXME: if the midx file format would just *store* this information,
 442         # we could calculate it a lot more efficiently.  But it's not needed
 443         # often, so let's do it like this.
 444         for f in os.listdir(self.dir):
 445             if f.endswith('.idx'):
 446                 full = os.path.join(self.dir, f)
 447                 try:
 448                     ix = open_idx(full)
 449                 except GitError, e:
 450                     add_error(e)
 451                     continue
 452                 if ix.exists(hash):
 453                     return full
 454
 455     def add(self, hash):
 456         """Insert an additional object in the list."""
 457         self.also[hash] = 1
 458
 459     def zap_also(self):
 460         """Remove all additional objects from the list."""
 461         self.also = {}
 462
 463
 464 def calc_hash(type, content):
 465     """Calculate some content's hash in the Git fashion."""
 466     header = '%s %d\0' % (type, len(content))
 467     sum = Sha1(header)
 468     sum.update(content)
 469     return sum.digest()
 470
 471
 472 def _shalist_sort_key(ent):
 473     (mode, name, id) = ent
 474     if stat.S_ISDIR(int(mode, 8)):
 475         return name + '/'
 476     else:
 477         return name
 478
 479
 480 def open_idx(filename):
 481     if filename.endswith('.idx'):
 482         f = open(filename, 'rb')
 483         header = f.read(8)
 484         if header[0:4] == '\377tOc':
 485             version = struct.unpack('!I', header[4:8])[0]
 486             if version == 2:
 487                 return PackIdxV2(filename, f)
 488             else:
 489                 raise GitError('%s: expected idx file version 2, got %d'
 490                                % (filename, version))
 491         elif len(header) == 8 and header[0:4] < '\377tOc':
 492             return PackIdxV1(filename, f)
 493         else:
 494             raise GitError('%s: unrecognized idx file header' % filename)
 495     elif filename.endswith('.midx'):
 496         return PackMidx(filename)
 497     else:
 498         raise GitError('idx filenames must end with .idx or .midx')
 499
 500
 501 def idxmerge(idxlist, final_progress=True):
 502     """Generate a list of all the objects reachable in a PackIdxList."""
 503     total = sum(len(i) for i in idxlist)
 504     iters = (iter(i) for i in idxlist)
 505     heap = [(next(it), it) for it in iters]
 506     heapq.heapify(heap)
 507     count = 0
 508     last = None
 509     while heap:
 510         if (count % 10024) == 0:
 511             progress('Reading indexes: %.2f%% (%d/%d)\r'
 512                      % (count*100.0/total, count, total))
 513         (e, it) = heap[0]
 514         if e != last:
 515             yield e
 516             last = e
 517         count += 1
 518         e = next(it)
 519         if e:
 520             heapq.heapreplace(heap, (e, it))
 521         else:
 522             heapq.heappop(heap)
 523     if final_progress:
 524         log('Reading indexes: %.2f%% (%d/%d), done.\n' % (100, total, total))
 525
 526
 527 def _make_objcache():
 528     return PackIdxList(repo('objects/pack'))
 529
 530 class PackWriter:
 531     """Writes Git objects insid a pack file."""
 532     def __init__(self, objcache_maker=_make_objcache):
 533         self.count = 0
 534         self.outbytes = 0
 535         self.filename = None
 536         self.file = None
 537         self.idx = None
 538         self.objcache_maker = objcache_maker
 539         self.objcache = None
 540
 541     def __del__(self):
 542         self.close()
 543
 544     def _open(self):
 545         if not self.file:
 546             (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
 547             self.file = os.fdopen(fd, 'w+b')
 548             assert(name.endswith('.pack'))
 549             self.filename = name[:-5]
 550             self.file.write('PACK\0\0\0\2\0\0\0\0')
 551             self.idx = list(list() for i in xrange(256))
 552
 553     # the 'sha' parameter is used in client.py's _raw_write(), but not needed
 554     # in this basic version.
 555     def _raw_write(self, datalist, sha):
 556         self._open()
 557         f = self.file
 558         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 559         # the file never has a *partial* blob.  So let's make sure it's
 560         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 561         # to our hashsplit algorithm.)  f.write() does its own buffering,
 562         # but that's okay because we'll flush it in _end().
 563         oneblob = ''.join(datalist)
 564         try:
 565             f.write(oneblob)
 566         except IOError, e:
 567             raise GitError, e, sys.exc_info()[2]
 568         nw = len(oneblob)
 569         crc = zlib.crc32(oneblob) & 0xffffffff
 570         self._update_idx(sha, crc, nw)
 571         self.outbytes += nw
 572         self.count += 1
 573         return nw, crc
 574
 575     def _update_idx(self, sha, crc, size):
 576         assert(sha)
 577         if self.idx:
 578             self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
 579
 580     def _write(self, sha, type, content):
 581         if verbose:
 582             log('>')
 583         if not sha:
 584             sha = calc_hash(type, content)
 585         size, crc = self._raw_write(_encode_packobj(type, content), sha=sha)
 586         return sha
 587
 588     def breakpoint(self):
 589         """Clear byte and object counts and return the last processed id."""
 590         id = self._end()
 591         self.outbytes = self.count = 0
 592         return id
 593
 594     def write(self, type, content):
 595         """Write an object in this pack file."""
 596         return self._write(calc_hash(type, content), type, content)
 597
 598     def _require_objcache(self):
 599         if self.objcache is None and self.objcache_maker:
 600             self.objcache = self.objcache_maker()
 601         if self.objcache is None:
 602             raise GitError(
 603                     "PackWriter not opened or can't check exists w/o objcache")
 604
 605     def exists(self, id):
 606         """Return non-empty if an object is found in the object cache."""
 607         self._require_objcache()
 608         return self.objcache.exists(id)
 609
 610     def maybe_write(self, type, content):
 611         """Write an object to the pack file if not present and return its id."""
 612         self._require_objcache()
 613         sha = calc_hash(type, content)
 614         if not self.exists(sha):
 615             self._write(sha, type, content)
 616             self.objcache.add(sha)
 617         return sha
 618
 619     def new_blob(self, blob):
 620         """Create a blob object in the pack with the supplied content."""
 621         return self.maybe_write('blob', blob)
 622
 623     def new_tree(self, shalist):
 624         """Create a tree object in the pack."""
 625         shalist = sorted(shalist, key = _shalist_sort_key)
 626         l = []
 627         for (mode,name,bin) in shalist:
 628             assert(mode)
 629             assert(mode != '0')
 630             assert(mode[0] != '0')
 631             assert(name)
 632             assert(len(bin) == 20)
 633             l.append('%s %s\0%s' % (mode,name,bin))
 634         return self.maybe_write('tree', ''.join(l))
 635
 636     def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
 637         l = []
 638         if tree: l.append('tree %s' % tree.encode('hex'))
 639         if parent: l.append('parent %s' % parent.encode('hex'))
 640         if author: l.append('author %s %s' % (author, _git_date(adate)))
 641         if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
 642         l.append('')
 643         l.append(msg)
 644         return self.maybe_write('commit', '\n'.join(l))
 645
 646     def new_commit(self, parent, tree, date, msg):
 647         """Create a commit object in the pack."""
 648         userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
 649         commit = self._new_commit(tree, parent,
 650                                   userline, date, userline, date,
 651                                   msg)
 652         return commit
 653
 654     def abort(self):
 655         """Remove the pack file from disk."""
 656         f = self.file
 657         if f:
 658             self.idx = None
 659             self.file = None
 660             f.close()
 661             os.unlink(self.filename + '.pack')
 662
 663     def _end(self, run_midx=True):
 664         f = self.file
 665         if not f: return None
 666         self.file = None
 667         self.objcache = None
 668         idx = self.idx
 669         self.idx = None
 670
 671         # update object count
 672         f.seek(8)
 673         cp = struct.pack('!i', self.count)
 674         assert(len(cp) == 4)
 675         f.write(cp)
 676
 677         # calculate the pack sha1sum
 678         f.seek(0)
 679         sum = Sha1()
 680         for b in chunkyreader(f):
 681             sum.update(b)
 682         packbin = sum.digest()
 683         f.write(packbin)
 684         f.close()
 685
 686         idx_f = open(self.filename + '.idx', 'wb')
 687         obj_list_sha = self._write_pack_idx_v2(idx_f, idx, packbin)
 688         idx_f.close()
 689
 690         nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
 691         if os.path.exists(self.filename + '.map'):
 692             os.unlink(self.filename + '.map')
 693         os.rename(self.filename + '.pack', nameprefix + '.pack')
 694         os.rename(self.filename + '.idx', nameprefix + '.idx')
 695
 696         if run_midx:
 697             auto_midx(repo('objects/pack'))
 698         return nameprefix
 699
 700     def close(self, run_midx=True):
 701         """Close the pack file and move it to its definitive path."""
 702         return self._end(run_midx=run_midx)
 703
 704     def _write_pack_idx_v2(self, file, idx, packbin):
 705         sum = Sha1()
 706
 707         def write(data):
 708             file.write(data)
 709             sum.update(data)
 710
 711         write('\377tOc\0\0\0\2')
 712
 713         n = 0
 714         for part in idx:
 715             n += len(part)
 716             write(struct.pack('!i', n))
 717             part.sort(key=lambda x: x[0])
 718
 719         obj_list_sum = Sha1()
 720         for part in idx:
 721             for entry in part:
 722                 write(entry[0])
 723                 obj_list_sum.update(entry[0])
 724         for part in idx:
 725             for entry in part:
 726                 write(struct.pack('!I', entry[1]))
 727         ofs64_list = []
 728         for part in idx:
 729             for entry in part:
 730                 if entry[2] & 0x80000000:
 731                     write(struct.pack('!I', 0x80000000 | len(ofs64_list)))
 732                     ofs64_list.append(struct.pack('!Q', entry[2]))
 733                 else:
 734                     write(struct.pack('!i', entry[2]))
 735         for ofs64 in ofs64_list:
 736             write(ofs64)
 737
 738         write(packbin)
 739         file.write(sum.digest())
 740         return obj_list_sum.hexdigest()
 741
 742
 743 def _git_date(date):
 744     return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
 745
 746
 747 def _gitenv():
 748     os.environ['GIT_DIR'] = os.path.abspath(repo())
 749
 750
 751 def list_refs(refname = None):
 752     """Generate a list of tuples in the form (refname,hash).
 753     If a ref name is specified, list only this particular ref.
 754     """
 755     argv = ['git', 'show-ref', '--']
 756     if refname:
 757         argv += [refname]
 758     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 759     out = p.stdout.read().strip()
 760     rv = p.wait()  # not fatal
 761     if rv:
 762         assert(not out)
 763     if out:
 764         for d in out.split('\n'):
 765             (sha, name) = d.split(' ', 1)
 766             yield (name, sha.decode('hex'))
 767
 768
 769 def read_ref(refname):
 770     """Get the commit id of the most recent commit made on a given ref."""
 771     l = list(list_refs(refname))
 772     if l:
 773         assert(len(l) == 1)
 774         return l[0][1]
 775     else:
 776         return None
 777
 778
 779 def rev_list(ref, count=None):
 780     """Generate a list of reachable commits in reverse chronological order.
 781
 782     This generator walks through commits, from child to parent, that are
 783     reachable via the specified ref and yields a series of tuples of the form
 784     (date,hash).
 785
 786     If count is a non-zero integer, limit the number of commits to "count"
 787     objects.
 788     """
 789     assert(not ref.startswith('-'))
 790     opts = []
 791     if count:
 792         opts += ['-n', str(atoi(count))]
 793     argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
 794     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 795     commit = None
 796     for row in p.stdout:
 797         s = row.strip()
 798         if s.startswith('commit '):
 799             commit = s[7:].decode('hex')
 800         else:
 801             date = int(s)
 802             yield (date, commit)
 803     rv = p.wait()  # not fatal
 804     if rv:
 805         raise GitError, 'git rev-list returned error %d' % rv
 806
 807
 808 def rev_get_date(ref):
 809     """Get the date of the latest commit on the specified ref."""
 810     for (date, commit) in rev_list(ref, count=1):
 811         return date
 812     raise GitError, 'no such commit %r' % ref
 813
 814
 815 def rev_parse(committish):
 816     """Resolve the full hash for 'committish', if it exists.
 817
 818     Should be roughly equivalent to 'git rev-parse'.
 819
 820     Returns the hex value of the hash if it is found, None if 'committish' does
 821     not correspond to anything.
 822     """
 823     head = read_ref(committish)
 824     if head:
 825         debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
 826         return head
 827
 828     pL = PackIdxList(repo('objects/pack'))
 829
 830     if len(committish) == 40:
 831         try:
 832             hash = committish.decode('hex')
 833         except TypeError:
 834             return None
 835
 836         if pL.exists(hash):
 837             return hash
 838
 839     return None
 840
 841
 842 def update_ref(refname, newval, oldval):
 843     """Change the commit pointed to by a branch."""
 844     if not oldval:
 845         oldval = ''
 846     assert(refname.startswith('refs/heads/'))
 847     p = subprocess.Popen(['git', 'update-ref', refname,
 848                           newval.encode('hex'), oldval.encode('hex')],
 849                          preexec_fn = _gitenv)
 850     _git_wait('git update-ref', p)
 851
 852
 853 def guess_repo(path=None):
 854     """Set the path value in the global variable "repodir".
 855     This makes bup look for an existing bup repository, but not fail if a
 856     repository doesn't exist. Usually, if you are interacting with a bup
 857     repository, you would not be calling this function but using
 858     check_repo_or_die().
 859     """
 860     global repodir
 861     if path:
 862         repodir = path
 863     if not repodir:
 864         repodir = os.environ.get('BUP_DIR')
 865         if not repodir:
 866             repodir = os.path.expanduser('~/.bup')
 867
 868
 869 def init_repo(path=None):
 870     """Create the Git bare repository for bup in a given path."""
 871     guess_repo(path)
 872     d = repo()  # appends a / to the path
 873     parent = os.path.dirname(os.path.dirname(d))
 874     if parent and not os.path.exists(parent):
 875         raise GitError('parent directory "%s" does not exist\n' % parent)
 876     if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
 877         raise GitError('"%d" exists but is not a directory\n' % d)
 878     p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
 879                          preexec_fn = _gitenv)
 880     _git_wait('git init', p)
 881     # Force the index version configuration in order to ensure bup works
 882     # regardless of the version of the installed Git binary.
 883     p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
 884                          stdout=sys.stderr, preexec_fn = _gitenv)
 885     _git_wait('git config', p)
 886
 887
 888 def check_repo_or_die(path=None):
 889     """Make sure a bup repository exists, and abort if not.
 890     If the path to a particular repository was not specified, this function
 891     initializes the default repository automatically.
 892     """
 893     guess_repo(path)
 894     if not os.path.isdir(repo('objects/pack/.')):
 895         if repodir == home_repodir:
 896             init_repo()
 897         else:
 898             log('error: %r is not a bup/git repository\n' % repo())
 899             sys.exit(15)
 900
 901
 902 def treeparse(buf):
 903     """Generate a list of (mode, name, hash) tuples of objects from 'buf'."""
 904     ofs = 0
 905     while ofs < len(buf):
 906         z = buf[ofs:].find('\0')
 907         assert(z > 0)
 908         spl = buf[ofs:ofs+z].split(' ', 1)
 909         assert(len(spl) == 2)
 910         sha = buf[ofs+z+1:ofs+z+1+20]
 911         ofs += z+1+20
 912         yield (spl[0], spl[1], sha)
 913
 914
 915 _ver = None
 916 def ver():
 917     """Get Git's version and ensure a usable version is installed.
 918
 919     The returned version is formatted as an ordered tuple with each position
 920     representing a digit in the version tag. For example, the following tuple
 921     would represent version 1.6.6.9:
 922
 923         ('1', '6', '6', '9')
 924     """
 925     global _ver
 926     if not _ver:
 927         p = subprocess.Popen(['git', '--version'],
 928                              stdout=subprocess.PIPE)
 929         gvs = p.stdout.read()
 930         _git_wait('git --version', p)
 931         m = re.match(r'git version (\S+.\S+)', gvs)
 932         if not m:
 933             raise GitError('git --version weird output: %r' % gvs)
 934         _ver = tuple(m.group(1).split('.'))
 935     needed = ('1','5', '3', '1')
 936     if _ver < needed:
 937         raise GitError('git version %s or higher is required; you have %s'
 938                        % ('.'.join(needed), '.'.join(_ver)))
 939     return _ver
 940
 941
 942 def _git_wait(cmd, p):
 943     rv = p.wait()
 944     if rv != 0:
 945         raise GitError('%s returned %d' % (cmd, rv))
 946
 947
 948 def _git_capture(argv):
 949     p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
 950     r = p.stdout.read()
 951     _git_wait(repr(argv), p)
 952     return r
 953
 954
 955 class _AbortableIter:
 956     def __init__(self, it, onabort = None):
 957         self.it = it
 958         self.onabort = onabort
 959         self.done = None
 960
 961     def __iter__(self):
 962         return self
 963
 964     def next(self):
 965         try:
 966             return self.it.next()
 967         except StopIteration, e:
 968             self.done = True
 969             raise
 970         except:
 971             self.abort()
 972             raise
 973
 974     def abort(self):
 975         """Abort iteration and call the abortion callback, if needed."""
 976         if not self.done:
 977             self.done = True
 978             if self.onabort:
 979                 self.onabort()
 980
 981     def __del__(self):
 982         self.abort()
 983
 984
 985 _ver_warned = 0
 986 class CatPipe:
 987     """Link to 'git cat-file' that is used to retrieve blob data."""
 988     def __init__(self):
 989         global _ver_warned
 990         wanted = ('1','5','6')
 991         if ver() < wanted:
 992             if not _ver_warned:
 993                 log('warning: git version < %s; bup will be slow.\n'
 994                     % '.'.join(wanted))
 995                 _ver_warned = 1
 996             self.get = self._slow_get
 997         else:
 998             self.p = self.inprogress = None
 999             self.get = self._fast_get
1000
1001     def _abort(self):
1002         if self.p:
1003             self.p.stdout.close()
1004             self.p.stdin.close()
1005         self.p = None
1006         self.inprogress = None
1007
1008     def _restart(self):
1009         self._abort()
1010         self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1011                                   stdin=subprocess.PIPE,
1012                                   stdout=subprocess.PIPE,
1013                                   close_fds = True,
1014                                   bufsize = 4096,
1015                                   preexec_fn = _gitenv)
1016
1017     def _fast_get(self, id):
1018         if not self.p or self.p.poll() != None:
1019             self._restart()
1020         assert(self.p)
1021         assert(self.p.poll() == None)
1022         if self.inprogress:
1023             log('_fast_get: opening %r while %r is open'
1024                 % (id, self.inprogress))
1025         assert(not self.inprogress)
1026         assert(id.find('\n') < 0)
1027         assert(id.find('\r') < 0)
1028         assert(not id.startswith('-'))
1029         self.inprogress = id
1030         self.p.stdin.write('%s\n' % id)
1031         self.p.stdin.flush()
1032         hdr = self.p.stdout.readline()
1033         if hdr.endswith(' missing\n'):
1034             self.inprogress = None
1035             raise KeyError('blob %r is missing' % id)
1036         spl = hdr.split(' ')
1037         if len(spl) != 3 or len(spl[0]) != 40:
1038             raise GitError('expected blob, got %r' % spl)
1039         (hex, type, size) = spl
1040
1041         it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
1042                            onabort = self._abort)
1043         try:
1044             yield type
1045             for blob in it:
1046                 yield blob
1047             assert(self.p.stdout.readline() == '\n')
1048             self.inprogress = None
1049         except Exception, e:
1050             it.abort()
1051             raise
1052
1053     def _slow_get(self, id):
1054         assert(id.find('\n') < 0)
1055         assert(id.find('\r') < 0)
1056         assert(id[0] != '-')
1057         type = _git_capture(['git', 'cat-file', '-t', id]).strip()
1058         yield type
1059
1060         p = subprocess.Popen(['git', 'cat-file', type, id],
1061                              stdout=subprocess.PIPE,
1062                              preexec_fn = _gitenv)
1063         for blob in chunkyreader(p.stdout):
1064             yield blob
1065         _git_wait('git cat-file', p)
1066
1067     def _join(self, it):
1068         type = it.next()
1069         if type == 'blob':
1070             for blob in it:
1071                 yield blob
1072         elif type == 'tree':
1073             treefile = ''.join(it)
1074             for (mode, name, sha) in treeparse(treefile):
1075                 for blob in self.join(sha.encode('hex')):
1076                     yield blob
1077         elif type == 'commit':
1078             treeline = ''.join(it).split('\n')[0]
1079             assert(treeline.startswith('tree '))
1080             for blob in self.join(treeline[5:]):
1081                 yield blob
1082         else:
1083             raise GitError('invalid object type %r: expected blob/tree/commit'
1084                            % type)
1085
1086     def join(self, id):
1087         """Generate a list of the content of all blobs that can be reached
1088         from an object.  The hash given in 'id' must point to a blob, a tree
1089         or a commit. The content of all blobs that can be seen from trees or
1090         commits will be added to the list.
1091         """
1092         try:
1093             for d in self._join(self.get(id)):
1094                 yield d
1095         except StopIteration:
1096             log('booger!\n')
1097
1098 def tags():
1099     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1100     tags = {}
1101     for (n,c) in list_refs():
1102         if n.startswith('refs/tags/'):
1103             name = n[10:]
1104             if not c in tags:
1105                 tags[c] = []
1106
1107             tags[c].append(name)  # more than one tag can point at 'c'
1108
1109     return tags