lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5 import os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
   6 from bup.helpers import *
   7 from bup import _helpers, path, bloom
   8
   9 MIDX_VERSION = 4
  10 SEEK_END=2  # os.SEEK_END is not defined in python 2.4
  11
  12 verbose = 0
  13 ignore_midx = 0
  14 home_repodir = os.path.expanduser('~/.bup')
  15 repodir = None
  16
  17 _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
  18 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
  19
  20 _total_searches = 0
  21 _total_steps = 0
  22
  23
  24 class GitError(Exception):
  25     pass
  26
  27
  28 def repo(sub = ''):
  29     """Get the path to the git repository or one of its subdirectories."""
  30     global repodir
  31     if not repodir:
  32         raise GitError('You should call check_repo_or_die()')
  33
  34     # If there's a .git subdirectory, then the actual repo is in there.
  35     gd = os.path.join(repodir, '.git')
  36     if os.path.exists(gd):
  37         repodir = gd
  38
  39     return os.path.join(repodir, sub)
  40
  41
  42 def repo_rel(path):
  43     full = os.path.abspath(path)
  44     fullrepo = os.path.abspath(repo(''))
  45     if not fullrepo.endswith('/'):
  46         fullrepo += '/'
  47     if full.startswith(fullrepo):
  48         path = full[len(fullrepo):]
  49     if path.startswith('index-cache/'):
  50         path = path[len('index-cache/'):]
  51     return path
  52
  53
  54 def all_packdirs():
  55     paths = [repo('objects/pack')]
  56     paths += glob.glob(repo('index-cache/*/.'))
  57     return paths
  58
  59
  60 def auto_midx(objdir):
  61     args = [path.exe(), 'midx', '--auto', '--dir', objdir]
  62     try:
  63         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
  64     except OSError, e:
  65         # make sure 'args' gets printed to help with debugging
  66         add_error('%r: exception: %s' % (args, e))
  67         raise
  68     if rv:
  69         add_error('%r: returned %d' % (args, rv))
  70
  71     args = [path.exe(), 'bloom', '--dir', objdir]
  72     try:
  73         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
  74     except OSError, e:
  75         # make sure 'args' gets printed to help with debugging
  76         add_error('%r: exception: %s' % (args, e))
  77         raise
  78     if rv:
  79         add_error('%r: returned %d' % (args, rv))
  80
  81
  82 def mangle_name(name, mode, gitmode):
  83     """Mangle a file name to present an abstract name for segmented files.
  84     Mangled file names will have the ".bup" extension added to them. If a
  85     file's name already ends with ".bup", a ".bupl" extension is added to
  86     disambiguate normal files from semgmented ones.
  87     """
  88     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
  89         return name + '.bup'
  90     elif name.endswith('.bup') or name[:-1].endswith('.bup'):
  91         return name + '.bupl'
  92     else:
  93         return name
  94
  95
  96 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
  97 def demangle_name(name):
  98     """Remove name mangling from a file name, if necessary.
  99
 100     The return value is a tuple (demangled_filename,mode), where mode is one of
 101     the following:
 102
 103     * BUP_NORMAL  : files that should be read as-is from the repository
 104     * BUP_CHUNKED : files that were chunked and need to be assembled
 105
 106     For more information on the name mangling algorythm, see mangle_name()
 107     """
 108     if name.endswith('.bupl'):
 109         return (name[:-5], BUP_NORMAL)
 110     elif name.endswith('.bup'):
 111         return (name[:-4], BUP_CHUNKED)
 112     else:
 113         return (name, BUP_NORMAL)
 114
 115
 116 def _encode_packobj(type, content):
 117     szout = ''
 118     sz = len(content)
 119     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 120     sz >>= 4
 121     while 1:
 122         if sz: szbits |= 0x80
 123         szout += chr(szbits)
 124         if not sz:
 125             break
 126         szbits = sz & 0x7f
 127         sz >>= 7
 128     z = zlib.compressobj(1)
 129     yield szout
 130     yield z.compress(content)
 131     yield z.flush()
 132
 133
 134 def _encode_looseobj(type, content):
 135     z = zlib.compressobj(1)
 136     yield z.compress('%s %d\0' % (type, len(content)))
 137     yield z.compress(content)
 138     yield z.flush()
 139
 140
 141 def _decode_looseobj(buf):
 142     assert(buf);
 143     s = zlib.decompress(buf)
 144     i = s.find('\0')
 145     assert(i > 0)
 146     l = s[:i].split(' ')
 147     type = l[0]
 148     sz = int(l[1])
 149     content = s[i+1:]
 150     assert(type in _typemap)
 151     assert(sz == len(content))
 152     return (type, content)
 153
 154
 155 def _decode_packobj(buf):
 156     assert(buf)
 157     c = ord(buf[0])
 158     type = _typermap[(c & 0x70) >> 4]
 159     sz = c & 0x0f
 160     shift = 4
 161     i = 0
 162     while c & 0x80:
 163         i += 1
 164         c = ord(buf[i])
 165         sz |= (c & 0x7f) << shift
 166         shift += 7
 167         if not (c & 0x80):
 168             break
 169     return (type, zlib.decompress(buf[i+1:]))
 170
 171
 172 class PackIdx:
 173     def __init__(self):
 174         assert(0)
 175
 176     def find_offset(self, hash):
 177         """Get the offset of an object inside the index file."""
 178         idx = self._idx_from_hash(hash)
 179         if idx != None:
 180             return self._ofs_from_idx(idx)
 181         return None
 182
 183     def exists(self, hash, want_source=False):
 184         """Return nonempty if the object exists in this index."""
 185         if hash and (self._idx_from_hash(hash) != None):
 186             return want_source and os.path.basename(self.name) or True
 187         return None
 188
 189     def __len__(self):
 190         return int(self.fanout[255])
 191
 192     def _idx_from_hash(self, hash):
 193         global _total_searches, _total_steps
 194         _total_searches += 1
 195         assert(len(hash) == 20)
 196         b1 = ord(hash[0])
 197         start = self.fanout[b1-1] # range -1..254
 198         end = self.fanout[b1] # range 0..255
 199         want = str(hash)
 200         _total_steps += 1  # lookup table is a step
 201         while start < end:
 202             _total_steps += 1
 203             mid = start + (end-start)/2
 204             v = self._idx_to_hash(mid)
 205             if v < want:
 206                 start = mid+1
 207             elif v > want:
 208                 end = mid
 209             else: # got it!
 210                 return mid
 211         return None
 212
 213
 214 class PackIdxV1(PackIdx):
 215     """Object representation of a Git pack index (version 1) file."""
 216     def __init__(self, filename, f):
 217         self.name = filename
 218         self.idxnames = [self.name]
 219         self.map = mmap_read(f)
 220         self.fanout = list(struct.unpack('!256I',
 221                                          str(buffer(self.map, 0, 256*4))))
 222         self.fanout.append(0)  # entry "-1"
 223         nsha = self.fanout[255]
 224         self.sha_ofs = 256*4
 225         self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
 226
 227     def _ofs_from_idx(self, idx):
 228         return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
 229
 230     def _idx_to_hash(self, idx):
 231         return str(self.shatable[idx*24+4 : idx*24+24])
 232
 233     def __iter__(self):
 234         for i in xrange(self.fanout[255]):
 235             yield buffer(self.map, 256*4 + 24*i + 4, 20)
 236
 237
 238 class PackIdxV2(PackIdx):
 239     """Object representation of a Git pack index (version 2) file."""
 240     def __init__(self, filename, f):
 241         self.name = filename
 242         self.idxnames = [self.name]
 243         self.map = mmap_read(f)
 244         assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
 245         self.fanout = list(struct.unpack('!256I',
 246                                          str(buffer(self.map, 8, 256*4))))
 247         self.fanout.append(0)  # entry "-1"
 248         nsha = self.fanout[255]
 249         self.sha_ofs = 8 + 256*4
 250         self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
 251         self.ofstable = buffer(self.map,
 252                                self.sha_ofs + nsha*20 + nsha*4,
 253                                nsha*4)
 254         self.ofs64table = buffer(self.map,
 255                                  8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
 256
 257     def _ofs_from_idx(self, idx):
 258         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
 259         if ofs & 0x80000000:
 260             idx64 = ofs & 0x7fffffff
 261             ofs = struct.unpack('!Q',
 262                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
 263         return ofs
 264
 265     def _idx_to_hash(self, idx):
 266         return str(self.shatable[idx*20:(idx+1)*20])
 267
 268     def __iter__(self):
 269         for i in xrange(self.fanout[255]):
 270             yield buffer(self.map, 8 + 256*4 + 20*i, 20)
 271
 272
 273 extract_bits = _helpers.extract_bits
 274
 275 class PackMidx:
 276     """Wrapper which contains data from multiple index files.
 277     Multiple index (.midx) files constitute a wrapper around index (.idx) files
 278     and make it possible for bup to expand Git's indexing capabilities to vast
 279     amounts of files.
 280     """
 281     def __init__(self, filename):
 282         self.name = filename
 283         self.force_keep = False
 284         assert(filename.endswith('.midx'))
 285         self.map = mmap_read(open(filename))
 286         if str(self.map[0:4]) != 'MIDX':
 287             log('Warning: skipping: invalid MIDX header in %r\n' % filename)
 288             self.force_keep = True
 289             return self._init_failed()
 290         ver = struct.unpack('!I', self.map[4:8])[0]
 291         if ver < MIDX_VERSION:
 292             log('Warning: ignoring old-style (v%d) midx %r\n'
 293                 % (ver, filename))
 294             self.force_keep = False  # old stuff is boring
 295             return self._init_failed()
 296         if ver > MIDX_VERSION:
 297             log('Warning: ignoring too-new (v%d) midx %r\n'
 298                 % (ver, filename))
 299             self.force_keep = True  # new stuff is exciting
 300             return self._init_failed()
 301
 302         self.bits = _helpers.firstword(self.map[8:12])
 303         self.entries = 2**self.bits
 304         self.fanout = buffer(self.map, 12, self.entries*4)
 305         self.sha_ofs = 12 + self.entries*4
 306         self.nsha = nsha = self._fanget(self.entries-1)
 307         self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
 308         self.which_ofs = self.sha_ofs + 20*nsha
 309         self.whichlist = buffer(self.map, self.which_ofs, nsha*4)
 310         self.idxnames = str(self.map[self.which_ofs + 4*nsha:]).split('\0')
 311
 312     def _init_failed(self):
 313         self.bits = 0
 314         self.entries = 1
 315         self.fanout = buffer('\0\0\0\0')
 316         self.shatable = buffer('\0'*20)
 317         self.idxnames = []
 318
 319     def _fanget(self, i):
 320         start = i*4
 321         s = self.fanout[start:start+4]
 322         return _helpers.firstword(s)
 323
 324     def _get(self, i):
 325         return str(self.shatable[i*20:(i+1)*20])
 326
 327     def _get_idx_i(self, i):
 328         return struct.unpack('!I', self.whichlist[i*4:(i+1)*4])[0]
 329
 330     def _get_idxname(self, i):
 331         return self.idxnames[self._get_idx_i(i)]
 332
 333     def exists(self, hash, want_source=False):
 334         """Return nonempty if the object exists in the index files."""
 335         global _total_searches, _total_steps
 336         _total_searches += 1
 337         want = str(hash)
 338         el = extract_bits(want, self.bits)
 339         if el:
 340             start = self._fanget(el-1)
 341             startv = el << (32-self.bits)
 342         else:
 343             start = 0
 344             startv = 0
 345         end = self._fanget(el)
 346         endv = (el+1) << (32-self.bits)
 347         _total_steps += 1   # lookup table is a step
 348         hashv = _helpers.firstword(hash)
 349         #print '(%08x) %08x %08x %08x' % (extract_bits(want, 32), startv, hashv, endv)
 350         while start < end:
 351             _total_steps += 1
 352             #print '! %08x %08x %08x   %d - %d' % (startv, hashv, endv, start, end)
 353             mid = start + (hashv-startv)*(end-start-1)/(endv-startv)
 354             #print '  %08x %08x %08x   %d %d %d' % (startv, hashv, endv, start, mid, end)
 355             v = self._get(mid)
 356             #print '    %08x' % self._num(v)
 357             if v < want:
 358                 start = mid+1
 359                 startv = _helpers.firstword(v)
 360             elif v > want:
 361                 end = mid
 362                 endv = _helpers.firstword(v)
 363             else: # got it!
 364                 return want_source and self._get_idxname(mid) or True
 365         return None
 366
 367     def __iter__(self):
 368         for i in xrange(self._fanget(self.entries-1)):
 369             yield buffer(self.shatable, i*20, 20)
 370
 371     def __len__(self):
 372         return int(self._fanget(self.entries-1))
 373
 374
 375 _mpi_count = 0
 376 class PackIdxList:
 377     def __init__(self, dir):
 378         global _mpi_count
 379         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 380         _mpi_count += 1
 381         self.dir = dir
 382         self.also = set()
 383         self.packs = []
 384         self.do_bloom = False
 385         self.bloom = None
 386         self.refresh()
 387
 388     def __del__(self):
 389         global _mpi_count
 390         _mpi_count -= 1
 391         assert(_mpi_count == 0)
 392
 393     def __iter__(self):
 394         return iter(idxmerge(self.packs))
 395
 396     def __len__(self):
 397         return sum(len(pack) for pack in self.packs)
 398
 399     def exists(self, hash, want_source=False):
 400         """Return nonempty if the object exists in the index files."""
 401         global _total_searches
 402         _total_searches += 1
 403         if hash in self.also:
 404             return True
 405         if self.do_bloom and self.bloom is not None:
 406             _total_searches -= 1  # will be incremented by bloom
 407             if self.bloom.exists(hash):
 408                 self.do_bloom = False
 409             else:
 410                 return None
 411         for i in xrange(len(self.packs)):
 412             p = self.packs[i]
 413             _total_searches -= 1  # will be incremented by sub-pack
 414             ix = p.exists(hash, want_source=want_source)
 415             if ix:
 416                 # reorder so most recently used packs are searched first
 417                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 418                 return ix
 419         self.do_bloom = True
 420         return None
 421
 422     def refresh(self, skip_midx = False):
 423         """Refresh the index list.
 424         This method verifies if .midx files were superseded (e.g. all of its
 425         contents are in another, bigger .midx file) and removes the superseded
 426         files.
 427
 428         If skip_midx is True, all work on .midx files will be skipped and .midx
 429         files will be removed from the list.
 430
 431         The module-global variable 'ignore_midx' can force this function to
 432         always act as if skip_midx was True.
 433         """
 434         self.bloom = None # Always reopen the bloom as it may have been relaced
 435         self.do_bloom = False
 436         skip_midx = skip_midx or ignore_midx
 437         d = dict((p.name, p) for p in self.packs
 438                  if not skip_midx or not isinstance(p, PackMidx))
 439         if os.path.exists(self.dir):
 440             if not skip_midx:
 441                 midxl = []
 442                 for ix in self.packs:
 443                     if isinstance(ix, PackMidx):
 444                         for name in ix.idxnames:
 445                             d[os.path.join(self.dir, name)] = ix
 446                 for full in glob.glob(os.path.join(self.dir,'*.midx')):
 447                     if not d.get(full):
 448                         mx = PackMidx(full)
 449                         (mxd, mxf) = os.path.split(mx.name)
 450                         broken = False
 451                         for n in mx.idxnames:
 452                             if not os.path.exists(os.path.join(mxd, n)):
 453                                 log(('warning: index %s missing\n' +
 454                                     '  used by %s\n') % (n, mxf))
 455                                 broken = True
 456                         if broken:
 457                             del mx
 458                             unlink(full)
 459                         else:
 460                             midxl.append(mx)
 461                 midxl.sort(lambda x,y: -cmp(len(x),len(y)))
 462                 for ix in midxl:
 463                     any_needed = False
 464                     for sub in ix.idxnames:
 465                         found = d.get(os.path.join(self.dir, sub))
 466                         if not found or isinstance(found, PackIdx):
 467                             # doesn't exist, or exists but not in a midx
 468                             any_needed = True
 469                             break
 470                     if any_needed:
 471                         d[ix.name] = ix
 472                         for name in ix.idxnames:
 473                             d[os.path.join(self.dir, name)] = ix
 474                     elif not ix.force_keep:
 475                         debug1('midx: removing redundant: %s\n'
 476                                % os.path.basename(ix.name))
 477                         unlink(ix.name)
 478             for full in glob.glob(os.path.join(self.dir,'*.idx')):
 479                 if not d.get(full):
 480                     try:
 481                         ix = open_idx(full)
 482                     except GitError, e:
 483                         add_error(e)
 484                         continue
 485                     d[full] = ix
 486             bfull = os.path.join(self.dir, 'bup.bloom')
 487             if self.bloom is None and os.path.exists(bfull):
 488                 self.bloom = bloom.ShaBloom(bfull)
 489             self.packs = list(set(d.values()))
 490             self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
 491             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 492                 self.do_bloom = True
 493             else:
 494                 self.bloom = None
 495         debug1('PackIdxList: using %d index%s.\n'
 496             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 497
 498     def add(self, hash):
 499         """Insert an additional object in the list."""
 500         self.also.add(hash)
 501
 502
 503 def calc_hash(type, content):
 504     """Calculate some content's hash in the Git fashion."""
 505     header = '%s %d\0' % (type, len(content))
 506     sum = Sha1(header)
 507     sum.update(content)
 508     return sum.digest()
 509
 510
 511 def _shalist_sort_key(ent):
 512     (mode, name, id) = ent
 513     if stat.S_ISDIR(int(mode, 8)):
 514         return name + '/'
 515     else:
 516         return name
 517
 518
 519 def open_idx(filename):
 520     if filename.endswith('.idx'):
 521         f = open(filename, 'rb')
 522         header = f.read(8)
 523         if header[0:4] == '\377tOc':
 524             version = struct.unpack('!I', header[4:8])[0]
 525             if version == 2:
 526                 return PackIdxV2(filename, f)
 527             else:
 528                 raise GitError('%s: expected idx file version 2, got %d'
 529                                % (filename, version))
 530         elif len(header) == 8 and header[0:4] < '\377tOc':
 531             return PackIdxV1(filename, f)
 532         else:
 533             raise GitError('%s: unrecognized idx file header' % filename)
 534     elif filename.endswith('.midx'):
 535         return PackMidx(filename)
 536     else:
 537         raise GitError('idx filenames must end with .idx or .midx')
 538
 539
 540 def idxmerge(idxlist, final_progress=True):
 541     """Generate a list of all the objects reachable in a PackIdxList."""
 542     def pfunc(count, total):
 543         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 544                   % (count*100.0/total, count, total))
 545     def pfinal(count, total):
 546         if final_progress:
 547             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 548                      % (100, total, total))
 549     return merge_iter(idxlist, 10024, pfunc, pfinal)
 550
 551
 552 def _make_objcache():
 553     return PackIdxList(repo('objects/pack'))
 554
 555 class PackWriter:
 556     """Writes Git objects insid a pack file."""
 557     def __init__(self, objcache_maker=_make_objcache):
 558         self.count = 0
 559         self.outbytes = 0
 560         self.filename = None
 561         self.file = None
 562         self.idx = None
 563         self.objcache_maker = objcache_maker
 564         self.objcache = None
 565
 566     def __del__(self):
 567         self.close()
 568
 569     def _open(self):
 570         if not self.file:
 571             (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
 572             self.file = os.fdopen(fd, 'w+b')
 573             assert(name.endswith('.pack'))
 574             self.filename = name[:-5]
 575             self.file.write('PACK\0\0\0\2\0\0\0\0')
 576             self.idx = list(list() for i in xrange(256))
 577
 578     def _raw_write(self, datalist, sha):
 579         self._open()
 580         f = self.file
 581         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 582         # the file never has a *partial* blob.  So let's make sure it's
 583         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 584         # to our hashsplit algorithm.)  f.write() does its own buffering,
 585         # but that's okay because we'll flush it in _end().
 586         oneblob = ''.join(datalist)
 587         try:
 588             f.write(oneblob)
 589         except IOError, e:
 590             raise GitError, e, sys.exc_info()[2]
 591         nw = len(oneblob)
 592         crc = zlib.crc32(oneblob) & 0xffffffff
 593         self._update_idx(sha, crc, nw)
 594         self.outbytes += nw
 595         self.count += 1
 596         return nw, crc
 597
 598     def _update_idx(self, sha, crc, size):
 599         assert(sha)
 600         if self.idx:
 601             self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
 602
 603     def _write(self, sha, type, content):
 604         if verbose:
 605             log('>')
 606         if not sha:
 607             sha = calc_hash(type, content)
 608         size, crc = self._raw_write(_encode_packobj(type, content), sha=sha)
 609         return sha
 610
 611     def breakpoint(self):
 612         """Clear byte and object counts and return the last processed id."""
 613         id = self._end()
 614         self.outbytes = self.count = 0
 615         return id
 616
 617     def _require_objcache(self):
 618         if self.objcache is None and self.objcache_maker:
 619             self.objcache = self.objcache_maker()
 620         if self.objcache is None:
 621             raise GitError(
 622                     "PackWriter not opened or can't check exists w/o objcache")
 623
 624     def exists(self, id, want_source=False):
 625         """Return non-empty if an object is found in the object cache."""
 626         self._require_objcache()
 627         return self.objcache.exists(id, want_source=want_source)
 628
 629     def maybe_write(self, type, content):
 630         """Write an object to the pack file if not present and return its id."""
 631         self._require_objcache()
 632         sha = calc_hash(type, content)
 633         if not self.exists(sha):
 634             self._write(sha, type, content)
 635             self.objcache.add(sha)
 636         return sha
 637
 638     def new_blob(self, blob):
 639         """Create a blob object in the pack with the supplied content."""
 640         return self.maybe_write('blob', blob)
 641
 642     def new_tree(self, shalist):
 643         """Create a tree object in the pack."""
 644         shalist = sorted(shalist, key = _shalist_sort_key)
 645         l = []
 646         for (mode,name,bin) in shalist:
 647             assert(mode)
 648             assert(mode != '0')
 649             assert(mode[0] != '0')
 650             assert(name)
 651             assert(len(bin) == 20)
 652             l.append('%s %s\0%s' % (mode,name,bin))
 653         return self.maybe_write('tree', ''.join(l))
 654
 655     def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
 656         l = []
 657         if tree: l.append('tree %s' % tree.encode('hex'))
 658         if parent: l.append('parent %s' % parent.encode('hex'))
 659         if author: l.append('author %s %s' % (author, _git_date(adate)))
 660         if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
 661         l.append('')
 662         l.append(msg)
 663         return self.maybe_write('commit', '\n'.join(l))
 664
 665     def new_commit(self, parent, tree, date, msg):
 666         """Create a commit object in the pack."""
 667         userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
 668         commit = self._new_commit(tree, parent,
 669                                   userline, date, userline, date,
 670                                   msg)
 671         return commit
 672
 673     def abort(self):
 674         """Remove the pack file from disk."""
 675         f = self.file
 676         if f:
 677             self.idx = None
 678             self.file = None
 679             f.close()
 680             os.unlink(self.filename + '.pack')
 681
 682     def _end(self, run_midx=True):
 683         f = self.file
 684         if not f: return None
 685         self.file = None
 686         self.objcache = None
 687         idx = self.idx
 688         self.idx = None
 689
 690         # update object count
 691         f.seek(8)
 692         cp = struct.pack('!i', self.count)
 693         assert(len(cp) == 4)
 694         f.write(cp)
 695
 696         # calculate the pack sha1sum
 697         f.seek(0)
 698         sum = Sha1()
 699         for b in chunkyreader(f):
 700             sum.update(b)
 701         packbin = sum.digest()
 702         f.write(packbin)
 703         f.close()
 704
 705         obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
 706
 707         nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
 708         if os.path.exists(self.filename + '.map'):
 709             os.unlink(self.filename + '.map')
 710         os.rename(self.filename + '.pack', nameprefix + '.pack')
 711         os.rename(self.filename + '.idx', nameprefix + '.idx')
 712
 713         if run_midx:
 714             auto_midx(repo('objects/pack'))
 715         return nameprefix
 716
 717     def close(self, run_midx=True):
 718         """Close the pack file and move it to its definitive path."""
 719         return self._end(run_midx=run_midx)
 720
 721     def _write_pack_idx_v2(self, filename, idx, packbin):
 722         idx_f = open(filename, 'w+b')
 723         idx_f.write('\377tOc\0\0\0\2')
 724
 725         ofs64_ofs = 8 + 4*256 + 28*self.count
 726         idx_f.truncate(ofs64_ofs)
 727         idx_f.seek(0)
 728         idx_map = mmap_readwrite(idx_f, close=False)
 729         idx_f.seek(0, SEEK_END)
 730         count = _helpers.write_idx(idx_f, idx_map, idx, self.count)
 731         assert(count == self.count)
 732         idx_map.close()
 733         idx_f.write(packbin)
 734
 735         idx_f.seek(0)
 736         idx_sum = Sha1()
 737         b = idx_f.read(8 + 4*256)
 738         idx_sum.update(b)
 739
 740         obj_list_sum = Sha1()
 741         for b in chunkyreader(idx_f, 20*self.count):
 742             idx_sum.update(b)
 743             obj_list_sum.update(b)
 744         namebase = obj_list_sum.hexdigest()
 745
 746         for b in chunkyreader(idx_f):
 747             idx_sum.update(b)
 748         idx_f.write(idx_sum.digest())
 749         idx_f.close()
 750
 751         return namebase
 752
 753
 754 def _git_date(date):
 755     return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
 756
 757
 758 def _gitenv():
 759     os.environ['GIT_DIR'] = os.path.abspath(repo())
 760
 761
 762 def list_refs(refname = None):
 763     """Generate a list of tuples in the form (refname,hash).
 764     If a ref name is specified, list only this particular ref.
 765     """
 766     argv = ['git', 'show-ref', '--']
 767     if refname:
 768         argv += [refname]
 769     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 770     out = p.stdout.read().strip()
 771     rv = p.wait()  # not fatal
 772     if rv:
 773         assert(not out)
 774     if out:
 775         for d in out.split('\n'):
 776             (sha, name) = d.split(' ', 1)
 777             yield (name, sha.decode('hex'))
 778
 779
 780 def read_ref(refname):
 781     """Get the commit id of the most recent commit made on a given ref."""
 782     l = list(list_refs(refname))
 783     if l:
 784         assert(len(l) == 1)
 785         return l[0][1]
 786     else:
 787         return None
 788
 789
 790 def rev_list(ref, count=None):
 791     """Generate a list of reachable commits in reverse chronological order.
 792
 793     This generator walks through commits, from child to parent, that are
 794     reachable via the specified ref and yields a series of tuples of the form
 795     (date,hash).
 796
 797     If count is a non-zero integer, limit the number of commits to "count"
 798     objects.
 799     """
 800     assert(not ref.startswith('-'))
 801     opts = []
 802     if count:
 803         opts += ['-n', str(atoi(count))]
 804     argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
 805     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 806     commit = None
 807     for row in p.stdout:
 808         s = row.strip()
 809         if s.startswith('commit '):
 810             commit = s[7:].decode('hex')
 811         else:
 812             date = int(s)
 813             yield (date, commit)
 814     rv = p.wait()  # not fatal
 815     if rv:
 816         raise GitError, 'git rev-list returned error %d' % rv
 817
 818
 819 def rev_get_date(ref):
 820     """Get the date of the latest commit on the specified ref."""
 821     for (date, commit) in rev_list(ref, count=1):
 822         return date
 823     raise GitError, 'no such commit %r' % ref
 824
 825
 826 def rev_parse(committish):
 827     """Resolve the full hash for 'committish', if it exists.
 828
 829     Should be roughly equivalent to 'git rev-parse'.
 830
 831     Returns the hex value of the hash if it is found, None if 'committish' does
 832     not correspond to anything.
 833     """
 834     head = read_ref(committish)
 835     if head:
 836         debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
 837         return head
 838
 839     pL = PackIdxList(repo('objects/pack'))
 840
 841     if len(committish) == 40:
 842         try:
 843             hash = committish.decode('hex')
 844         except TypeError:
 845             return None
 846
 847         if pL.exists(hash):
 848             return hash
 849
 850     return None
 851
 852
 853 def update_ref(refname, newval, oldval):
 854     """Change the commit pointed to by a branch."""
 855     if not oldval:
 856         oldval = ''
 857     assert(refname.startswith('refs/heads/'))
 858     p = subprocess.Popen(['git', 'update-ref', refname,
 859                           newval.encode('hex'), oldval.encode('hex')],
 860                          preexec_fn = _gitenv)
 861     _git_wait('git update-ref', p)
 862
 863
 864 def guess_repo(path=None):
 865     """Set the path value in the global variable "repodir".
 866     This makes bup look for an existing bup repository, but not fail if a
 867     repository doesn't exist. Usually, if you are interacting with a bup
 868     repository, you would not be calling this function but using
 869     check_repo_or_die().
 870     """
 871     global repodir
 872     if path:
 873         repodir = path
 874     if not repodir:
 875         repodir = os.environ.get('BUP_DIR')
 876         if not repodir:
 877             repodir = os.path.expanduser('~/.bup')
 878
 879
 880 def init_repo(path=None):
 881     """Create the Git bare repository for bup in a given path."""
 882     guess_repo(path)
 883     d = repo()  # appends a / to the path
 884     parent = os.path.dirname(os.path.dirname(d))
 885     if parent and not os.path.exists(parent):
 886         raise GitError('parent directory "%s" does not exist\n' % parent)
 887     if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
 888         raise GitError('"%d" exists but is not a directory\n' % d)
 889     p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
 890                          preexec_fn = _gitenv)
 891     _git_wait('git init', p)
 892     # Force the index version configuration in order to ensure bup works
 893     # regardless of the version of the installed Git binary.
 894     p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
 895                          stdout=sys.stderr, preexec_fn = _gitenv)
 896     _git_wait('git config', p)
 897
 898
 899 def check_repo_or_die(path=None):
 900     """Make sure a bup repository exists, and abort if not.
 901     If the path to a particular repository was not specified, this function
 902     initializes the default repository automatically.
 903     """
 904     guess_repo(path)
 905     if not os.path.isdir(repo('objects/pack/.')):
 906         if repodir == home_repodir:
 907             init_repo()
 908         else:
 909             log('error: %r is not a bup/git repository\n' % repo())
 910             sys.exit(15)
 911
 912
 913 def treeparse(buf):
 914     """Generate a list of (mode, name, hash) tuples of objects from 'buf'."""
 915     ofs = 0
 916     while ofs < len(buf):
 917         z = buf[ofs:].find('\0')
 918         assert(z > 0)
 919         spl = buf[ofs:ofs+z].split(' ', 1)
 920         assert(len(spl) == 2)
 921         sha = buf[ofs+z+1:ofs+z+1+20]
 922         ofs += z+1+20
 923         yield (spl[0], spl[1], sha)
 924
 925
 926 _ver = None
 927 def ver():
 928     """Get Git's version and ensure a usable version is installed.
 929
 930     The returned version is formatted as an ordered tuple with each position
 931     representing a digit in the version tag. For example, the following tuple
 932     would represent version 1.6.6.9:
 933
 934         ('1', '6', '6', '9')
 935     """
 936     global _ver
 937     if not _ver:
 938         p = subprocess.Popen(['git', '--version'],
 939                              stdout=subprocess.PIPE)
 940         gvs = p.stdout.read()
 941         _git_wait('git --version', p)
 942         m = re.match(r'git version (\S+.\S+)', gvs)
 943         if not m:
 944             raise GitError('git --version weird output: %r' % gvs)
 945         _ver = tuple(m.group(1).split('.'))
 946     needed = ('1','5', '3', '1')
 947     if _ver < needed:
 948         raise GitError('git version %s or higher is required; you have %s'
 949                        % ('.'.join(needed), '.'.join(_ver)))
 950     return _ver
 951
 952
 953 def _git_wait(cmd, p):
 954     rv = p.wait()
 955     if rv != 0:
 956         raise GitError('%s returned %d' % (cmd, rv))
 957
 958
 959 def _git_capture(argv):
 960     p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
 961     r = p.stdout.read()
 962     _git_wait(repr(argv), p)
 963     return r
 964
 965
 966 class _AbortableIter:
 967     def __init__(self, it, onabort = None):
 968         self.it = it
 969         self.onabort = onabort
 970         self.done = None
 971
 972     def __iter__(self):
 973         return self
 974
 975     def next(self):
 976         try:
 977             return self.it.next()
 978         except StopIteration, e:
 979             self.done = True
 980             raise
 981         except:
 982             self.abort()
 983             raise
 984
 985     def abort(self):
 986         """Abort iteration and call the abortion callback, if needed."""
 987         if not self.done:
 988             self.done = True
 989             if self.onabort:
 990                 self.onabort()
 991
 992     def __del__(self):
 993         self.abort()
 994
 995
 996 _ver_warned = 0
 997 class CatPipe:
 998     """Link to 'git cat-file' that is used to retrieve blob data."""
 999     def __init__(self):
1000         global _ver_warned
1001         wanted = ('1','5','6')
1002         if ver() < wanted:
1003             if not _ver_warned:
1004                 log('warning: git version < %s; bup will be slow.\n'
1005                     % '.'.join(wanted))
1006                 _ver_warned = 1
1007             self.get = self._slow_get
1008         else:
1009             self.p = self.inprogress = None
1010             self.get = self._fast_get
1011
1012     def _abort(self):
1013         if self.p:
1014             self.p.stdout.close()
1015             self.p.stdin.close()
1016         self.p = None
1017         self.inprogress = None
1018
1019     def _restart(self):
1020         self._abort()
1021         self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1022                                   stdin=subprocess.PIPE,
1023                                   stdout=subprocess.PIPE,
1024                                   close_fds = True,
1025                                   bufsize = 4096,
1026                                   preexec_fn = _gitenv)
1027
1028     def _fast_get(self, id):
1029         if not self.p or self.p.poll() != None:
1030             self._restart()
1031         assert(self.p)
1032         assert(self.p.poll() == None)
1033         if self.inprogress:
1034             log('_fast_get: opening %r while %r is open'
1035                 % (id, self.inprogress))
1036         assert(not self.inprogress)
1037         assert(id.find('\n') < 0)
1038         assert(id.find('\r') < 0)
1039         assert(not id.startswith('-'))
1040         self.inprogress = id
1041         self.p.stdin.write('%s\n' % id)
1042         self.p.stdin.flush()
1043         hdr = self.p.stdout.readline()
1044         if hdr.endswith(' missing\n'):
1045             self.inprogress = None
1046             raise KeyError('blob %r is missing' % id)
1047         spl = hdr.split(' ')
1048         if len(spl) != 3 or len(spl[0]) != 40:
1049             raise GitError('expected blob, got %r' % spl)
1050         (hex, type, size) = spl
1051
1052         it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
1053                            onabort = self._abort)
1054         try:
1055             yield type
1056             for blob in it:
1057                 yield blob
1058             assert(self.p.stdout.readline() == '\n')
1059             self.inprogress = None
1060         except Exception, e:
1061             it.abort()
1062             raise
1063
1064     def _slow_get(self, id):
1065         assert(id.find('\n') < 0)
1066         assert(id.find('\r') < 0)
1067         assert(id[0] != '-')
1068         type = _git_capture(['git', 'cat-file', '-t', id]).strip()
1069         yield type
1070
1071         p = subprocess.Popen(['git', 'cat-file', type, id],
1072                              stdout=subprocess.PIPE,
1073                              preexec_fn = _gitenv)
1074         for blob in chunkyreader(p.stdout):
1075             yield blob
1076         _git_wait('git cat-file', p)
1077
1078     def _join(self, it):
1079         type = it.next()
1080         if type == 'blob':
1081             for blob in it:
1082                 yield blob
1083         elif type == 'tree':
1084             treefile = ''.join(it)
1085             for (mode, name, sha) in treeparse(treefile):
1086                 for blob in self.join(sha.encode('hex')):
1087                     yield blob
1088         elif type == 'commit':
1089             treeline = ''.join(it).split('\n')[0]
1090             assert(treeline.startswith('tree '))
1091             for blob in self.join(treeline[5:]):
1092                 yield blob
1093         else:
1094             raise GitError('invalid object type %r: expected blob/tree/commit'
1095                            % type)
1096
1097     def join(self, id):
1098         """Generate a list of the content of all blobs that can be reached
1099         from an object.  The hash given in 'id' must point to a blob, a tree
1100         or a commit. The content of all blobs that can be seen from trees or
1101         commits will be added to the list.
1102         """
1103         try:
1104             for d in self._join(self.get(id)):
1105                 yield d
1106         except StopIteration:
1107             log('booger!\n')
1108
1109 def tags():
1110     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1111     tags = {}
1112     for (n,c) in list_refs():
1113         if n.startswith('refs/tags/'):
1114             name = n[10:]
1115             if not c in tags:
1116                 tags[c] = []
1117
1118             tags[c].append(name)  # more than one tag can point at 'c'
1119
1120     return tags