lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5 import os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
   6 from collections import namedtuple
   7
   8 from bup.helpers import *
   9 from bup import _helpers, path, midx, bloom, xstat
  10
  11 max_pack_size = 1000*1000*1000  # larger packs will slow down pruning
  12 max_pack_objects = 200*1000  # cache memory usage is about 83 bytes per object
  13
  14 verbose = 0
  15 ignore_midx = 0
  16 repodir = None
  17
  18 _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
  19 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
  20
  21 _total_searches = 0
  22 _total_steps = 0
  23
  24
  25 class GitError(Exception):
  26     pass
  27
  28
  29 def parse_tz_offset(s):
  30     """UTC offset in seconds."""
  31     tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
  32     if s[0] == '-':
  33         return - tz_off
  34     return tz_off
  35
  36
  37 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
  38 # Make sure that's authoritative.
  39 _start_end_char = r'[^ .,:;<>"\'\0\n]'
  40 _content_char = r'[^\0\n<>]'
  41 _safe_str_rx = '(?:%s{1,2}|(?:%s%s*%s))' \
  42     % (_start_end_char,
  43        _start_end_char, _content_char, _start_end_char)
  44 _tz_rx = r'[-+]\d\d[0-5]\d'
  45 _parent_rx = r'(?:parent [abcdefABCDEF0123456789]{40}\n)'
  46 _commit_rx = re.compile(r'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
  47 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
  48 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)
  49
  50 (?P<message>(?:.|\n)*)''' % (_parent_rx,
  51                              _safe_str_rx, _safe_str_rx, _tz_rx,
  52                              _safe_str_rx, _safe_str_rx, _tz_rx))
  53 _parent_hash_rx = re.compile(r'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
  54
  55
  56 # Note that the author_sec and committer_sec values are (UTC) epoch seconds.
  57 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
  58                                        'author_name', 'author_mail',
  59                                        'author_sec', 'author_offset',
  60                                        'committer_name', 'committer_mail',
  61                                        'committer_sec', 'committer_offset',
  62                                        'message'])
  63
  64 def parse_commit(content):
  65     commit_match = re.match(_commit_rx, content)
  66     if not commit_match:
  67         raise Exception('cannot parse commit %r' % content)
  68     matches = commit_match.groupdict()
  69     return CommitInfo(tree=matches['tree'],
  70                       parents=re.findall(_parent_hash_rx, matches['parents']),
  71                       author_name=matches['author_name'],
  72                       author_mail=matches['author_mail'],
  73                       author_sec=int(matches['asec']),
  74                       author_offset=parse_tz_offset(matches['atz']),
  75                       committer_name=matches['committer_name'],
  76                       committer_mail=matches['committer_mail'],
  77                       committer_sec=int(matches['csec']),
  78                       committer_offset=parse_tz_offset(matches['ctz']),
  79                       message=matches['message'])
  80
  81
  82 def get_commit_items(id, cp):
  83     commit_it = cp.get(id)
  84     assert(commit_it.next() == 'commit')
  85     commit_content = ''.join(commit_it)
  86     return parse_commit(commit_content)
  87
  88
  89 def repo(sub = ''):
  90     """Get the path to the git repository or one of its subdirectories."""
  91     global repodir
  92     if not repodir:
  93         raise GitError('You should call check_repo_or_die()')
  94
  95     # If there's a .git subdirectory, then the actual repo is in there.
  96     gd = os.path.join(repodir, '.git')
  97     if os.path.exists(gd):
  98         repodir = gd
  99
 100     return os.path.join(repodir, sub)
 101
 102
 103 def shorten_hash(s):
 104     return re.sub(r'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)',
 105                   r'\1\2*\3', s)
 106
 107
 108 def repo_rel(path):
 109     full = os.path.abspath(path)
 110     fullrepo = os.path.abspath(repo(''))
 111     if not fullrepo.endswith('/'):
 112         fullrepo += '/'
 113     if full.startswith(fullrepo):
 114         path = full[len(fullrepo):]
 115     if path.startswith('index-cache/'):
 116         path = path[len('index-cache/'):]
 117     return shorten_hash(path)
 118
 119
 120 def all_packdirs():
 121     paths = [repo('objects/pack')]
 122     paths += glob.glob(repo('index-cache/*/.'))
 123     return paths
 124
 125
 126 def auto_midx(objdir):
 127     args = [path.exe(), 'midx', '--auto', '--dir', objdir]
 128     try:
 129         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 130     except OSError, e:
 131         # make sure 'args' gets printed to help with debugging
 132         add_error('%r: exception: %s' % (args, e))
 133         raise
 134     if rv:
 135         add_error('%r: returned %d' % (args, rv))
 136
 137     args = [path.exe(), 'bloom', '--dir', objdir]
 138     try:
 139         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 140     except OSError, e:
 141         # make sure 'args' gets printed to help with debugging
 142         add_error('%r: exception: %s' % (args, e))
 143         raise
 144     if rv:
 145         add_error('%r: returned %d' % (args, rv))
 146
 147
 148 def mangle_name(name, mode, gitmode):
 149     """Mangle a file name to present an abstract name for segmented files.
 150     Mangled file names will have the ".bup" extension added to them. If a
 151     file's name already ends with ".bup", a ".bupl" extension is added to
 152     disambiguate normal files from semgmented ones.
 153     """
 154     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 155         return name + '.bup'
 156     elif name.endswith('.bup') or name[:-1].endswith('.bup'):
 157         return name + '.bupl'
 158     else:
 159         return name
 160
 161
 162 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 163 def demangle_name(name):
 164     """Remove name mangling from a file name, if necessary.
 165
 166     The return value is a tuple (demangled_filename,mode), where mode is one of
 167     the following:
 168
 169     * BUP_NORMAL  : files that should be read as-is from the repository
 170     * BUP_CHUNKED : files that were chunked and need to be assembled
 171
 172     For more information on the name mangling algorythm, see mangle_name()
 173     """
 174     if name.endswith('.bupl'):
 175         return (name[:-5], BUP_NORMAL)
 176     elif name.endswith('.bup'):
 177         return (name[:-4], BUP_CHUNKED)
 178     else:
 179         return (name, BUP_NORMAL)
 180
 181
 182 def calc_hash(type, content):
 183     """Calculate some content's hash in the Git fashion."""
 184     header = '%s %d\0' % (type, len(content))
 185     sum = Sha1(header)
 186     sum.update(content)
 187     return sum.digest()
 188
 189
 190 def shalist_item_sort_key(ent):
 191     (mode, name, id) = ent
 192     assert(mode+0 == mode)
 193     if stat.S_ISDIR(mode):
 194         return name + '/'
 195     else:
 196         return name
 197
 198
 199 def tree_encode(shalist):
 200     """Generate a git tree object from (mode,name,hash) tuples."""
 201     shalist = sorted(shalist, key = shalist_item_sort_key)
 202     l = []
 203     for (mode,name,bin) in shalist:
 204         assert(mode)
 205         assert(mode+0 == mode)
 206         assert(name)
 207         assert(len(bin) == 20)
 208         s = '%o %s\0%s' % (mode,name,bin)
 209         assert(s[0] != '0')  # 0-padded octal is not acceptable in a git tree
 210         l.append(s)
 211     return ''.join(l)
 212
 213
 214 def tree_decode(buf):
 215     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 216     ofs = 0
 217     while ofs < len(buf):
 218         z = buf.find('\0', ofs)
 219         assert(z > ofs)
 220         spl = buf[ofs:z].split(' ', 1)
 221         assert(len(spl) == 2)
 222         mode,name = spl
 223         sha = buf[z+1:z+1+20]
 224         ofs = z+1+20
 225         yield (int(mode, 8), name, sha)
 226
 227
 228 def _encode_packobj(type, content, compression_level=1):
 229     szout = ''
 230     sz = len(content)
 231     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 232     sz >>= 4
 233     while 1:
 234         if sz: szbits |= 0x80
 235         szout += chr(szbits)
 236         if not sz:
 237             break
 238         szbits = sz & 0x7f
 239         sz >>= 7
 240     if compression_level > 9:
 241         compression_level = 9
 242     elif compression_level < 0:
 243         compression_level = 0
 244     z = zlib.compressobj(compression_level)
 245     yield szout
 246     yield z.compress(content)
 247     yield z.flush()
 248
 249
 250 def _encode_looseobj(type, content, compression_level=1):
 251     z = zlib.compressobj(compression_level)
 252     yield z.compress('%s %d\0' % (type, len(content)))
 253     yield z.compress(content)
 254     yield z.flush()
 255
 256
 257 def _decode_looseobj(buf):
 258     assert(buf);
 259     s = zlib.decompress(buf)
 260     i = s.find('\0')
 261     assert(i > 0)
 262     l = s[:i].split(' ')
 263     type = l[0]
 264     sz = int(l[1])
 265     content = s[i+1:]
 266     assert(type in _typemap)
 267     assert(sz == len(content))
 268     return (type, content)
 269
 270
 271 def _decode_packobj(buf):
 272     assert(buf)
 273     c = ord(buf[0])
 274     type = _typermap[(c & 0x70) >> 4]
 275     sz = c & 0x0f
 276     shift = 4
 277     i = 0
 278     while c & 0x80:
 279         i += 1
 280         c = ord(buf[i])
 281         sz |= (c & 0x7f) << shift
 282         shift += 7
 283         if not (c & 0x80):
 284             break
 285     return (type, zlib.decompress(buf[i+1:]))
 286
 287
 288 class PackIdx:
 289     def __init__(self):
 290         assert(0)
 291
 292     def find_offset(self, hash):
 293         """Get the offset of an object inside the index file."""
 294         idx = self._idx_from_hash(hash)
 295         if idx != None:
 296             return self._ofs_from_idx(idx)
 297         return None
 298
 299     def exists(self, hash, want_source=False):
 300         """Return nonempty if the object exists in this index."""
 301         if hash and (self._idx_from_hash(hash) != None):
 302             return want_source and os.path.basename(self.name) or True
 303         return None
 304
 305     def __len__(self):
 306         return int(self.fanout[255])
 307
 308     def _idx_from_hash(self, hash):
 309         global _total_searches, _total_steps
 310         _total_searches += 1
 311         assert(len(hash) == 20)
 312         b1 = ord(hash[0])
 313         start = self.fanout[b1-1] # range -1..254
 314         end = self.fanout[b1] # range 0..255
 315         want = str(hash)
 316         _total_steps += 1  # lookup table is a step
 317         while start < end:
 318             _total_steps += 1
 319             mid = start + (end-start)/2
 320             v = self._idx_to_hash(mid)
 321             if v < want:
 322                 start = mid+1
 323             elif v > want:
 324                 end = mid
 325             else: # got it!
 326                 return mid
 327         return None
 328
 329
 330 class PackIdxV1(PackIdx):
 331     """Object representation of a Git pack index (version 1) file."""
 332     def __init__(self, filename, f):
 333         self.name = filename
 334         self.idxnames = [self.name]
 335         self.map = mmap_read(f)
 336         self.fanout = list(struct.unpack('!256I',
 337                                          str(buffer(self.map, 0, 256*4))))
 338         self.fanout.append(0)  # entry "-1"
 339         nsha = self.fanout[255]
 340         self.sha_ofs = 256*4
 341         self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
 342
 343     def _ofs_from_idx(self, idx):
 344         return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
 345
 346     def _idx_to_hash(self, idx):
 347         return str(self.shatable[idx*24+4 : idx*24+24])
 348
 349     def __iter__(self):
 350         for i in xrange(self.fanout[255]):
 351             yield buffer(self.map, 256*4 + 24*i + 4, 20)
 352
 353
 354 class PackIdxV2(PackIdx):
 355     """Object representation of a Git pack index (version 2) file."""
 356     def __init__(self, filename, f):
 357         self.name = filename
 358         self.idxnames = [self.name]
 359         self.map = mmap_read(f)
 360         assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
 361         self.fanout = list(struct.unpack('!256I',
 362                                          str(buffer(self.map, 8, 256*4))))
 363         self.fanout.append(0)  # entry "-1"
 364         nsha = self.fanout[255]
 365         self.sha_ofs = 8 + 256*4
 366         self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
 367         self.ofstable = buffer(self.map,
 368                                self.sha_ofs + nsha*20 + nsha*4,
 369                                nsha*4)
 370         self.ofs64table = buffer(self.map,
 371                                  8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
 372
 373     def _ofs_from_idx(self, idx):
 374         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
 375         if ofs & 0x80000000:
 376             idx64 = ofs & 0x7fffffff
 377             ofs = struct.unpack('!Q',
 378                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
 379         return ofs
 380
 381     def _idx_to_hash(self, idx):
 382         return str(self.shatable[idx*20:(idx+1)*20])
 383
 384     def __iter__(self):
 385         for i in xrange(self.fanout[255]):
 386             yield buffer(self.map, 8 + 256*4 + 20*i, 20)
 387
 388
 389 _mpi_count = 0
 390 class PackIdxList:
 391     def __init__(self, dir):
 392         global _mpi_count
 393         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 394         _mpi_count += 1
 395         self.dir = dir
 396         self.also = set()
 397         self.packs = []
 398         self.do_bloom = False
 399         self.bloom = None
 400         self.refresh()
 401
 402     def __del__(self):
 403         global _mpi_count
 404         _mpi_count -= 1
 405         assert(_mpi_count == 0)
 406
 407     def __iter__(self):
 408         return iter(idxmerge(self.packs))
 409
 410     def __len__(self):
 411         return sum(len(pack) for pack in self.packs)
 412
 413     def exists(self, hash, want_source=False):
 414         """Return nonempty if the object exists in the index files."""
 415         global _total_searches
 416         _total_searches += 1
 417         if hash in self.also:
 418             return True
 419         if self.do_bloom and self.bloom:
 420             if self.bloom.exists(hash):
 421                 self.do_bloom = False
 422             else:
 423                 _total_searches -= 1  # was counted by bloom
 424                 return None
 425         for i in xrange(len(self.packs)):
 426             p = self.packs[i]
 427             _total_searches -= 1  # will be incremented by sub-pack
 428             ix = p.exists(hash, want_source=want_source)
 429             if ix:
 430                 # reorder so most recently used packs are searched first
 431                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 432                 return ix
 433         self.do_bloom = True
 434         return None
 435
 436     def refresh(self, skip_midx = False):
 437         """Refresh the index list.
 438         This method verifies if .midx files were superseded (e.g. all of its
 439         contents are in another, bigger .midx file) and removes the superseded
 440         files.
 441
 442         If skip_midx is True, all work on .midx files will be skipped and .midx
 443         files will be removed from the list.
 444
 445         The module-global variable 'ignore_midx' can force this function to
 446         always act as if skip_midx was True.
 447         """
 448         self.bloom = None # Always reopen the bloom as it may have been relaced
 449         self.do_bloom = False
 450         skip_midx = skip_midx or ignore_midx
 451         d = dict((p.name, p) for p in self.packs
 452                  if not skip_midx or not isinstance(p, midx.PackMidx))
 453         if os.path.exists(self.dir):
 454             if not skip_midx:
 455                 midxl = []
 456                 for ix in self.packs:
 457                     if isinstance(ix, midx.PackMidx):
 458                         for name in ix.idxnames:
 459                             d[os.path.join(self.dir, name)] = ix
 460                 for full in glob.glob(os.path.join(self.dir,'*.midx')):
 461                     if not d.get(full):
 462                         mx = midx.PackMidx(full)
 463                         (mxd, mxf) = os.path.split(mx.name)
 464                         broken = False
 465                         for n in mx.idxnames:
 466                             if not os.path.exists(os.path.join(mxd, n)):
 467                                 log(('warning: index %s missing\n' +
 468                                     '  used by %s\n') % (n, mxf))
 469                                 broken = True
 470                         if broken:
 471                             mx.close()
 472                             del mx
 473                             unlink(full)
 474                         else:
 475                             midxl.append(mx)
 476                 midxl.sort(key=lambda ix:
 477                            (-len(ix), -xstat.stat(ix.name).st_mtime))
 478                 for ix in midxl:
 479                     any_needed = False
 480                     for sub in ix.idxnames:
 481                         found = d.get(os.path.join(self.dir, sub))
 482                         if not found or isinstance(found, PackIdx):
 483                             # doesn't exist, or exists but not in a midx
 484                             any_needed = True
 485                             break
 486                     if any_needed:
 487                         d[ix.name] = ix
 488                         for name in ix.idxnames:
 489                             d[os.path.join(self.dir, name)] = ix
 490                     elif not ix.force_keep:
 491                         debug1('midx: removing redundant: %s\n'
 492                                % os.path.basename(ix.name))
 493                         ix.close()
 494                         unlink(ix.name)
 495             for full in glob.glob(os.path.join(self.dir,'*.idx')):
 496                 if not d.get(full):
 497                     try:
 498                         ix = open_idx(full)
 499                     except GitError, e:
 500                         add_error(e)
 501                         continue
 502                     d[full] = ix
 503             bfull = os.path.join(self.dir, 'bup.bloom')
 504             if self.bloom is None and os.path.exists(bfull):
 505                 self.bloom = bloom.ShaBloom(bfull)
 506             self.packs = list(set(d.values()))
 507             self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
 508             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 509                 self.do_bloom = True
 510             else:
 511                 self.bloom = None
 512         debug1('PackIdxList: using %d index%s.\n'
 513             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 514
 515     def add(self, hash):
 516         """Insert an additional object in the list."""
 517         self.also.add(hash)
 518
 519
 520 def open_idx(filename):
 521     if filename.endswith('.idx'):
 522         f = open(filename, 'rb')
 523         header = f.read(8)
 524         if header[0:4] == '\377tOc':
 525             version = struct.unpack('!I', header[4:8])[0]
 526             if version == 2:
 527                 return PackIdxV2(filename, f)
 528             else:
 529                 raise GitError('%s: expected idx file version 2, got %d'
 530                                % (filename, version))
 531         elif len(header) == 8 and header[0:4] < '\377tOc':
 532             return PackIdxV1(filename, f)
 533         else:
 534             raise GitError('%s: unrecognized idx file header' % filename)
 535     elif filename.endswith('.midx'):
 536         return midx.PackMidx(filename)
 537     else:
 538         raise GitError('idx filenames must end with .idx or .midx')
 539
 540
 541 def idxmerge(idxlist, final_progress=True):
 542     """Generate a list of all the objects reachable in a PackIdxList."""
 543     def pfunc(count, total):
 544         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 545                   % (count*100.0/total, count, total))
 546     def pfinal(count, total):
 547         if final_progress:
 548             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 549                      % (100, total, total))
 550     return merge_iter(idxlist, 10024, pfunc, pfinal)
 551
 552
 553 def _make_objcache():
 554     return PackIdxList(repo('objects/pack'))
 555
 556 class PackWriter:
 557     """Writes Git objects inside a pack file."""
 558     def __init__(self, objcache_maker=_make_objcache, compression_level=1):
 559         self.count = 0
 560         self.outbytes = 0
 561         self.filename = None
 562         self.file = None
 563         self.idx = None
 564         self.objcache_maker = objcache_maker
 565         self.objcache = None
 566         self.compression_level = compression_level
 567
 568     def __del__(self):
 569         self.close()
 570
 571     def _open(self):
 572         if not self.file:
 573             (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
 574             self.file = os.fdopen(fd, 'w+b')
 575             assert(name.endswith('.pack'))
 576             self.filename = name[:-5]
 577             self.file.write('PACK\0\0\0\2\0\0\0\0')
 578             self.idx = list(list() for i in xrange(256))
 579
 580     def _raw_write(self, datalist, sha):
 581         self._open()
 582         f = self.file
 583         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 584         # the file never has a *partial* blob.  So let's make sure it's
 585         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 586         # to our hashsplit algorithm.)  f.write() does its own buffering,
 587         # but that's okay because we'll flush it in _end().
 588         oneblob = ''.join(datalist)
 589         try:
 590             f.write(oneblob)
 591         except IOError, e:
 592             raise GitError, e, sys.exc_info()[2]
 593         nw = len(oneblob)
 594         crc = zlib.crc32(oneblob) & 0xffffffff
 595         self._update_idx(sha, crc, nw)
 596         self.outbytes += nw
 597         self.count += 1
 598         return nw, crc
 599
 600     def _update_idx(self, sha, crc, size):
 601         assert(sha)
 602         if self.idx:
 603             self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
 604
 605     def _write(self, sha, type, content):
 606         if verbose:
 607             log('>')
 608         if not sha:
 609             sha = calc_hash(type, content)
 610         size, crc = self._raw_write(_encode_packobj(type, content,
 611                                                     self.compression_level),
 612                                     sha=sha)
 613         if self.outbytes >= max_pack_size or self.count >= max_pack_objects:
 614             self.breakpoint()
 615         return sha
 616
 617     def breakpoint(self):
 618         """Clear byte and object counts and return the last processed id."""
 619         id = self._end()
 620         self.outbytes = self.count = 0
 621         return id
 622
 623     def _require_objcache(self):
 624         if self.objcache is None and self.objcache_maker:
 625             self.objcache = self.objcache_maker()
 626         if self.objcache is None:
 627             raise GitError(
 628                     "PackWriter not opened or can't check exists w/o objcache")
 629
 630     def exists(self, id, want_source=False):
 631         """Return non-empty if an object is found in the object cache."""
 632         self._require_objcache()
 633         return self.objcache.exists(id, want_source=want_source)
 634
 635     def maybe_write(self, type, content):
 636         """Write an object to the pack file if not present and return its id."""
 637         sha = calc_hash(type, content)
 638         if not self.exists(sha):
 639             self._write(sha, type, content)
 640             self._require_objcache()
 641             self.objcache.add(sha)
 642         return sha
 643
 644     def new_blob(self, blob):
 645         """Create a blob object in the pack with the supplied content."""
 646         return self.maybe_write('blob', blob)
 647
 648     def new_tree(self, shalist):
 649         """Create a tree object in the pack."""
 650         content = tree_encode(shalist)
 651         return self.maybe_write('tree', content)
 652
 653     def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
 654         l = []
 655         if tree: l.append('tree %s' % tree.encode('hex'))
 656         if parent: l.append('parent %s' % parent.encode('hex'))
 657         if author: l.append('author %s %s' % (author, _git_date(adate)))
 658         if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
 659         l.append('')
 660         l.append(msg)
 661         return self.maybe_write('commit', '\n'.join(l))
 662
 663     def new_commit(self, parent, tree, date, msg):
 664         """Create a commit object in the pack."""
 665         userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
 666         commit = self._new_commit(tree, parent,
 667                                   userline, date, userline, date,
 668                                   msg)
 669         return commit
 670
 671     def abort(self):
 672         """Remove the pack file from disk."""
 673         f = self.file
 674         if f:
 675             self.idx = None
 676             self.file = None
 677             f.close()
 678             os.unlink(self.filename + '.pack')
 679
 680     def _end(self, run_midx=True):
 681         f = self.file
 682         if not f: return None
 683         self.file = None
 684         self.objcache = None
 685         idx = self.idx
 686         self.idx = None
 687
 688         # update object count
 689         f.seek(8)
 690         cp = struct.pack('!i', self.count)
 691         assert(len(cp) == 4)
 692         f.write(cp)
 693
 694         # calculate the pack sha1sum
 695         f.seek(0)
 696         sum = Sha1()
 697         for b in chunkyreader(f):
 698             sum.update(b)
 699         packbin = sum.digest()
 700         f.write(packbin)
 701         f.close()
 702
 703         obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
 704
 705         nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
 706         if os.path.exists(self.filename + '.map'):
 707             os.unlink(self.filename + '.map')
 708         os.rename(self.filename + '.pack', nameprefix + '.pack')
 709         os.rename(self.filename + '.idx', nameprefix + '.idx')
 710
 711         if run_midx:
 712             auto_midx(repo('objects/pack'))
 713         return nameprefix
 714
 715     def close(self, run_midx=True):
 716         """Close the pack file and move it to its definitive path."""
 717         return self._end(run_midx=run_midx)
 718
 719     def _write_pack_idx_v2(self, filename, idx, packbin):
 720         ofs64_count = 0
 721         for section in idx:
 722             for entry in section:
 723                 if entry[2] >= 2**31:
 724                     ofs64_count += 1
 725
 726         # Length: header + fan-out + shas-and-crcs + overflow-offsets
 727         index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
 728         idx_map = None
 729         idx_f = open(filename, 'w+b')
 730         try:
 731             idx_f.truncate(index_len)
 732             idx_map = mmap_readwrite(idx_f, close=False)
 733             count = _helpers.write_idx(filename, idx_map, idx, self.count)
 734             assert(count == self.count)
 735         finally:
 736             if idx_map: idx_map.close()
 737             idx_f.close()
 738
 739         idx_f = open(filename, 'a+b')
 740         try:
 741             idx_f.write(packbin)
 742             idx_f.seek(0)
 743             idx_sum = Sha1()
 744             b = idx_f.read(8 + 4*256)
 745             idx_sum.update(b)
 746
 747             obj_list_sum = Sha1()
 748             for b in chunkyreader(idx_f, 20*self.count):
 749                 idx_sum.update(b)
 750                 obj_list_sum.update(b)
 751             namebase = obj_list_sum.hexdigest()
 752
 753             for b in chunkyreader(idx_f):
 754                 idx_sum.update(b)
 755             idx_f.write(idx_sum.digest())
 756             return namebase
 757         finally:
 758             idx_f.close()
 759
 760
 761 def _git_date(date):
 762     return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
 763
 764
 765 def _gitenv():
 766     os.environ['GIT_DIR'] = os.path.abspath(repo())
 767
 768
 769 def list_refs(refname = None):
 770     """Generate a list of tuples in the form (refname,hash).
 771     If a ref name is specified, list only this particular ref.
 772     """
 773     argv = ['git', 'show-ref', '--']
 774     if refname:
 775         argv += [refname]
 776     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 777     out = p.stdout.read().strip()
 778     rv = p.wait()  # not fatal
 779     if rv:
 780         assert(not out)
 781     if out:
 782         for d in out.split('\n'):
 783             (sha, name) = d.split(' ', 1)
 784             yield (name, sha.decode('hex'))
 785
 786
 787 def read_ref(refname):
 788     """Get the commit id of the most recent commit made on a given ref."""
 789     l = list(list_refs(refname))
 790     if l:
 791         assert(len(l) == 1)
 792         return l[0][1]
 793     else:
 794         return None
 795
 796
 797 def rev_list(ref, count=None):
 798     """Generate a list of reachable commits in reverse chronological order.
 799
 800     This generator walks through commits, from child to parent, that are
 801     reachable via the specified ref and yields a series of tuples of the form
 802     (date,hash).
 803
 804     If count is a non-zero integer, limit the number of commits to "count"
 805     objects.
 806     """
 807     assert(not ref.startswith('-'))
 808     opts = []
 809     if count:
 810         opts += ['-n', str(atoi(count))]
 811     argv = ['git', 'rev-list', '--pretty=format:%at'] + opts + [ref, '--']
 812     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 813     commit = None
 814     for row in p.stdout:
 815         s = row.strip()
 816         if s.startswith('commit '):
 817             commit = s[7:].decode('hex')
 818         else:
 819             date = int(s)
 820             yield (date, commit)
 821     rv = p.wait()  # not fatal
 822     if rv:
 823         raise GitError, 'git rev-list returned error %d' % rv
 824
 825
 826 def get_commit_dates(refs):
 827     """Get the dates for the specified commit refs.  For now, every unique
 828        string in refs must resolve to a different commit or this
 829        function will fail."""
 830     result = []
 831     for ref in refs:
 832         commit = get_commit_items(ref, cp())
 833         result.append(commit.author_sec)
 834     return result
 835
 836
 837 def rev_parse(committish):
 838     """Resolve the full hash for 'committish', if it exists.
 839
 840     Should be roughly equivalent to 'git rev-parse'.
 841
 842     Returns the hex value of the hash if it is found, None if 'committish' does
 843     not correspond to anything.
 844     """
 845     head = read_ref(committish)
 846     if head:
 847         debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
 848         return head
 849
 850     pL = PackIdxList(repo('objects/pack'))
 851
 852     if len(committish) == 40:
 853         try:
 854             hash = committish.decode('hex')
 855         except TypeError:
 856             return None
 857
 858         if pL.exists(hash):
 859             return hash
 860
 861     return None
 862
 863
 864 def update_ref(refname, newval, oldval):
 865     """Change the commit pointed to by a branch."""
 866     if not oldval:
 867         oldval = ''
 868     assert(refname.startswith('refs/heads/'))
 869     p = subprocess.Popen(['git', 'update-ref', refname,
 870                           newval.encode('hex'), oldval.encode('hex')],
 871                          preexec_fn = _gitenv)
 872     _git_wait('git update-ref', p)
 873
 874
 875 def guess_repo(path=None):
 876     """Set the path value in the global variable "repodir".
 877     This makes bup look for an existing bup repository, but not fail if a
 878     repository doesn't exist. Usually, if you are interacting with a bup
 879     repository, you would not be calling this function but using
 880     check_repo_or_die().
 881     """
 882     global repodir
 883     if path:
 884         repodir = path
 885     if not repodir:
 886         repodir = os.environ.get('BUP_DIR')
 887         if not repodir:
 888             repodir = os.path.expanduser('~/.bup')
 889
 890
 891 def init_repo(path=None):
 892     """Create the Git bare repository for bup in a given path."""
 893     guess_repo(path)
 894     d = repo()  # appends a / to the path
 895     parent = os.path.dirname(os.path.dirname(d))
 896     if parent and not os.path.exists(parent):
 897         raise GitError('parent directory "%s" does not exist\n' % parent)
 898     if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
 899         raise GitError('"%s" exists but is not a directory\n' % d)
 900     p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
 901                          preexec_fn = _gitenv)
 902     _git_wait('git init', p)
 903     # Force the index version configuration in order to ensure bup works
 904     # regardless of the version of the installed Git binary.
 905     p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
 906                          stdout=sys.stderr, preexec_fn = _gitenv)
 907     _git_wait('git config', p)
 908     # Enable the reflog
 909     p = subprocess.Popen(['git', 'config', 'core.logAllRefUpdates', 'true'],
 910                          stdout=sys.stderr, preexec_fn = _gitenv)
 911     _git_wait('git config', p)
 912
 913
 914 def check_repo_or_die(path=None):
 915     """Make sure a bup repository exists, and abort if not.
 916     If the path to a particular repository was not specified, this function
 917     initializes the default repository automatically.
 918     """
 919     guess_repo(path)
 920     try:
 921         os.stat(repo('objects/pack/.'))
 922     except OSError, e:
 923         if e.errno == errno.ENOENT:
 924             log('error: %r is not a bup repository; run "bup init"\n'
 925                 % repo())
 926             sys.exit(15)
 927         else:
 928             log('error: %s\n' % e)
 929             sys.exit(14)
 930
 931
 932 _ver = None
 933 def ver():
 934     """Get Git's version and ensure a usable version is installed.
 935
 936     The returned version is formatted as an ordered tuple with each position
 937     representing a digit in the version tag. For example, the following tuple
 938     would represent version 1.6.6.9:
 939
 940         ('1', '6', '6', '9')
 941     """
 942     global _ver
 943     if not _ver:
 944         p = subprocess.Popen(['git', '--version'],
 945                              stdout=subprocess.PIPE)
 946         gvs = p.stdout.read()
 947         _git_wait('git --version', p)
 948         m = re.match(r'git version (\S+.\S+)', gvs)
 949         if not m:
 950             raise GitError('git --version weird output: %r' % gvs)
 951         _ver = tuple(m.group(1).split('.'))
 952     needed = ('1','5', '3', '1')
 953     if _ver < needed:
 954         raise GitError('git version %s or higher is required; you have %s'
 955                        % ('.'.join(needed), '.'.join(_ver)))
 956     return _ver
 957
 958
 959 def _git_wait(cmd, p):
 960     rv = p.wait()
 961     if rv != 0:
 962         raise GitError('%s returned %d' % (cmd, rv))
 963
 964
 965 def _git_capture(argv):
 966     p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
 967     r = p.stdout.read()
 968     _git_wait(repr(argv), p)
 969     return r
 970
 971
 972 class _AbortableIter:
 973     def __init__(self, it, onabort = None):
 974         self.it = it
 975         self.onabort = onabort
 976         self.done = None
 977
 978     def __iter__(self):
 979         return self
 980
 981     def next(self):
 982         try:
 983             return self.it.next()
 984         except StopIteration, e:
 985             self.done = True
 986             raise
 987         except:
 988             self.abort()
 989             raise
 990
 991     def abort(self):
 992         """Abort iteration and call the abortion callback, if needed."""
 993         if not self.done:
 994             self.done = True
 995             if self.onabort:
 996                 self.onabort()
 997
 998     def __del__(self):
 999         self.abort()
1000
1001
1002 _ver_warned = 0
1003 class CatPipe:
1004     """Link to 'git cat-file' that is used to retrieve blob data."""
1005     def __init__(self):
1006         global _ver_warned
1007         wanted = ('1','5','6')
1008         if ver() < wanted:
1009             if not _ver_warned:
1010                 log('warning: git version < %s; bup will be slow.\n'
1011                     % '.'.join(wanted))
1012                 _ver_warned = 1
1013             self.get = self._slow_get
1014         else:
1015             self.p = self.inprogress = None
1016             self.get = self._fast_get
1017
1018     def _abort(self):
1019         if self.p:
1020             self.p.stdout.close()
1021             self.p.stdin.close()
1022         self.p = None
1023         self.inprogress = None
1024
1025     def _restart(self):
1026         self._abort()
1027         self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1028                                   stdin=subprocess.PIPE,
1029                                   stdout=subprocess.PIPE,
1030                                   close_fds = True,
1031                                   bufsize = 4096,
1032                                   preexec_fn = _gitenv)
1033
1034     def _fast_get(self, id):
1035         if not self.p or self.p.poll() != None:
1036             self._restart()
1037         assert(self.p)
1038         poll_result = self.p.poll()
1039         assert(poll_result == None)
1040         if self.inprogress:
1041             log('_fast_get: opening %r while %r is open\n'
1042                 % (id, self.inprogress))
1043         assert(not self.inprogress)
1044         assert(id.find('\n') < 0)
1045         assert(id.find('\r') < 0)
1046         assert(not id.startswith('-'))
1047         self.inprogress = id
1048         self.p.stdin.write('%s\n' % id)
1049         self.p.stdin.flush()
1050         hdr = self.p.stdout.readline()
1051         if hdr.endswith(' missing\n'):
1052             self.inprogress = None
1053             raise KeyError('blob %r is missing' % id)
1054         spl = hdr.split(' ')
1055         if len(spl) != 3 or len(spl[0]) != 40:
1056             raise GitError('expected blob, got %r' % spl)
1057         (hex, type, size) = spl
1058
1059         it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
1060                            onabort = self._abort)
1061         try:
1062             yield type
1063             for blob in it:
1064                 yield blob
1065             readline_result = self.p.stdout.readline()
1066             assert(readline_result == '\n')
1067             self.inprogress = None
1068         except Exception, e:
1069             it.abort()
1070             raise
1071
1072     def _slow_get(self, id):
1073         assert(id.find('\n') < 0)
1074         assert(id.find('\r') < 0)
1075         assert(id[0] != '-')
1076         type = _git_capture(['git', 'cat-file', '-t', id]).strip()
1077         yield type
1078
1079         p = subprocess.Popen(['git', 'cat-file', type, id],
1080                              stdout=subprocess.PIPE,
1081                              preexec_fn = _gitenv)
1082         for blob in chunkyreader(p.stdout):
1083             yield blob
1084         _git_wait('git cat-file', p)
1085
1086     def _join(self, it):
1087         type = it.next()
1088         if type == 'blob':
1089             for blob in it:
1090                 yield blob
1091         elif type == 'tree':
1092             treefile = ''.join(it)
1093             for (mode, name, sha) in tree_decode(treefile):
1094                 for blob in self.join(sha.encode('hex')):
1095                     yield blob
1096         elif type == 'commit':
1097             treeline = ''.join(it).split('\n')[0]
1098             assert(treeline.startswith('tree '))
1099             for blob in self.join(treeline[5:]):
1100                 yield blob
1101         else:
1102             raise GitError('invalid object type %r: expected blob/tree/commit'
1103                            % type)
1104
1105     def join(self, id):
1106         """Generate a list of the content of all blobs that can be reached
1107         from an object.  The hash given in 'id' must point to a blob, a tree
1108         or a commit. The content of all blobs that can be seen from trees or
1109         commits will be added to the list.
1110         """
1111         try:
1112             for d in self._join(self.get(id)):
1113                 yield d
1114         except StopIteration:
1115             log('booger!\n')
1116
1117
1118 _cp = (None, None)
1119
1120 def cp():
1121     """Create a CatPipe object or reuse an already existing one."""
1122     global _cp
1123     cp_dir, cp = _cp
1124     cur_dir = os.path.realpath(repo())
1125     if cur_dir != cp_dir:
1126         cp = CatPipe()
1127         _cp = (cur_dir, cp)
1128     return cp
1129
1130
1131 def tags():
1132     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1133     tags = {}
1134     for (n,c) in list_refs():
1135         if n.startswith('refs/tags/'):
1136             name = n[10:]
1137             if not c in tags:
1138                 tags[c] = []
1139
1140             tags[c].append(name)  # more than one tag can point at 'c'
1141
1142     return tags