lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5 import os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
   6 from collections import namedtuple
   7
   8 from bup.helpers import *
   9 from bup import _helpers, path, midx, bloom, xstat
  10
  11 max_pack_size = 1000*1000*1000  # larger packs will slow down pruning
  12 max_pack_objects = 200*1000  # cache memory usage is about 83 bytes per object
  13
  14 verbose = 0
  15 ignore_midx = 0
  16 repodir = None
  17
  18 _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
  19 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
  20
  21 _total_searches = 0
  22 _total_steps = 0
  23
  24
  25 class GitError(Exception):
  26     pass
  27
  28
  29 def parse_tz_offset(s):
  30     """UTC offset in seconds."""
  31     tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
  32     if s[0] == '-':
  33         return - tz_off
  34     return tz_off
  35
  36
  37 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
  38 # Make sure that's authoritative.
  39 _start_end_char = r'[^ .,:;<>"\'\0\n]'
  40 _content_char = r'[^\0\n<>]'
  41 _safe_str_rx = '(?:%s{1,2}|(?:%s%s*%s))' \
  42     % (_start_end_char,
  43        _start_end_char, _content_char, _start_end_char)
  44 _tz_rx = r'[-+]\d\d[0-5]\d'
  45 _parent_rx = r'(?:parent [abcdefABCDEF0123456789]{40}\n)'
  46 _commit_rx = re.compile(r'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
  47 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
  48 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)
  49
  50 (?P<message>(?:.|\n)*)''' % (_parent_rx,
  51                              _safe_str_rx, _safe_str_rx, _tz_rx,
  52                              _safe_str_rx, _safe_str_rx, _tz_rx))
  53 _parent_hash_rx = re.compile(r'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
  54
  55
  56 # Note that the author_sec and committer_sec values are (UTC) epoch seconds.
  57 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
  58                                        'author_name', 'author_mail',
  59                                        'author_sec', 'author_offset',
  60                                        'committer_name', 'committer_mail',
  61                                        'committer_sec', 'committer_offset',
  62                                        'message'])
  63
  64 def parse_commit(content):
  65     commit_match = re.match(_commit_rx, content)
  66     if not commit_match:
  67         raise Exception('cannot parse commit %r' % content)
  68     matches = commit_match.groupdict()
  69     return CommitInfo(tree=matches['tree'],
  70                       parents=re.findall(_parent_hash_rx, matches['parents']),
  71                       author_name=matches['author_name'],
  72                       author_mail=matches['author_mail'],
  73                       author_sec=int(matches['asec']),
  74                       author_offset=parse_tz_offset(matches['atz']),
  75                       committer_name=matches['committer_name'],
  76                       committer_mail=matches['committer_mail'],
  77                       committer_sec=int(matches['csec']),
  78                       committer_offset=parse_tz_offset(matches['ctz']),
  79                       message=matches['message'])
  80
  81
  82 def get_commit_items(id, cp):
  83     commit_it = cp.get(id)
  84     assert(commit_it.next() == 'commit')
  85     commit_content = ''.join(commit_it)
  86     return parse_commit(commit_content)
  87
  88
  89 def repo(sub = '', repo_dir=None):
  90     """Get the path to the git repository or one of its subdirectories."""
  91     global repodir
  92     repo_dir = repo_dir or repodir
  93     if not repo_dir:
  94         raise GitError('You should call check_repo_or_die()')
  95
  96     # If there's a .git subdirectory, then the actual repo is in there.
  97     gd = os.path.join(repo_dir, '.git')
  98     if os.path.exists(gd):
  99         repodir = gd
 100
 101     return os.path.join(repo_dir, sub)
 102
 103
 104 def shorten_hash(s):
 105     return re.sub(r'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)',
 106                   r'\1\2*\3', s)
 107
 108
 109 def repo_rel(path):
 110     full = os.path.abspath(path)
 111     fullrepo = os.path.abspath(repo(''))
 112     if not fullrepo.endswith('/'):
 113         fullrepo += '/'
 114     if full.startswith(fullrepo):
 115         path = full[len(fullrepo):]
 116     if path.startswith('index-cache/'):
 117         path = path[len('index-cache/'):]
 118     return shorten_hash(path)
 119
 120
 121 def all_packdirs():
 122     paths = [repo('objects/pack')]
 123     paths += glob.glob(repo('index-cache/*/.'))
 124     return paths
 125
 126
 127 def auto_midx(objdir):
 128     args = [path.exe(), 'midx', '--auto', '--dir', objdir]
 129     try:
 130         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 131     except OSError, e:
 132         # make sure 'args' gets printed to help with debugging
 133         add_error('%r: exception: %s' % (args, e))
 134         raise
 135     if rv:
 136         add_error('%r: returned %d' % (args, rv))
 137
 138     args = [path.exe(), 'bloom', '--dir', objdir]
 139     try:
 140         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 141     except OSError, e:
 142         # make sure 'args' gets printed to help with debugging
 143         add_error('%r: exception: %s' % (args, e))
 144         raise
 145     if rv:
 146         add_error('%r: returned %d' % (args, rv))
 147
 148
 149 def mangle_name(name, mode, gitmode):
 150     """Mangle a file name to present an abstract name for segmented files.
 151     Mangled file names will have the ".bup" extension added to them. If a
 152     file's name already ends with ".bup", a ".bupl" extension is added to
 153     disambiguate normal files from segmented ones.
 154     """
 155     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 156         assert(stat.S_ISDIR(gitmode))
 157         return name + '.bup'
 158     elif name.endswith('.bup') or name[:-1].endswith('.bup'):
 159         return name + '.bupl'
 160     else:
 161         return name
 162
 163
 164 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 165 def demangle_name(name):
 166     """Remove name mangling from a file name, if necessary.
 167
 168     The return value is a tuple (demangled_filename,mode), where mode is one of
 169     the following:
 170
 171     * BUP_NORMAL  : files that should be read as-is from the repository
 172     * BUP_CHUNKED : files that were chunked and need to be reassembled
 173
 174     For more information on the name mangling algorithm, see mangle_name()
 175     """
 176     if name.endswith('.bupl'):
 177         return (name[:-5], BUP_NORMAL)
 178     elif name.endswith('.bup'):
 179         return (name[:-4], BUP_CHUNKED)
 180     else:
 181         return (name, BUP_NORMAL)
 182
 183
 184 def calc_hash(type, content):
 185     """Calculate some content's hash in the Git fashion."""
 186     header = '%s %d\0' % (type, len(content))
 187     sum = Sha1(header)
 188     sum.update(content)
 189     return sum.digest()
 190
 191
 192 def shalist_item_sort_key(ent):
 193     (mode, name, id) = ent
 194     assert(mode+0 == mode)
 195     if stat.S_ISDIR(mode):
 196         return name + '/'
 197     else:
 198         return name
 199
 200
 201 def tree_encode(shalist):
 202     """Generate a git tree object from (mode,name,hash) tuples."""
 203     shalist = sorted(shalist, key = shalist_item_sort_key)
 204     l = []
 205     for (mode,name,bin) in shalist:
 206         assert(mode)
 207         assert(mode+0 == mode)
 208         assert(name)
 209         assert(len(bin) == 20)
 210         s = '%o %s\0%s' % (mode,name,bin)
 211         assert(s[0] != '0')  # 0-padded octal is not acceptable in a git tree
 212         l.append(s)
 213     return ''.join(l)
 214
 215
 216 def tree_decode(buf):
 217     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 218     ofs = 0
 219     while ofs < len(buf):
 220         z = buf.find('\0', ofs)
 221         assert(z > ofs)
 222         spl = buf[ofs:z].split(' ', 1)
 223         assert(len(spl) == 2)
 224         mode,name = spl
 225         sha = buf[z+1:z+1+20]
 226         ofs = z+1+20
 227         yield (int(mode, 8), name, sha)
 228
 229
 230 def _encode_packobj(type, content, compression_level=1):
 231     szout = ''
 232     sz = len(content)
 233     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 234     sz >>= 4
 235     while 1:
 236         if sz: szbits |= 0x80
 237         szout += chr(szbits)
 238         if not sz:
 239             break
 240         szbits = sz & 0x7f
 241         sz >>= 7
 242     if compression_level > 9:
 243         compression_level = 9
 244     elif compression_level < 0:
 245         compression_level = 0
 246     z = zlib.compressobj(compression_level)
 247     yield szout
 248     yield z.compress(content)
 249     yield z.flush()
 250
 251
 252 def _encode_looseobj(type, content, compression_level=1):
 253     z = zlib.compressobj(compression_level)
 254     yield z.compress('%s %d\0' % (type, len(content)))
 255     yield z.compress(content)
 256     yield z.flush()
 257
 258
 259 def _decode_looseobj(buf):
 260     assert(buf);
 261     s = zlib.decompress(buf)
 262     i = s.find('\0')
 263     assert(i > 0)
 264     l = s[:i].split(' ')
 265     type = l[0]
 266     sz = int(l[1])
 267     content = s[i+1:]
 268     assert(type in _typemap)
 269     assert(sz == len(content))
 270     return (type, content)
 271
 272
 273 def _decode_packobj(buf):
 274     assert(buf)
 275     c = ord(buf[0])
 276     type = _typermap[(c & 0x70) >> 4]
 277     sz = c & 0x0f
 278     shift = 4
 279     i = 0
 280     while c & 0x80:
 281         i += 1
 282         c = ord(buf[i])
 283         sz |= (c & 0x7f) << shift
 284         shift += 7
 285         if not (c & 0x80):
 286             break
 287     return (type, zlib.decompress(buf[i+1:]))
 288
 289
 290 class PackIdx:
 291     def __init__(self):
 292         assert(0)
 293
 294     def find_offset(self, hash):
 295         """Get the offset of an object inside the index file."""
 296         idx = self._idx_from_hash(hash)
 297         if idx != None:
 298             return self._ofs_from_idx(idx)
 299         return None
 300
 301     def exists(self, hash, want_source=False):
 302         """Return nonempty if the object exists in this index."""
 303         if hash and (self._idx_from_hash(hash) != None):
 304             return want_source and os.path.basename(self.name) or True
 305         return None
 306
 307     def __len__(self):
 308         return int(self.fanout[255])
 309
 310     def _idx_from_hash(self, hash):
 311         global _total_searches, _total_steps
 312         _total_searches += 1
 313         assert(len(hash) == 20)
 314         b1 = ord(hash[0])
 315         start = self.fanout[b1-1] # range -1..254
 316         end = self.fanout[b1] # range 0..255
 317         want = str(hash)
 318         _total_steps += 1  # lookup table is a step
 319         while start < end:
 320             _total_steps += 1
 321             mid = start + (end-start)/2
 322             v = self._idx_to_hash(mid)
 323             if v < want:
 324                 start = mid+1
 325             elif v > want:
 326                 end = mid
 327             else: # got it!
 328                 return mid
 329         return None
 330
 331
 332 class PackIdxV1(PackIdx):
 333     """Object representation of a Git pack index (version 1) file."""
 334     def __init__(self, filename, f):
 335         self.name = filename
 336         self.idxnames = [self.name]
 337         self.map = mmap_read(f)
 338         self.fanout = list(struct.unpack('!256I',
 339                                          str(buffer(self.map, 0, 256*4))))
 340         self.fanout.append(0)  # entry "-1"
 341         nsha = self.fanout[255]
 342         self.sha_ofs = 256*4
 343         self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
 344
 345     def _ofs_from_idx(self, idx):
 346         return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
 347
 348     def _idx_to_hash(self, idx):
 349         return str(self.shatable[idx*24+4 : idx*24+24])
 350
 351     def __iter__(self):
 352         for i in xrange(self.fanout[255]):
 353             yield buffer(self.map, 256*4 + 24*i + 4, 20)
 354
 355
 356 class PackIdxV2(PackIdx):
 357     """Object representation of a Git pack index (version 2) file."""
 358     def __init__(self, filename, f):
 359         self.name = filename
 360         self.idxnames = [self.name]
 361         self.map = mmap_read(f)
 362         assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
 363         self.fanout = list(struct.unpack('!256I',
 364                                          str(buffer(self.map, 8, 256*4))))
 365         self.fanout.append(0)  # entry "-1"
 366         nsha = self.fanout[255]
 367         self.sha_ofs = 8 + 256*4
 368         self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
 369         self.ofstable = buffer(self.map,
 370                                self.sha_ofs + nsha*20 + nsha*4,
 371                                nsha*4)
 372         self.ofs64table = buffer(self.map,
 373                                  8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
 374
 375     def _ofs_from_idx(self, idx):
 376         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
 377         if ofs & 0x80000000:
 378             idx64 = ofs & 0x7fffffff
 379             ofs = struct.unpack('!Q',
 380                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
 381         return ofs
 382
 383     def _idx_to_hash(self, idx):
 384         return str(self.shatable[idx*20:(idx+1)*20])
 385
 386     def __iter__(self):
 387         for i in xrange(self.fanout[255]):
 388             yield buffer(self.map, 8 + 256*4 + 20*i, 20)
 389
 390
 391 _mpi_count = 0
 392 class PackIdxList:
 393     def __init__(self, dir):
 394         global _mpi_count
 395         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 396         _mpi_count += 1
 397         self.dir = dir
 398         self.also = set()
 399         self.packs = []
 400         self.do_bloom = False
 401         self.bloom = None
 402         self.refresh()
 403
 404     def __del__(self):
 405         global _mpi_count
 406         _mpi_count -= 1
 407         assert(_mpi_count == 0)
 408
 409     def __iter__(self):
 410         return iter(idxmerge(self.packs))
 411
 412     def __len__(self):
 413         return sum(len(pack) for pack in self.packs)
 414
 415     def exists(self, hash, want_source=False):
 416         """Return nonempty if the object exists in the index files."""
 417         global _total_searches
 418         _total_searches += 1
 419         if hash in self.also:
 420             return True
 421         if self.do_bloom and self.bloom:
 422             if self.bloom.exists(hash):
 423                 self.do_bloom = False
 424             else:
 425                 _total_searches -= 1  # was counted by bloom
 426                 return None
 427         for i in xrange(len(self.packs)):
 428             p = self.packs[i]
 429             _total_searches -= 1  # will be incremented by sub-pack
 430             ix = p.exists(hash, want_source=want_source)
 431             if ix:
 432                 # reorder so most recently used packs are searched first
 433                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 434                 return ix
 435         self.do_bloom = True
 436         return None
 437
 438     def refresh(self, skip_midx = False):
 439         """Refresh the index list.
 440         This method verifies if .midx files were superseded (e.g. all of its
 441         contents are in another, bigger .midx file) and removes the superseded
 442         files.
 443
 444         If skip_midx is True, all work on .midx files will be skipped and .midx
 445         files will be removed from the list.
 446
 447         The module-global variable 'ignore_midx' can force this function to
 448         always act as if skip_midx was True.
 449         """
 450         self.bloom = None # Always reopen the bloom as it may have been relaced
 451         self.do_bloom = False
 452         skip_midx = skip_midx or ignore_midx
 453         d = dict((p.name, p) for p in self.packs
 454                  if not skip_midx or not isinstance(p, midx.PackMidx))
 455         if os.path.exists(self.dir):
 456             if not skip_midx:
 457                 midxl = []
 458                 for ix in self.packs:
 459                     if isinstance(ix, midx.PackMidx):
 460                         for name in ix.idxnames:
 461                             d[os.path.join(self.dir, name)] = ix
 462                 for full in glob.glob(os.path.join(self.dir,'*.midx')):
 463                     if not d.get(full):
 464                         mx = midx.PackMidx(full)
 465                         (mxd, mxf) = os.path.split(mx.name)
 466                         broken = False
 467                         for n in mx.idxnames:
 468                             if not os.path.exists(os.path.join(mxd, n)):
 469                                 log(('warning: index %s missing\n' +
 470                                     '  used by %s\n') % (n, mxf))
 471                                 broken = True
 472                         if broken:
 473                             mx.close()
 474                             del mx
 475                             unlink(full)
 476                         else:
 477                             midxl.append(mx)
 478                 midxl.sort(key=lambda ix:
 479                            (-len(ix), -xstat.stat(ix.name).st_mtime))
 480                 for ix in midxl:
 481                     any_needed = False
 482                     for sub in ix.idxnames:
 483                         found = d.get(os.path.join(self.dir, sub))
 484                         if not found or isinstance(found, PackIdx):
 485                             # doesn't exist, or exists but not in a midx
 486                             any_needed = True
 487                             break
 488                     if any_needed:
 489                         d[ix.name] = ix
 490                         for name in ix.idxnames:
 491                             d[os.path.join(self.dir, name)] = ix
 492                     elif not ix.force_keep:
 493                         debug1('midx: removing redundant: %s\n'
 494                                % os.path.basename(ix.name))
 495                         ix.close()
 496                         unlink(ix.name)
 497             for full in glob.glob(os.path.join(self.dir,'*.idx')):
 498                 if not d.get(full):
 499                     try:
 500                         ix = open_idx(full)
 501                     except GitError, e:
 502                         add_error(e)
 503                         continue
 504                     d[full] = ix
 505             bfull = os.path.join(self.dir, 'bup.bloom')
 506             if self.bloom is None and os.path.exists(bfull):
 507                 self.bloom = bloom.ShaBloom(bfull)
 508             self.packs = list(set(d.values()))
 509             self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
 510             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 511                 self.do_bloom = True
 512             else:
 513                 self.bloom = None
 514         debug1('PackIdxList: using %d index%s.\n'
 515             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 516
 517     def add(self, hash):
 518         """Insert an additional object in the list."""
 519         self.also.add(hash)
 520
 521
 522 def open_idx(filename):
 523     if filename.endswith('.idx'):
 524         f = open(filename, 'rb')
 525         header = f.read(8)
 526         if header[0:4] == '\377tOc':
 527             version = struct.unpack('!I', header[4:8])[0]
 528             if version == 2:
 529                 return PackIdxV2(filename, f)
 530             else:
 531                 raise GitError('%s: expected idx file version 2, got %d'
 532                                % (filename, version))
 533         elif len(header) == 8 and header[0:4] < '\377tOc':
 534             return PackIdxV1(filename, f)
 535         else:
 536             raise GitError('%s: unrecognized idx file header' % filename)
 537     elif filename.endswith('.midx'):
 538         return midx.PackMidx(filename)
 539     else:
 540         raise GitError('idx filenames must end with .idx or .midx')
 541
 542
 543 def idxmerge(idxlist, final_progress=True):
 544     """Generate a list of all the objects reachable in a PackIdxList."""
 545     def pfunc(count, total):
 546         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 547                   % (count*100.0/total, count, total))
 548     def pfinal(count, total):
 549         if final_progress:
 550             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 551                      % (100, total, total))
 552     return merge_iter(idxlist, 10024, pfunc, pfinal)
 553
 554
 555 def _make_objcache():
 556     return PackIdxList(repo('objects/pack'))
 557
 558 class PackWriter:
 559     """Writes Git objects inside a pack file."""
 560     def __init__(self, objcache_maker=_make_objcache, compression_level=1):
 561         self.count = 0
 562         self.outbytes = 0
 563         self.filename = None
 564         self.file = None
 565         self.idx = None
 566         self.objcache_maker = objcache_maker
 567         self.objcache = None
 568         self.compression_level = compression_level
 569
 570     def __del__(self):
 571         self.close()
 572
 573     def _open(self):
 574         if not self.file:
 575             (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
 576             self.file = os.fdopen(fd, 'w+b')
 577             assert(name.endswith('.pack'))
 578             self.filename = name[:-5]
 579             self.file.write('PACK\0\0\0\2\0\0\0\0')
 580             self.idx = list(list() for i in xrange(256))
 581
 582     def _raw_write(self, datalist, sha):
 583         self._open()
 584         f = self.file
 585         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 586         # the file never has a *partial* blob.  So let's make sure it's
 587         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 588         # to our hashsplit algorithm.)  f.write() does its own buffering,
 589         # but that's okay because we'll flush it in _end().
 590         oneblob = ''.join(datalist)
 591         try:
 592             f.write(oneblob)
 593         except IOError, e:
 594             raise GitError, e, sys.exc_info()[2]
 595         nw = len(oneblob)
 596         crc = zlib.crc32(oneblob) & 0xffffffff
 597         self._update_idx(sha, crc, nw)
 598         self.outbytes += nw
 599         self.count += 1
 600         return nw, crc
 601
 602     def _update_idx(self, sha, crc, size):
 603         assert(sha)
 604         if self.idx:
 605             self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
 606
 607     def _write(self, sha, type, content):
 608         if verbose:
 609             log('>')
 610         if not sha:
 611             sha = calc_hash(type, content)
 612         size, crc = self._raw_write(_encode_packobj(type, content,
 613                                                     self.compression_level),
 614                                     sha=sha)
 615         if self.outbytes >= max_pack_size or self.count >= max_pack_objects:
 616             self.breakpoint()
 617         return sha
 618
 619     def breakpoint(self):
 620         """Clear byte and object counts and return the last processed id."""
 621         id = self._end()
 622         self.outbytes = self.count = 0
 623         return id
 624
 625     def _require_objcache(self):
 626         if self.objcache is None and self.objcache_maker:
 627             self.objcache = self.objcache_maker()
 628         if self.objcache is None:
 629             raise GitError(
 630                     "PackWriter not opened or can't check exists w/o objcache")
 631
 632     def exists(self, id, want_source=False):
 633         """Return non-empty if an object is found in the object cache."""
 634         self._require_objcache()
 635         return self.objcache.exists(id, want_source=want_source)
 636
 637     def maybe_write(self, type, content):
 638         """Write an object to the pack file if not present and return its id."""
 639         sha = calc_hash(type, content)
 640         if not self.exists(sha):
 641             self._write(sha, type, content)
 642             self._require_objcache()
 643             self.objcache.add(sha)
 644         return sha
 645
 646     def new_blob(self, blob):
 647         """Create a blob object in the pack with the supplied content."""
 648         return self.maybe_write('blob', blob)
 649
 650     def new_tree(self, shalist):
 651         """Create a tree object in the pack."""
 652         content = tree_encode(shalist)
 653         return self.maybe_write('tree', content)
 654
 655     def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
 656         l = []
 657         if tree: l.append('tree %s' % tree.encode('hex'))
 658         if parent: l.append('parent %s' % parent.encode('hex'))
 659         if author: l.append('author %s %s' % (author, _git_date(adate)))
 660         if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
 661         l.append('')
 662         l.append(msg)
 663         return self.maybe_write('commit', '\n'.join(l))
 664
 665     def new_commit(self, parent, tree, date, msg):
 666         """Create a commit object in the pack."""
 667         userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
 668         commit = self._new_commit(tree, parent,
 669                                   userline, date, userline, date,
 670                                   msg)
 671         return commit
 672
 673     def abort(self):
 674         """Remove the pack file from disk."""
 675         f = self.file
 676         if f:
 677             self.idx = None
 678             self.file = None
 679             f.close()
 680             os.unlink(self.filename + '.pack')
 681
 682     def _end(self, run_midx=True):
 683         f = self.file
 684         if not f: return None
 685         self.file = None
 686         self.objcache = None
 687         idx = self.idx
 688         self.idx = None
 689
 690         # update object count
 691         f.seek(8)
 692         cp = struct.pack('!i', self.count)
 693         assert(len(cp) == 4)
 694         f.write(cp)
 695
 696         # calculate the pack sha1sum
 697         f.seek(0)
 698         sum = Sha1()
 699         for b in chunkyreader(f):
 700             sum.update(b)
 701         packbin = sum.digest()
 702         f.write(packbin)
 703         f.close()
 704
 705         obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
 706
 707         nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
 708         if os.path.exists(self.filename + '.map'):
 709             os.unlink(self.filename + '.map')
 710         os.rename(self.filename + '.pack', nameprefix + '.pack')
 711         os.rename(self.filename + '.idx', nameprefix + '.idx')
 712
 713         if run_midx:
 714             auto_midx(repo('objects/pack'))
 715         return nameprefix
 716
 717     def close(self, run_midx=True):
 718         """Close the pack file and move it to its definitive path."""
 719         return self._end(run_midx=run_midx)
 720
 721     def _write_pack_idx_v2(self, filename, idx, packbin):
 722         ofs64_count = 0
 723         for section in idx:
 724             for entry in section:
 725                 if entry[2] >= 2**31:
 726                     ofs64_count += 1
 727
 728         # Length: header + fan-out + shas-and-crcs + overflow-offsets
 729         index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
 730         idx_map = None
 731         idx_f = open(filename, 'w+b')
 732         try:
 733             idx_f.truncate(index_len)
 734             idx_map = mmap_readwrite(idx_f, close=False)
 735             count = _helpers.write_idx(filename, idx_map, idx, self.count)
 736             assert(count == self.count)
 737         finally:
 738             if idx_map: idx_map.close()
 739             idx_f.close()
 740
 741         idx_f = open(filename, 'a+b')
 742         try:
 743             idx_f.write(packbin)
 744             idx_f.seek(0)
 745             idx_sum = Sha1()
 746             b = idx_f.read(8 + 4*256)
 747             idx_sum.update(b)
 748
 749             obj_list_sum = Sha1()
 750             for b in chunkyreader(idx_f, 20*self.count):
 751                 idx_sum.update(b)
 752                 obj_list_sum.update(b)
 753             namebase = obj_list_sum.hexdigest()
 754
 755             for b in chunkyreader(idx_f):
 756                 idx_sum.update(b)
 757             idx_f.write(idx_sum.digest())
 758             return namebase
 759         finally:
 760             idx_f.close()
 761
 762
 763 def _git_date(date):
 764     return '%d %s' % (date, utc_offset_str(date))
 765
 766
 767 def _gitenv(repo_dir = None):
 768     if not repo_dir:
 769         repo_dir = repo()
 770     def env():
 771         os.environ['GIT_DIR'] = os.path.abspath(repo_dir)
 772     return env
 773
 774
 775 def list_refs(refname = None, repo_dir = None):
 776     """Generate a list of tuples in the form (refname,hash).
 777     If a ref name is specified, list only this particular ref.
 778     """
 779     argv = ['git', 'show-ref', '--']
 780     if refname:
 781         argv += [refname]
 782     p = subprocess.Popen(argv,
 783                          preexec_fn = _gitenv(repo_dir),
 784                          stdout = subprocess.PIPE)
 785     out = p.stdout.read().strip()
 786     rv = p.wait()  # not fatal
 787     if rv:
 788         assert(not out)
 789     if out:
 790         for d in out.split('\n'):
 791             (sha, name) = d.split(' ', 1)
 792             yield (name, sha.decode('hex'))
 793
 794
 795 def read_ref(refname, repo_dir = None):
 796     """Get the commit id of the most recent commit made on a given ref."""
 797     l = list(list_refs(refname, repo_dir))
 798     if l:
 799         assert(len(l) == 1)
 800         return l[0][1]
 801     else:
 802         return None
 803
 804
 805 def rev_list(ref, count=None, repo_dir=None):
 806     """Generate a list of reachable commits in reverse chronological order.
 807
 808     This generator walks through commits, from child to parent, that are
 809     reachable via the specified ref and yields a series of tuples of the form
 810     (date,hash).
 811
 812     If count is a non-zero integer, limit the number of commits to "count"
 813     objects.
 814     """
 815     assert(not ref.startswith('-'))
 816     opts = []
 817     if count:
 818         opts += ['-n', str(atoi(count))]
 819     argv = ['git', 'rev-list', '--pretty=format:%at'] + opts + [ref, '--']
 820     p = subprocess.Popen(argv,
 821                          preexec_fn = _gitenv(repo_dir),
 822                          stdout = subprocess.PIPE)
 823     commit = None
 824     for row in p.stdout:
 825         s = row.strip()
 826         if s.startswith('commit '):
 827             commit = s[7:].decode('hex')
 828         else:
 829             date = int(s)
 830             yield (date, commit)
 831     rv = p.wait()  # not fatal
 832     if rv:
 833         raise GitError, 'git rev-list returned error %d' % rv
 834
 835
 836 def get_commit_dates(refs, repo_dir=None):
 837     """Get the dates for the specified commit refs.  For now, every unique
 838        string in refs must resolve to a different commit or this
 839        function will fail."""
 840     result = []
 841     for ref in refs:
 842         commit = get_commit_items(ref, cp(repo_dir))
 843         result.append(commit.author_sec)
 844     return result
 845
 846
 847 def rev_parse(committish, repo_dir=None):
 848     """Resolve the full hash for 'committish', if it exists.
 849
 850     Should be roughly equivalent to 'git rev-parse'.
 851
 852     Returns the hex value of the hash if it is found, None if 'committish' does
 853     not correspond to anything.
 854     """
 855     head = read_ref(committish, repo_dir=repo_dir)
 856     if head:
 857         debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
 858         return head
 859
 860     pL = PackIdxList(repo('objects/pack', repo_dir=repo_dir))
 861
 862     if len(committish) == 40:
 863         try:
 864             hash = committish.decode('hex')
 865         except TypeError:
 866             return None
 867
 868         if pL.exists(hash):
 869             return hash
 870
 871     return None
 872
 873
 874 def update_ref(refname, newval, oldval, repo_dir=None):
 875     """Update a repository reference."""
 876     if not oldval:
 877         oldval = ''
 878     assert(refname.startswith('refs/heads/') \
 879            or refname.startswith('refs/tags/'))
 880     p = subprocess.Popen(['git', 'update-ref', refname,
 881                           newval.encode('hex'), oldval.encode('hex')],
 882                          preexec_fn = _gitenv(repo_dir))
 883     _git_wait('git update-ref', p)
 884
 885
 886 def delete_ref(refname):
 887     """Delete a repository reference."""
 888     assert(refname.startswith('refs/'))
 889     p = subprocess.Popen(['git', 'update-ref', '-d', refname],
 890                          preexec_fn = _gitenv())
 891     _git_wait('git update-ref', p)
 892
 893
 894 def guess_repo(path=None):
 895     """Set the path value in the global variable "repodir".
 896     This makes bup look for an existing bup repository, but not fail if a
 897     repository doesn't exist. Usually, if you are interacting with a bup
 898     repository, you would not be calling this function but using
 899     check_repo_or_die().
 900     """
 901     global repodir
 902     if path:
 903         repodir = path
 904     if not repodir:
 905         repodir = os.environ.get('BUP_DIR')
 906         if not repodir:
 907             repodir = os.path.expanduser('~/.bup')
 908
 909
 910 def init_repo(path=None):
 911     """Create the Git bare repository for bup in a given path."""
 912     guess_repo(path)
 913     d = repo()  # appends a / to the path
 914     parent = os.path.dirname(os.path.dirname(d))
 915     if parent and not os.path.exists(parent):
 916         raise GitError('parent directory "%s" does not exist\n' % parent)
 917     if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
 918         raise GitError('"%s" exists but is not a directory\n' % d)
 919     p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
 920                          preexec_fn = _gitenv())
 921     _git_wait('git init', p)
 922     # Force the index version configuration in order to ensure bup works
 923     # regardless of the version of the installed Git binary.
 924     p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
 925                          stdout=sys.stderr, preexec_fn = _gitenv())
 926     _git_wait('git config', p)
 927     # Enable the reflog
 928     p = subprocess.Popen(['git', 'config', 'core.logAllRefUpdates', 'true'],
 929                          stdout=sys.stderr, preexec_fn = _gitenv())
 930     _git_wait('git config', p)
 931
 932
 933 def check_repo_or_die(path=None):
 934     """Make sure a bup repository exists, and abort if not.
 935     If the path to a particular repository was not specified, this function
 936     initializes the default repository automatically.
 937     """
 938     guess_repo(path)
 939     try:
 940         os.stat(repo('objects/pack/.'))
 941     except OSError, e:
 942         if e.errno == errno.ENOENT:
 943             log('error: %r is not a bup repository; run "bup init"\n'
 944                 % repo())
 945             sys.exit(15)
 946         else:
 947             log('error: %s\n' % e)
 948             sys.exit(14)
 949
 950
 951 _ver = None
 952 def ver():
 953     """Get Git's version and ensure a usable version is installed.
 954
 955     The returned version is formatted as an ordered tuple with each position
 956     representing a digit in the version tag. For example, the following tuple
 957     would represent version 1.6.6.9:
 958
 959         ('1', '6', '6', '9')
 960     """
 961     global _ver
 962     if not _ver:
 963         p = subprocess.Popen(['git', '--version'],
 964                              stdout=subprocess.PIPE)
 965         gvs = p.stdout.read()
 966         _git_wait('git --version', p)
 967         m = re.match(r'git version (\S+.\S+)', gvs)
 968         if not m:
 969             raise GitError('git --version weird output: %r' % gvs)
 970         _ver = tuple(m.group(1).split('.'))
 971     needed = ('1','5', '3', '1')
 972     if _ver < needed:
 973         raise GitError('git version %s or higher is required; you have %s'
 974                        % ('.'.join(needed), '.'.join(_ver)))
 975     return _ver
 976
 977
 978 def _git_wait(cmd, p):
 979     rv = p.wait()
 980     if rv != 0:
 981         raise GitError('%s returned %d' % (cmd, rv))
 982
 983
 984 def _git_capture(argv):
 985     p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv())
 986     r = p.stdout.read()
 987     _git_wait(repr(argv), p)
 988     return r
 989
 990
 991 class _AbortableIter:
 992     def __init__(self, it, onabort = None):
 993         self.it = it
 994         self.onabort = onabort
 995         self.done = None
 996
 997     def __iter__(self):
 998         return self
 999
1000     def next(self):
1001         try:
1002             return self.it.next()
1003         except StopIteration, e:
1004             self.done = True
1005             raise
1006         except:
1007             self.abort()
1008             raise
1009
1010     def abort(self):
1011         """Abort iteration and call the abortion callback, if needed."""
1012         if not self.done:
1013             self.done = True
1014             if self.onabort:
1015                 self.onabort()
1016
1017     def __del__(self):
1018         self.abort()
1019
1020
1021 _ver_warned = 0
1022 class CatPipe:
1023     """Link to 'git cat-file' that is used to retrieve blob data."""
1024     def __init__(self, repo_dir = None):
1025         global _ver_warned
1026         self.repo_dir = repo_dir
1027         wanted = ('1','5','6')
1028         if ver() < wanted:
1029             if not _ver_warned:
1030                 log('warning: git version < %s; bup will be slow.\n'
1031                     % '.'.join(wanted))
1032                 _ver_warned = 1
1033             self.get = self._slow_get
1034         else:
1035             self.p = self.inprogress = None
1036             self.get = self._fast_get
1037
1038     def _abort(self):
1039         if self.p:
1040             self.p.stdout.close()
1041             self.p.stdin.close()
1042         self.p = None
1043         self.inprogress = None
1044
1045     def _restart(self):
1046         self._abort()
1047         self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1048                                   stdin=subprocess.PIPE,
1049                                   stdout=subprocess.PIPE,
1050                                   close_fds = True,
1051                                   bufsize = 4096,
1052                                   preexec_fn = _gitenv(self.repo_dir))
1053
1054     def _fast_get(self, id):
1055         if not self.p or self.p.poll() != None:
1056             self._restart()
1057         assert(self.p)
1058         poll_result = self.p.poll()
1059         assert(poll_result == None)
1060         if self.inprogress:
1061             log('_fast_get: opening %r while %r is open\n'
1062                 % (id, self.inprogress))
1063         assert(not self.inprogress)
1064         assert(id.find('\n') < 0)
1065         assert(id.find('\r') < 0)
1066         assert(not id.startswith('-'))
1067         self.inprogress = id
1068         self.p.stdin.write('%s\n' % id)
1069         self.p.stdin.flush()
1070         hdr = self.p.stdout.readline()
1071         if hdr.endswith(' missing\n'):
1072             self.inprogress = None
1073             raise KeyError('blob %r is missing' % id)
1074         spl = hdr.split(' ')
1075         if len(spl) != 3 or len(spl[0]) != 40:
1076             raise GitError('expected blob, got %r' % spl)
1077         (hex, type, size) = spl
1078
1079         it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
1080                            onabort = self._abort)
1081         try:
1082             yield type
1083             for blob in it:
1084                 yield blob
1085             readline_result = self.p.stdout.readline()
1086             assert(readline_result == '\n')
1087             self.inprogress = None
1088         except Exception, e:
1089             it.abort()
1090             raise
1091
1092     def _slow_get(self, id):
1093         assert(id.find('\n') < 0)
1094         assert(id.find('\r') < 0)
1095         assert(id[0] != '-')
1096         type = _git_capture(['git', 'cat-file', '-t', id]).strip()
1097         yield type
1098
1099         p = subprocess.Popen(['git', 'cat-file', type, id],
1100                              stdout=subprocess.PIPE,
1101                              preexec_fn = _gitenv(self.repo_dir))
1102         for blob in chunkyreader(p.stdout):
1103             yield blob
1104         _git_wait('git cat-file', p)
1105
1106     def _join(self, it):
1107         type = it.next()
1108         if type == 'blob':
1109             for blob in it:
1110                 yield blob
1111         elif type == 'tree':
1112             treefile = ''.join(it)
1113             for (mode, name, sha) in tree_decode(treefile):
1114                 for blob in self.join(sha.encode('hex')):
1115                     yield blob
1116         elif type == 'commit':
1117             treeline = ''.join(it).split('\n')[0]
1118             assert(treeline.startswith('tree '))
1119             for blob in self.join(treeline[5:]):
1120                 yield blob
1121         else:
1122             raise GitError('invalid object type %r: expected blob/tree/commit'
1123                            % type)
1124
1125     def join(self, id):
1126         """Generate a list of the content of all blobs that can be reached
1127         from an object.  The hash given in 'id' must point to a blob, a tree
1128         or a commit. The content of all blobs that can be seen from trees or
1129         commits will be added to the list.
1130         """
1131         try:
1132             for d in self._join(self.get(id)):
1133                 yield d
1134         except StopIteration:
1135             log('booger!\n')
1136
1137
1138 _cp = {}
1139
1140 def cp(repo_dir=None):
1141     """Create a CatPipe object or reuse the already existing one."""
1142     global _cp
1143     if not repo_dir:
1144         repo_dir = repo()
1145     repo_dir = os.path.abspath(repo_dir)
1146     cp = _cp.get(repo_dir)
1147     if not cp:
1148         cp = CatPipe(repo_dir)
1149         _cp[repo_dir] = cp
1150     return cp
1151
1152
1153 def tags(repo_dir = None):
1154     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1155     tags = {}
1156     for (n,c) in list_refs(repo_dir = repo_dir):
1157         if n.startswith('refs/tags/'):
1158             name = n[10:]
1159             if not c in tags:
1160                 tags[c] = []
1161
1162             tags[c].append(name)  # more than one tag can point at 'c'
1163     return tags