lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5 import os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
   6 from collections import namedtuple
   7
   8 from bup.helpers import *
   9 from bup import _helpers, path, midx, bloom, xstat
  10
  11 max_pack_size = 1000*1000*1000  # larger packs will slow down pruning
  12 max_pack_objects = 200*1000  # cache memory usage is about 83 bytes per object
  13
  14 verbose = 0
  15 ignore_midx = 0
  16 repodir = None
  17
  18 _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
  19 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
  20
  21 _total_searches = 0
  22 _total_steps = 0
  23
  24
  25 class GitError(Exception):
  26     pass
  27
  28
  29 def parse_tz_offset(s):
  30     """UTC offset in seconds."""
  31     tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
  32     if s[0] == '-':
  33         return - tz_off
  34     return tz_off
  35
  36
  37 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
  38 # Make sure that's authoritative.
  39 _start_end_char = r'[^ .,:;<>"\'\0\n]'
  40 _content_char = r'[^\0\n<>]'
  41 _safe_str_rx = '(?:%s{1,2}|(?:%s%s*%s))' \
  42     % (_start_end_char,
  43        _start_end_char, _content_char, _start_end_char)
  44 _tz_rx = r'[-+]\d\d[0-5]\d'
  45 _parent_rx = r'(?:parent [abcdefABCDEF0123456789]{40}\n)'
  46 _commit_rx = re.compile(r'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
  47 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
  48 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)
  49
  50 (?P<message>(?:.|\n)*)''' % (_parent_rx,
  51                              _safe_str_rx, _safe_str_rx, _tz_rx,
  52                              _safe_str_rx, _safe_str_rx, _tz_rx))
  53 _parent_hash_rx = re.compile(r'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
  54
  55
  56 # Note that the author_sec and committer_sec values are (UTC) epoch seconds.
  57 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
  58                                        'author_name', 'author_mail',
  59                                        'author_sec', 'author_offset',
  60                                        'committer_name', 'committer_mail',
  61                                        'committer_sec', 'committer_offset',
  62                                        'message'])
  63
  64 def parse_commit(content):
  65     commit_match = re.match(_commit_rx, content)
  66     if not commit_match:
  67         raise Exception('cannot parse commit %r' % content)
  68     matches = commit_match.groupdict()
  69     return CommitInfo(tree=matches['tree'],
  70                       parents=re.findall(_parent_hash_rx, matches['parents']),
  71                       author_name=matches['author_name'],
  72                       author_mail=matches['author_mail'],
  73                       author_sec=int(matches['asec']),
  74                       author_offset=parse_tz_offset(matches['atz']),
  75                       committer_name=matches['committer_name'],
  76                       committer_mail=matches['committer_mail'],
  77                       committer_sec=int(matches['csec']),
  78                       committer_offset=parse_tz_offset(matches['ctz']),
  79                       message=matches['message'])
  80
  81
  82 def get_commit_items(id, cp):
  83     commit_it = cp.get(id)
  84     assert(commit_it.next() == 'commit')
  85     commit_content = ''.join(commit_it)
  86     return parse_commit(commit_content)
  87
  88
  89 def repo(sub = ''):
  90     """Get the path to the git repository or one of its subdirectories."""
  91     global repodir
  92     if not repodir:
  93         raise GitError('You should call check_repo_or_die()')
  94
  95     # If there's a .git subdirectory, then the actual repo is in there.
  96     gd = os.path.join(repodir, '.git')
  97     if os.path.exists(gd):
  98         repodir = gd
  99
 100     return os.path.join(repodir, sub)
 101
 102
 103 def shorten_hash(s):
 104     return re.sub(r'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)',
 105                   r'\1\2*\3', s)
 106
 107
 108 def repo_rel(path):
 109     full = os.path.abspath(path)
 110     fullrepo = os.path.abspath(repo(''))
 111     if not fullrepo.endswith('/'):
 112         fullrepo += '/'
 113     if full.startswith(fullrepo):
 114         path = full[len(fullrepo):]
 115     if path.startswith('index-cache/'):
 116         path = path[len('index-cache/'):]
 117     return shorten_hash(path)
 118
 119
 120 def all_packdirs():
 121     paths = [repo('objects/pack')]
 122     paths += glob.glob(repo('index-cache/*/.'))
 123     return paths
 124
 125
 126 def auto_midx(objdir):
 127     args = [path.exe(), 'midx', '--auto', '--dir', objdir]
 128     try:
 129         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 130     except OSError, e:
 131         # make sure 'args' gets printed to help with debugging
 132         add_error('%r: exception: %s' % (args, e))
 133         raise
 134     if rv:
 135         add_error('%r: returned %d' % (args, rv))
 136
 137     args = [path.exe(), 'bloom', '--dir', objdir]
 138     try:
 139         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 140     except OSError, e:
 141         # make sure 'args' gets printed to help with debugging
 142         add_error('%r: exception: %s' % (args, e))
 143         raise
 144     if rv:
 145         add_error('%r: returned %d' % (args, rv))
 146
 147
 148 def mangle_name(name, mode, gitmode):
 149     """Mangle a file name to present an abstract name for segmented files.
 150     Mangled file names will have the ".bup" extension added to them. If a
 151     file's name already ends with ".bup", a ".bupl" extension is added to
 152     disambiguate normal files from segmented ones.
 153     """
 154     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 155         assert(stat.S_ISDIR(gitmode))
 156         return name + '.bup'
 157     elif name.endswith('.bup') or name[:-1].endswith('.bup'):
 158         return name + '.bupl'
 159     else:
 160         return name
 161
 162
 163 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 164 def demangle_name(name):
 165     """Remove name mangling from a file name, if necessary.
 166
 167     The return value is a tuple (demangled_filename,mode), where mode is one of
 168     the following:
 169
 170     * BUP_NORMAL  : files that should be read as-is from the repository
 171     * BUP_CHUNKED : files that were chunked and need to be reassembled
 172
 173     For more information on the name mangling algorithm, see mangle_name()
 174     """
 175     if name.endswith('.bupl'):
 176         return (name[:-5], BUP_NORMAL)
 177     elif name.endswith('.bup'):
 178         return (name[:-4], BUP_CHUNKED)
 179     else:
 180         return (name, BUP_NORMAL)
 181
 182
 183 def calc_hash(type, content):
 184     """Calculate some content's hash in the Git fashion."""
 185     header = '%s %d\0' % (type, len(content))
 186     sum = Sha1(header)
 187     sum.update(content)
 188     return sum.digest()
 189
 190
 191 def shalist_item_sort_key(ent):
 192     (mode, name, id) = ent
 193     assert(mode+0 == mode)
 194     if stat.S_ISDIR(mode):
 195         return name + '/'
 196     else:
 197         return name
 198
 199
 200 def tree_encode(shalist):
 201     """Generate a git tree object from (mode,name,hash) tuples."""
 202     shalist = sorted(shalist, key = shalist_item_sort_key)
 203     l = []
 204     for (mode,name,bin) in shalist:
 205         assert(mode)
 206         assert(mode+0 == mode)
 207         assert(name)
 208         assert(len(bin) == 20)
 209         s = '%o %s\0%s' % (mode,name,bin)
 210         assert(s[0] != '0')  # 0-padded octal is not acceptable in a git tree
 211         l.append(s)
 212     return ''.join(l)
 213
 214
 215 def tree_decode(buf):
 216     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 217     ofs = 0
 218     while ofs < len(buf):
 219         z = buf.find('\0', ofs)
 220         assert(z > ofs)
 221         spl = buf[ofs:z].split(' ', 1)
 222         assert(len(spl) == 2)
 223         mode,name = spl
 224         sha = buf[z+1:z+1+20]
 225         ofs = z+1+20
 226         yield (int(mode, 8), name, sha)
 227
 228
 229 def _encode_packobj(type, content, compression_level=1):
 230     szout = ''
 231     sz = len(content)
 232     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 233     sz >>= 4
 234     while 1:
 235         if sz: szbits |= 0x80
 236         szout += chr(szbits)
 237         if not sz:
 238             break
 239         szbits = sz & 0x7f
 240         sz >>= 7
 241     if compression_level > 9:
 242         compression_level = 9
 243     elif compression_level < 0:
 244         compression_level = 0
 245     z = zlib.compressobj(compression_level)
 246     yield szout
 247     yield z.compress(content)
 248     yield z.flush()
 249
 250
 251 def _encode_looseobj(type, content, compression_level=1):
 252     z = zlib.compressobj(compression_level)
 253     yield z.compress('%s %d\0' % (type, len(content)))
 254     yield z.compress(content)
 255     yield z.flush()
 256
 257
 258 def _decode_looseobj(buf):
 259     assert(buf);
 260     s = zlib.decompress(buf)
 261     i = s.find('\0')
 262     assert(i > 0)
 263     l = s[:i].split(' ')
 264     type = l[0]
 265     sz = int(l[1])
 266     content = s[i+1:]
 267     assert(type in _typemap)
 268     assert(sz == len(content))
 269     return (type, content)
 270
 271
 272 def _decode_packobj(buf):
 273     assert(buf)
 274     c = ord(buf[0])
 275     type = _typermap[(c & 0x70) >> 4]
 276     sz = c & 0x0f
 277     shift = 4
 278     i = 0
 279     while c & 0x80:
 280         i += 1
 281         c = ord(buf[i])
 282         sz |= (c & 0x7f) << shift
 283         shift += 7
 284         if not (c & 0x80):
 285             break
 286     return (type, zlib.decompress(buf[i+1:]))
 287
 288
 289 class PackIdx:
 290     def __init__(self):
 291         assert(0)
 292
 293     def find_offset(self, hash):
 294         """Get the offset of an object inside the index file."""
 295         idx = self._idx_from_hash(hash)
 296         if idx != None:
 297             return self._ofs_from_idx(idx)
 298         return None
 299
 300     def exists(self, hash, want_source=False):
 301         """Return nonempty if the object exists in this index."""
 302         if hash and (self._idx_from_hash(hash) != None):
 303             return want_source and os.path.basename(self.name) or True
 304         return None
 305
 306     def __len__(self):
 307         return int(self.fanout[255])
 308
 309     def _idx_from_hash(self, hash):
 310         global _total_searches, _total_steps
 311         _total_searches += 1
 312         assert(len(hash) == 20)
 313         b1 = ord(hash[0])
 314         start = self.fanout[b1-1] # range -1..254
 315         end = self.fanout[b1] # range 0..255
 316         want = str(hash)
 317         _total_steps += 1  # lookup table is a step
 318         while start < end:
 319             _total_steps += 1
 320             mid = start + (end-start)/2
 321             v = self._idx_to_hash(mid)
 322             if v < want:
 323                 start = mid+1
 324             elif v > want:
 325                 end = mid
 326             else: # got it!
 327                 return mid
 328         return None
 329
 330
 331 class PackIdxV1(PackIdx):
 332     """Object representation of a Git pack index (version 1) file."""
 333     def __init__(self, filename, f):
 334         self.name = filename
 335         self.idxnames = [self.name]
 336         self.map = mmap_read(f)
 337         self.fanout = list(struct.unpack('!256I',
 338                                          str(buffer(self.map, 0, 256*4))))
 339         self.fanout.append(0)  # entry "-1"
 340         nsha = self.fanout[255]
 341         self.sha_ofs = 256*4
 342         self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
 343
 344     def _ofs_from_idx(self, idx):
 345         return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
 346
 347     def _idx_to_hash(self, idx):
 348         return str(self.shatable[idx*24+4 : idx*24+24])
 349
 350     def __iter__(self):
 351         for i in xrange(self.fanout[255]):
 352             yield buffer(self.map, 256*4 + 24*i + 4, 20)
 353
 354
 355 class PackIdxV2(PackIdx):
 356     """Object representation of a Git pack index (version 2) file."""
 357     def __init__(self, filename, f):
 358         self.name = filename
 359         self.idxnames = [self.name]
 360         self.map = mmap_read(f)
 361         assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
 362         self.fanout = list(struct.unpack('!256I',
 363                                          str(buffer(self.map, 8, 256*4))))
 364         self.fanout.append(0)  # entry "-1"
 365         nsha = self.fanout[255]
 366         self.sha_ofs = 8 + 256*4
 367         self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
 368         self.ofstable = buffer(self.map,
 369                                self.sha_ofs + nsha*20 + nsha*4,
 370                                nsha*4)
 371         self.ofs64table = buffer(self.map,
 372                                  8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
 373
 374     def _ofs_from_idx(self, idx):
 375         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
 376         if ofs & 0x80000000:
 377             idx64 = ofs & 0x7fffffff
 378             ofs = struct.unpack('!Q',
 379                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
 380         return ofs
 381
 382     def _idx_to_hash(self, idx):
 383         return str(self.shatable[idx*20:(idx+1)*20])
 384
 385     def __iter__(self):
 386         for i in xrange(self.fanout[255]):
 387             yield buffer(self.map, 8 + 256*4 + 20*i, 20)
 388
 389
 390 _mpi_count = 0
 391 class PackIdxList:
 392     def __init__(self, dir):
 393         global _mpi_count
 394         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 395         _mpi_count += 1
 396         self.dir = dir
 397         self.also = set()
 398         self.packs = []
 399         self.do_bloom = False
 400         self.bloom = None
 401         self.refresh()
 402
 403     def __del__(self):
 404         global _mpi_count
 405         _mpi_count -= 1
 406         assert(_mpi_count == 0)
 407
 408     def __iter__(self):
 409         return iter(idxmerge(self.packs))
 410
 411     def __len__(self):
 412         return sum(len(pack) for pack in self.packs)
 413
 414     def exists(self, hash, want_source=False):
 415         """Return nonempty if the object exists in the index files."""
 416         global _total_searches
 417         _total_searches += 1
 418         if hash in self.also:
 419             return True
 420         if self.do_bloom and self.bloom:
 421             if self.bloom.exists(hash):
 422                 self.do_bloom = False
 423             else:
 424                 _total_searches -= 1  # was counted by bloom
 425                 return None
 426         for i in xrange(len(self.packs)):
 427             p = self.packs[i]
 428             _total_searches -= 1  # will be incremented by sub-pack
 429             ix = p.exists(hash, want_source=want_source)
 430             if ix:
 431                 # reorder so most recently used packs are searched first
 432                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 433                 return ix
 434         self.do_bloom = True
 435         return None
 436
 437     def refresh(self, skip_midx = False):
 438         """Refresh the index list.
 439         This method verifies if .midx files were superseded (e.g. all of its
 440         contents are in another, bigger .midx file) and removes the superseded
 441         files.
 442
 443         If skip_midx is True, all work on .midx files will be skipped and .midx
 444         files will be removed from the list.
 445
 446         The module-global variable 'ignore_midx' can force this function to
 447         always act as if skip_midx was True.
 448         """
 449         self.bloom = None # Always reopen the bloom as it may have been relaced
 450         self.do_bloom = False
 451         skip_midx = skip_midx or ignore_midx
 452         d = dict((p.name, p) for p in self.packs
 453                  if not skip_midx or not isinstance(p, midx.PackMidx))
 454         if os.path.exists(self.dir):
 455             if not skip_midx:
 456                 midxl = []
 457                 for ix in self.packs:
 458                     if isinstance(ix, midx.PackMidx):
 459                         for name in ix.idxnames:
 460                             d[os.path.join(self.dir, name)] = ix
 461                 for full in glob.glob(os.path.join(self.dir,'*.midx')):
 462                     if not d.get(full):
 463                         mx = midx.PackMidx(full)
 464                         (mxd, mxf) = os.path.split(mx.name)
 465                         broken = False
 466                         for n in mx.idxnames:
 467                             if not os.path.exists(os.path.join(mxd, n)):
 468                                 log(('warning: index %s missing\n' +
 469                                     '  used by %s\n') % (n, mxf))
 470                                 broken = True
 471                         if broken:
 472                             mx.close()
 473                             del mx
 474                             unlink(full)
 475                         else:
 476                             midxl.append(mx)
 477                 midxl.sort(key=lambda ix:
 478                            (-len(ix), -xstat.stat(ix.name).st_mtime))
 479                 for ix in midxl:
 480                     any_needed = False
 481                     for sub in ix.idxnames:
 482                         found = d.get(os.path.join(self.dir, sub))
 483                         if not found or isinstance(found, PackIdx):
 484                             # doesn't exist, or exists but not in a midx
 485                             any_needed = True
 486                             break
 487                     if any_needed:
 488                         d[ix.name] = ix
 489                         for name in ix.idxnames:
 490                             d[os.path.join(self.dir, name)] = ix
 491                     elif not ix.force_keep:
 492                         debug1('midx: removing redundant: %s\n'
 493                                % os.path.basename(ix.name))
 494                         ix.close()
 495                         unlink(ix.name)
 496             for full in glob.glob(os.path.join(self.dir,'*.idx')):
 497                 if not d.get(full):
 498                     try:
 499                         ix = open_idx(full)
 500                     except GitError, e:
 501                         add_error(e)
 502                         continue
 503                     d[full] = ix
 504             bfull = os.path.join(self.dir, 'bup.bloom')
 505             if self.bloom is None and os.path.exists(bfull):
 506                 self.bloom = bloom.ShaBloom(bfull)
 507             self.packs = list(set(d.values()))
 508             self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
 509             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 510                 self.do_bloom = True
 511             else:
 512                 self.bloom = None
 513         debug1('PackIdxList: using %d index%s.\n'
 514             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 515
 516     def add(self, hash):
 517         """Insert an additional object in the list."""
 518         self.also.add(hash)
 519
 520
 521 def open_idx(filename):
 522     if filename.endswith('.idx'):
 523         f = open(filename, 'rb')
 524         header = f.read(8)
 525         if header[0:4] == '\377tOc':
 526             version = struct.unpack('!I', header[4:8])[0]
 527             if version == 2:
 528                 return PackIdxV2(filename, f)
 529             else:
 530                 raise GitError('%s: expected idx file version 2, got %d'
 531                                % (filename, version))
 532         elif len(header) == 8 and header[0:4] < '\377tOc':
 533             return PackIdxV1(filename, f)
 534         else:
 535             raise GitError('%s: unrecognized idx file header' % filename)
 536     elif filename.endswith('.midx'):
 537         return midx.PackMidx(filename)
 538     else:
 539         raise GitError('idx filenames must end with .idx or .midx')
 540
 541
 542 def idxmerge(idxlist, final_progress=True):
 543     """Generate a list of all the objects reachable in a PackIdxList."""
 544     def pfunc(count, total):
 545         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 546                   % (count*100.0/total, count, total))
 547     def pfinal(count, total):
 548         if final_progress:
 549             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 550                      % (100, total, total))
 551     return merge_iter(idxlist, 10024, pfunc, pfinal)
 552
 553
 554 def _make_objcache():
 555     return PackIdxList(repo('objects/pack'))
 556
 557 class PackWriter:
 558     """Writes Git objects inside a pack file."""
 559     def __init__(self, objcache_maker=_make_objcache, compression_level=1):
 560         self.count = 0
 561         self.outbytes = 0
 562         self.filename = None
 563         self.file = None
 564         self.idx = None
 565         self.objcache_maker = objcache_maker
 566         self.objcache = None
 567         self.compression_level = compression_level
 568
 569     def __del__(self):
 570         self.close()
 571
 572     def _open(self):
 573         if not self.file:
 574             (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
 575             self.file = os.fdopen(fd, 'w+b')
 576             assert(name.endswith('.pack'))
 577             self.filename = name[:-5]
 578             self.file.write('PACK\0\0\0\2\0\0\0\0')
 579             self.idx = list(list() for i in xrange(256))
 580
 581     def _raw_write(self, datalist, sha):
 582         self._open()
 583         f = self.file
 584         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 585         # the file never has a *partial* blob.  So let's make sure it's
 586         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 587         # to our hashsplit algorithm.)  f.write() does its own buffering,
 588         # but that's okay because we'll flush it in _end().
 589         oneblob = ''.join(datalist)
 590         try:
 591             f.write(oneblob)
 592         except IOError, e:
 593             raise GitError, e, sys.exc_info()[2]
 594         nw = len(oneblob)
 595         crc = zlib.crc32(oneblob) & 0xffffffff
 596         self._update_idx(sha, crc, nw)
 597         self.outbytes += nw
 598         self.count += 1
 599         return nw, crc
 600
 601     def _update_idx(self, sha, crc, size):
 602         assert(sha)
 603         if self.idx:
 604             self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
 605
 606     def _write(self, sha, type, content):
 607         if verbose:
 608             log('>')
 609         if not sha:
 610             sha = calc_hash(type, content)
 611         size, crc = self._raw_write(_encode_packobj(type, content,
 612                                                     self.compression_level),
 613                                     sha=sha)
 614         if self.outbytes >= max_pack_size or self.count >= max_pack_objects:
 615             self.breakpoint()
 616         return sha
 617
 618     def breakpoint(self):
 619         """Clear byte and object counts and return the last processed id."""
 620         id = self._end()
 621         self.outbytes = self.count = 0
 622         return id
 623
 624     def _require_objcache(self):
 625         if self.objcache is None and self.objcache_maker:
 626             self.objcache = self.objcache_maker()
 627         if self.objcache is None:
 628             raise GitError(
 629                     "PackWriter not opened or can't check exists w/o objcache")
 630
 631     def exists(self, id, want_source=False):
 632         """Return non-empty if an object is found in the object cache."""
 633         self._require_objcache()
 634         return self.objcache.exists(id, want_source=want_source)
 635
 636     def maybe_write(self, type, content):
 637         """Write an object to the pack file if not present and return its id."""
 638         sha = calc_hash(type, content)
 639         if not self.exists(sha):
 640             self._write(sha, type, content)
 641             self._require_objcache()
 642             self.objcache.add(sha)
 643         return sha
 644
 645     def new_blob(self, blob):
 646         """Create a blob object in the pack with the supplied content."""
 647         return self.maybe_write('blob', blob)
 648
 649     def new_tree(self, shalist):
 650         """Create a tree object in the pack."""
 651         content = tree_encode(shalist)
 652         return self.maybe_write('tree', content)
 653
 654     def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
 655         l = []
 656         if tree: l.append('tree %s' % tree.encode('hex'))
 657         if parent: l.append('parent %s' % parent.encode('hex'))
 658         if author: l.append('author %s %s' % (author, _git_date(adate)))
 659         if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
 660         l.append('')
 661         l.append(msg)
 662         return self.maybe_write('commit', '\n'.join(l))
 663
 664     def new_commit(self, parent, tree, date, msg):
 665         """Create a commit object in the pack."""
 666         userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
 667         commit = self._new_commit(tree, parent,
 668                                   userline, date, userline, date,
 669                                   msg)
 670         return commit
 671
 672     def abort(self):
 673         """Remove the pack file from disk."""
 674         f = self.file
 675         if f:
 676             self.idx = None
 677             self.file = None
 678             f.close()
 679             os.unlink(self.filename + '.pack')
 680
 681     def _end(self, run_midx=True):
 682         f = self.file
 683         if not f: return None
 684         self.file = None
 685         self.objcache = None
 686         idx = self.idx
 687         self.idx = None
 688
 689         # update object count
 690         f.seek(8)
 691         cp = struct.pack('!i', self.count)
 692         assert(len(cp) == 4)
 693         f.write(cp)
 694
 695         # calculate the pack sha1sum
 696         f.seek(0)
 697         sum = Sha1()
 698         for b in chunkyreader(f):
 699             sum.update(b)
 700         packbin = sum.digest()
 701         f.write(packbin)
 702         f.close()
 703
 704         obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
 705
 706         nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
 707         if os.path.exists(self.filename + '.map'):
 708             os.unlink(self.filename + '.map')
 709         os.rename(self.filename + '.pack', nameprefix + '.pack')
 710         os.rename(self.filename + '.idx', nameprefix + '.idx')
 711
 712         if run_midx:
 713             auto_midx(repo('objects/pack'))
 714         return nameprefix
 715
 716     def close(self, run_midx=True):
 717         """Close the pack file and move it to its definitive path."""
 718         return self._end(run_midx=run_midx)
 719
 720     def _write_pack_idx_v2(self, filename, idx, packbin):
 721         ofs64_count = 0
 722         for section in idx:
 723             for entry in section:
 724                 if entry[2] >= 2**31:
 725                     ofs64_count += 1
 726
 727         # Length: header + fan-out + shas-and-crcs + overflow-offsets
 728         index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
 729         idx_map = None
 730         idx_f = open(filename, 'w+b')
 731         try:
 732             idx_f.truncate(index_len)
 733             idx_map = mmap_readwrite(idx_f, close=False)
 734             count = _helpers.write_idx(filename, idx_map, idx, self.count)
 735             assert(count == self.count)
 736         finally:
 737             if idx_map: idx_map.close()
 738             idx_f.close()
 739
 740         idx_f = open(filename, 'a+b')
 741         try:
 742             idx_f.write(packbin)
 743             idx_f.seek(0)
 744             idx_sum = Sha1()
 745             b = idx_f.read(8 + 4*256)
 746             idx_sum.update(b)
 747
 748             obj_list_sum = Sha1()
 749             for b in chunkyreader(idx_f, 20*self.count):
 750                 idx_sum.update(b)
 751                 obj_list_sum.update(b)
 752             namebase = obj_list_sum.hexdigest()
 753
 754             for b in chunkyreader(idx_f):
 755                 idx_sum.update(b)
 756             idx_f.write(idx_sum.digest())
 757             return namebase
 758         finally:
 759             idx_f.close()
 760
 761
 762 def _git_date(date):
 763     return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
 764
 765
 766 def _gitenv(repo_dir = None):
 767     if not repo_dir:
 768         repo_dir = repo()
 769     def env():
 770         os.environ['GIT_DIR'] = os.path.abspath(repo_dir)
 771     return env
 772
 773
 774 def list_refs(refname = None):
 775     """Generate a list of tuples in the form (refname,hash).
 776     If a ref name is specified, list only this particular ref.
 777     """
 778     argv = ['git', 'show-ref', '--']
 779     if refname:
 780         argv += [refname]
 781     p = subprocess.Popen(argv, preexec_fn = _gitenv(), stdout = subprocess.PIPE)
 782     out = p.stdout.read().strip()
 783     rv = p.wait()  # not fatal
 784     if rv:
 785         assert(not out)
 786     if out:
 787         for d in out.split('\n'):
 788             (sha, name) = d.split(' ', 1)
 789             yield (name, sha.decode('hex'))
 790
 791
 792 def read_ref(refname):
 793     """Get the commit id of the most recent commit made on a given ref."""
 794     l = list(list_refs(refname))
 795     if l:
 796         assert(len(l) == 1)
 797         return l[0][1]
 798     else:
 799         return None
 800
 801
 802 def rev_list(ref, count=None):
 803     """Generate a list of reachable commits in reverse chronological order.
 804
 805     This generator walks through commits, from child to parent, that are
 806     reachable via the specified ref and yields a series of tuples of the form
 807     (date,hash).
 808
 809     If count is a non-zero integer, limit the number of commits to "count"
 810     objects.
 811     """
 812     assert(not ref.startswith('-'))
 813     opts = []
 814     if count:
 815         opts += ['-n', str(atoi(count))]
 816     argv = ['git', 'rev-list', '--pretty=format:%at'] + opts + [ref, '--']
 817     p = subprocess.Popen(argv, preexec_fn = _gitenv(), stdout = subprocess.PIPE)
 818     commit = None
 819     for row in p.stdout:
 820         s = row.strip()
 821         if s.startswith('commit '):
 822             commit = s[7:].decode('hex')
 823         else:
 824             date = int(s)
 825             yield (date, commit)
 826     rv = p.wait()  # not fatal
 827     if rv:
 828         raise GitError, 'git rev-list returned error %d' % rv
 829
 830
 831 def get_commit_dates(refs):
 832     """Get the dates for the specified commit refs.  For now, every unique
 833        string in refs must resolve to a different commit or this
 834        function will fail."""
 835     result = []
 836     for ref in refs:
 837         commit = get_commit_items(ref, cp())
 838         result.append(commit.author_sec)
 839     return result
 840
 841
 842 def rev_parse(committish):
 843     """Resolve the full hash for 'committish', if it exists.
 844
 845     Should be roughly equivalent to 'git rev-parse'.
 846
 847     Returns the hex value of the hash if it is found, None if 'committish' does
 848     not correspond to anything.
 849     """
 850     head = read_ref(committish)
 851     if head:
 852         debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
 853         return head
 854
 855     pL = PackIdxList(repo('objects/pack'))
 856
 857     if len(committish) == 40:
 858         try:
 859             hash = committish.decode('hex')
 860         except TypeError:
 861             return None
 862
 863         if pL.exists(hash):
 864             return hash
 865
 866     return None
 867
 868
 869 def update_ref(refname, newval, oldval):
 870     """Change the commit pointed to by a branch."""
 871     if not oldval:
 872         oldval = ''
 873     assert(refname.startswith('refs/heads/'))
 874     p = subprocess.Popen(['git', 'update-ref', refname,
 875                           newval.encode('hex'), oldval.encode('hex')],
 876                          preexec_fn = _gitenv())
 877     _git_wait('git update-ref', p)
 878
 879
 880 def guess_repo(path=None):
 881     """Set the path value in the global variable "repodir".
 882     This makes bup look for an existing bup repository, but not fail if a
 883     repository doesn't exist. Usually, if you are interacting with a bup
 884     repository, you would not be calling this function but using
 885     check_repo_or_die().
 886     """
 887     global repodir
 888     if path:
 889         repodir = path
 890     if not repodir:
 891         repodir = os.environ.get('BUP_DIR')
 892         if not repodir:
 893             repodir = os.path.expanduser('~/.bup')
 894
 895
 896 def init_repo(path=None):
 897     """Create the Git bare repository for bup in a given path."""
 898     guess_repo(path)
 899     d = repo()  # appends a / to the path
 900     parent = os.path.dirname(os.path.dirname(d))
 901     if parent and not os.path.exists(parent):
 902         raise GitError('parent directory "%s" does not exist\n' % parent)
 903     if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
 904         raise GitError('"%s" exists but is not a directory\n' % d)
 905     p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
 906                          preexec_fn = _gitenv())
 907     _git_wait('git init', p)
 908     # Force the index version configuration in order to ensure bup works
 909     # regardless of the version of the installed Git binary.
 910     p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
 911                          stdout=sys.stderr, preexec_fn = _gitenv())
 912     _git_wait('git config', p)
 913     # Enable the reflog
 914     p = subprocess.Popen(['git', 'config', 'core.logAllRefUpdates', 'true'],
 915                          stdout=sys.stderr, preexec_fn = _gitenv())
 916     _git_wait('git config', p)
 917
 918
 919 def check_repo_or_die(path=None):
 920     """Make sure a bup repository exists, and abort if not.
 921     If the path to a particular repository was not specified, this function
 922     initializes the default repository automatically.
 923     """
 924     guess_repo(path)
 925     try:
 926         os.stat(repo('objects/pack/.'))
 927     except OSError, e:
 928         if e.errno == errno.ENOENT:
 929             log('error: %r is not a bup repository; run "bup init"\n'
 930                 % repo())
 931             sys.exit(15)
 932         else:
 933             log('error: %s\n' % e)
 934             sys.exit(14)
 935
 936
 937 _ver = None
 938 def ver():
 939     """Get Git's version and ensure a usable version is installed.
 940
 941     The returned version is formatted as an ordered tuple with each position
 942     representing a digit in the version tag. For example, the following tuple
 943     would represent version 1.6.6.9:
 944
 945         ('1', '6', '6', '9')
 946     """
 947     global _ver
 948     if not _ver:
 949         p = subprocess.Popen(['git', '--version'],
 950                              stdout=subprocess.PIPE)
 951         gvs = p.stdout.read()
 952         _git_wait('git --version', p)
 953         m = re.match(r'git version (\S+.\S+)', gvs)
 954         if not m:
 955             raise GitError('git --version weird output: %r' % gvs)
 956         _ver = tuple(m.group(1).split('.'))
 957     needed = ('1','5', '3', '1')
 958     if _ver < needed:
 959         raise GitError('git version %s or higher is required; you have %s'
 960                        % ('.'.join(needed), '.'.join(_ver)))
 961     return _ver
 962
 963
 964 def _git_wait(cmd, p):
 965     rv = p.wait()
 966     if rv != 0:
 967         raise GitError('%s returned %d' % (cmd, rv))
 968
 969
 970 def _git_capture(argv):
 971     p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv())
 972     r = p.stdout.read()
 973     _git_wait(repr(argv), p)
 974     return r
 975
 976
 977 class _AbortableIter:
 978     def __init__(self, it, onabort = None):
 979         self.it = it
 980         self.onabort = onabort
 981         self.done = None
 982
 983     def __iter__(self):
 984         return self
 985
 986     def next(self):
 987         try:
 988             return self.it.next()
 989         except StopIteration, e:
 990             self.done = True
 991             raise
 992         except:
 993             self.abort()
 994             raise
 995
 996     def abort(self):
 997         """Abort iteration and call the abortion callback, if needed."""
 998         if not self.done:
 999             self.done = True
1000             if self.onabort:
1001                 self.onabort()
1002
1003     def __del__(self):
1004         self.abort()
1005
1006
1007 _ver_warned = 0
1008 class CatPipe:
1009     """Link to 'git cat-file' that is used to retrieve blob data."""
1010     def __init__(self, repo_dir = None):
1011         global _ver_warned
1012         self.repo_dir = repo_dir
1013         wanted = ('1','5','6')
1014         if ver() < wanted:
1015             if not _ver_warned:
1016                 log('warning: git version < %s; bup will be slow.\n'
1017                     % '.'.join(wanted))
1018                 _ver_warned = 1
1019             self.get = self._slow_get
1020         else:
1021             self.p = self.inprogress = None
1022             self.get = self._fast_get
1023
1024     def _abort(self):
1025         if self.p:
1026             self.p.stdout.close()
1027             self.p.stdin.close()
1028         self.p = None
1029         self.inprogress = None
1030
1031     def _restart(self):
1032         self._abort()
1033         self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1034                                   stdin=subprocess.PIPE,
1035                                   stdout=subprocess.PIPE,
1036                                   close_fds = True,
1037                                   bufsize = 4096,
1038                                   preexec_fn = _gitenv(self.repo_dir))
1039
1040     def _fast_get(self, id):
1041         if not self.p or self.p.poll() != None:
1042             self._restart()
1043         assert(self.p)
1044         poll_result = self.p.poll()
1045         assert(poll_result == None)
1046         if self.inprogress:
1047             log('_fast_get: opening %r while %r is open\n'
1048                 % (id, self.inprogress))
1049         assert(not self.inprogress)
1050         assert(id.find('\n') < 0)
1051         assert(id.find('\r') < 0)
1052         assert(not id.startswith('-'))
1053         self.inprogress = id
1054         self.p.stdin.write('%s\n' % id)
1055         self.p.stdin.flush()
1056         hdr = self.p.stdout.readline()
1057         if hdr.endswith(' missing\n'):
1058             self.inprogress = None
1059             raise KeyError('blob %r is missing' % id)
1060         spl = hdr.split(' ')
1061         if len(spl) != 3 or len(spl[0]) != 40:
1062             raise GitError('expected blob, got %r' % spl)
1063         (hex, type, size) = spl
1064
1065         it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
1066                            onabort = self._abort)
1067         try:
1068             yield type
1069             for blob in it:
1070                 yield blob
1071             readline_result = self.p.stdout.readline()
1072             assert(readline_result == '\n')
1073             self.inprogress = None
1074         except Exception, e:
1075             it.abort()
1076             raise
1077
1078     def _slow_get(self, id):
1079         assert(id.find('\n') < 0)
1080         assert(id.find('\r') < 0)
1081         assert(id[0] != '-')
1082         type = _git_capture(['git', 'cat-file', '-t', id]).strip()
1083         yield type
1084
1085         p = subprocess.Popen(['git', 'cat-file', type, id],
1086                              stdout=subprocess.PIPE,
1087                              preexec_fn = _gitenv(self.repo_dir))
1088         for blob in chunkyreader(p.stdout):
1089             yield blob
1090         _git_wait('git cat-file', p)
1091
1092     def _join(self, it):
1093         type = it.next()
1094         if type == 'blob':
1095             for blob in it:
1096                 yield blob
1097         elif type == 'tree':
1098             treefile = ''.join(it)
1099             for (mode, name, sha) in tree_decode(treefile):
1100                 for blob in self.join(sha.encode('hex')):
1101                     yield blob
1102         elif type == 'commit':
1103             treeline = ''.join(it).split('\n')[0]
1104             assert(treeline.startswith('tree '))
1105             for blob in self.join(treeline[5:]):
1106                 yield blob
1107         else:
1108             raise GitError('invalid object type %r: expected blob/tree/commit'
1109                            % type)
1110
1111     def join(self, id):
1112         """Generate a list of the content of all blobs that can be reached
1113         from an object.  The hash given in 'id' must point to a blob, a tree
1114         or a commit. The content of all blobs that can be seen from trees or
1115         commits will be added to the list.
1116         """
1117         try:
1118             for d in self._join(self.get(id)):
1119                 yield d
1120         except StopIteration:
1121             log('booger!\n')
1122
1123
1124 _cp = (None, None)
1125
1126 def cp():
1127     """Create a CatPipe object or reuse an already existing one."""
1128     global _cp
1129     cp_dir, cp = _cp
1130     cur_dir = os.path.realpath(repo())
1131     if cur_dir != cp_dir:
1132         cp = CatPipe()
1133         _cp = (cur_dir, cp)
1134     return cp
1135
1136
1137 def tags():
1138     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1139     tags = {}
1140     for (n,c) in list_refs():
1141         if n.startswith('refs/tags/'):
1142             name = n[10:]
1143             if not c in tags:
1144                 tags[c] = []
1145
1146             tags[c].append(name)  # more than one tag can point at 'c'
1147
1148     return tags