lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5 import os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
   6 from collections import namedtuple
   7
   8 from bup.helpers import *
   9 from bup import _helpers, path, midx, bloom, xstat
  10
  11 max_pack_size = 1000*1000*1000  # larger packs will slow down pruning
  12 max_pack_objects = 200*1000  # cache memory usage is about 83 bytes per object
  13
  14 verbose = 0
  15 ignore_midx = 0
  16 repodir = None
  17
  18 _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
  19 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
  20
  21 _total_searches = 0
  22 _total_steps = 0
  23
  24
  25 class GitError(Exception):
  26     pass
  27
  28
  29 def parse_tz_offset(s):
  30     """UTC offset in seconds."""
  31     tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
  32     if s[0] == '-':
  33         return - tz_off
  34     return tz_off
  35
  36
  37 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
  38 # Make sure that's authoritative.
  39 _start_end_char = r'[^ .,:;<>"\'\0\n]'
  40 _content_char = r'[^\0\n<>]'
  41 _safe_str_rx = '(?:%s{1,2}|(?:%s%s*%s))' \
  42     % (_start_end_char,
  43        _start_end_char, _content_char, _start_end_char)
  44 _tz_rx = r'[-+]\d\d[0-5]\d'
  45 _parent_rx = r'(?:parent [abcdefABCDEF0123456789]{40}\n)'
  46 _commit_rx = re.compile(r'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
  47 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
  48 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)
  49
  50 (?P<message>(?:.|\n)*)''' % (_parent_rx,
  51                              _safe_str_rx, _safe_str_rx, _tz_rx,
  52                              _safe_str_rx, _safe_str_rx, _tz_rx))
  53 _parent_hash_rx = re.compile(r'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
  54
  55
  56 # Note that the author_sec and committer_sec values are (UTC) epoch seconds.
  57 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
  58                                        'author_name', 'author_mail',
  59                                        'author_sec', 'author_offset',
  60                                        'committer_name', 'committer_mail',
  61                                        'committer_sec', 'committer_offset',
  62                                        'message'])
  63
  64 def parse_commit(content):
  65     commit_match = re.match(_commit_rx, content)
  66     if not commit_match:
  67         raise Exception('cannot parse commit %r' % content)
  68     matches = commit_match.groupdict()
  69     return CommitInfo(tree=matches['tree'],
  70                       parents=re.findall(_parent_hash_rx, matches['parents']),
  71                       author_name=matches['author_name'],
  72                       author_mail=matches['author_mail'],
  73                       author_sec=int(matches['asec']),
  74                       author_offset=parse_tz_offset(matches['atz']),
  75                       committer_name=matches['committer_name'],
  76                       committer_mail=matches['committer_mail'],
  77                       committer_sec=int(matches['csec']),
  78                       committer_offset=parse_tz_offset(matches['ctz']),
  79                       message=matches['message'])
  80
  81
  82 def get_commit_items(id, cp):
  83     commit_it = cp.get(id)
  84     assert(commit_it.next() == 'commit')
  85     commit_content = ''.join(commit_it)
  86     return parse_commit(commit_content)
  87
  88
  89 def repo(sub = ''):
  90     """Get the path to the git repository or one of its subdirectories."""
  91     global repodir
  92     if not repodir:
  93         raise GitError('You should call check_repo_or_die()')
  94
  95     # If there's a .git subdirectory, then the actual repo is in there.
  96     gd = os.path.join(repodir, '.git')
  97     if os.path.exists(gd):
  98         repodir = gd
  99
 100     return os.path.join(repodir, sub)
 101
 102
 103 def shorten_hash(s):
 104     return re.sub(r'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)',
 105                   r'\1\2*\3', s)
 106
 107
 108 def repo_rel(path):
 109     full = os.path.abspath(path)
 110     fullrepo = os.path.abspath(repo(''))
 111     if not fullrepo.endswith('/'):
 112         fullrepo += '/'
 113     if full.startswith(fullrepo):
 114         path = full[len(fullrepo):]
 115     if path.startswith('index-cache/'):
 116         path = path[len('index-cache/'):]
 117     return shorten_hash(path)
 118
 119
 120 def all_packdirs():
 121     paths = [repo('objects/pack')]
 122     paths += glob.glob(repo('index-cache/*/.'))
 123     return paths
 124
 125
 126 def auto_midx(objdir):
 127     args = [path.exe(), 'midx', '--auto', '--dir', objdir]
 128     try:
 129         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 130     except OSError, e:
 131         # make sure 'args' gets printed to help with debugging
 132         add_error('%r: exception: %s' % (args, e))
 133         raise
 134     if rv:
 135         add_error('%r: returned %d' % (args, rv))
 136
 137     args = [path.exe(), 'bloom', '--dir', objdir]
 138     try:
 139         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 140     except OSError, e:
 141         # make sure 'args' gets printed to help with debugging
 142         add_error('%r: exception: %s' % (args, e))
 143         raise
 144     if rv:
 145         add_error('%r: returned %d' % (args, rv))
 146
 147
 148 def mangle_name(name, mode, gitmode):
 149     """Mangle a file name to present an abstract name for segmented files.
 150     Mangled file names will have the ".bup" extension added to them. If a
 151     file's name already ends with ".bup", a ".bupl" extension is added to
 152     disambiguate normal files from segmented ones.
 153     """
 154     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 155         assert(stat.S_ISDIR(gitmode))
 156         return name + '.bup'
 157     elif name.endswith('.bup') or name[:-1].endswith('.bup'):
 158         return name + '.bupl'
 159     else:
 160         return name
 161
 162
 163 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 164 def demangle_name(name):
 165     """Remove name mangling from a file name, if necessary.
 166
 167     The return value is a tuple (demangled_filename,mode), where mode is one of
 168     the following:
 169
 170     * BUP_NORMAL  : files that should be read as-is from the repository
 171     * BUP_CHUNKED : files that were chunked and need to be reassembled
 172
 173     For more information on the name mangling algorithm, see mangle_name()
 174     """
 175     if name.endswith('.bupl'):
 176         return (name[:-5], BUP_NORMAL)
 177     elif name.endswith('.bup'):
 178         return (name[:-4], BUP_CHUNKED)
 179     else:
 180         return (name, BUP_NORMAL)
 181
 182
 183 def calc_hash(type, content):
 184     """Calculate some content's hash in the Git fashion."""
 185     header = '%s %d\0' % (type, len(content))
 186     sum = Sha1(header)
 187     sum.update(content)
 188     return sum.digest()
 189
 190
 191 def shalist_item_sort_key(ent):
 192     (mode, name, id) = ent
 193     assert(mode+0 == mode)
 194     if stat.S_ISDIR(mode):
 195         return name + '/'
 196     else:
 197         return name
 198
 199
 200 def tree_encode(shalist):
 201     """Generate a git tree object from (mode,name,hash) tuples."""
 202     shalist = sorted(shalist, key = shalist_item_sort_key)
 203     l = []
 204     for (mode,name,bin) in shalist:
 205         assert(mode)
 206         assert(mode+0 == mode)
 207         assert(name)
 208         assert(len(bin) == 20)
 209         s = '%o %s\0%s' % (mode,name,bin)
 210         assert(s[0] != '0')  # 0-padded octal is not acceptable in a git tree
 211         l.append(s)
 212     return ''.join(l)
 213
 214
 215 def tree_decode(buf):
 216     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 217     ofs = 0
 218     while ofs < len(buf):
 219         z = buf.find('\0', ofs)
 220         assert(z > ofs)
 221         spl = buf[ofs:z].split(' ', 1)
 222         assert(len(spl) == 2)
 223         mode,name = spl
 224         sha = buf[z+1:z+1+20]
 225         ofs = z+1+20
 226         yield (int(mode, 8), name, sha)
 227
 228
 229 def _encode_packobj(type, content, compression_level=1):
 230     szout = ''
 231     sz = len(content)
 232     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 233     sz >>= 4
 234     while 1:
 235         if sz: szbits |= 0x80
 236         szout += chr(szbits)
 237         if not sz:
 238             break
 239         szbits = sz & 0x7f
 240         sz >>= 7
 241     if compression_level > 9:
 242         compression_level = 9
 243     elif compression_level < 0:
 244         compression_level = 0
 245     z = zlib.compressobj(compression_level)
 246     yield szout
 247     yield z.compress(content)
 248     yield z.flush()
 249
 250
 251 def _encode_looseobj(type, content, compression_level=1):
 252     z = zlib.compressobj(compression_level)
 253     yield z.compress('%s %d\0' % (type, len(content)))
 254     yield z.compress(content)
 255     yield z.flush()
 256
 257
 258 def _decode_looseobj(buf):
 259     assert(buf);
 260     s = zlib.decompress(buf)
 261     i = s.find('\0')
 262     assert(i > 0)
 263     l = s[:i].split(' ')
 264     type = l[0]
 265     sz = int(l[1])
 266     content = s[i+1:]
 267     assert(type in _typemap)
 268     assert(sz == len(content))
 269     return (type, content)
 270
 271
 272 def _decode_packobj(buf):
 273     assert(buf)
 274     c = ord(buf[0])
 275     type = _typermap[(c & 0x70) >> 4]
 276     sz = c & 0x0f
 277     shift = 4
 278     i = 0
 279     while c & 0x80:
 280         i += 1
 281         c = ord(buf[i])
 282         sz |= (c & 0x7f) << shift
 283         shift += 7
 284         if not (c & 0x80):
 285             break
 286     return (type, zlib.decompress(buf[i+1:]))
 287
 288
 289 class PackIdx:
 290     def __init__(self):
 291         assert(0)
 292
 293     def find_offset(self, hash):
 294         """Get the offset of an object inside the index file."""
 295         idx = self._idx_from_hash(hash)
 296         if idx != None:
 297             return self._ofs_from_idx(idx)
 298         return None
 299
 300     def exists(self, hash, want_source=False):
 301         """Return nonempty if the object exists in this index."""
 302         if hash and (self._idx_from_hash(hash) != None):
 303             return want_source and os.path.basename(self.name) or True
 304         return None
 305
 306     def __len__(self):
 307         return int(self.fanout[255])
 308
 309     def _idx_from_hash(self, hash):
 310         global _total_searches, _total_steps
 311         _total_searches += 1
 312         assert(len(hash) == 20)
 313         b1 = ord(hash[0])
 314         start = self.fanout[b1-1] # range -1..254
 315         end = self.fanout[b1] # range 0..255
 316         want = str(hash)
 317         _total_steps += 1  # lookup table is a step
 318         while start < end:
 319             _total_steps += 1
 320             mid = start + (end-start)/2
 321             v = self._idx_to_hash(mid)
 322             if v < want:
 323                 start = mid+1
 324             elif v > want:
 325                 end = mid
 326             else: # got it!
 327                 return mid
 328         return None
 329
 330
 331 class PackIdxV1(PackIdx):
 332     """Object representation of a Git pack index (version 1) file."""
 333     def __init__(self, filename, f):
 334         self.name = filename
 335         self.idxnames = [self.name]
 336         self.map = mmap_read(f)
 337         self.fanout = list(struct.unpack('!256I',
 338                                          str(buffer(self.map, 0, 256*4))))
 339         self.fanout.append(0)  # entry "-1"
 340         nsha = self.fanout[255]
 341         self.sha_ofs = 256*4
 342         self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
 343
 344     def _ofs_from_idx(self, idx):
 345         return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
 346
 347     def _idx_to_hash(self, idx):
 348         return str(self.shatable[idx*24+4 : idx*24+24])
 349
 350     def __iter__(self):
 351         for i in xrange(self.fanout[255]):
 352             yield buffer(self.map, 256*4 + 24*i + 4, 20)
 353
 354
 355 class PackIdxV2(PackIdx):
 356     """Object representation of a Git pack index (version 2) file."""
 357     def __init__(self, filename, f):
 358         self.name = filename
 359         self.idxnames = [self.name]
 360         self.map = mmap_read(f)
 361         assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
 362         self.fanout = list(struct.unpack('!256I',
 363                                          str(buffer(self.map, 8, 256*4))))
 364         self.fanout.append(0)  # entry "-1"
 365         nsha = self.fanout[255]
 366         self.sha_ofs = 8 + 256*4
 367         self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
 368         self.ofstable = buffer(self.map,
 369                                self.sha_ofs + nsha*20 + nsha*4,
 370                                nsha*4)
 371         self.ofs64table = buffer(self.map,
 372                                  8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
 373
 374     def _ofs_from_idx(self, idx):
 375         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
 376         if ofs & 0x80000000:
 377             idx64 = ofs & 0x7fffffff
 378             ofs = struct.unpack('!Q',
 379                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
 380         return ofs
 381
 382     def _idx_to_hash(self, idx):
 383         return str(self.shatable[idx*20:(idx+1)*20])
 384
 385     def __iter__(self):
 386         for i in xrange(self.fanout[255]):
 387             yield buffer(self.map, 8 + 256*4 + 20*i, 20)
 388
 389
 390 _mpi_count = 0
 391 class PackIdxList:
 392     def __init__(self, dir):
 393         global _mpi_count
 394         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 395         _mpi_count += 1
 396         self.dir = dir
 397         self.also = set()
 398         self.packs = []
 399         self.do_bloom = False
 400         self.bloom = None
 401         self.refresh()
 402
 403     def __del__(self):
 404         global _mpi_count
 405         _mpi_count -= 1
 406         assert(_mpi_count == 0)
 407
 408     def __iter__(self):
 409         return iter(idxmerge(self.packs))
 410
 411     def __len__(self):
 412         return sum(len(pack) for pack in self.packs)
 413
 414     def exists(self, hash, want_source=False):
 415         """Return nonempty if the object exists in the index files."""
 416         global _total_searches
 417         _total_searches += 1
 418         if hash in self.also:
 419             return True
 420         if self.do_bloom and self.bloom:
 421             if self.bloom.exists(hash):
 422                 self.do_bloom = False
 423             else:
 424                 _total_searches -= 1  # was counted by bloom
 425                 return None
 426         for i in xrange(len(self.packs)):
 427             p = self.packs[i]
 428             _total_searches -= 1  # will be incremented by sub-pack
 429             ix = p.exists(hash, want_source=want_source)
 430             if ix:
 431                 # reorder so most recently used packs are searched first
 432                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 433                 return ix
 434         self.do_bloom = True
 435         return None
 436
 437     def refresh(self, skip_midx = False):
 438         """Refresh the index list.
 439         This method verifies if .midx files were superseded (e.g. all of its
 440         contents are in another, bigger .midx file) and removes the superseded
 441         files.
 442
 443         If skip_midx is True, all work on .midx files will be skipped and .midx
 444         files will be removed from the list.
 445
 446         The module-global variable 'ignore_midx' can force this function to
 447         always act as if skip_midx was True.
 448         """
 449         self.bloom = None # Always reopen the bloom as it may have been relaced
 450         self.do_bloom = False
 451         skip_midx = skip_midx or ignore_midx
 452         d = dict((p.name, p) for p in self.packs
 453                  if not skip_midx or not isinstance(p, midx.PackMidx))
 454         if os.path.exists(self.dir):
 455             if not skip_midx:
 456                 midxl = []
 457                 for ix in self.packs:
 458                     if isinstance(ix, midx.PackMidx):
 459                         for name in ix.idxnames:
 460                             d[os.path.join(self.dir, name)] = ix
 461                 for full in glob.glob(os.path.join(self.dir,'*.midx')):
 462                     if not d.get(full):
 463                         mx = midx.PackMidx(full)
 464                         (mxd, mxf) = os.path.split(mx.name)
 465                         broken = False
 466                         for n in mx.idxnames:
 467                             if not os.path.exists(os.path.join(mxd, n)):
 468                                 log(('warning: index %s missing\n' +
 469                                     '  used by %s\n') % (n, mxf))
 470                                 broken = True
 471                         if broken:
 472                             mx.close()
 473                             del mx
 474                             unlink(full)
 475                         else:
 476                             midxl.append(mx)
 477                 midxl.sort(key=lambda ix:
 478                            (-len(ix), -xstat.stat(ix.name).st_mtime))
 479                 for ix in midxl:
 480                     any_needed = False
 481                     for sub in ix.idxnames:
 482                         found = d.get(os.path.join(self.dir, sub))
 483                         if not found or isinstance(found, PackIdx):
 484                             # doesn't exist, or exists but not in a midx
 485                             any_needed = True
 486                             break
 487                     if any_needed:
 488                         d[ix.name] = ix
 489                         for name in ix.idxnames:
 490                             d[os.path.join(self.dir, name)] = ix
 491                     elif not ix.force_keep:
 492                         debug1('midx: removing redundant: %s\n'
 493                                % os.path.basename(ix.name))
 494                         ix.close()
 495                         unlink(ix.name)
 496             for full in glob.glob(os.path.join(self.dir,'*.idx')):
 497                 if not d.get(full):
 498                     try:
 499                         ix = open_idx(full)
 500                     except GitError, e:
 501                         add_error(e)
 502                         continue
 503                     d[full] = ix
 504             bfull = os.path.join(self.dir, 'bup.bloom')
 505             if self.bloom is None and os.path.exists(bfull):
 506                 self.bloom = bloom.ShaBloom(bfull)
 507             self.packs = list(set(d.values()))
 508             self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
 509             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 510                 self.do_bloom = True
 511             else:
 512                 self.bloom = None
 513         debug1('PackIdxList: using %d index%s.\n'
 514             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 515
 516     def add(self, hash):
 517         """Insert an additional object in the list."""
 518         self.also.add(hash)
 519
 520
 521 def open_idx(filename):
 522     if filename.endswith('.idx'):
 523         f = open(filename, 'rb')
 524         header = f.read(8)
 525         if header[0:4] == '\377tOc':
 526             version = struct.unpack('!I', header[4:8])[0]
 527             if version == 2:
 528                 return PackIdxV2(filename, f)
 529             else:
 530                 raise GitError('%s: expected idx file version 2, got %d'
 531                                % (filename, version))
 532         elif len(header) == 8 and header[0:4] < '\377tOc':
 533             return PackIdxV1(filename, f)
 534         else:
 535             raise GitError('%s: unrecognized idx file header' % filename)
 536     elif filename.endswith('.midx'):
 537         return midx.PackMidx(filename)
 538     else:
 539         raise GitError('idx filenames must end with .idx or .midx')
 540
 541
 542 def idxmerge(idxlist, final_progress=True):
 543     """Generate a list of all the objects reachable in a PackIdxList."""
 544     def pfunc(count, total):
 545         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 546                   % (count*100.0/total, count, total))
 547     def pfinal(count, total):
 548         if final_progress:
 549             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 550                      % (100, total, total))
 551     return merge_iter(idxlist, 10024, pfunc, pfinal)
 552
 553
 554 def _make_objcache():
 555     return PackIdxList(repo('objects/pack'))
 556
 557 class PackWriter:
 558     """Writes Git objects inside a pack file."""
 559     def __init__(self, objcache_maker=_make_objcache, compression_level=1):
 560         self.count = 0
 561         self.outbytes = 0
 562         self.filename = None
 563         self.file = None
 564         self.idx = None
 565         self.objcache_maker = objcache_maker
 566         self.objcache = None
 567         self.compression_level = compression_level
 568
 569     def __del__(self):
 570         self.close()
 571
 572     def _open(self):
 573         if not self.file:
 574             (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
 575             self.file = os.fdopen(fd, 'w+b')
 576             assert(name.endswith('.pack'))
 577             self.filename = name[:-5]
 578             self.file.write('PACK\0\0\0\2\0\0\0\0')
 579             self.idx = list(list() for i in xrange(256))
 580
 581     def _raw_write(self, datalist, sha):
 582         self._open()
 583         f = self.file
 584         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 585         # the file never has a *partial* blob.  So let's make sure it's
 586         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 587         # to our hashsplit algorithm.)  f.write() does its own buffering,
 588         # but that's okay because we'll flush it in _end().
 589         oneblob = ''.join(datalist)
 590         try:
 591             f.write(oneblob)
 592         except IOError, e:
 593             raise GitError, e, sys.exc_info()[2]
 594         nw = len(oneblob)
 595         crc = zlib.crc32(oneblob) & 0xffffffff
 596         self._update_idx(sha, crc, nw)
 597         self.outbytes += nw
 598         self.count += 1
 599         return nw, crc
 600
 601     def _update_idx(self, sha, crc, size):
 602         assert(sha)
 603         if self.idx:
 604             self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
 605
 606     def _write(self, sha, type, content):
 607         if verbose:
 608             log('>')
 609         if not sha:
 610             sha = calc_hash(type, content)
 611         size, crc = self._raw_write(_encode_packobj(type, content,
 612                                                     self.compression_level),
 613                                     sha=sha)
 614         if self.outbytes >= max_pack_size or self.count >= max_pack_objects:
 615             self.breakpoint()
 616         return sha
 617
 618     def breakpoint(self):
 619         """Clear byte and object counts and return the last processed id."""
 620         id = self._end()
 621         self.outbytes = self.count = 0
 622         return id
 623
 624     def _require_objcache(self):
 625         if self.objcache is None and self.objcache_maker:
 626             self.objcache = self.objcache_maker()
 627         if self.objcache is None:
 628             raise GitError(
 629                     "PackWriter not opened or can't check exists w/o objcache")
 630
 631     def exists(self, id, want_source=False):
 632         """Return non-empty if an object is found in the object cache."""
 633         self._require_objcache()
 634         return self.objcache.exists(id, want_source=want_source)
 635
 636     def maybe_write(self, type, content):
 637         """Write an object to the pack file if not present and return its id."""
 638         sha = calc_hash(type, content)
 639         if not self.exists(sha):
 640             self._write(sha, type, content)
 641             self._require_objcache()
 642             self.objcache.add(sha)
 643         return sha
 644
 645     def new_blob(self, blob):
 646         """Create a blob object in the pack with the supplied content."""
 647         return self.maybe_write('blob', blob)
 648
 649     def new_tree(self, shalist):
 650         """Create a tree object in the pack."""
 651         content = tree_encode(shalist)
 652         return self.maybe_write('tree', content)
 653
 654     def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
 655         l = []
 656         if tree: l.append('tree %s' % tree.encode('hex'))
 657         if parent: l.append('parent %s' % parent.encode('hex'))
 658         if author: l.append('author %s %s' % (author, _git_date(adate)))
 659         if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
 660         l.append('')
 661         l.append(msg)
 662         return self.maybe_write('commit', '\n'.join(l))
 663
 664     def new_commit(self, parent, tree, date, msg):
 665         """Create a commit object in the pack."""
 666         userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
 667         commit = self._new_commit(tree, parent,
 668                                   userline, date, userline, date,
 669                                   msg)
 670         return commit
 671
 672     def abort(self):
 673         """Remove the pack file from disk."""
 674         f = self.file
 675         if f:
 676             self.idx = None
 677             self.file = None
 678             f.close()
 679             os.unlink(self.filename + '.pack')
 680
 681     def _end(self, run_midx=True):
 682         f = self.file
 683         if not f: return None
 684         self.file = None
 685         self.objcache = None
 686         idx = self.idx
 687         self.idx = None
 688
 689         # update object count
 690         f.seek(8)
 691         cp = struct.pack('!i', self.count)
 692         assert(len(cp) == 4)
 693         f.write(cp)
 694
 695         # calculate the pack sha1sum
 696         f.seek(0)
 697         sum = Sha1()
 698         for b in chunkyreader(f):
 699             sum.update(b)
 700         packbin = sum.digest()
 701         f.write(packbin)
 702         f.close()
 703
 704         obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
 705
 706         nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
 707         if os.path.exists(self.filename + '.map'):
 708             os.unlink(self.filename + '.map')
 709         os.rename(self.filename + '.pack', nameprefix + '.pack')
 710         os.rename(self.filename + '.idx', nameprefix + '.idx')
 711
 712         if run_midx:
 713             auto_midx(repo('objects/pack'))
 714         return nameprefix
 715
 716     def close(self, run_midx=True):
 717         """Close the pack file and move it to its definitive path."""
 718         return self._end(run_midx=run_midx)
 719
 720     def _write_pack_idx_v2(self, filename, idx, packbin):
 721         ofs64_count = 0
 722         for section in idx:
 723             for entry in section:
 724                 if entry[2] >= 2**31:
 725                     ofs64_count += 1
 726
 727         # Length: header + fan-out + shas-and-crcs + overflow-offsets
 728         index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
 729         idx_map = None
 730         idx_f = open(filename, 'w+b')
 731         try:
 732             idx_f.truncate(index_len)
 733             idx_map = mmap_readwrite(idx_f, close=False)
 734             count = _helpers.write_idx(filename, idx_map, idx, self.count)
 735             assert(count == self.count)
 736         finally:
 737             if idx_map: idx_map.close()
 738             idx_f.close()
 739
 740         idx_f = open(filename, 'a+b')
 741         try:
 742             idx_f.write(packbin)
 743             idx_f.seek(0)
 744             idx_sum = Sha1()
 745             b = idx_f.read(8 + 4*256)
 746             idx_sum.update(b)
 747
 748             obj_list_sum = Sha1()
 749             for b in chunkyreader(idx_f, 20*self.count):
 750                 idx_sum.update(b)
 751                 obj_list_sum.update(b)
 752             namebase = obj_list_sum.hexdigest()
 753
 754             for b in chunkyreader(idx_f):
 755                 idx_sum.update(b)
 756             idx_f.write(idx_sum.digest())
 757             return namebase
 758         finally:
 759             idx_f.close()
 760
 761
 762 def _git_date(date):
 763     return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
 764
 765
 766 def _gitenv():
 767     os.environ['GIT_DIR'] = os.path.abspath(repo())
 768
 769
 770 def list_refs(refname = None):
 771     """Generate a list of tuples in the form (refname,hash).
 772     If a ref name is specified, list only this particular ref.
 773     """
 774     argv = ['git', 'show-ref', '--']
 775     if refname:
 776         argv += [refname]
 777     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 778     out = p.stdout.read().strip()
 779     rv = p.wait()  # not fatal
 780     if rv:
 781         assert(not out)
 782     if out:
 783         for d in out.split('\n'):
 784             (sha, name) = d.split(' ', 1)
 785             yield (name, sha.decode('hex'))
 786
 787
 788 def read_ref(refname):
 789     """Get the commit id of the most recent commit made on a given ref."""
 790     l = list(list_refs(refname))
 791     if l:
 792         assert(len(l) == 1)
 793         return l[0][1]
 794     else:
 795         return None
 796
 797
 798 def rev_list(ref, count=None):
 799     """Generate a list of reachable commits in reverse chronological order.
 800
 801     This generator walks through commits, from child to parent, that are
 802     reachable via the specified ref and yields a series of tuples of the form
 803     (date,hash).
 804
 805     If count is a non-zero integer, limit the number of commits to "count"
 806     objects.
 807     """
 808     assert(not ref.startswith('-'))
 809     opts = []
 810     if count:
 811         opts += ['-n', str(atoi(count))]
 812     argv = ['git', 'rev-list', '--pretty=format:%at'] + opts + [ref, '--']
 813     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 814     commit = None
 815     for row in p.stdout:
 816         s = row.strip()
 817         if s.startswith('commit '):
 818             commit = s[7:].decode('hex')
 819         else:
 820             date = int(s)
 821             yield (date, commit)
 822     rv = p.wait()  # not fatal
 823     if rv:
 824         raise GitError, 'git rev-list returned error %d' % rv
 825
 826
 827 def get_commit_dates(refs):
 828     """Get the dates for the specified commit refs.  For now, every unique
 829        string in refs must resolve to a different commit or this
 830        function will fail."""
 831     result = []
 832     for ref in refs:
 833         commit = get_commit_items(ref, cp())
 834         result.append(commit.author_sec)
 835     return result
 836
 837
 838 def rev_parse(committish):
 839     """Resolve the full hash for 'committish', if it exists.
 840
 841     Should be roughly equivalent to 'git rev-parse'.
 842
 843     Returns the hex value of the hash if it is found, None if 'committish' does
 844     not correspond to anything.
 845     """
 846     head = read_ref(committish)
 847     if head:
 848         debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
 849         return head
 850
 851     pL = PackIdxList(repo('objects/pack'))
 852
 853     if len(committish) == 40:
 854         try:
 855             hash = committish.decode('hex')
 856         except TypeError:
 857             return None
 858
 859         if pL.exists(hash):
 860             return hash
 861
 862     return None
 863
 864
 865 def update_ref(refname, newval, oldval):
 866     """Change the commit pointed to by a branch."""
 867     if not oldval:
 868         oldval = ''
 869     assert(refname.startswith('refs/heads/'))
 870     p = subprocess.Popen(['git', 'update-ref', refname,
 871                           newval.encode('hex'), oldval.encode('hex')],
 872                          preexec_fn = _gitenv)
 873     _git_wait('git update-ref', p)
 874
 875
 876 def guess_repo(path=None):
 877     """Set the path value in the global variable "repodir".
 878     This makes bup look for an existing bup repository, but not fail if a
 879     repository doesn't exist. Usually, if you are interacting with a bup
 880     repository, you would not be calling this function but using
 881     check_repo_or_die().
 882     """
 883     global repodir
 884     if path:
 885         repodir = path
 886     if not repodir:
 887         repodir = os.environ.get('BUP_DIR')
 888         if not repodir:
 889             repodir = os.path.expanduser('~/.bup')
 890
 891
 892 def init_repo(path=None):
 893     """Create the Git bare repository for bup in a given path."""
 894     guess_repo(path)
 895     d = repo()  # appends a / to the path
 896     parent = os.path.dirname(os.path.dirname(d))
 897     if parent and not os.path.exists(parent):
 898         raise GitError('parent directory "%s" does not exist\n' % parent)
 899     if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
 900         raise GitError('"%s" exists but is not a directory\n' % d)
 901     p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
 902                          preexec_fn = _gitenv)
 903     _git_wait('git init', p)
 904     # Force the index version configuration in order to ensure bup works
 905     # regardless of the version of the installed Git binary.
 906     p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
 907                          stdout=sys.stderr, preexec_fn = _gitenv)
 908     _git_wait('git config', p)
 909     # Enable the reflog
 910     p = subprocess.Popen(['git', 'config', 'core.logAllRefUpdates', 'true'],
 911                          stdout=sys.stderr, preexec_fn = _gitenv)
 912     _git_wait('git config', p)
 913
 914
 915 def check_repo_or_die(path=None):
 916     """Make sure a bup repository exists, and abort if not.
 917     If the path to a particular repository was not specified, this function
 918     initializes the default repository automatically.
 919     """
 920     guess_repo(path)
 921     try:
 922         os.stat(repo('objects/pack/.'))
 923     except OSError, e:
 924         if e.errno == errno.ENOENT:
 925             log('error: %r is not a bup repository; run "bup init"\n'
 926                 % repo())
 927             sys.exit(15)
 928         else:
 929             log('error: %s\n' % e)
 930             sys.exit(14)
 931
 932
 933 _ver = None
 934 def ver():
 935     """Get Git's version and ensure a usable version is installed.
 936
 937     The returned version is formatted as an ordered tuple with each position
 938     representing a digit in the version tag. For example, the following tuple
 939     would represent version 1.6.6.9:
 940
 941         ('1', '6', '6', '9')
 942     """
 943     global _ver
 944     if not _ver:
 945         p = subprocess.Popen(['git', '--version'],
 946                              stdout=subprocess.PIPE)
 947         gvs = p.stdout.read()
 948         _git_wait('git --version', p)
 949         m = re.match(r'git version (\S+.\S+)', gvs)
 950         if not m:
 951             raise GitError('git --version weird output: %r' % gvs)
 952         _ver = tuple(m.group(1).split('.'))
 953     needed = ('1','5', '3', '1')
 954     if _ver < needed:
 955         raise GitError('git version %s or higher is required; you have %s'
 956                        % ('.'.join(needed), '.'.join(_ver)))
 957     return _ver
 958
 959
 960 def _git_wait(cmd, p):
 961     rv = p.wait()
 962     if rv != 0:
 963         raise GitError('%s returned %d' % (cmd, rv))
 964
 965
 966 def _git_capture(argv):
 967     p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
 968     r = p.stdout.read()
 969     _git_wait(repr(argv), p)
 970     return r
 971
 972
 973 class _AbortableIter:
 974     def __init__(self, it, onabort = None):
 975         self.it = it
 976         self.onabort = onabort
 977         self.done = None
 978
 979     def __iter__(self):
 980         return self
 981
 982     def next(self):
 983         try:
 984             return self.it.next()
 985         except StopIteration, e:
 986             self.done = True
 987             raise
 988         except:
 989             self.abort()
 990             raise
 991
 992     def abort(self):
 993         """Abort iteration and call the abortion callback, if needed."""
 994         if not self.done:
 995             self.done = True
 996             if self.onabort:
 997                 self.onabort()
 998
 999     def __del__(self):
1000         self.abort()
1001
1002
1003 _ver_warned = 0
1004 class CatPipe:
1005     """Link to 'git cat-file' that is used to retrieve blob data."""
1006     def __init__(self):
1007         global _ver_warned
1008         wanted = ('1','5','6')
1009         if ver() < wanted:
1010             if not _ver_warned:
1011                 log('warning: git version < %s; bup will be slow.\n'
1012                     % '.'.join(wanted))
1013                 _ver_warned = 1
1014             self.get = self._slow_get
1015         else:
1016             self.p = self.inprogress = None
1017             self.get = self._fast_get
1018
1019     def _abort(self):
1020         if self.p:
1021             self.p.stdout.close()
1022             self.p.stdin.close()
1023         self.p = None
1024         self.inprogress = None
1025
1026     def _restart(self):
1027         self._abort()
1028         self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1029                                   stdin=subprocess.PIPE,
1030                                   stdout=subprocess.PIPE,
1031                                   close_fds = True,
1032                                   bufsize = 4096,
1033                                   preexec_fn = _gitenv)
1034
1035     def _fast_get(self, id):
1036         if not self.p or self.p.poll() != None:
1037             self._restart()
1038         assert(self.p)
1039         poll_result = self.p.poll()
1040         assert(poll_result == None)
1041         if self.inprogress:
1042             log('_fast_get: opening %r while %r is open\n'
1043                 % (id, self.inprogress))
1044         assert(not self.inprogress)
1045         assert(id.find('\n') < 0)
1046         assert(id.find('\r') < 0)
1047         assert(not id.startswith('-'))
1048         self.inprogress = id
1049         self.p.stdin.write('%s\n' % id)
1050         self.p.stdin.flush()
1051         hdr = self.p.stdout.readline()
1052         if hdr.endswith(' missing\n'):
1053             self.inprogress = None
1054             raise KeyError('blob %r is missing' % id)
1055         spl = hdr.split(' ')
1056         if len(spl) != 3 or len(spl[0]) != 40:
1057             raise GitError('expected blob, got %r' % spl)
1058         (hex, type, size) = spl
1059
1060         it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
1061                            onabort = self._abort)
1062         try:
1063             yield type
1064             for blob in it:
1065                 yield blob
1066             readline_result = self.p.stdout.readline()
1067             assert(readline_result == '\n')
1068             self.inprogress = None
1069         except Exception, e:
1070             it.abort()
1071             raise
1072
1073     def _slow_get(self, id):
1074         assert(id.find('\n') < 0)
1075         assert(id.find('\r') < 0)
1076         assert(id[0] != '-')
1077         type = _git_capture(['git', 'cat-file', '-t', id]).strip()
1078         yield type
1079
1080         p = subprocess.Popen(['git', 'cat-file', type, id],
1081                              stdout=subprocess.PIPE,
1082                              preexec_fn = _gitenv)
1083         for blob in chunkyreader(p.stdout):
1084             yield blob
1085         _git_wait('git cat-file', p)
1086
1087     def _join(self, it):
1088         type = it.next()
1089         if type == 'blob':
1090             for blob in it:
1091                 yield blob
1092         elif type == 'tree':
1093             treefile = ''.join(it)
1094             for (mode, name, sha) in tree_decode(treefile):
1095                 for blob in self.join(sha.encode('hex')):
1096                     yield blob
1097         elif type == 'commit':
1098             treeline = ''.join(it).split('\n')[0]
1099             assert(treeline.startswith('tree '))
1100             for blob in self.join(treeline[5:]):
1101                 yield blob
1102         else:
1103             raise GitError('invalid object type %r: expected blob/tree/commit'
1104                            % type)
1105
1106     def join(self, id):
1107         """Generate a list of the content of all blobs that can be reached
1108         from an object.  The hash given in 'id' must point to a blob, a tree
1109         or a commit. The content of all blobs that can be seen from trees or
1110         commits will be added to the list.
1111         """
1112         try:
1113             for d in self._join(self.get(id)):
1114                 yield d
1115         except StopIteration:
1116             log('booger!\n')
1117
1118
1119 _cp = (None, None)
1120
1121 def cp():
1122     """Create a CatPipe object or reuse an already existing one."""
1123     global _cp
1124     cp_dir, cp = _cp
1125     cur_dir = os.path.realpath(repo())
1126     if cur_dir != cp_dir:
1127         cp = CatPipe()
1128         _cp = (cur_dir, cp)
1129     return cp
1130
1131
1132 def tags():
1133     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1134     tags = {}
1135     for (n,c) in list_refs():
1136         if n.startswith('refs/tags/'):
1137             name = n[10:]
1138             if not c in tags:
1139                 tags[c] = []
1140
1141             tags[c].append(name)  # more than one tag can point at 'c'
1142
1143     return tags