lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5 import os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
   6 from collections import namedtuple
   7 from itertools import islice
   8
   9 from bup.helpers import *
  10 from bup import _helpers, path, midx, bloom, xstat
  11
  12 max_pack_size = 1000*1000*1000  # larger packs will slow down pruning
  13 max_pack_objects = 200*1000  # cache memory usage is about 83 bytes per object
  14
  15 verbose = 0
  16 ignore_midx = 0
  17 repodir = None
  18
  19 _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
  20 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
  21
  22 _total_searches = 0
  23 _total_steps = 0
  24
  25
  26 class GitError(Exception):
  27     pass
  28
  29
  30 def parse_tz_offset(s):
  31     """UTC offset in seconds."""
  32     tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
  33     if s[0] == '-':
  34         return - tz_off
  35     return tz_off
  36
  37
  38 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
  39 # Make sure that's authoritative.
  40 _start_end_char = r'[^ .,:;<>"\'\0\n]'
  41 _content_char = r'[^\0\n<>]'
  42 _safe_str_rx = '(?:%s{1,2}|(?:%s%s*%s))' \
  43     % (_start_end_char,
  44        _start_end_char, _content_char, _start_end_char)
  45 _tz_rx = r'[-+]\d\d[0-5]\d'
  46 _parent_rx = r'(?:parent [abcdefABCDEF0123456789]{40}\n)'
  47 _commit_rx = re.compile(r'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
  48 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
  49 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)
  50
  51 (?P<message>(?:.|\n)*)''' % (_parent_rx,
  52                              _safe_str_rx, _safe_str_rx, _tz_rx,
  53                              _safe_str_rx, _safe_str_rx, _tz_rx))
  54 _parent_hash_rx = re.compile(r'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
  55
  56
  57 # Note that the author_sec and committer_sec values are (UTC) epoch seconds.
  58 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
  59                                        'author_name', 'author_mail',
  60                                        'author_sec', 'author_offset',
  61                                        'committer_name', 'committer_mail',
  62                                        'committer_sec', 'committer_offset',
  63                                        'message'])
  64
  65 def parse_commit(content):
  66     commit_match = re.match(_commit_rx, content)
  67     if not commit_match:
  68         raise Exception('cannot parse commit %r' % content)
  69     matches = commit_match.groupdict()
  70     return CommitInfo(tree=matches['tree'],
  71                       parents=re.findall(_parent_hash_rx, matches['parents']),
  72                       author_name=matches['author_name'],
  73                       author_mail=matches['author_mail'],
  74                       author_sec=int(matches['asec']),
  75                       author_offset=parse_tz_offset(matches['atz']),
  76                       committer_name=matches['committer_name'],
  77                       committer_mail=matches['committer_mail'],
  78                       committer_sec=int(matches['csec']),
  79                       committer_offset=parse_tz_offset(matches['ctz']),
  80                       message=matches['message'])
  81
  82
  83 def get_commit_items(id, cp):
  84     commit_it = cp.get(id)
  85     assert(commit_it.next() == 'commit')
  86     commit_content = ''.join(commit_it)
  87     return parse_commit(commit_content)
  88
  89
  90 def repo(sub = '', repo_dir=None):
  91     """Get the path to the git repository or one of its subdirectories."""
  92     global repodir
  93     repo_dir = repo_dir or repodir
  94     if not repo_dir:
  95         raise GitError('You should call check_repo_or_die()')
  96
  97     # If there's a .git subdirectory, then the actual repo is in there.
  98     gd = os.path.join(repo_dir, '.git')
  99     if os.path.exists(gd):
 100         repodir = gd
 101
 102     return os.path.join(repo_dir, sub)
 103
 104
 105 def shorten_hash(s):
 106     return re.sub(r'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)',
 107                   r'\1\2*\3', s)
 108
 109
 110 def repo_rel(path):
 111     full = os.path.abspath(path)
 112     fullrepo = os.path.abspath(repo(''))
 113     if not fullrepo.endswith('/'):
 114         fullrepo += '/'
 115     if full.startswith(fullrepo):
 116         path = full[len(fullrepo):]
 117     if path.startswith('index-cache/'):
 118         path = path[len('index-cache/'):]
 119     return shorten_hash(path)
 120
 121
 122 def all_packdirs():
 123     paths = [repo('objects/pack')]
 124     paths += glob.glob(repo('index-cache/*/.'))
 125     return paths
 126
 127
 128 def auto_midx(objdir):
 129     args = [path.exe(), 'midx', '--auto', '--dir', objdir]
 130     try:
 131         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 132     except OSError, e:
 133         # make sure 'args' gets printed to help with debugging
 134         add_error('%r: exception: %s' % (args, e))
 135         raise
 136     if rv:
 137         add_error('%r: returned %d' % (args, rv))
 138
 139     args = [path.exe(), 'bloom', '--dir', objdir]
 140     try:
 141         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 142     except OSError, e:
 143         # make sure 'args' gets printed to help with debugging
 144         add_error('%r: exception: %s' % (args, e))
 145         raise
 146     if rv:
 147         add_error('%r: returned %d' % (args, rv))
 148
 149
 150 def mangle_name(name, mode, gitmode):
 151     """Mangle a file name to present an abstract name for segmented files.
 152     Mangled file names will have the ".bup" extension added to them. If a
 153     file's name already ends with ".bup", a ".bupl" extension is added to
 154     disambiguate normal files from segmented ones.
 155     """
 156     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 157         assert(stat.S_ISDIR(gitmode))
 158         return name + '.bup'
 159     elif name.endswith('.bup') or name[:-1].endswith('.bup'):
 160         return name + '.bupl'
 161     else:
 162         return name
 163
 164
 165 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 166 def demangle_name(name, mode):
 167     """Remove name mangling from a file name, if necessary.
 168
 169     The return value is a tuple (demangled_filename,mode), where mode is one of
 170     the following:
 171
 172     * BUP_NORMAL  : files that should be read as-is from the repository
 173     * BUP_CHUNKED : files that were chunked and need to be reassembled
 174
 175     For more information on the name mangling algorithm, see mangle_name()
 176     """
 177     if name.endswith('.bupl'):
 178         return (name[:-5], BUP_NORMAL)
 179     elif name.endswith('.bup'):
 180         return (name[:-4], BUP_CHUNKED)
 181     elif name.endswith('.bupm'):
 182         return (name[:-5],
 183                 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
 184     else:
 185         return (name, BUP_NORMAL)
 186
 187
 188 def calc_hash(type, content):
 189     """Calculate some content's hash in the Git fashion."""
 190     header = '%s %d\0' % (type, len(content))
 191     sum = Sha1(header)
 192     sum.update(content)
 193     return sum.digest()
 194
 195
 196 def shalist_item_sort_key(ent):
 197     (mode, name, id) = ent
 198     assert(mode+0 == mode)
 199     if stat.S_ISDIR(mode):
 200         return name + '/'
 201     else:
 202         return name
 203
 204
 205 def tree_encode(shalist):
 206     """Generate a git tree object from (mode,name,hash) tuples."""
 207     shalist = sorted(shalist, key = shalist_item_sort_key)
 208     l = []
 209     for (mode,name,bin) in shalist:
 210         assert(mode)
 211         assert(mode+0 == mode)
 212         assert(name)
 213         assert(len(bin) == 20)
 214         s = '%o %s\0%s' % (mode,name,bin)
 215         assert(s[0] != '0')  # 0-padded octal is not acceptable in a git tree
 216         l.append(s)
 217     return ''.join(l)
 218
 219
 220 def tree_decode(buf):
 221     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 222     ofs = 0
 223     while ofs < len(buf):
 224         z = buf.find('\0', ofs)
 225         assert(z > ofs)
 226         spl = buf[ofs:z].split(' ', 1)
 227         assert(len(spl) == 2)
 228         mode,name = spl
 229         sha = buf[z+1:z+1+20]
 230         ofs = z+1+20
 231         yield (int(mode, 8), name, sha)
 232
 233
 234 def _encode_packobj(type, content, compression_level=1):
 235     szout = ''
 236     sz = len(content)
 237     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 238     sz >>= 4
 239     while 1:
 240         if sz: szbits |= 0x80
 241         szout += chr(szbits)
 242         if not sz:
 243             break
 244         szbits = sz & 0x7f
 245         sz >>= 7
 246     if compression_level > 9:
 247         compression_level = 9
 248     elif compression_level < 0:
 249         compression_level = 0
 250     z = zlib.compressobj(compression_level)
 251     yield szout
 252     yield z.compress(content)
 253     yield z.flush()
 254
 255
 256 def _encode_looseobj(type, content, compression_level=1):
 257     z = zlib.compressobj(compression_level)
 258     yield z.compress('%s %d\0' % (type, len(content)))
 259     yield z.compress(content)
 260     yield z.flush()
 261
 262
 263 def _decode_looseobj(buf):
 264     assert(buf);
 265     s = zlib.decompress(buf)
 266     i = s.find('\0')
 267     assert(i > 0)
 268     l = s[:i].split(' ')
 269     type = l[0]
 270     sz = int(l[1])
 271     content = s[i+1:]
 272     assert(type in _typemap)
 273     assert(sz == len(content))
 274     return (type, content)
 275
 276
 277 def _decode_packobj(buf):
 278     assert(buf)
 279     c = ord(buf[0])
 280     type = _typermap[(c & 0x70) >> 4]
 281     sz = c & 0x0f
 282     shift = 4
 283     i = 0
 284     while c & 0x80:
 285         i += 1
 286         c = ord(buf[i])
 287         sz |= (c & 0x7f) << shift
 288         shift += 7
 289         if not (c & 0x80):
 290             break
 291     return (type, zlib.decompress(buf[i+1:]))
 292
 293
 294 class PackIdx:
 295     def __init__(self):
 296         assert(0)
 297
 298     def find_offset(self, hash):
 299         """Get the offset of an object inside the index file."""
 300         idx = self._idx_from_hash(hash)
 301         if idx != None:
 302             return self._ofs_from_idx(idx)
 303         return None
 304
 305     def exists(self, hash, want_source=False):
 306         """Return nonempty if the object exists in this index."""
 307         if hash and (self._idx_from_hash(hash) != None):
 308             return want_source and os.path.basename(self.name) or True
 309         return None
 310
 311     def __len__(self):
 312         return int(self.fanout[255])
 313
 314     def _idx_from_hash(self, hash):
 315         global _total_searches, _total_steps
 316         _total_searches += 1
 317         assert(len(hash) == 20)
 318         b1 = ord(hash[0])
 319         start = self.fanout[b1-1] # range -1..254
 320         end = self.fanout[b1] # range 0..255
 321         want = str(hash)
 322         _total_steps += 1  # lookup table is a step
 323         while start < end:
 324             _total_steps += 1
 325             mid = start + (end-start)/2
 326             v = self._idx_to_hash(mid)
 327             if v < want:
 328                 start = mid+1
 329             elif v > want:
 330                 end = mid
 331             else: # got it!
 332                 return mid
 333         return None
 334
 335
 336 class PackIdxV1(PackIdx):
 337     """Object representation of a Git pack index (version 1) file."""
 338     def __init__(self, filename, f):
 339         self.name = filename
 340         self.idxnames = [self.name]
 341         self.map = mmap_read(f)
 342         self.fanout = list(struct.unpack('!256I',
 343                                          str(buffer(self.map, 0, 256*4))))
 344         self.fanout.append(0)  # entry "-1"
 345         nsha = self.fanout[255]
 346         self.sha_ofs = 256*4
 347         self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
 348
 349     def _ofs_from_idx(self, idx):
 350         return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
 351
 352     def _idx_to_hash(self, idx):
 353         return str(self.shatable[idx*24+4 : idx*24+24])
 354
 355     def __iter__(self):
 356         for i in xrange(self.fanout[255]):
 357             yield buffer(self.map, 256*4 + 24*i + 4, 20)
 358
 359
 360 class PackIdxV2(PackIdx):
 361     """Object representation of a Git pack index (version 2) file."""
 362     def __init__(self, filename, f):
 363         self.name = filename
 364         self.idxnames = [self.name]
 365         self.map = mmap_read(f)
 366         assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
 367         self.fanout = list(struct.unpack('!256I',
 368                                          str(buffer(self.map, 8, 256*4))))
 369         self.fanout.append(0)  # entry "-1"
 370         nsha = self.fanout[255]
 371         self.sha_ofs = 8 + 256*4
 372         self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
 373         self.ofstable = buffer(self.map,
 374                                self.sha_ofs + nsha*20 + nsha*4,
 375                                nsha*4)
 376         self.ofs64table = buffer(self.map,
 377                                  8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
 378
 379     def _ofs_from_idx(self, idx):
 380         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
 381         if ofs & 0x80000000:
 382             idx64 = ofs & 0x7fffffff
 383             ofs = struct.unpack('!Q',
 384                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
 385         return ofs
 386
 387     def _idx_to_hash(self, idx):
 388         return str(self.shatable[idx*20:(idx+1)*20])
 389
 390     def __iter__(self):
 391         for i in xrange(self.fanout[255]):
 392             yield buffer(self.map, 8 + 256*4 + 20*i, 20)
 393
 394
 395 _mpi_count = 0
 396 class PackIdxList:
 397     def __init__(self, dir):
 398         global _mpi_count
 399         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 400         _mpi_count += 1
 401         self.dir = dir
 402         self.also = set()
 403         self.packs = []
 404         self.do_bloom = False
 405         self.bloom = None
 406         self.refresh()
 407
 408     def __del__(self):
 409         global _mpi_count
 410         _mpi_count -= 1
 411         assert(_mpi_count == 0)
 412
 413     def __iter__(self):
 414         return iter(idxmerge(self.packs))
 415
 416     def __len__(self):
 417         return sum(len(pack) for pack in self.packs)
 418
 419     def exists(self, hash, want_source=False):
 420         """Return nonempty if the object exists in the index files."""
 421         global _total_searches
 422         _total_searches += 1
 423         if hash in self.also:
 424             return True
 425         if self.do_bloom and self.bloom:
 426             if self.bloom.exists(hash):
 427                 self.do_bloom = False
 428             else:
 429                 _total_searches -= 1  # was counted by bloom
 430                 return None
 431         for i in xrange(len(self.packs)):
 432             p = self.packs[i]
 433             _total_searches -= 1  # will be incremented by sub-pack
 434             ix = p.exists(hash, want_source=want_source)
 435             if ix:
 436                 # reorder so most recently used packs are searched first
 437                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 438                 return ix
 439         self.do_bloom = True
 440         return None
 441
 442     def refresh(self, skip_midx = False):
 443         """Refresh the index list.
 444         This method verifies if .midx files were superseded (e.g. all of its
 445         contents are in another, bigger .midx file) and removes the superseded
 446         files.
 447
 448         If skip_midx is True, all work on .midx files will be skipped and .midx
 449         files will be removed from the list.
 450
 451         The module-global variable 'ignore_midx' can force this function to
 452         always act as if skip_midx was True.
 453         """
 454         self.bloom = None # Always reopen the bloom as it may have been relaced
 455         self.do_bloom = False
 456         skip_midx = skip_midx or ignore_midx
 457         d = dict((p.name, p) for p in self.packs
 458                  if not skip_midx or not isinstance(p, midx.PackMidx))
 459         if os.path.exists(self.dir):
 460             if not skip_midx:
 461                 midxl = []
 462                 for ix in self.packs:
 463                     if isinstance(ix, midx.PackMidx):
 464                         for name in ix.idxnames:
 465                             d[os.path.join(self.dir, name)] = ix
 466                 for full in glob.glob(os.path.join(self.dir,'*.midx')):
 467                     if not d.get(full):
 468                         mx = midx.PackMidx(full)
 469                         (mxd, mxf) = os.path.split(mx.name)
 470                         broken = False
 471                         for n in mx.idxnames:
 472                             if not os.path.exists(os.path.join(mxd, n)):
 473                                 log(('warning: index %s missing\n' +
 474                                     '  used by %s\n') % (n, mxf))
 475                                 broken = True
 476                         if broken:
 477                             mx.close()
 478                             del mx
 479                             unlink(full)
 480                         else:
 481                             midxl.append(mx)
 482                 midxl.sort(key=lambda ix:
 483                            (-len(ix), -xstat.stat(ix.name).st_mtime))
 484                 for ix in midxl:
 485                     any_needed = False
 486                     for sub in ix.idxnames:
 487                         found = d.get(os.path.join(self.dir, sub))
 488                         if not found or isinstance(found, PackIdx):
 489                             # doesn't exist, or exists but not in a midx
 490                             any_needed = True
 491                             break
 492                     if any_needed:
 493                         d[ix.name] = ix
 494                         for name in ix.idxnames:
 495                             d[os.path.join(self.dir, name)] = ix
 496                     elif not ix.force_keep:
 497                         debug1('midx: removing redundant: %s\n'
 498                                % os.path.basename(ix.name))
 499                         ix.close()
 500                         unlink(ix.name)
 501             for full in glob.glob(os.path.join(self.dir,'*.idx')):
 502                 if not d.get(full):
 503                     try:
 504                         ix = open_idx(full)
 505                     except GitError, e:
 506                         add_error(e)
 507                         continue
 508                     d[full] = ix
 509             bfull = os.path.join(self.dir, 'bup.bloom')
 510             if self.bloom is None and os.path.exists(bfull):
 511                 self.bloom = bloom.ShaBloom(bfull)
 512             self.packs = list(set(d.values()))
 513             self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
 514             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 515                 self.do_bloom = True
 516             else:
 517                 self.bloom = None
 518         debug1('PackIdxList: using %d index%s.\n'
 519             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 520
 521     def add(self, hash):
 522         """Insert an additional object in the list."""
 523         self.also.add(hash)
 524
 525
 526 def open_idx(filename):
 527     if filename.endswith('.idx'):
 528         f = open(filename, 'rb')
 529         header = f.read(8)
 530         if header[0:4] == '\377tOc':
 531             version = struct.unpack('!I', header[4:8])[0]
 532             if version == 2:
 533                 return PackIdxV2(filename, f)
 534             else:
 535                 raise GitError('%s: expected idx file version 2, got %d'
 536                                % (filename, version))
 537         elif len(header) == 8 and header[0:4] < '\377tOc':
 538             return PackIdxV1(filename, f)
 539         else:
 540             raise GitError('%s: unrecognized idx file header' % filename)
 541     elif filename.endswith('.midx'):
 542         return midx.PackMidx(filename)
 543     else:
 544         raise GitError('idx filenames must end with .idx or .midx')
 545
 546
 547 def idxmerge(idxlist, final_progress=True):
 548     """Generate a list of all the objects reachable in a PackIdxList."""
 549     def pfunc(count, total):
 550         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 551                   % (count*100.0/total, count, total))
 552     def pfinal(count, total):
 553         if final_progress:
 554             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 555                      % (100, total, total))
 556     return merge_iter(idxlist, 10024, pfunc, pfinal)
 557
 558
 559 def _make_objcache():
 560     return PackIdxList(repo('objects/pack'))
 561
 562 class PackWriter:
 563     """Writes Git objects inside a pack file."""
 564     def __init__(self, objcache_maker=_make_objcache, compression_level=1):
 565         self.count = 0
 566         self.outbytes = 0
 567         self.filename = None
 568         self.file = None
 569         self.idx = None
 570         self.objcache_maker = objcache_maker
 571         self.objcache = None
 572         self.compression_level = compression_level
 573
 574     def __del__(self):
 575         self.close()
 576
 577     def _open(self):
 578         if not self.file:
 579             (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
 580             self.file = os.fdopen(fd, 'w+b')
 581             assert(name.endswith('.pack'))
 582             self.filename = name[:-5]
 583             self.file.write('PACK\0\0\0\2\0\0\0\0')
 584             self.idx = list(list() for i in xrange(256))
 585
 586     def _raw_write(self, datalist, sha):
 587         self._open()
 588         f = self.file
 589         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 590         # the file never has a *partial* blob.  So let's make sure it's
 591         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 592         # to our hashsplit algorithm.)  f.write() does its own buffering,
 593         # but that's okay because we'll flush it in _end().
 594         oneblob = ''.join(datalist)
 595         try:
 596             f.write(oneblob)
 597         except IOError, e:
 598             raise GitError, e, sys.exc_info()[2]
 599         nw = len(oneblob)
 600         crc = zlib.crc32(oneblob) & 0xffffffff
 601         self._update_idx(sha, crc, nw)
 602         self.outbytes += nw
 603         self.count += 1
 604         return nw, crc
 605
 606     def _update_idx(self, sha, crc, size):
 607         assert(sha)
 608         if self.idx:
 609             self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
 610
 611     def _write(self, sha, type, content):
 612         if verbose:
 613             log('>')
 614         if not sha:
 615             sha = calc_hash(type, content)
 616         size, crc = self._raw_write(_encode_packobj(type, content,
 617                                                     self.compression_level),
 618                                     sha=sha)
 619         if self.outbytes >= max_pack_size or self.count >= max_pack_objects:
 620             self.breakpoint()
 621         return sha
 622
 623     def breakpoint(self):
 624         """Clear byte and object counts and return the last processed id."""
 625         id = self._end()
 626         self.outbytes = self.count = 0
 627         return id
 628
 629     def _require_objcache(self):
 630         if self.objcache is None and self.objcache_maker:
 631             self.objcache = self.objcache_maker()
 632         if self.objcache is None:
 633             raise GitError(
 634                     "PackWriter not opened or can't check exists w/o objcache")
 635
 636     def exists(self, id, want_source=False):
 637         """Return non-empty if an object is found in the object cache."""
 638         self._require_objcache()
 639         return self.objcache.exists(id, want_source=want_source)
 640
 641     def maybe_write(self, type, content):
 642         """Write an object to the pack file if not present and return its id."""
 643         sha = calc_hash(type, content)
 644         if not self.exists(sha):
 645             self._write(sha, type, content)
 646             self._require_objcache()
 647             self.objcache.add(sha)
 648         return sha
 649
 650     def new_blob(self, blob):
 651         """Create a blob object in the pack with the supplied content."""
 652         return self.maybe_write('blob', blob)
 653
 654     def new_tree(self, shalist):
 655         """Create a tree object in the pack."""
 656         content = tree_encode(shalist)
 657         return self.maybe_write('tree', content)
 658
 659     def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
 660         l = []
 661         if tree: l.append('tree %s' % tree.encode('hex'))
 662         if parent: l.append('parent %s' % parent.encode('hex'))
 663         if author: l.append('author %s %s' % (author, _git_date(adate)))
 664         if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
 665         l.append('')
 666         l.append(msg)
 667         return self.maybe_write('commit', '\n'.join(l))
 668
 669     def new_commit(self, parent, tree, date, msg):
 670         """Create a commit object in the pack."""
 671         userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
 672         commit = self._new_commit(tree, parent,
 673                                   userline, date, userline, date,
 674                                   msg)
 675         return commit
 676
 677     def abort(self):
 678         """Remove the pack file from disk."""
 679         f = self.file
 680         if f:
 681             self.idx = None
 682             self.file = None
 683             f.close()
 684             os.unlink(self.filename + '.pack')
 685
 686     def _end(self, run_midx=True):
 687         f = self.file
 688         if not f: return None
 689         self.file = None
 690         self.objcache = None
 691         idx = self.idx
 692         self.idx = None
 693
 694         # update object count
 695         f.seek(8)
 696         cp = struct.pack('!i', self.count)
 697         assert(len(cp) == 4)
 698         f.write(cp)
 699
 700         # calculate the pack sha1sum
 701         f.seek(0)
 702         sum = Sha1()
 703         for b in chunkyreader(f):
 704             sum.update(b)
 705         packbin = sum.digest()
 706         f.write(packbin)
 707         f.close()
 708
 709         obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
 710
 711         nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
 712         if os.path.exists(self.filename + '.map'):
 713             os.unlink(self.filename + '.map')
 714         os.rename(self.filename + '.pack', nameprefix + '.pack')
 715         os.rename(self.filename + '.idx', nameprefix + '.idx')
 716
 717         if run_midx:
 718             auto_midx(repo('objects/pack'))
 719         return nameprefix
 720
 721     def close(self, run_midx=True):
 722         """Close the pack file and move it to its definitive path."""
 723         return self._end(run_midx=run_midx)
 724
 725     def _write_pack_idx_v2(self, filename, idx, packbin):
 726         ofs64_count = 0
 727         for section in idx:
 728             for entry in section:
 729                 if entry[2] >= 2**31:
 730                     ofs64_count += 1
 731
 732         # Length: header + fan-out + shas-and-crcs + overflow-offsets
 733         index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
 734         idx_map = None
 735         idx_f = open(filename, 'w+b')
 736         try:
 737             idx_f.truncate(index_len)
 738             idx_map = mmap_readwrite(idx_f, close=False)
 739             count = _helpers.write_idx(filename, idx_map, idx, self.count)
 740             assert(count == self.count)
 741         finally:
 742             if idx_map: idx_map.close()
 743             idx_f.close()
 744
 745         idx_f = open(filename, 'a+b')
 746         try:
 747             idx_f.write(packbin)
 748             idx_f.seek(0)
 749             idx_sum = Sha1()
 750             b = idx_f.read(8 + 4*256)
 751             idx_sum.update(b)
 752
 753             obj_list_sum = Sha1()
 754             for b in chunkyreader(idx_f, 20*self.count):
 755                 idx_sum.update(b)
 756                 obj_list_sum.update(b)
 757             namebase = obj_list_sum.hexdigest()
 758
 759             for b in chunkyreader(idx_f):
 760                 idx_sum.update(b)
 761             idx_f.write(idx_sum.digest())
 762             return namebase
 763         finally:
 764             idx_f.close()
 765
 766
 767 def _git_date(date):
 768     return '%d %s' % (date, utc_offset_str(date))
 769
 770
 771 def _gitenv(repo_dir = None):
 772     if not repo_dir:
 773         repo_dir = repo()
 774     def env():
 775         os.environ['GIT_DIR'] = os.path.abspath(repo_dir)
 776     return env
 777
 778
 779 def list_refs(refname=None, repo_dir=None,
 780               limit_to_heads=False, limit_to_tags=False):
 781     """Yield (refname, hash) tuples for all repository refs unless a ref
 782     name is specified.  Given a ref name, only include tuples for that
 783     particular ref.  The limits restrict the result items to
 784     refs/heads or refs/tags.  If both limits are specified, items from
 785     both sources will be included.
 786
 787     """
 788     argv = ['git', 'show-ref']
 789     if limit_to_heads:
 790         argv.append('--heads')
 791     if limit_to_tags:
 792         argv.append('--tags')
 793     argv.append('--')
 794     if refname:
 795         argv += [refname]
 796     p = subprocess.Popen(argv,
 797                          preexec_fn = _gitenv(repo_dir),
 798                          stdout = subprocess.PIPE)
 799     out = p.stdout.read().strip()
 800     rv = p.wait()  # not fatal
 801     if rv:
 802         assert(not out)
 803     if out:
 804         for d in out.split('\n'):
 805             (sha, name) = d.split(' ', 1)
 806             yield (name, sha.decode('hex'))
 807
 808
 809 def read_ref(refname, repo_dir = None):
 810     """Get the commit id of the most recent commit made on a given ref."""
 811     refs = list_refs(refname, repo_dir=repo_dir, limit_to_heads=True)
 812     l = tuple(islice(refs, 2))
 813     if l:
 814         assert(len(l) == 1)
 815         return l[0][1]
 816     else:
 817         return None
 818
 819
 820 def rev_list(ref, count=None, repo_dir=None):
 821     """Generate a list of reachable commits in reverse chronological order.
 822
 823     This generator walks through commits, from child to parent, that are
 824     reachable via the specified ref and yields a series of tuples of the form
 825     (date,hash).
 826
 827     If count is a non-zero integer, limit the number of commits to "count"
 828     objects.
 829     """
 830     assert(not ref.startswith('-'))
 831     opts = []
 832     if count:
 833         opts += ['-n', str(atoi(count))]
 834     argv = ['git', 'rev-list', '--pretty=format:%at'] + opts + [ref, '--']
 835     p = subprocess.Popen(argv,
 836                          preexec_fn = _gitenv(repo_dir),
 837                          stdout = subprocess.PIPE)
 838     commit = None
 839     for row in p.stdout:
 840         s = row.strip()
 841         if s.startswith('commit '):
 842             commit = s[7:].decode('hex')
 843         else:
 844             date = int(s)
 845             yield (date, commit)
 846     rv = p.wait()  # not fatal
 847     if rv:
 848         raise GitError, 'git rev-list returned error %d' % rv
 849
 850
 851 def get_commit_dates(refs, repo_dir=None):
 852     """Get the dates for the specified commit refs.  For now, every unique
 853        string in refs must resolve to a different commit or this
 854        function will fail."""
 855     result = []
 856     for ref in refs:
 857         commit = get_commit_items(ref, cp(repo_dir))
 858         result.append(commit.author_sec)
 859     return result
 860
 861
 862 def rev_parse(committish, repo_dir=None):
 863     """Resolve the full hash for 'committish', if it exists.
 864
 865     Should be roughly equivalent to 'git rev-parse'.
 866
 867     Returns the hex value of the hash if it is found, None if 'committish' does
 868     not correspond to anything.
 869     """
 870     head = read_ref(committish, repo_dir=repo_dir)
 871     if head:
 872         debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
 873         return head
 874
 875     pL = PackIdxList(repo('objects/pack', repo_dir=repo_dir))
 876
 877     if len(committish) == 40:
 878         try:
 879             hash = committish.decode('hex')
 880         except TypeError:
 881             return None
 882
 883         if pL.exists(hash):
 884             return hash
 885
 886     return None
 887
 888
 889 def update_ref(refname, newval, oldval, repo_dir=None):
 890     """Update a repository reference."""
 891     if not oldval:
 892         oldval = ''
 893     assert(refname.startswith('refs/heads/') \
 894            or refname.startswith('refs/tags/'))
 895     p = subprocess.Popen(['git', 'update-ref', refname,
 896                           newval.encode('hex'), oldval.encode('hex')],
 897                          preexec_fn = _gitenv(repo_dir))
 898     _git_wait('git update-ref', p)
 899
 900
 901 def delete_ref(refname):
 902     """Delete a repository reference."""
 903     assert(refname.startswith('refs/'))
 904     p = subprocess.Popen(['git', 'update-ref', '-d', refname],
 905                          preexec_fn = _gitenv())
 906     _git_wait('git update-ref', p)
 907
 908
 909 def guess_repo(path=None):
 910     """Set the path value in the global variable "repodir".
 911     This makes bup look for an existing bup repository, but not fail if a
 912     repository doesn't exist. Usually, if you are interacting with a bup
 913     repository, you would not be calling this function but using
 914     check_repo_or_die().
 915     """
 916     global repodir
 917     if path:
 918         repodir = path
 919     if not repodir:
 920         repodir = os.environ.get('BUP_DIR')
 921         if not repodir:
 922             repodir = os.path.expanduser('~/.bup')
 923
 924
 925 def init_repo(path=None):
 926     """Create the Git bare repository for bup in a given path."""
 927     guess_repo(path)
 928     d = repo()  # appends a / to the path
 929     parent = os.path.dirname(os.path.dirname(d))
 930     if parent and not os.path.exists(parent):
 931         raise GitError('parent directory "%s" does not exist\n' % parent)
 932     if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
 933         raise GitError('"%s" exists but is not a directory\n' % d)
 934     p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
 935                          preexec_fn = _gitenv())
 936     _git_wait('git init', p)
 937     # Force the index version configuration in order to ensure bup works
 938     # regardless of the version of the installed Git binary.
 939     p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
 940                          stdout=sys.stderr, preexec_fn = _gitenv())
 941     _git_wait('git config', p)
 942     # Enable the reflog
 943     p = subprocess.Popen(['git', 'config', 'core.logAllRefUpdates', 'true'],
 944                          stdout=sys.stderr, preexec_fn = _gitenv())
 945     _git_wait('git config', p)
 946
 947
 948 def check_repo_or_die(path=None):
 949     """Make sure a bup repository exists, and abort if not.
 950     If the path to a particular repository was not specified, this function
 951     initializes the default repository automatically.
 952     """
 953     guess_repo(path)
 954     try:
 955         os.stat(repo('objects/pack/.'))
 956     except OSError, e:
 957         if e.errno == errno.ENOENT:
 958             log('error: %r is not a bup repository; run "bup init"\n'
 959                 % repo())
 960             sys.exit(15)
 961         else:
 962             log('error: %s\n' % e)
 963             sys.exit(14)
 964
 965
 966 _ver = None
 967 def ver():
 968     """Get Git's version and ensure a usable version is installed.
 969
 970     The returned version is formatted as an ordered tuple with each position
 971     representing a digit in the version tag. For example, the following tuple
 972     would represent version 1.6.6.9:
 973
 974         ('1', '6', '6', '9')
 975     """
 976     global _ver
 977     if not _ver:
 978         p = subprocess.Popen(['git', '--version'],
 979                              stdout=subprocess.PIPE)
 980         gvs = p.stdout.read()
 981         _git_wait('git --version', p)
 982         m = re.match(r'git version (\S+.\S+)', gvs)
 983         if not m:
 984             raise GitError('git --version weird output: %r' % gvs)
 985         _ver = tuple(m.group(1).split('.'))
 986     needed = ('1','5', '3', '1')
 987     if _ver < needed:
 988         raise GitError('git version %s or higher is required; you have %s'
 989                        % ('.'.join(needed), '.'.join(_ver)))
 990     return _ver
 991
 992
 993 def _git_wait(cmd, p):
 994     rv = p.wait()
 995     if rv != 0:
 996         raise GitError('%s returned %d' % (cmd, rv))
 997
 998
 999 def _git_capture(argv):
1000     p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv())
1001     r = p.stdout.read()
1002     _git_wait(repr(argv), p)
1003     return r
1004
1005
1006 class _AbortableIter:
1007     def __init__(self, it, onabort = None):
1008         self.it = it
1009         self.onabort = onabort
1010         self.done = None
1011
1012     def __iter__(self):
1013         return self
1014
1015     def next(self):
1016         try:
1017             return self.it.next()
1018         except StopIteration, e:
1019             self.done = True
1020             raise
1021         except:
1022             self.abort()
1023             raise
1024
1025     def abort(self):
1026         """Abort iteration and call the abortion callback, if needed."""
1027         if not self.done:
1028             self.done = True
1029             if self.onabort:
1030                 self.onabort()
1031
1032     def __del__(self):
1033         self.abort()
1034
1035
1036 _ver_warned = 0
1037 class CatPipe:
1038     """Link to 'git cat-file' that is used to retrieve blob data."""
1039     def __init__(self, repo_dir = None):
1040         global _ver_warned
1041         self.repo_dir = repo_dir
1042         wanted = ('1','5','6')
1043         if ver() < wanted:
1044             if not _ver_warned:
1045                 log('warning: git version < %s; bup will be slow.\n'
1046                     % '.'.join(wanted))
1047                 _ver_warned = 1
1048             self.get = self._slow_get
1049         else:
1050             self.p = self.inprogress = None
1051             self.get = self._fast_get
1052
1053     def _abort(self):
1054         if self.p:
1055             self.p.stdout.close()
1056             self.p.stdin.close()
1057         self.p = None
1058         self.inprogress = None
1059
1060     def _restart(self):
1061         self._abort()
1062         self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1063                                   stdin=subprocess.PIPE,
1064                                   stdout=subprocess.PIPE,
1065                                   close_fds = True,
1066                                   bufsize = 4096,
1067                                   preexec_fn = _gitenv(self.repo_dir))
1068
1069     def _fast_get(self, id):
1070         if not self.p or self.p.poll() != None:
1071             self._restart()
1072         assert(self.p)
1073         poll_result = self.p.poll()
1074         assert(poll_result == None)
1075         if self.inprogress:
1076             log('_fast_get: opening %r while %r is open\n'
1077                 % (id, self.inprogress))
1078         assert(not self.inprogress)
1079         assert(id.find('\n') < 0)
1080         assert(id.find('\r') < 0)
1081         assert(not id.startswith('-'))
1082         self.inprogress = id
1083         self.p.stdin.write('%s\n' % id)
1084         self.p.stdin.flush()
1085         hdr = self.p.stdout.readline()
1086         if hdr.endswith(' missing\n'):
1087             self.inprogress = None
1088             raise KeyError('blob %r is missing' % id)
1089         spl = hdr.split(' ')
1090         if len(spl) != 3 or len(spl[0]) != 40:
1091             raise GitError('expected blob, got %r' % spl)
1092         (hex, type, size) = spl
1093
1094         it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
1095                            onabort = self._abort)
1096         try:
1097             yield type
1098             for blob in it:
1099                 yield blob
1100             readline_result = self.p.stdout.readline()
1101             assert(readline_result == '\n')
1102             self.inprogress = None
1103         except Exception, e:
1104             it.abort()
1105             raise
1106
1107     def _slow_get(self, id):
1108         assert(id.find('\n') < 0)
1109         assert(id.find('\r') < 0)
1110         assert(id[0] != '-')
1111         type = _git_capture(['git', 'cat-file', '-t', id]).strip()
1112         yield type
1113
1114         p = subprocess.Popen(['git', 'cat-file', type, id],
1115                              stdout=subprocess.PIPE,
1116                              preexec_fn = _gitenv(self.repo_dir))
1117         for blob in chunkyreader(p.stdout):
1118             yield blob
1119         _git_wait('git cat-file', p)
1120
1121     def _join(self, it):
1122         type = it.next()
1123         if type == 'blob':
1124             for blob in it:
1125                 yield blob
1126         elif type == 'tree':
1127             treefile = ''.join(it)
1128             for (mode, name, sha) in tree_decode(treefile):
1129                 for blob in self.join(sha.encode('hex')):
1130                     yield blob
1131         elif type == 'commit':
1132             treeline = ''.join(it).split('\n')[0]
1133             assert(treeline.startswith('tree '))
1134             for blob in self.join(treeline[5:]):
1135                 yield blob
1136         else:
1137             raise GitError('invalid object type %r: expected blob/tree/commit'
1138                            % type)
1139
1140     def join(self, id):
1141         """Generate a list of the content of all blobs that can be reached
1142         from an object.  The hash given in 'id' must point to a blob, a tree
1143         or a commit. The content of all blobs that can be seen from trees or
1144         commits will be added to the list.
1145         """
1146         try:
1147             for d in self._join(self.get(id)):
1148                 yield d
1149         except StopIteration:
1150             log('booger!\n')
1151
1152
1153 _cp = {}
1154
1155 def cp(repo_dir=None):
1156     """Create a CatPipe object or reuse the already existing one."""
1157     global _cp
1158     if not repo_dir:
1159         repo_dir = repo()
1160     repo_dir = os.path.abspath(repo_dir)
1161     cp = _cp.get(repo_dir)
1162     if not cp:
1163         cp = CatPipe(repo_dir)
1164         _cp[repo_dir] = cp
1165     return cp
1166
1167
1168 def tags(repo_dir = None):
1169     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1170     tags = {}
1171     for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1172         assert(n.startswith('refs/tags/'))
1173         name = n[10:]
1174         if not c in tags:
1175             tags[c] = []
1176         tags[c].append(name)  # more than one tag can point at 'c'
1177     return tags