lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5 import os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
   6 from collections import namedtuple
   7 from itertools import islice
   8
   9 from bup.helpers import *
  10 from bup import _helpers, path, midx, bloom, xstat
  11
  12 max_pack_size = 1000*1000*1000  # larger packs will slow down pruning
  13 max_pack_objects = 200*1000  # cache memory usage is about 83 bytes per object
  14
  15 verbose = 0
  16 ignore_midx = 0
  17 repodir = None
  18
  19 _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
  20 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
  21
  22 _total_searches = 0
  23 _total_steps = 0
  24
  25
  26 class GitError(Exception):
  27     pass
  28
  29
  30 def parse_tz_offset(s):
  31     """UTC offset in seconds."""
  32     tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
  33     if s[0] == '-':
  34         return - tz_off
  35     return tz_off
  36
  37
  38 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
  39 # Make sure that's authoritative.
  40 _start_end_char = r'[^ .,:;<>"\'\0\n]'
  41 _content_char = r'[^\0\n<>]'
  42 _safe_str_rx = '(?:%s{1,2}|(?:%s%s*%s))' \
  43     % (_start_end_char,
  44        _start_end_char, _content_char, _start_end_char)
  45 _tz_rx = r'[-+]\d\d[0-5]\d'
  46 _parent_rx = r'(?:parent [abcdefABCDEF0123456789]{40}\n)'
  47 _commit_rx = re.compile(r'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
  48 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
  49 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)
  50
  51 (?P<message>(?:.|\n)*)''' % (_parent_rx,
  52                              _safe_str_rx, _safe_str_rx, _tz_rx,
  53                              _safe_str_rx, _safe_str_rx, _tz_rx))
  54 _parent_hash_rx = re.compile(r'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
  55
  56
  57 # Note that the author_sec and committer_sec values are (UTC) epoch seconds.
  58 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
  59                                        'author_name', 'author_mail',
  60                                        'author_sec', 'author_offset',
  61                                        'committer_name', 'committer_mail',
  62                                        'committer_sec', 'committer_offset',
  63                                        'message'])
  64
  65 def parse_commit(content):
  66     commit_match = re.match(_commit_rx, content)
  67     if not commit_match:
  68         raise Exception('cannot parse commit %r' % content)
  69     matches = commit_match.groupdict()
  70     return CommitInfo(tree=matches['tree'],
  71                       parents=re.findall(_parent_hash_rx, matches['parents']),
  72                       author_name=matches['author_name'],
  73                       author_mail=matches['author_mail'],
  74                       author_sec=int(matches['asec']),
  75                       author_offset=parse_tz_offset(matches['atz']),
  76                       committer_name=matches['committer_name'],
  77                       committer_mail=matches['committer_mail'],
  78                       committer_sec=int(matches['csec']),
  79                       committer_offset=parse_tz_offset(matches['ctz']),
  80                       message=matches['message'])
  81
  82
  83 def get_commit_items(id, cp):
  84     commit_it = cp.get(id)
  85     assert(commit_it.next() == 'commit')
  86     commit_content = ''.join(commit_it)
  87     return parse_commit(commit_content)
  88
  89
  90 def repo(sub = '', repo_dir=None):
  91     """Get the path to the git repository or one of its subdirectories."""
  92     global repodir
  93     repo_dir = repo_dir or repodir
  94     if not repo_dir:
  95         raise GitError('You should call check_repo_or_die()')
  96
  97     # If there's a .git subdirectory, then the actual repo is in there.
  98     gd = os.path.join(repo_dir, '.git')
  99     if os.path.exists(gd):
 100         repodir = gd
 101
 102     return os.path.join(repo_dir, sub)
 103
 104
 105 def shorten_hash(s):
 106     return re.sub(r'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)',
 107                   r'\1\2*\3', s)
 108
 109
 110 def repo_rel(path):
 111     full = os.path.abspath(path)
 112     fullrepo = os.path.abspath(repo(''))
 113     if not fullrepo.endswith('/'):
 114         fullrepo += '/'
 115     if full.startswith(fullrepo):
 116         path = full[len(fullrepo):]
 117     if path.startswith('index-cache/'):
 118         path = path[len('index-cache/'):]
 119     return shorten_hash(path)
 120
 121
 122 def all_packdirs():
 123     paths = [repo('objects/pack')]
 124     paths += glob.glob(repo('index-cache/*/.'))
 125     return paths
 126
 127
 128 def auto_midx(objdir):
 129     args = [path.exe(), 'midx', '--auto', '--dir', objdir]
 130     try:
 131         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 132     except OSError, e:
 133         # make sure 'args' gets printed to help with debugging
 134         add_error('%r: exception: %s' % (args, e))
 135         raise
 136     if rv:
 137         add_error('%r: returned %d' % (args, rv))
 138
 139     args = [path.exe(), 'bloom', '--dir', objdir]
 140     try:
 141         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 142     except OSError, e:
 143         # make sure 'args' gets printed to help with debugging
 144         add_error('%r: exception: %s' % (args, e))
 145         raise
 146     if rv:
 147         add_error('%r: returned %d' % (args, rv))
 148
 149
 150 def mangle_name(name, mode, gitmode):
 151     """Mangle a file name to present an abstract name for segmented files.
 152     Mangled file names will have the ".bup" extension added to them. If a
 153     file's name already ends with ".bup", a ".bupl" extension is added to
 154     disambiguate normal files from segmented ones.
 155     """
 156     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 157         assert(stat.S_ISDIR(gitmode))
 158         return name + '.bup'
 159     elif name.endswith('.bup') or name[:-1].endswith('.bup'):
 160         return name + '.bupl'
 161     else:
 162         return name
 163
 164
 165 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 166 def demangle_name(name):
 167     """Remove name mangling from a file name, if necessary.
 168
 169     The return value is a tuple (demangled_filename,mode), where mode is one of
 170     the following:
 171
 172     * BUP_NORMAL  : files that should be read as-is from the repository
 173     * BUP_CHUNKED : files that were chunked and need to be reassembled
 174
 175     For more information on the name mangling algorithm, see mangle_name()
 176     """
 177     if name.endswith('.bupl'):
 178         return (name[:-5], BUP_NORMAL)
 179     elif name.endswith('.bup'):
 180         return (name[:-4], BUP_CHUNKED)
 181     else:
 182         return (name, BUP_NORMAL)
 183
 184
 185 def calc_hash(type, content):
 186     """Calculate some content's hash in the Git fashion."""
 187     header = '%s %d\0' % (type, len(content))
 188     sum = Sha1(header)
 189     sum.update(content)
 190     return sum.digest()
 191
 192
 193 def shalist_item_sort_key(ent):
 194     (mode, name, id) = ent
 195     assert(mode+0 == mode)
 196     if stat.S_ISDIR(mode):
 197         return name + '/'
 198     else:
 199         return name
 200
 201
 202 def tree_encode(shalist):
 203     """Generate a git tree object from (mode,name,hash) tuples."""
 204     shalist = sorted(shalist, key = shalist_item_sort_key)
 205     l = []
 206     for (mode,name,bin) in shalist:
 207         assert(mode)
 208         assert(mode+0 == mode)
 209         assert(name)
 210         assert(len(bin) == 20)
 211         s = '%o %s\0%s' % (mode,name,bin)
 212         assert(s[0] != '0')  # 0-padded octal is not acceptable in a git tree
 213         l.append(s)
 214     return ''.join(l)
 215
 216
 217 def tree_decode(buf):
 218     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 219     ofs = 0
 220     while ofs < len(buf):
 221         z = buf.find('\0', ofs)
 222         assert(z > ofs)
 223         spl = buf[ofs:z].split(' ', 1)
 224         assert(len(spl) == 2)
 225         mode,name = spl
 226         sha = buf[z+1:z+1+20]
 227         ofs = z+1+20
 228         yield (int(mode, 8), name, sha)
 229
 230
 231 def _encode_packobj(type, content, compression_level=1):
 232     szout = ''
 233     sz = len(content)
 234     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 235     sz >>= 4
 236     while 1:
 237         if sz: szbits |= 0x80
 238         szout += chr(szbits)
 239         if not sz:
 240             break
 241         szbits = sz & 0x7f
 242         sz >>= 7
 243     if compression_level > 9:
 244         compression_level = 9
 245     elif compression_level < 0:
 246         compression_level = 0
 247     z = zlib.compressobj(compression_level)
 248     yield szout
 249     yield z.compress(content)
 250     yield z.flush()
 251
 252
 253 def _encode_looseobj(type, content, compression_level=1):
 254     z = zlib.compressobj(compression_level)
 255     yield z.compress('%s %d\0' % (type, len(content)))
 256     yield z.compress(content)
 257     yield z.flush()
 258
 259
 260 def _decode_looseobj(buf):
 261     assert(buf);
 262     s = zlib.decompress(buf)
 263     i = s.find('\0')
 264     assert(i > 0)
 265     l = s[:i].split(' ')
 266     type = l[0]
 267     sz = int(l[1])
 268     content = s[i+1:]
 269     assert(type in _typemap)
 270     assert(sz == len(content))
 271     return (type, content)
 272
 273
 274 def _decode_packobj(buf):
 275     assert(buf)
 276     c = ord(buf[0])
 277     type = _typermap[(c & 0x70) >> 4]
 278     sz = c & 0x0f
 279     shift = 4
 280     i = 0
 281     while c & 0x80:
 282         i += 1
 283         c = ord(buf[i])
 284         sz |= (c & 0x7f) << shift
 285         shift += 7
 286         if not (c & 0x80):
 287             break
 288     return (type, zlib.decompress(buf[i+1:]))
 289
 290
 291 class PackIdx:
 292     def __init__(self):
 293         assert(0)
 294
 295     def find_offset(self, hash):
 296         """Get the offset of an object inside the index file."""
 297         idx = self._idx_from_hash(hash)
 298         if idx != None:
 299             return self._ofs_from_idx(idx)
 300         return None
 301
 302     def exists(self, hash, want_source=False):
 303         """Return nonempty if the object exists in this index."""
 304         if hash and (self._idx_from_hash(hash) != None):
 305             return want_source and os.path.basename(self.name) or True
 306         return None
 307
 308     def __len__(self):
 309         return int(self.fanout[255])
 310
 311     def _idx_from_hash(self, hash):
 312         global _total_searches, _total_steps
 313         _total_searches += 1
 314         assert(len(hash) == 20)
 315         b1 = ord(hash[0])
 316         start = self.fanout[b1-1] # range -1..254
 317         end = self.fanout[b1] # range 0..255
 318         want = str(hash)
 319         _total_steps += 1  # lookup table is a step
 320         while start < end:
 321             _total_steps += 1
 322             mid = start + (end-start)/2
 323             v = self._idx_to_hash(mid)
 324             if v < want:
 325                 start = mid+1
 326             elif v > want:
 327                 end = mid
 328             else: # got it!
 329                 return mid
 330         return None
 331
 332
 333 class PackIdxV1(PackIdx):
 334     """Object representation of a Git pack index (version 1) file."""
 335     def __init__(self, filename, f):
 336         self.name = filename
 337         self.idxnames = [self.name]
 338         self.map = mmap_read(f)
 339         self.fanout = list(struct.unpack('!256I',
 340                                          str(buffer(self.map, 0, 256*4))))
 341         self.fanout.append(0)  # entry "-1"
 342         nsha = self.fanout[255]
 343         self.sha_ofs = 256*4
 344         self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
 345
 346     def _ofs_from_idx(self, idx):
 347         return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
 348
 349     def _idx_to_hash(self, idx):
 350         return str(self.shatable[idx*24+4 : idx*24+24])
 351
 352     def __iter__(self):
 353         for i in xrange(self.fanout[255]):
 354             yield buffer(self.map, 256*4 + 24*i + 4, 20)
 355
 356
 357 class PackIdxV2(PackIdx):
 358     """Object representation of a Git pack index (version 2) file."""
 359     def __init__(self, filename, f):
 360         self.name = filename
 361         self.idxnames = [self.name]
 362         self.map = mmap_read(f)
 363         assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
 364         self.fanout = list(struct.unpack('!256I',
 365                                          str(buffer(self.map, 8, 256*4))))
 366         self.fanout.append(0)  # entry "-1"
 367         nsha = self.fanout[255]
 368         self.sha_ofs = 8 + 256*4
 369         self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
 370         self.ofstable = buffer(self.map,
 371                                self.sha_ofs + nsha*20 + nsha*4,
 372                                nsha*4)
 373         self.ofs64table = buffer(self.map,
 374                                  8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
 375
 376     def _ofs_from_idx(self, idx):
 377         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
 378         if ofs & 0x80000000:
 379             idx64 = ofs & 0x7fffffff
 380             ofs = struct.unpack('!Q',
 381                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
 382         return ofs
 383
 384     def _idx_to_hash(self, idx):
 385         return str(self.shatable[idx*20:(idx+1)*20])
 386
 387     def __iter__(self):
 388         for i in xrange(self.fanout[255]):
 389             yield buffer(self.map, 8 + 256*4 + 20*i, 20)
 390
 391
 392 _mpi_count = 0
 393 class PackIdxList:
 394     def __init__(self, dir):
 395         global _mpi_count
 396         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 397         _mpi_count += 1
 398         self.dir = dir
 399         self.also = set()
 400         self.packs = []
 401         self.do_bloom = False
 402         self.bloom = None
 403         self.refresh()
 404
 405     def __del__(self):
 406         global _mpi_count
 407         _mpi_count -= 1
 408         assert(_mpi_count == 0)
 409
 410     def __iter__(self):
 411         return iter(idxmerge(self.packs))
 412
 413     def __len__(self):
 414         return sum(len(pack) for pack in self.packs)
 415
 416     def exists(self, hash, want_source=False):
 417         """Return nonempty if the object exists in the index files."""
 418         global _total_searches
 419         _total_searches += 1
 420         if hash in self.also:
 421             return True
 422         if self.do_bloom and self.bloom:
 423             if self.bloom.exists(hash):
 424                 self.do_bloom = False
 425             else:
 426                 _total_searches -= 1  # was counted by bloom
 427                 return None
 428         for i in xrange(len(self.packs)):
 429             p = self.packs[i]
 430             _total_searches -= 1  # will be incremented by sub-pack
 431             ix = p.exists(hash, want_source=want_source)
 432             if ix:
 433                 # reorder so most recently used packs are searched first
 434                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 435                 return ix
 436         self.do_bloom = True
 437         return None
 438
 439     def refresh(self, skip_midx = False):
 440         """Refresh the index list.
 441         This method verifies if .midx files were superseded (e.g. all of its
 442         contents are in another, bigger .midx file) and removes the superseded
 443         files.
 444
 445         If skip_midx is True, all work on .midx files will be skipped and .midx
 446         files will be removed from the list.
 447
 448         The module-global variable 'ignore_midx' can force this function to
 449         always act as if skip_midx was True.
 450         """
 451         self.bloom = None # Always reopen the bloom as it may have been relaced
 452         self.do_bloom = False
 453         skip_midx = skip_midx or ignore_midx
 454         d = dict((p.name, p) for p in self.packs
 455                  if not skip_midx or not isinstance(p, midx.PackMidx))
 456         if os.path.exists(self.dir):
 457             if not skip_midx:
 458                 midxl = []
 459                 for ix in self.packs:
 460                     if isinstance(ix, midx.PackMidx):
 461                         for name in ix.idxnames:
 462                             d[os.path.join(self.dir, name)] = ix
 463                 for full in glob.glob(os.path.join(self.dir,'*.midx')):
 464                     if not d.get(full):
 465                         mx = midx.PackMidx(full)
 466                         (mxd, mxf) = os.path.split(mx.name)
 467                         broken = False
 468                         for n in mx.idxnames:
 469                             if not os.path.exists(os.path.join(mxd, n)):
 470                                 log(('warning: index %s missing\n' +
 471                                     '  used by %s\n') % (n, mxf))
 472                                 broken = True
 473                         if broken:
 474                             mx.close()
 475                             del mx
 476                             unlink(full)
 477                         else:
 478                             midxl.append(mx)
 479                 midxl.sort(key=lambda ix:
 480                            (-len(ix), -xstat.stat(ix.name).st_mtime))
 481                 for ix in midxl:
 482                     any_needed = False
 483                     for sub in ix.idxnames:
 484                         found = d.get(os.path.join(self.dir, sub))
 485                         if not found or isinstance(found, PackIdx):
 486                             # doesn't exist, or exists but not in a midx
 487                             any_needed = True
 488                             break
 489                     if any_needed:
 490                         d[ix.name] = ix
 491                         for name in ix.idxnames:
 492                             d[os.path.join(self.dir, name)] = ix
 493                     elif not ix.force_keep:
 494                         debug1('midx: removing redundant: %s\n'
 495                                % os.path.basename(ix.name))
 496                         ix.close()
 497                         unlink(ix.name)
 498             for full in glob.glob(os.path.join(self.dir,'*.idx')):
 499                 if not d.get(full):
 500                     try:
 501                         ix = open_idx(full)
 502                     except GitError, e:
 503                         add_error(e)
 504                         continue
 505                     d[full] = ix
 506             bfull = os.path.join(self.dir, 'bup.bloom')
 507             if self.bloom is None and os.path.exists(bfull):
 508                 self.bloom = bloom.ShaBloom(bfull)
 509             self.packs = list(set(d.values()))
 510             self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
 511             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 512                 self.do_bloom = True
 513             else:
 514                 self.bloom = None
 515         debug1('PackIdxList: using %d index%s.\n'
 516             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 517
 518     def add(self, hash):
 519         """Insert an additional object in the list."""
 520         self.also.add(hash)
 521
 522
 523 def open_idx(filename):
 524     if filename.endswith('.idx'):
 525         f = open(filename, 'rb')
 526         header = f.read(8)
 527         if header[0:4] == '\377tOc':
 528             version = struct.unpack('!I', header[4:8])[0]
 529             if version == 2:
 530                 return PackIdxV2(filename, f)
 531             else:
 532                 raise GitError('%s: expected idx file version 2, got %d'
 533                                % (filename, version))
 534         elif len(header) == 8 and header[0:4] < '\377tOc':
 535             return PackIdxV1(filename, f)
 536         else:
 537             raise GitError('%s: unrecognized idx file header' % filename)
 538     elif filename.endswith('.midx'):
 539         return midx.PackMidx(filename)
 540     else:
 541         raise GitError('idx filenames must end with .idx or .midx')
 542
 543
 544 def idxmerge(idxlist, final_progress=True):
 545     """Generate a list of all the objects reachable in a PackIdxList."""
 546     def pfunc(count, total):
 547         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 548                   % (count*100.0/total, count, total))
 549     def pfinal(count, total):
 550         if final_progress:
 551             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 552                      % (100, total, total))
 553     return merge_iter(idxlist, 10024, pfunc, pfinal)
 554
 555
 556 def _make_objcache():
 557     return PackIdxList(repo('objects/pack'))
 558
 559 class PackWriter:
 560     """Writes Git objects inside a pack file."""
 561     def __init__(self, objcache_maker=_make_objcache, compression_level=1):
 562         self.count = 0
 563         self.outbytes = 0
 564         self.filename = None
 565         self.file = None
 566         self.idx = None
 567         self.objcache_maker = objcache_maker
 568         self.objcache = None
 569         self.compression_level = compression_level
 570
 571     def __del__(self):
 572         self.close()
 573
 574     def _open(self):
 575         if not self.file:
 576             (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
 577             self.file = os.fdopen(fd, 'w+b')
 578             assert(name.endswith('.pack'))
 579             self.filename = name[:-5]
 580             self.file.write('PACK\0\0\0\2\0\0\0\0')
 581             self.idx = list(list() for i in xrange(256))
 582
 583     def _raw_write(self, datalist, sha):
 584         self._open()
 585         f = self.file
 586         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 587         # the file never has a *partial* blob.  So let's make sure it's
 588         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 589         # to our hashsplit algorithm.)  f.write() does its own buffering,
 590         # but that's okay because we'll flush it in _end().
 591         oneblob = ''.join(datalist)
 592         try:
 593             f.write(oneblob)
 594         except IOError, e:
 595             raise GitError, e, sys.exc_info()[2]
 596         nw = len(oneblob)
 597         crc = zlib.crc32(oneblob) & 0xffffffff
 598         self._update_idx(sha, crc, nw)
 599         self.outbytes += nw
 600         self.count += 1
 601         return nw, crc
 602
 603     def _update_idx(self, sha, crc, size):
 604         assert(sha)
 605         if self.idx:
 606             self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
 607
 608     def _write(self, sha, type, content):
 609         if verbose:
 610             log('>')
 611         if not sha:
 612             sha = calc_hash(type, content)
 613         size, crc = self._raw_write(_encode_packobj(type, content,
 614                                                     self.compression_level),
 615                                     sha=sha)
 616         if self.outbytes >= max_pack_size or self.count >= max_pack_objects:
 617             self.breakpoint()
 618         return sha
 619
 620     def breakpoint(self):
 621         """Clear byte and object counts and return the last processed id."""
 622         id = self._end()
 623         self.outbytes = self.count = 0
 624         return id
 625
 626     def _require_objcache(self):
 627         if self.objcache is None and self.objcache_maker:
 628             self.objcache = self.objcache_maker()
 629         if self.objcache is None:
 630             raise GitError(
 631                     "PackWriter not opened or can't check exists w/o objcache")
 632
 633     def exists(self, id, want_source=False):
 634         """Return non-empty if an object is found in the object cache."""
 635         self._require_objcache()
 636         return self.objcache.exists(id, want_source=want_source)
 637
 638     def maybe_write(self, type, content):
 639         """Write an object to the pack file if not present and return its id."""
 640         sha = calc_hash(type, content)
 641         if not self.exists(sha):
 642             self._write(sha, type, content)
 643             self._require_objcache()
 644             self.objcache.add(sha)
 645         return sha
 646
 647     def new_blob(self, blob):
 648         """Create a blob object in the pack with the supplied content."""
 649         return self.maybe_write('blob', blob)
 650
 651     def new_tree(self, shalist):
 652         """Create a tree object in the pack."""
 653         content = tree_encode(shalist)
 654         return self.maybe_write('tree', content)
 655
 656     def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
 657         l = []
 658         if tree: l.append('tree %s' % tree.encode('hex'))
 659         if parent: l.append('parent %s' % parent.encode('hex'))
 660         if author: l.append('author %s %s' % (author, _git_date(adate)))
 661         if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
 662         l.append('')
 663         l.append(msg)
 664         return self.maybe_write('commit', '\n'.join(l))
 665
 666     def new_commit(self, parent, tree, date, msg):
 667         """Create a commit object in the pack."""
 668         userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
 669         commit = self._new_commit(tree, parent,
 670                                   userline, date, userline, date,
 671                                   msg)
 672         return commit
 673
 674     def abort(self):
 675         """Remove the pack file from disk."""
 676         f = self.file
 677         if f:
 678             self.idx = None
 679             self.file = None
 680             f.close()
 681             os.unlink(self.filename + '.pack')
 682
 683     def _end(self, run_midx=True):
 684         f = self.file
 685         if not f: return None
 686         self.file = None
 687         self.objcache = None
 688         idx = self.idx
 689         self.idx = None
 690
 691         # update object count
 692         f.seek(8)
 693         cp = struct.pack('!i', self.count)
 694         assert(len(cp) == 4)
 695         f.write(cp)
 696
 697         # calculate the pack sha1sum
 698         f.seek(0)
 699         sum = Sha1()
 700         for b in chunkyreader(f):
 701             sum.update(b)
 702         packbin = sum.digest()
 703         f.write(packbin)
 704         f.close()
 705
 706         obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
 707
 708         nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
 709         if os.path.exists(self.filename + '.map'):
 710             os.unlink(self.filename + '.map')
 711         os.rename(self.filename + '.pack', nameprefix + '.pack')
 712         os.rename(self.filename + '.idx', nameprefix + '.idx')
 713
 714         if run_midx:
 715             auto_midx(repo('objects/pack'))
 716         return nameprefix
 717
 718     def close(self, run_midx=True):
 719         """Close the pack file and move it to its definitive path."""
 720         return self._end(run_midx=run_midx)
 721
 722     def _write_pack_idx_v2(self, filename, idx, packbin):
 723         ofs64_count = 0
 724         for section in idx:
 725             for entry in section:
 726                 if entry[2] >= 2**31:
 727                     ofs64_count += 1
 728
 729         # Length: header + fan-out + shas-and-crcs + overflow-offsets
 730         index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
 731         idx_map = None
 732         idx_f = open(filename, 'w+b')
 733         try:
 734             idx_f.truncate(index_len)
 735             idx_map = mmap_readwrite(idx_f, close=False)
 736             count = _helpers.write_idx(filename, idx_map, idx, self.count)
 737             assert(count == self.count)
 738         finally:
 739             if idx_map: idx_map.close()
 740             idx_f.close()
 741
 742         idx_f = open(filename, 'a+b')
 743         try:
 744             idx_f.write(packbin)
 745             idx_f.seek(0)
 746             idx_sum = Sha1()
 747             b = idx_f.read(8 + 4*256)
 748             idx_sum.update(b)
 749
 750             obj_list_sum = Sha1()
 751             for b in chunkyreader(idx_f, 20*self.count):
 752                 idx_sum.update(b)
 753                 obj_list_sum.update(b)
 754             namebase = obj_list_sum.hexdigest()
 755
 756             for b in chunkyreader(idx_f):
 757                 idx_sum.update(b)
 758             idx_f.write(idx_sum.digest())
 759             return namebase
 760         finally:
 761             idx_f.close()
 762
 763
 764 def _git_date(date):
 765     return '%d %s' % (date, utc_offset_str(date))
 766
 767
 768 def _gitenv(repo_dir = None):
 769     if not repo_dir:
 770         repo_dir = repo()
 771     def env():
 772         os.environ['GIT_DIR'] = os.path.abspath(repo_dir)
 773     return env
 774
 775
 776 def list_refs(refname=None, repo_dir=None,
 777               limit_to_heads=False, limit_to_tags=False):
 778     """Yield (refname, hash) tuples for all repository refs unless a ref
 779     name is specified.  Given a ref name, only include tuples for that
 780     particular ref.  The limits restrict the result items to
 781     refs/heads or refs/tags.  If both limits are specified, items from
 782     both sources will be included.
 783
 784     """
 785     argv = ['git', 'show-ref']
 786     if limit_to_heads:
 787         argv.append('--heads')
 788     if limit_to_tags:
 789         argv.append('--tags')
 790     argv.append('--')
 791     if refname:
 792         argv += [refname]
 793     p = subprocess.Popen(argv,
 794                          preexec_fn = _gitenv(repo_dir),
 795                          stdout = subprocess.PIPE)
 796     out = p.stdout.read().strip()
 797     rv = p.wait()  # not fatal
 798     if rv:
 799         assert(not out)
 800     if out:
 801         for d in out.split('\n'):
 802             (sha, name) = d.split(' ', 1)
 803             yield (name, sha.decode('hex'))
 804
 805
 806 def read_ref(refname, repo_dir = None):
 807     """Get the commit id of the most recent commit made on a given ref."""
 808     refs = list_refs(refname, repo_dir=repo_dir, limit_to_heads=True)
 809     l = tuple(islice(refs, 2))
 810     if l:
 811         assert(len(l) == 1)
 812         return l[0][1]
 813     else:
 814         return None
 815
 816
 817 def rev_list(ref, count=None, repo_dir=None):
 818     """Generate a list of reachable commits in reverse chronological order.
 819
 820     This generator walks through commits, from child to parent, that are
 821     reachable via the specified ref and yields a series of tuples of the form
 822     (date,hash).
 823
 824     If count is a non-zero integer, limit the number of commits to "count"
 825     objects.
 826     """
 827     assert(not ref.startswith('-'))
 828     opts = []
 829     if count:
 830         opts += ['-n', str(atoi(count))]
 831     argv = ['git', 'rev-list', '--pretty=format:%at'] + opts + [ref, '--']
 832     p = subprocess.Popen(argv,
 833                          preexec_fn = _gitenv(repo_dir),
 834                          stdout = subprocess.PIPE)
 835     commit = None
 836     for row in p.stdout:
 837         s = row.strip()
 838         if s.startswith('commit '):
 839             commit = s[7:].decode('hex')
 840         else:
 841             date = int(s)
 842             yield (date, commit)
 843     rv = p.wait()  # not fatal
 844     if rv:
 845         raise GitError, 'git rev-list returned error %d' % rv
 846
 847
 848 def get_commit_dates(refs, repo_dir=None):
 849     """Get the dates for the specified commit refs.  For now, every unique
 850        string in refs must resolve to a different commit or this
 851        function will fail."""
 852     result = []
 853     for ref in refs:
 854         commit = get_commit_items(ref, cp(repo_dir))
 855         result.append(commit.author_sec)
 856     return result
 857
 858
 859 def rev_parse(committish, repo_dir=None):
 860     """Resolve the full hash for 'committish', if it exists.
 861
 862     Should be roughly equivalent to 'git rev-parse'.
 863
 864     Returns the hex value of the hash if it is found, None if 'committish' does
 865     not correspond to anything.
 866     """
 867     head = read_ref(committish, repo_dir=repo_dir)
 868     if head:
 869         debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
 870         return head
 871
 872     pL = PackIdxList(repo('objects/pack', repo_dir=repo_dir))
 873
 874     if len(committish) == 40:
 875         try:
 876             hash = committish.decode('hex')
 877         except TypeError:
 878             return None
 879
 880         if pL.exists(hash):
 881             return hash
 882
 883     return None
 884
 885
 886 def update_ref(refname, newval, oldval, repo_dir=None):
 887     """Update a repository reference."""
 888     if not oldval:
 889         oldval = ''
 890     assert(refname.startswith('refs/heads/') \
 891            or refname.startswith('refs/tags/'))
 892     p = subprocess.Popen(['git', 'update-ref', refname,
 893                           newval.encode('hex'), oldval.encode('hex')],
 894                          preexec_fn = _gitenv(repo_dir))
 895     _git_wait('git update-ref', p)
 896
 897
 898 def delete_ref(refname):
 899     """Delete a repository reference."""
 900     assert(refname.startswith('refs/'))
 901     p = subprocess.Popen(['git', 'update-ref', '-d', refname],
 902                          preexec_fn = _gitenv())
 903     _git_wait('git update-ref', p)
 904
 905
 906 def guess_repo(path=None):
 907     """Set the path value in the global variable "repodir".
 908     This makes bup look for an existing bup repository, but not fail if a
 909     repository doesn't exist. Usually, if you are interacting with a bup
 910     repository, you would not be calling this function but using
 911     check_repo_or_die().
 912     """
 913     global repodir
 914     if path:
 915         repodir = path
 916     if not repodir:
 917         repodir = os.environ.get('BUP_DIR')
 918         if not repodir:
 919             repodir = os.path.expanduser('~/.bup')
 920
 921
 922 def init_repo(path=None):
 923     """Create the Git bare repository for bup in a given path."""
 924     guess_repo(path)
 925     d = repo()  # appends a / to the path
 926     parent = os.path.dirname(os.path.dirname(d))
 927     if parent and not os.path.exists(parent):
 928         raise GitError('parent directory "%s" does not exist\n' % parent)
 929     if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
 930         raise GitError('"%s" exists but is not a directory\n' % d)
 931     p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
 932                          preexec_fn = _gitenv())
 933     _git_wait('git init', p)
 934     # Force the index version configuration in order to ensure bup works
 935     # regardless of the version of the installed Git binary.
 936     p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
 937                          stdout=sys.stderr, preexec_fn = _gitenv())
 938     _git_wait('git config', p)
 939     # Enable the reflog
 940     p = subprocess.Popen(['git', 'config', 'core.logAllRefUpdates', 'true'],
 941                          stdout=sys.stderr, preexec_fn = _gitenv())
 942     _git_wait('git config', p)
 943
 944
 945 def check_repo_or_die(path=None):
 946     """Make sure a bup repository exists, and abort if not.
 947     If the path to a particular repository was not specified, this function
 948     initializes the default repository automatically.
 949     """
 950     guess_repo(path)
 951     try:
 952         os.stat(repo('objects/pack/.'))
 953     except OSError, e:
 954         if e.errno == errno.ENOENT:
 955             log('error: %r is not a bup repository; run "bup init"\n'
 956                 % repo())
 957             sys.exit(15)
 958         else:
 959             log('error: %s\n' % e)
 960             sys.exit(14)
 961
 962
 963 _ver = None
 964 def ver():
 965     """Get Git's version and ensure a usable version is installed.
 966
 967     The returned version is formatted as an ordered tuple with each position
 968     representing a digit in the version tag. For example, the following tuple
 969     would represent version 1.6.6.9:
 970
 971         ('1', '6', '6', '9')
 972     """
 973     global _ver
 974     if not _ver:
 975         p = subprocess.Popen(['git', '--version'],
 976                              stdout=subprocess.PIPE)
 977         gvs = p.stdout.read()
 978         _git_wait('git --version', p)
 979         m = re.match(r'git version (\S+.\S+)', gvs)
 980         if not m:
 981             raise GitError('git --version weird output: %r' % gvs)
 982         _ver = tuple(m.group(1).split('.'))
 983     needed = ('1','5', '3', '1')
 984     if _ver < needed:
 985         raise GitError('git version %s or higher is required; you have %s'
 986                        % ('.'.join(needed), '.'.join(_ver)))
 987     return _ver
 988
 989
 990 def _git_wait(cmd, p):
 991     rv = p.wait()
 992     if rv != 0:
 993         raise GitError('%s returned %d' % (cmd, rv))
 994
 995
 996 def _git_capture(argv):
 997     p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv())
 998     r = p.stdout.read()
 999     _git_wait(repr(argv), p)
1000     return r
1001
1002
1003 class _AbortableIter:
1004     def __init__(self, it, onabort = None):
1005         self.it = it
1006         self.onabort = onabort
1007         self.done = None
1008
1009     def __iter__(self):
1010         return self
1011
1012     def next(self):
1013         try:
1014             return self.it.next()
1015         except StopIteration, e:
1016             self.done = True
1017             raise
1018         except:
1019             self.abort()
1020             raise
1021
1022     def abort(self):
1023         """Abort iteration and call the abortion callback, if needed."""
1024         if not self.done:
1025             self.done = True
1026             if self.onabort:
1027                 self.onabort()
1028
1029     def __del__(self):
1030         self.abort()
1031
1032
1033 _ver_warned = 0
1034 class CatPipe:
1035     """Link to 'git cat-file' that is used to retrieve blob data."""
1036     def __init__(self, repo_dir = None):
1037         global _ver_warned
1038         self.repo_dir = repo_dir
1039         wanted = ('1','5','6')
1040         if ver() < wanted:
1041             if not _ver_warned:
1042                 log('warning: git version < %s; bup will be slow.\n'
1043                     % '.'.join(wanted))
1044                 _ver_warned = 1
1045             self.get = self._slow_get
1046         else:
1047             self.p = self.inprogress = None
1048             self.get = self._fast_get
1049
1050     def _abort(self):
1051         if self.p:
1052             self.p.stdout.close()
1053             self.p.stdin.close()
1054         self.p = None
1055         self.inprogress = None
1056
1057     def _restart(self):
1058         self._abort()
1059         self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1060                                   stdin=subprocess.PIPE,
1061                                   stdout=subprocess.PIPE,
1062                                   close_fds = True,
1063                                   bufsize = 4096,
1064                                   preexec_fn = _gitenv(self.repo_dir))
1065
1066     def _fast_get(self, id):
1067         if not self.p or self.p.poll() != None:
1068             self._restart()
1069         assert(self.p)
1070         poll_result = self.p.poll()
1071         assert(poll_result == None)
1072         if self.inprogress:
1073             log('_fast_get: opening %r while %r is open\n'
1074                 % (id, self.inprogress))
1075         assert(not self.inprogress)
1076         assert(id.find('\n') < 0)
1077         assert(id.find('\r') < 0)
1078         assert(not id.startswith('-'))
1079         self.inprogress = id
1080         self.p.stdin.write('%s\n' % id)
1081         self.p.stdin.flush()
1082         hdr = self.p.stdout.readline()
1083         if hdr.endswith(' missing\n'):
1084             self.inprogress = None
1085             raise KeyError('blob %r is missing' % id)
1086         spl = hdr.split(' ')
1087         if len(spl) != 3 or len(spl[0]) != 40:
1088             raise GitError('expected blob, got %r' % spl)
1089         (hex, type, size) = spl
1090
1091         it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
1092                            onabort = self._abort)
1093         try:
1094             yield type
1095             for blob in it:
1096                 yield blob
1097             readline_result = self.p.stdout.readline()
1098             assert(readline_result == '\n')
1099             self.inprogress = None
1100         except Exception, e:
1101             it.abort()
1102             raise
1103
1104     def _slow_get(self, id):
1105         assert(id.find('\n') < 0)
1106         assert(id.find('\r') < 0)
1107         assert(id[0] != '-')
1108         type = _git_capture(['git', 'cat-file', '-t', id]).strip()
1109         yield type
1110
1111         p = subprocess.Popen(['git', 'cat-file', type, id],
1112                              stdout=subprocess.PIPE,
1113                              preexec_fn = _gitenv(self.repo_dir))
1114         for blob in chunkyreader(p.stdout):
1115             yield blob
1116         _git_wait('git cat-file', p)
1117
1118     def _join(self, it):
1119         type = it.next()
1120         if type == 'blob':
1121             for blob in it:
1122                 yield blob
1123         elif type == 'tree':
1124             treefile = ''.join(it)
1125             for (mode, name, sha) in tree_decode(treefile):
1126                 for blob in self.join(sha.encode('hex')):
1127                     yield blob
1128         elif type == 'commit':
1129             treeline = ''.join(it).split('\n')[0]
1130             assert(treeline.startswith('tree '))
1131             for blob in self.join(treeline[5:]):
1132                 yield blob
1133         else:
1134             raise GitError('invalid object type %r: expected blob/tree/commit'
1135                            % type)
1136
1137     def join(self, id):
1138         """Generate a list of the content of all blobs that can be reached
1139         from an object.  The hash given in 'id' must point to a blob, a tree
1140         or a commit. The content of all blobs that can be seen from trees or
1141         commits will be added to the list.
1142         """
1143         try:
1144             for d in self._join(self.get(id)):
1145                 yield d
1146         except StopIteration:
1147             log('booger!\n')
1148
1149
1150 _cp = {}
1151
1152 def cp(repo_dir=None):
1153     """Create a CatPipe object or reuse the already existing one."""
1154     global _cp
1155     if not repo_dir:
1156         repo_dir = repo()
1157     repo_dir = os.path.abspath(repo_dir)
1158     cp = _cp.get(repo_dir)
1159     if not cp:
1160         cp = CatPipe(repo_dir)
1161         _cp[repo_dir] = cp
1162     return cp
1163
1164
1165 def tags(repo_dir = None):
1166     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1167     tags = {}
1168     for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1169         assert(n.startswith('refs/tags/'))
1170         name = n[10:]
1171         if not c in tags:
1172             tags[c] = []
1173         tags[c].append(name)  # more than one tag can point at 'c'
1174     return tags