lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5
   6 import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
   7 from collections import namedtuple
   8 from itertools import islice
   9
  10 from bup import _helpers, path, midx, bloom, xstat
  11 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
  12                          fdatasync,
  13                          hostname, log, merge_iter, mmap_read, mmap_readwrite,
  14                          progress, qprogress, unlink, username, userfullname,
  15                          utc_offset_str)
  16
  17
  18 max_pack_size = 1000*1000*1000  # larger packs will slow down pruning
  19 max_pack_objects = 200*1000  # cache memory usage is about 83 bytes per object
  20
  21 verbose = 0
  22 ignore_midx = 0
  23 repodir = None
  24
  25 _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
  26 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
  27
  28 _total_searches = 0
  29 _total_steps = 0
  30
  31
  32 class GitError(Exception):
  33     pass
  34
  35
  36 def parse_tz_offset(s):
  37     """UTC offset in seconds."""
  38     tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
  39     if s[0] == '-':
  40         return - tz_off
  41     return tz_off
  42
  43
  44 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
  45 # Make sure that's authoritative.
  46 _start_end_char = r'[^ .,:;<>"\'\0\n]'
  47 _content_char = r'[^\0\n<>]'
  48 _safe_str_rx = '(?:%s{1,2}|(?:%s%s*%s))' \
  49     % (_start_end_char,
  50        _start_end_char, _content_char, _start_end_char)
  51 _tz_rx = r'[-+]\d\d[0-5]\d'
  52 _parent_rx = r'(?:parent [abcdefABCDEF0123456789]{40}\n)'
  53 _commit_rx = re.compile(r'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
  54 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
  55 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)
  56
  57 (?P<message>(?:.|\n)*)''' % (_parent_rx,
  58                              _safe_str_rx, _safe_str_rx, _tz_rx,
  59                              _safe_str_rx, _safe_str_rx, _tz_rx))
  60 _parent_hash_rx = re.compile(r'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
  61
  62
  63 # Note that the author_sec and committer_sec values are (UTC) epoch seconds.
  64 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
  65                                        'author_name', 'author_mail',
  66                                        'author_sec', 'author_offset',
  67                                        'committer_name', 'committer_mail',
  68                                        'committer_sec', 'committer_offset',
  69                                        'message'])
  70
  71 def parse_commit(content):
  72     commit_match = re.match(_commit_rx, content)
  73     if not commit_match:
  74         raise Exception('cannot parse commit %r' % content)
  75     matches = commit_match.groupdict()
  76     return CommitInfo(tree=matches['tree'],
  77                       parents=re.findall(_parent_hash_rx, matches['parents']),
  78                       author_name=matches['author_name'],
  79                       author_mail=matches['author_mail'],
  80                       author_sec=int(matches['asec']),
  81                       author_offset=parse_tz_offset(matches['atz']),
  82                       committer_name=matches['committer_name'],
  83                       committer_mail=matches['committer_mail'],
  84                       committer_sec=int(matches['csec']),
  85                       committer_offset=parse_tz_offset(matches['ctz']),
  86                       message=matches['message'])
  87
  88
  89 def get_commit_items(id, cp):
  90     commit_it = cp.get(id)
  91     assert(commit_it.next() == 'commit')
  92     commit_content = ''.join(commit_it)
  93     return parse_commit(commit_content)
  94
  95
  96 def repo(sub = '', repo_dir=None):
  97     """Get the path to the git repository or one of its subdirectories."""
  98     global repodir
  99     repo_dir = repo_dir or repodir
 100     if not repo_dir:
 101         raise GitError('You should call check_repo_or_die()')
 102
 103     # If there's a .git subdirectory, then the actual repo is in there.
 104     gd = os.path.join(repo_dir, '.git')
 105     if os.path.exists(gd):
 106         repodir = gd
 107
 108     return os.path.join(repo_dir, sub)
 109
 110
 111 def shorten_hash(s):
 112     return re.sub(r'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)',
 113                   r'\1\2*\3', s)
 114
 115
 116 def repo_rel(path):
 117     full = os.path.abspath(path)
 118     fullrepo = os.path.abspath(repo(''))
 119     if not fullrepo.endswith('/'):
 120         fullrepo += '/'
 121     if full.startswith(fullrepo):
 122         path = full[len(fullrepo):]
 123     if path.startswith('index-cache/'):
 124         path = path[len('index-cache/'):]
 125     return shorten_hash(path)
 126
 127
 128 def all_packdirs():
 129     paths = [repo('objects/pack')]
 130     paths += glob.glob(repo('index-cache/*/.'))
 131     return paths
 132
 133
 134 def auto_midx(objdir):
 135     args = [path.exe(), 'midx', '--auto', '--dir', objdir]
 136     try:
 137         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 138     except OSError as e:
 139         # make sure 'args' gets printed to help with debugging
 140         add_error('%r: exception: %s' % (args, e))
 141         raise
 142     if rv:
 143         add_error('%r: returned %d' % (args, rv))
 144
 145     args = [path.exe(), 'bloom', '--dir', objdir]
 146     try:
 147         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 148     except OSError as e:
 149         # make sure 'args' gets printed to help with debugging
 150         add_error('%r: exception: %s' % (args, e))
 151         raise
 152     if rv:
 153         add_error('%r: returned %d' % (args, rv))
 154
 155
 156 def mangle_name(name, mode, gitmode):
 157     """Mangle a file name to present an abstract name for segmented files.
 158     Mangled file names will have the ".bup" extension added to them. If a
 159     file's name already ends with ".bup", a ".bupl" extension is added to
 160     disambiguate normal files from segmented ones.
 161     """
 162     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 163         assert(stat.S_ISDIR(gitmode))
 164         return name + '.bup'
 165     elif name.endswith('.bup') or name[:-1].endswith('.bup'):
 166         return name + '.bupl'
 167     else:
 168         return name
 169
 170
 171 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 172 def demangle_name(name, mode):
 173     """Remove name mangling from a file name, if necessary.
 174
 175     The return value is a tuple (demangled_filename,mode), where mode is one of
 176     the following:
 177
 178     * BUP_NORMAL  : files that should be read as-is from the repository
 179     * BUP_CHUNKED : files that were chunked and need to be reassembled
 180
 181     For more information on the name mangling algorithm, see mangle_name()
 182     """
 183     if name.endswith('.bupl'):
 184         return (name[:-5], BUP_NORMAL)
 185     elif name.endswith('.bup'):
 186         return (name[:-4], BUP_CHUNKED)
 187     elif name.endswith('.bupm'):
 188         return (name[:-5],
 189                 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
 190     else:
 191         return (name, BUP_NORMAL)
 192
 193
 194 def calc_hash(type, content):
 195     """Calculate some content's hash in the Git fashion."""
 196     header = '%s %d\0' % (type, len(content))
 197     sum = Sha1(header)
 198     sum.update(content)
 199     return sum.digest()
 200
 201
 202 def shalist_item_sort_key(ent):
 203     (mode, name, id) = ent
 204     assert(mode+0 == mode)
 205     if stat.S_ISDIR(mode):
 206         return name + '/'
 207     else:
 208         return name
 209
 210
 211 def tree_encode(shalist):
 212     """Generate a git tree object from (mode,name,hash) tuples."""
 213     shalist = sorted(shalist, key = shalist_item_sort_key)
 214     l = []
 215     for (mode,name,bin) in shalist:
 216         assert(mode)
 217         assert(mode+0 == mode)
 218         assert(name)
 219         assert(len(bin) == 20)
 220         s = '%o %s\0%s' % (mode,name,bin)
 221         assert(s[0] != '0')  # 0-padded octal is not acceptable in a git tree
 222         l.append(s)
 223     return ''.join(l)
 224
 225
 226 def tree_decode(buf):
 227     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 228     ofs = 0
 229     while ofs < len(buf):
 230         z = buf.find('\0', ofs)
 231         assert(z > ofs)
 232         spl = buf[ofs:z].split(' ', 1)
 233         assert(len(spl) == 2)
 234         mode,name = spl
 235         sha = buf[z+1:z+1+20]
 236         ofs = z+1+20
 237         yield (int(mode, 8), name, sha)
 238
 239
 240 def _encode_packobj(type, content, compression_level=1):
 241     szout = ''
 242     sz = len(content)
 243     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 244     sz >>= 4
 245     while 1:
 246         if sz: szbits |= 0x80
 247         szout += chr(szbits)
 248         if not sz:
 249             break
 250         szbits = sz & 0x7f
 251         sz >>= 7
 252     if compression_level > 9:
 253         compression_level = 9
 254     elif compression_level < 0:
 255         compression_level = 0
 256     z = zlib.compressobj(compression_level)
 257     yield szout
 258     yield z.compress(content)
 259     yield z.flush()
 260
 261
 262 def _encode_looseobj(type, content, compression_level=1):
 263     z = zlib.compressobj(compression_level)
 264     yield z.compress('%s %d\0' % (type, len(content)))
 265     yield z.compress(content)
 266     yield z.flush()
 267
 268
 269 def _decode_looseobj(buf):
 270     assert(buf);
 271     s = zlib.decompress(buf)
 272     i = s.find('\0')
 273     assert(i > 0)
 274     l = s[:i].split(' ')
 275     type = l[0]
 276     sz = int(l[1])
 277     content = s[i+1:]
 278     assert(type in _typemap)
 279     assert(sz == len(content))
 280     return (type, content)
 281
 282
 283 def _decode_packobj(buf):
 284     assert(buf)
 285     c = ord(buf[0])
 286     type = _typermap[(c & 0x70) >> 4]
 287     sz = c & 0x0f
 288     shift = 4
 289     i = 0
 290     while c & 0x80:
 291         i += 1
 292         c = ord(buf[i])
 293         sz |= (c & 0x7f) << shift
 294         shift += 7
 295         if not (c & 0x80):
 296             break
 297     return (type, zlib.decompress(buf[i+1:]))
 298
 299
 300 class PackIdx:
 301     def __init__(self):
 302         assert(0)
 303
 304     def find_offset(self, hash):
 305         """Get the offset of an object inside the index file."""
 306         idx = self._idx_from_hash(hash)
 307         if idx != None:
 308             return self._ofs_from_idx(idx)
 309         return None
 310
 311     def exists(self, hash, want_source=False):
 312         """Return nonempty if the object exists in this index."""
 313         if hash and (self._idx_from_hash(hash) != None):
 314             return want_source and os.path.basename(self.name) or True
 315         return None
 316
 317     def __len__(self):
 318         return int(self.fanout[255])
 319
 320     def _idx_from_hash(self, hash):
 321         global _total_searches, _total_steps
 322         _total_searches += 1
 323         assert(len(hash) == 20)
 324         b1 = ord(hash[0])
 325         start = self.fanout[b1-1] # range -1..254
 326         end = self.fanout[b1] # range 0..255
 327         want = str(hash)
 328         _total_steps += 1  # lookup table is a step
 329         while start < end:
 330             _total_steps += 1
 331             mid = start + (end-start)/2
 332             v = self._idx_to_hash(mid)
 333             if v < want:
 334                 start = mid+1
 335             elif v > want:
 336                 end = mid
 337             else: # got it!
 338                 return mid
 339         return None
 340
 341
 342 class PackIdxV1(PackIdx):
 343     """Object representation of a Git pack index (version 1) file."""
 344     def __init__(self, filename, f):
 345         self.name = filename
 346         self.idxnames = [self.name]
 347         self.map = mmap_read(f)
 348         self.fanout = list(struct.unpack('!256I',
 349                                          str(buffer(self.map, 0, 256*4))))
 350         self.fanout.append(0)  # entry "-1"
 351         nsha = self.fanout[255]
 352         self.sha_ofs = 256*4
 353         self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
 354
 355     def _ofs_from_idx(self, idx):
 356         return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
 357
 358     def _idx_to_hash(self, idx):
 359         return str(self.shatable[idx*24+4 : idx*24+24])
 360
 361     def __iter__(self):
 362         for i in xrange(self.fanout[255]):
 363             yield buffer(self.map, 256*4 + 24*i + 4, 20)
 364
 365
 366 class PackIdxV2(PackIdx):
 367     """Object representation of a Git pack index (version 2) file."""
 368     def __init__(self, filename, f):
 369         self.name = filename
 370         self.idxnames = [self.name]
 371         self.map = mmap_read(f)
 372         assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
 373         self.fanout = list(struct.unpack('!256I',
 374                                          str(buffer(self.map, 8, 256*4))))
 375         self.fanout.append(0)  # entry "-1"
 376         nsha = self.fanout[255]
 377         self.sha_ofs = 8 + 256*4
 378         self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
 379         self.ofstable = buffer(self.map,
 380                                self.sha_ofs + nsha*20 + nsha*4,
 381                                nsha*4)
 382         self.ofs64table = buffer(self.map,
 383                                  8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
 384
 385     def _ofs_from_idx(self, idx):
 386         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
 387         if ofs & 0x80000000:
 388             idx64 = ofs & 0x7fffffff
 389             ofs = struct.unpack('!Q',
 390                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
 391         return ofs
 392
 393     def _idx_to_hash(self, idx):
 394         return str(self.shatable[idx*20:(idx+1)*20])
 395
 396     def __iter__(self):
 397         for i in xrange(self.fanout[255]):
 398             yield buffer(self.map, 8 + 256*4 + 20*i, 20)
 399
 400
 401 _mpi_count = 0
 402 class PackIdxList:
 403     def __init__(self, dir):
 404         global _mpi_count
 405         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 406         _mpi_count += 1
 407         self.dir = dir
 408         self.also = set()
 409         self.packs = []
 410         self.do_bloom = False
 411         self.bloom = None
 412         self.refresh()
 413
 414     def __del__(self):
 415         global _mpi_count
 416         _mpi_count -= 1
 417         assert(_mpi_count == 0)
 418
 419     def __iter__(self):
 420         return iter(idxmerge(self.packs))
 421
 422     def __len__(self):
 423         return sum(len(pack) for pack in self.packs)
 424
 425     def exists(self, hash, want_source=False):
 426         """Return nonempty if the object exists in the index files."""
 427         global _total_searches
 428         _total_searches += 1
 429         if hash in self.also:
 430             return True
 431         if self.do_bloom and self.bloom:
 432             if self.bloom.exists(hash):
 433                 self.do_bloom = False
 434             else:
 435                 _total_searches -= 1  # was counted by bloom
 436                 return None
 437         for i in xrange(len(self.packs)):
 438             p = self.packs[i]
 439             _total_searches -= 1  # will be incremented by sub-pack
 440             ix = p.exists(hash, want_source=want_source)
 441             if ix:
 442                 # reorder so most recently used packs are searched first
 443                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 444                 return ix
 445         self.do_bloom = True
 446         return None
 447
 448     def refresh(self, skip_midx = False):
 449         """Refresh the index list.
 450         This method verifies if .midx files were superseded (e.g. all of its
 451         contents are in another, bigger .midx file) and removes the superseded
 452         files.
 453
 454         If skip_midx is True, all work on .midx files will be skipped and .midx
 455         files will be removed from the list.
 456
 457         The module-global variable 'ignore_midx' can force this function to
 458         always act as if skip_midx was True.
 459         """
 460         self.bloom = None # Always reopen the bloom as it may have been relaced
 461         self.do_bloom = False
 462         skip_midx = skip_midx or ignore_midx
 463         d = dict((p.name, p) for p in self.packs
 464                  if not skip_midx or not isinstance(p, midx.PackMidx))
 465         if os.path.exists(self.dir):
 466             if not skip_midx:
 467                 midxl = []
 468                 for ix in self.packs:
 469                     if isinstance(ix, midx.PackMidx):
 470                         for name in ix.idxnames:
 471                             d[os.path.join(self.dir, name)] = ix
 472                 for full in glob.glob(os.path.join(self.dir,'*.midx')):
 473                     if not d.get(full):
 474                         mx = midx.PackMidx(full)
 475                         (mxd, mxf) = os.path.split(mx.name)
 476                         broken = False
 477                         for n in mx.idxnames:
 478                             if not os.path.exists(os.path.join(mxd, n)):
 479                                 log(('warning: index %s missing\n' +
 480                                     '  used by %s\n') % (n, mxf))
 481                                 broken = True
 482                         if broken:
 483                             mx.close()
 484                             del mx
 485                             unlink(full)
 486                         else:
 487                             midxl.append(mx)
 488                 midxl.sort(key=lambda ix:
 489                            (-len(ix), -xstat.stat(ix.name).st_mtime))
 490                 for ix in midxl:
 491                     any_needed = False
 492                     for sub in ix.idxnames:
 493                         found = d.get(os.path.join(self.dir, sub))
 494                         if not found or isinstance(found, PackIdx):
 495                             # doesn't exist, or exists but not in a midx
 496                             any_needed = True
 497                             break
 498                     if any_needed:
 499                         d[ix.name] = ix
 500                         for name in ix.idxnames:
 501                             d[os.path.join(self.dir, name)] = ix
 502                     elif not ix.force_keep:
 503                         debug1('midx: removing redundant: %s\n'
 504                                % os.path.basename(ix.name))
 505                         ix.close()
 506                         unlink(ix.name)
 507             for full in glob.glob(os.path.join(self.dir,'*.idx')):
 508                 if not d.get(full):
 509                     try:
 510                         ix = open_idx(full)
 511                     except GitError as e:
 512                         add_error(e)
 513                         continue
 514                     d[full] = ix
 515             bfull = os.path.join(self.dir, 'bup.bloom')
 516             if self.bloom is None and os.path.exists(bfull):
 517                 self.bloom = bloom.ShaBloom(bfull)
 518             self.packs = list(set(d.values()))
 519             self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
 520             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 521                 self.do_bloom = True
 522             else:
 523                 self.bloom = None
 524         debug1('PackIdxList: using %d index%s.\n'
 525             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 526
 527     def add(self, hash):
 528         """Insert an additional object in the list."""
 529         self.also.add(hash)
 530
 531
 532 def open_idx(filename):
 533     if filename.endswith('.idx'):
 534         f = open(filename, 'rb')
 535         header = f.read(8)
 536         if header[0:4] == '\377tOc':
 537             version = struct.unpack('!I', header[4:8])[0]
 538             if version == 2:
 539                 return PackIdxV2(filename, f)
 540             else:
 541                 raise GitError('%s: expected idx file version 2, got %d'
 542                                % (filename, version))
 543         elif len(header) == 8 and header[0:4] < '\377tOc':
 544             return PackIdxV1(filename, f)
 545         else:
 546             raise GitError('%s: unrecognized idx file header' % filename)
 547     elif filename.endswith('.midx'):
 548         return midx.PackMidx(filename)
 549     else:
 550         raise GitError('idx filenames must end with .idx or .midx')
 551
 552
 553 def idxmerge(idxlist, final_progress=True):
 554     """Generate a list of all the objects reachable in a PackIdxList."""
 555     def pfunc(count, total):
 556         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 557                   % (count*100.0/total, count, total))
 558     def pfinal(count, total):
 559         if final_progress:
 560             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 561                      % (100, total, total))
 562     return merge_iter(idxlist, 10024, pfunc, pfinal)
 563
 564
 565 def _make_objcache():
 566     return PackIdxList(repo('objects/pack'))
 567
 568 class PackWriter:
 569     """Writes Git objects inside a pack file."""
 570     def __init__(self, objcache_maker=_make_objcache, compression_level=1):
 571         self.file = None
 572         self.parentfd = None
 573         self.count = 0
 574         self.outbytes = 0
 575         self.filename = None
 576         self.idx = None
 577         self.objcache_maker = objcache_maker
 578         self.objcache = None
 579         self.compression_level = compression_level
 580
 581     def __del__(self):
 582         self.close()
 583
 584     def _open(self):
 585         if not self.file:
 586             objdir = dir=repo('objects')
 587             fd, name = tempfile.mkstemp(suffix='.pack', dir=objdir)
 588             try:
 589                 self.file = os.fdopen(fd, 'w+b')
 590             except:
 591                 os.close(fd)
 592                 raise
 593             try:
 594                 self.parentfd = os.open(objdir, os.O_RDONLY)
 595             except:
 596                 f = self.file
 597                 self.file = None
 598                 f.close()
 599                 raise
 600             assert(name.endswith('.pack'))
 601             self.filename = name[:-5]
 602             self.file.write('PACK\0\0\0\2\0\0\0\0')
 603             self.idx = list(list() for i in xrange(256))
 604
 605     def _raw_write(self, datalist, sha):
 606         self._open()
 607         f = self.file
 608         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 609         # the file never has a *partial* blob.  So let's make sure it's
 610         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 611         # to our hashsplit algorithm.)  f.write() does its own buffering,
 612         # but that's okay because we'll flush it in _end().
 613         oneblob = ''.join(datalist)
 614         try:
 615             f.write(oneblob)
 616         except IOError as e:
 617             raise GitError, e, sys.exc_info()[2]
 618         nw = len(oneblob)
 619         crc = zlib.crc32(oneblob) & 0xffffffff
 620         self._update_idx(sha, crc, nw)
 621         self.outbytes += nw
 622         self.count += 1
 623         return nw, crc
 624
 625     def _update_idx(self, sha, crc, size):
 626         assert(sha)
 627         if self.idx:
 628             self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
 629
 630     def _write(self, sha, type, content):
 631         if verbose:
 632             log('>')
 633         if not sha:
 634             sha = calc_hash(type, content)
 635         size, crc = self._raw_write(_encode_packobj(type, content,
 636                                                     self.compression_level),
 637                                     sha=sha)
 638         if self.outbytes >= max_pack_size or self.count >= max_pack_objects:
 639             self.breakpoint()
 640         return sha
 641
 642     def breakpoint(self):
 643         """Clear byte and object counts and return the last processed id."""
 644         id = self._end()
 645         self.outbytes = self.count = 0
 646         return id
 647
 648     def _require_objcache(self):
 649         if self.objcache is None and self.objcache_maker:
 650             self.objcache = self.objcache_maker()
 651         if self.objcache is None:
 652             raise GitError(
 653                     "PackWriter not opened or can't check exists w/o objcache")
 654
 655     def exists(self, id, want_source=False):
 656         """Return non-empty if an object is found in the object cache."""
 657         self._require_objcache()
 658         return self.objcache.exists(id, want_source=want_source)
 659
 660     def maybe_write(self, type, content):
 661         """Write an object to the pack file if not present and return its id."""
 662         sha = calc_hash(type, content)
 663         if not self.exists(sha):
 664             self._write(sha, type, content)
 665             self._require_objcache()
 666             self.objcache.add(sha)
 667         return sha
 668
 669     def new_blob(self, blob):
 670         """Create a blob object in the pack with the supplied content."""
 671         return self.maybe_write('blob', blob)
 672
 673     def new_tree(self, shalist):
 674         """Create a tree object in the pack."""
 675         content = tree_encode(shalist)
 676         return self.maybe_write('tree', content)
 677
 678     def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
 679         l = []
 680         if tree: l.append('tree %s' % tree.encode('hex'))
 681         if parent: l.append('parent %s' % parent.encode('hex'))
 682         if author: l.append('author %s %s' % (author, _git_date(adate)))
 683         if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
 684         l.append('')
 685         l.append(msg)
 686         return self.maybe_write('commit', '\n'.join(l))
 687
 688     def new_commit(self, parent, tree, date, msg):
 689         """Create a commit object in the pack."""
 690         userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
 691         commit = self._new_commit(tree, parent,
 692                                   userline, date, userline, date,
 693                                   msg)
 694         return commit
 695
 696     def abort(self):
 697         """Remove the pack file from disk."""
 698         f = self.file
 699         if f:
 700             pfd = self.parentfd
 701             self.file = None
 702             self.parentfd = None
 703             self.idx = None
 704             try:
 705                 try:
 706                     os.unlink(self.filename + '.pack')
 707                 finally:
 708                     f.close()
 709             finally:
 710                 if pfd is not None:
 711                     os.close(pfd)
 712
 713     def _end(self, run_midx=True):
 714         f = self.file
 715         if not f: return None
 716         self.file = None
 717         try:
 718             self.objcache = None
 719             idx = self.idx
 720             self.idx = None
 721
 722             # update object count
 723             f.seek(8)
 724             cp = struct.pack('!i', self.count)
 725             assert(len(cp) == 4)
 726             f.write(cp)
 727
 728             # calculate the pack sha1sum
 729             f.seek(0)
 730             sum = Sha1()
 731             for b in chunkyreader(f):
 732                 sum.update(b)
 733             packbin = sum.digest()
 734             f.write(packbin)
 735             fdatasync(f.fileno())
 736         finally:
 737             f.close()
 738
 739         obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
 740
 741         nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
 742         if os.path.exists(self.filename + '.map'):
 743             os.unlink(self.filename + '.map')
 744         os.rename(self.filename + '.pack', nameprefix + '.pack')
 745         os.rename(self.filename + '.idx', nameprefix + '.idx')
 746         try:
 747             os.fsync(self.parentfd)
 748         finally:
 749             os.close(self.parentfd)
 750
 751         if run_midx:
 752             auto_midx(repo('objects/pack'))
 753         return nameprefix
 754
 755     def close(self, run_midx=True):
 756         """Close the pack file and move it to its definitive path."""
 757         return self._end(run_midx=run_midx)
 758
 759     def _write_pack_idx_v2(self, filename, idx, packbin):
 760         ofs64_count = 0
 761         for section in idx:
 762             for entry in section:
 763                 if entry[2] >= 2**31:
 764                     ofs64_count += 1
 765
 766         # Length: header + fan-out + shas-and-crcs + overflow-offsets
 767         index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
 768         idx_map = None
 769         idx_f = open(filename, 'w+b')
 770         try:
 771             idx_f.truncate(index_len)
 772             fdatasync(idx_f.fileno())
 773             idx_map = mmap_readwrite(idx_f, close=False)
 774             try:
 775                 count = _helpers.write_idx(filename, idx_map, idx, self.count)
 776                 assert(count == self.count)
 777                 idx_map.flush()
 778             finally:
 779                 idx_map.close()
 780         finally:
 781             idx_f.close()
 782
 783         idx_f = open(filename, 'a+b')
 784         try:
 785             idx_f.write(packbin)
 786             idx_f.seek(0)
 787             idx_sum = Sha1()
 788             b = idx_f.read(8 + 4*256)
 789             idx_sum.update(b)
 790
 791             obj_list_sum = Sha1()
 792             for b in chunkyreader(idx_f, 20*self.count):
 793                 idx_sum.update(b)
 794                 obj_list_sum.update(b)
 795             namebase = obj_list_sum.hexdigest()
 796
 797             for b in chunkyreader(idx_f):
 798                 idx_sum.update(b)
 799             idx_f.write(idx_sum.digest())
 800             fdatasync(idx_f.fileno())
 801             return namebase
 802         finally:
 803             idx_f.close()
 804
 805
 806 def _git_date(date):
 807     return '%d %s' % (date, utc_offset_str(date))
 808
 809
 810 def _gitenv(repo_dir = None):
 811     if not repo_dir:
 812         repo_dir = repo()
 813     def env():
 814         os.environ['GIT_DIR'] = os.path.abspath(repo_dir)
 815     return env
 816
 817
 818 def list_refs(refname=None, repo_dir=None,
 819               limit_to_heads=False, limit_to_tags=False):
 820     """Yield (refname, hash) tuples for all repository refs unless a ref
 821     name is specified.  Given a ref name, only include tuples for that
 822     particular ref.  The limits restrict the result items to
 823     refs/heads or refs/tags.  If both limits are specified, items from
 824     both sources will be included.
 825
 826     """
 827     argv = ['git', 'show-ref']
 828     if limit_to_heads:
 829         argv.append('--heads')
 830     if limit_to_tags:
 831         argv.append('--tags')
 832     argv.append('--')
 833     if refname:
 834         argv += [refname]
 835     p = subprocess.Popen(argv,
 836                          preexec_fn = _gitenv(repo_dir),
 837                          stdout = subprocess.PIPE)
 838     out = p.stdout.read().strip()
 839     rv = p.wait()  # not fatal
 840     if rv:
 841         assert(not out)
 842     if out:
 843         for d in out.split('\n'):
 844             (sha, name) = d.split(' ', 1)
 845             yield (name, sha.decode('hex'))
 846
 847
 848 def read_ref(refname, repo_dir = None):
 849     """Get the commit id of the most recent commit made on a given ref."""
 850     refs = list_refs(refname, repo_dir=repo_dir, limit_to_heads=True)
 851     l = tuple(islice(refs, 2))
 852     if l:
 853         assert(len(l) == 1)
 854         return l[0][1]
 855     else:
 856         return None
 857
 858
 859 def rev_list(ref, count=None, repo_dir=None):
 860     """Generate a list of reachable commits in reverse chronological order.
 861
 862     This generator walks through commits, from child to parent, that are
 863     reachable via the specified ref and yields a series of tuples of the form
 864     (date,hash).
 865
 866     If count is a non-zero integer, limit the number of commits to "count"
 867     objects.
 868     """
 869     assert(not ref.startswith('-'))
 870     opts = []
 871     if count:
 872         opts += ['-n', str(atoi(count))]
 873     argv = ['git', 'rev-list', '--pretty=format:%at'] + opts + [ref, '--']
 874     p = subprocess.Popen(argv,
 875                          preexec_fn = _gitenv(repo_dir),
 876                          stdout = subprocess.PIPE)
 877     commit = None
 878     for row in p.stdout:
 879         s = row.strip()
 880         if s.startswith('commit '):
 881             commit = s[7:].decode('hex')
 882         else:
 883             date = int(s)
 884             yield (date, commit)
 885     rv = p.wait()  # not fatal
 886     if rv:
 887         raise GitError, 'git rev-list returned error %d' % rv
 888
 889
 890 def get_commit_dates(refs, repo_dir=None):
 891     """Get the dates for the specified commit refs.  For now, every unique
 892        string in refs must resolve to a different commit or this
 893        function will fail."""
 894     result = []
 895     for ref in refs:
 896         commit = get_commit_items(ref, cp(repo_dir))
 897         result.append(commit.author_sec)
 898     return result
 899
 900
 901 def rev_parse(committish, repo_dir=None):
 902     """Resolve the full hash for 'committish', if it exists.
 903
 904     Should be roughly equivalent to 'git rev-parse'.
 905
 906     Returns the hex value of the hash if it is found, None if 'committish' does
 907     not correspond to anything.
 908     """
 909     head = read_ref(committish, repo_dir=repo_dir)
 910     if head:
 911         debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
 912         return head
 913
 914     pL = PackIdxList(repo('objects/pack', repo_dir=repo_dir))
 915
 916     if len(committish) == 40:
 917         try:
 918             hash = committish.decode('hex')
 919         except TypeError:
 920             return None
 921
 922         if pL.exists(hash):
 923             return hash
 924
 925     return None
 926
 927
 928 def update_ref(refname, newval, oldval, repo_dir=None):
 929     """Update a repository reference."""
 930     if not oldval:
 931         oldval = ''
 932     assert(refname.startswith('refs/heads/') \
 933            or refname.startswith('refs/tags/'))
 934     p = subprocess.Popen(['git', 'update-ref', refname,
 935                           newval.encode('hex'), oldval.encode('hex')],
 936                          preexec_fn = _gitenv(repo_dir))
 937     _git_wait('git update-ref', p)
 938
 939
 940 def delete_ref(refname):
 941     """Delete a repository reference."""
 942     assert(refname.startswith('refs/'))
 943     p = subprocess.Popen(['git', 'update-ref', '-d', refname],
 944                          preexec_fn = _gitenv())
 945     _git_wait('git update-ref', p)
 946
 947
 948 def guess_repo(path=None):
 949     """Set the path value in the global variable "repodir".
 950     This makes bup look for an existing bup repository, but not fail if a
 951     repository doesn't exist. Usually, if you are interacting with a bup
 952     repository, you would not be calling this function but using
 953     check_repo_or_die().
 954     """
 955     global repodir
 956     if path:
 957         repodir = path
 958     if not repodir:
 959         repodir = os.environ.get('BUP_DIR')
 960         if not repodir:
 961             repodir = os.path.expanduser('~/.bup')
 962
 963
 964 def init_repo(path=None):
 965     """Create the Git bare repository for bup in a given path."""
 966     guess_repo(path)
 967     d = repo()  # appends a / to the path
 968     parent = os.path.dirname(os.path.dirname(d))
 969     if parent and not os.path.exists(parent):
 970         raise GitError('parent directory "%s" does not exist\n' % parent)
 971     if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
 972         raise GitError('"%s" exists but is not a directory\n' % d)
 973     p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
 974                          preexec_fn = _gitenv())
 975     _git_wait('git init', p)
 976     # Force the index version configuration in order to ensure bup works
 977     # regardless of the version of the installed Git binary.
 978     p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
 979                          stdout=sys.stderr, preexec_fn = _gitenv())
 980     _git_wait('git config', p)
 981     # Enable the reflog
 982     p = subprocess.Popen(['git', 'config', 'core.logAllRefUpdates', 'true'],
 983                          stdout=sys.stderr, preexec_fn = _gitenv())
 984     _git_wait('git config', p)
 985
 986
 987 def check_repo_or_die(path=None):
 988     """Make sure a bup repository exists, and abort if not.
 989     If the path to a particular repository was not specified, this function
 990     initializes the default repository automatically.
 991     """
 992     guess_repo(path)
 993     try:
 994         os.stat(repo('objects/pack/.'))
 995     except OSError as e:
 996         if e.errno == errno.ENOENT:
 997             log('error: %r is not a bup repository; run "bup init"\n'
 998                 % repo())
 999             sys.exit(15)
1000         else:
1001             log('error: %s\n' % e)
1002             sys.exit(14)
1003
1004
1005 _ver = None
1006 def ver():
1007     """Get Git's version and ensure a usable version is installed.
1008
1009     The returned version is formatted as an ordered tuple with each position
1010     representing a digit in the version tag. For example, the following tuple
1011     would represent version 1.6.6.9:
1012
1013         ('1', '6', '6', '9')
1014     """
1015     global _ver
1016     if not _ver:
1017         p = subprocess.Popen(['git', '--version'],
1018                              stdout=subprocess.PIPE)
1019         gvs = p.stdout.read()
1020         _git_wait('git --version', p)
1021         m = re.match(r'git version (\S+.\S+)', gvs)
1022         if not m:
1023             raise GitError('git --version weird output: %r' % gvs)
1024         _ver = tuple(m.group(1).split('.'))
1025     needed = ('1','5', '3', '1')
1026     if _ver < needed:
1027         raise GitError('git version %s or higher is required; you have %s'
1028                        % ('.'.join(needed), '.'.join(_ver)))
1029     return _ver
1030
1031
1032 def _git_wait(cmd, p):
1033     rv = p.wait()
1034     if rv != 0:
1035         raise GitError('%s returned %d' % (cmd, rv))
1036
1037
1038 def _git_capture(argv):
1039     p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv())
1040     r = p.stdout.read()
1041     _git_wait(repr(argv), p)
1042     return r
1043
1044
1045 class _AbortableIter:
1046     def __init__(self, it, onabort = None):
1047         self.it = it
1048         self.onabort = onabort
1049         self.done = None
1050
1051     def __iter__(self):
1052         return self
1053
1054     def next(self):
1055         try:
1056             return self.it.next()
1057         except StopIteration as e:
1058             self.done = True
1059             raise
1060         except:
1061             self.abort()
1062             raise
1063
1064     def abort(self):
1065         """Abort iteration and call the abortion callback, if needed."""
1066         if not self.done:
1067             self.done = True
1068             if self.onabort:
1069                 self.onabort()
1070
1071     def __del__(self):
1072         self.abort()
1073
1074
1075 _ver_warned = 0
1076 class CatPipe:
1077     """Link to 'git cat-file' that is used to retrieve blob data."""
1078     def __init__(self, repo_dir = None):
1079         global _ver_warned
1080         self.repo_dir = repo_dir
1081         wanted = ('1','5','6')
1082         if ver() < wanted:
1083             if not _ver_warned:
1084                 log('warning: git version < %s; bup will be slow.\n'
1085                     % '.'.join(wanted))
1086                 _ver_warned = 1
1087             self.get = self._slow_get
1088         else:
1089             self.p = self.inprogress = None
1090             self.get = self._fast_get
1091
1092     def _abort(self):
1093         if self.p:
1094             self.p.stdout.close()
1095             self.p.stdin.close()
1096         self.p = None
1097         self.inprogress = None
1098
1099     def _restart(self):
1100         self._abort()
1101         self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1102                                   stdin=subprocess.PIPE,
1103                                   stdout=subprocess.PIPE,
1104                                   close_fds = True,
1105                                   bufsize = 4096,
1106                                   preexec_fn = _gitenv(self.repo_dir))
1107
1108     def _fast_get(self, id):
1109         if not self.p or self.p.poll() != None:
1110             self._restart()
1111         assert(self.p)
1112         poll_result = self.p.poll()
1113         assert(poll_result == None)
1114         if self.inprogress:
1115             log('_fast_get: opening %r while %r is open\n'
1116                 % (id, self.inprogress))
1117         assert(not self.inprogress)
1118         assert(id.find('\n') < 0)
1119         assert(id.find('\r') < 0)
1120         assert(not id.startswith('-'))
1121         self.inprogress = id
1122         self.p.stdin.write('%s\n' % id)
1123         self.p.stdin.flush()
1124         hdr = self.p.stdout.readline()
1125         if hdr.endswith(' missing\n'):
1126             self.inprogress = None
1127             raise KeyError('blob %r is missing' % id)
1128         spl = hdr.split(' ')
1129         if len(spl) != 3 or len(spl[0]) != 40:
1130             raise GitError('expected blob, got %r' % spl)
1131         (hex, type, size) = spl
1132
1133         it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
1134                            onabort = self._abort)
1135         try:
1136             yield type
1137             for blob in it:
1138                 yield blob
1139             readline_result = self.p.stdout.readline()
1140             assert(readline_result == '\n')
1141             self.inprogress = None
1142         except Exception as e:
1143             it.abort()
1144             raise
1145
1146     def _slow_get(self, id):
1147         assert(id.find('\n') < 0)
1148         assert(id.find('\r') < 0)
1149         assert(id[0] != '-')
1150         type = _git_capture(['git', 'cat-file', '-t', id]).strip()
1151         yield type
1152
1153         p = subprocess.Popen(['git', 'cat-file', type, id],
1154                              stdout=subprocess.PIPE,
1155                              preexec_fn = _gitenv(self.repo_dir))
1156         for blob in chunkyreader(p.stdout):
1157             yield blob
1158         _git_wait('git cat-file', p)
1159
1160     def _join(self, it):
1161         type = it.next()
1162         if type == 'blob':
1163             for blob in it:
1164                 yield blob
1165         elif type == 'tree':
1166             treefile = ''.join(it)
1167             for (mode, name, sha) in tree_decode(treefile):
1168                 for blob in self.join(sha.encode('hex')):
1169                     yield blob
1170         elif type == 'commit':
1171             treeline = ''.join(it).split('\n')[0]
1172             assert(treeline.startswith('tree '))
1173             for blob in self.join(treeline[5:]):
1174                 yield blob
1175         else:
1176             raise GitError('invalid object type %r: expected blob/tree/commit'
1177                            % type)
1178
1179     def join(self, id):
1180         """Generate a list of the content of all blobs that can be reached
1181         from an object.  The hash given in 'id' must point to a blob, a tree
1182         or a commit. The content of all blobs that can be seen from trees or
1183         commits will be added to the list.
1184         """
1185         try:
1186             for d in self._join(self.get(id)):
1187                 yield d
1188         except StopIteration:
1189             log('booger!\n')
1190
1191
1192 _cp = {}
1193
1194 def cp(repo_dir=None):
1195     """Create a CatPipe object or reuse the already existing one."""
1196     global _cp
1197     if not repo_dir:
1198         repo_dir = repo()
1199     repo_dir = os.path.abspath(repo_dir)
1200     cp = _cp.get(repo_dir)
1201     if not cp:
1202         cp = CatPipe(repo_dir)
1203         _cp[repo_dir] = cp
1204     return cp
1205
1206
1207 def tags(repo_dir = None):
1208     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1209     tags = {}
1210     for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1211         assert(n.startswith('refs/tags/'))
1212         name = n[10:]
1213         if not c in tags:
1214             tags[c] = []
1215         tags[c].append(name)  # more than one tag can point at 'c'
1216     return tags