lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5
   6 import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
   7 from collections import namedtuple
   8 from itertools import islice
   9
  10 from bup import _helpers, path, midx, bloom, xstat
  11 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
  12                          hostname, log, merge_iter, mmap_read, mmap_readwrite,
  13                          progress, qprogress, unlink, username, userfullname,
  14                          utc_offset_str)
  15
  16
  17 max_pack_size = 1000*1000*1000  # larger packs will slow down pruning
  18 max_pack_objects = 200*1000  # cache memory usage is about 83 bytes per object
  19
  20 verbose = 0
  21 ignore_midx = 0
  22 repodir = None
  23
  24 _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
  25 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
  26
  27 _total_searches = 0
  28 _total_steps = 0
  29
  30
  31 class GitError(Exception):
  32     pass
  33
  34
  35 def parse_tz_offset(s):
  36     """UTC offset in seconds."""
  37     tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
  38     if s[0] == '-':
  39         return - tz_off
  40     return tz_off
  41
  42
  43 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
  44 # Make sure that's authoritative.
  45 _start_end_char = r'[^ .,:;<>"\'\0\n]'
  46 _content_char = r'[^\0\n<>]'
  47 _safe_str_rx = '(?:%s{1,2}|(?:%s%s*%s))' \
  48     % (_start_end_char,
  49        _start_end_char, _content_char, _start_end_char)
  50 _tz_rx = r'[-+]\d\d[0-5]\d'
  51 _parent_rx = r'(?:parent [abcdefABCDEF0123456789]{40}\n)'
  52 _commit_rx = re.compile(r'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
  53 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
  54 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)
  55
  56 (?P<message>(?:.|\n)*)''' % (_parent_rx,
  57                              _safe_str_rx, _safe_str_rx, _tz_rx,
  58                              _safe_str_rx, _safe_str_rx, _tz_rx))
  59 _parent_hash_rx = re.compile(r'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
  60
  61
  62 # Note that the author_sec and committer_sec values are (UTC) epoch seconds.
  63 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
  64                                        'author_name', 'author_mail',
  65                                        'author_sec', 'author_offset',
  66                                        'committer_name', 'committer_mail',
  67                                        'committer_sec', 'committer_offset',
  68                                        'message'])
  69
  70 def parse_commit(content):
  71     commit_match = re.match(_commit_rx, content)
  72     if not commit_match:
  73         raise Exception('cannot parse commit %r' % content)
  74     matches = commit_match.groupdict()
  75     return CommitInfo(tree=matches['tree'],
  76                       parents=re.findall(_parent_hash_rx, matches['parents']),
  77                       author_name=matches['author_name'],
  78                       author_mail=matches['author_mail'],
  79                       author_sec=int(matches['asec']),
  80                       author_offset=parse_tz_offset(matches['atz']),
  81                       committer_name=matches['committer_name'],
  82                       committer_mail=matches['committer_mail'],
  83                       committer_sec=int(matches['csec']),
  84                       committer_offset=parse_tz_offset(matches['ctz']),
  85                       message=matches['message'])
  86
  87
  88 def get_commit_items(id, cp):
  89     commit_it = cp.get(id)
  90     assert(commit_it.next() == 'commit')
  91     commit_content = ''.join(commit_it)
  92     return parse_commit(commit_content)
  93
  94
  95 def repo(sub = '', repo_dir=None):
  96     """Get the path to the git repository or one of its subdirectories."""
  97     global repodir
  98     repo_dir = repo_dir or repodir
  99     if not repo_dir:
 100         raise GitError('You should call check_repo_or_die()')
 101
 102     # If there's a .git subdirectory, then the actual repo is in there.
 103     gd = os.path.join(repo_dir, '.git')
 104     if os.path.exists(gd):
 105         repodir = gd
 106
 107     return os.path.join(repo_dir, sub)
 108
 109
 110 def shorten_hash(s):
 111     return re.sub(r'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)',
 112                   r'\1\2*\3', s)
 113
 114
 115 def repo_rel(path):
 116     full = os.path.abspath(path)
 117     fullrepo = os.path.abspath(repo(''))
 118     if not fullrepo.endswith('/'):
 119         fullrepo += '/'
 120     if full.startswith(fullrepo):
 121         path = full[len(fullrepo):]
 122     if path.startswith('index-cache/'):
 123         path = path[len('index-cache/'):]
 124     return shorten_hash(path)
 125
 126
 127 def all_packdirs():
 128     paths = [repo('objects/pack')]
 129     paths += glob.glob(repo('index-cache/*/.'))
 130     return paths
 131
 132
 133 def auto_midx(objdir):
 134     args = [path.exe(), 'midx', '--auto', '--dir', objdir]
 135     try:
 136         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 137     except OSError as e:
 138         # make sure 'args' gets printed to help with debugging
 139         add_error('%r: exception: %s' % (args, e))
 140         raise
 141     if rv:
 142         add_error('%r: returned %d' % (args, rv))
 143
 144     args = [path.exe(), 'bloom', '--dir', objdir]
 145     try:
 146         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 147     except OSError as e:
 148         # make sure 'args' gets printed to help with debugging
 149         add_error('%r: exception: %s' % (args, e))
 150         raise
 151     if rv:
 152         add_error('%r: returned %d' % (args, rv))
 153
 154
 155 def mangle_name(name, mode, gitmode):
 156     """Mangle a file name to present an abstract name for segmented files.
 157     Mangled file names will have the ".bup" extension added to them. If a
 158     file's name already ends with ".bup", a ".bupl" extension is added to
 159     disambiguate normal files from segmented ones.
 160     """
 161     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 162         assert(stat.S_ISDIR(gitmode))
 163         return name + '.bup'
 164     elif name.endswith('.bup') or name[:-1].endswith('.bup'):
 165         return name + '.bupl'
 166     else:
 167         return name
 168
 169
 170 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 171 def demangle_name(name, mode):
 172     """Remove name mangling from a file name, if necessary.
 173
 174     The return value is a tuple (demangled_filename,mode), where mode is one of
 175     the following:
 176
 177     * BUP_NORMAL  : files that should be read as-is from the repository
 178     * BUP_CHUNKED : files that were chunked and need to be reassembled
 179
 180     For more information on the name mangling algorithm, see mangle_name()
 181     """
 182     if name.endswith('.bupl'):
 183         return (name[:-5], BUP_NORMAL)
 184     elif name.endswith('.bup'):
 185         return (name[:-4], BUP_CHUNKED)
 186     elif name.endswith('.bupm'):
 187         return (name[:-5],
 188                 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
 189     else:
 190         return (name, BUP_NORMAL)
 191
 192
 193 def calc_hash(type, content):
 194     """Calculate some content's hash in the Git fashion."""
 195     header = '%s %d\0' % (type, len(content))
 196     sum = Sha1(header)
 197     sum.update(content)
 198     return sum.digest()
 199
 200
 201 def shalist_item_sort_key(ent):
 202     (mode, name, id) = ent
 203     assert(mode+0 == mode)
 204     if stat.S_ISDIR(mode):
 205         return name + '/'
 206     else:
 207         return name
 208
 209
 210 def tree_encode(shalist):
 211     """Generate a git tree object from (mode,name,hash) tuples."""
 212     shalist = sorted(shalist, key = shalist_item_sort_key)
 213     l = []
 214     for (mode,name,bin) in shalist:
 215         assert(mode)
 216         assert(mode+0 == mode)
 217         assert(name)
 218         assert(len(bin) == 20)
 219         s = '%o %s\0%s' % (mode,name,bin)
 220         assert(s[0] != '0')  # 0-padded octal is not acceptable in a git tree
 221         l.append(s)
 222     return ''.join(l)
 223
 224
 225 def tree_decode(buf):
 226     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 227     ofs = 0
 228     while ofs < len(buf):
 229         z = buf.find('\0', ofs)
 230         assert(z > ofs)
 231         spl = buf[ofs:z].split(' ', 1)
 232         assert(len(spl) == 2)
 233         mode,name = spl
 234         sha = buf[z+1:z+1+20]
 235         ofs = z+1+20
 236         yield (int(mode, 8), name, sha)
 237
 238
 239 def _encode_packobj(type, content, compression_level=1):
 240     szout = ''
 241     sz = len(content)
 242     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 243     sz >>= 4
 244     while 1:
 245         if sz: szbits |= 0x80
 246         szout += chr(szbits)
 247         if not sz:
 248             break
 249         szbits = sz & 0x7f
 250         sz >>= 7
 251     if compression_level > 9:
 252         compression_level = 9
 253     elif compression_level < 0:
 254         compression_level = 0
 255     z = zlib.compressobj(compression_level)
 256     yield szout
 257     yield z.compress(content)
 258     yield z.flush()
 259
 260
 261 def _encode_looseobj(type, content, compression_level=1):
 262     z = zlib.compressobj(compression_level)
 263     yield z.compress('%s %d\0' % (type, len(content)))
 264     yield z.compress(content)
 265     yield z.flush()
 266
 267
 268 def _decode_looseobj(buf):
 269     assert(buf);
 270     s = zlib.decompress(buf)
 271     i = s.find('\0')
 272     assert(i > 0)
 273     l = s[:i].split(' ')
 274     type = l[0]
 275     sz = int(l[1])
 276     content = s[i+1:]
 277     assert(type in _typemap)
 278     assert(sz == len(content))
 279     return (type, content)
 280
 281
 282 def _decode_packobj(buf):
 283     assert(buf)
 284     c = ord(buf[0])
 285     type = _typermap[(c & 0x70) >> 4]
 286     sz = c & 0x0f
 287     shift = 4
 288     i = 0
 289     while c & 0x80:
 290         i += 1
 291         c = ord(buf[i])
 292         sz |= (c & 0x7f) << shift
 293         shift += 7
 294         if not (c & 0x80):
 295             break
 296     return (type, zlib.decompress(buf[i+1:]))
 297
 298
 299 class PackIdx:
 300     def __init__(self):
 301         assert(0)
 302
 303     def find_offset(self, hash):
 304         """Get the offset of an object inside the index file."""
 305         idx = self._idx_from_hash(hash)
 306         if idx != None:
 307             return self._ofs_from_idx(idx)
 308         return None
 309
 310     def exists(self, hash, want_source=False):
 311         """Return nonempty if the object exists in this index."""
 312         if hash and (self._idx_from_hash(hash) != None):
 313             return want_source and os.path.basename(self.name) or True
 314         return None
 315
 316     def __len__(self):
 317         return int(self.fanout[255])
 318
 319     def _idx_from_hash(self, hash):
 320         global _total_searches, _total_steps
 321         _total_searches += 1
 322         assert(len(hash) == 20)
 323         b1 = ord(hash[0])
 324         start = self.fanout[b1-1] # range -1..254
 325         end = self.fanout[b1] # range 0..255
 326         want = str(hash)
 327         _total_steps += 1  # lookup table is a step
 328         while start < end:
 329             _total_steps += 1
 330             mid = start + (end-start)/2
 331             v = self._idx_to_hash(mid)
 332             if v < want:
 333                 start = mid+1
 334             elif v > want:
 335                 end = mid
 336             else: # got it!
 337                 return mid
 338         return None
 339
 340
 341 class PackIdxV1(PackIdx):
 342     """Object representation of a Git pack index (version 1) file."""
 343     def __init__(self, filename, f):
 344         self.name = filename
 345         self.idxnames = [self.name]
 346         self.map = mmap_read(f)
 347         self.fanout = list(struct.unpack('!256I',
 348                                          str(buffer(self.map, 0, 256*4))))
 349         self.fanout.append(0)  # entry "-1"
 350         nsha = self.fanout[255]
 351         self.sha_ofs = 256*4
 352         self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
 353
 354     def _ofs_from_idx(self, idx):
 355         return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
 356
 357     def _idx_to_hash(self, idx):
 358         return str(self.shatable[idx*24+4 : idx*24+24])
 359
 360     def __iter__(self):
 361         for i in xrange(self.fanout[255]):
 362             yield buffer(self.map, 256*4 + 24*i + 4, 20)
 363
 364
 365 class PackIdxV2(PackIdx):
 366     """Object representation of a Git pack index (version 2) file."""
 367     def __init__(self, filename, f):
 368         self.name = filename
 369         self.idxnames = [self.name]
 370         self.map = mmap_read(f)
 371         assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
 372         self.fanout = list(struct.unpack('!256I',
 373                                          str(buffer(self.map, 8, 256*4))))
 374         self.fanout.append(0)  # entry "-1"
 375         nsha = self.fanout[255]
 376         self.sha_ofs = 8 + 256*4
 377         self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
 378         self.ofstable = buffer(self.map,
 379                                self.sha_ofs + nsha*20 + nsha*4,
 380                                nsha*4)
 381         self.ofs64table = buffer(self.map,
 382                                  8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
 383
 384     def _ofs_from_idx(self, idx):
 385         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
 386         if ofs & 0x80000000:
 387             idx64 = ofs & 0x7fffffff
 388             ofs = struct.unpack('!Q',
 389                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
 390         return ofs
 391
 392     def _idx_to_hash(self, idx):
 393         return str(self.shatable[idx*20:(idx+1)*20])
 394
 395     def __iter__(self):
 396         for i in xrange(self.fanout[255]):
 397             yield buffer(self.map, 8 + 256*4 + 20*i, 20)
 398
 399
 400 _mpi_count = 0
 401 class PackIdxList:
 402     def __init__(self, dir):
 403         global _mpi_count
 404         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 405         _mpi_count += 1
 406         self.dir = dir
 407         self.also = set()
 408         self.packs = []
 409         self.do_bloom = False
 410         self.bloom = None
 411         self.refresh()
 412
 413     def __del__(self):
 414         global _mpi_count
 415         _mpi_count -= 1
 416         assert(_mpi_count == 0)
 417
 418     def __iter__(self):
 419         return iter(idxmerge(self.packs))
 420
 421     def __len__(self):
 422         return sum(len(pack) for pack in self.packs)
 423
 424     def exists(self, hash, want_source=False):
 425         """Return nonempty if the object exists in the index files."""
 426         global _total_searches
 427         _total_searches += 1
 428         if hash in self.also:
 429             return True
 430         if self.do_bloom and self.bloom:
 431             if self.bloom.exists(hash):
 432                 self.do_bloom = False
 433             else:
 434                 _total_searches -= 1  # was counted by bloom
 435                 return None
 436         for i in xrange(len(self.packs)):
 437             p = self.packs[i]
 438             _total_searches -= 1  # will be incremented by sub-pack
 439             ix = p.exists(hash, want_source=want_source)
 440             if ix:
 441                 # reorder so most recently used packs are searched first
 442                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 443                 return ix
 444         self.do_bloom = True
 445         return None
 446
 447     def refresh(self, skip_midx = False):
 448         """Refresh the index list.
 449         This method verifies if .midx files were superseded (e.g. all of its
 450         contents are in another, bigger .midx file) and removes the superseded
 451         files.
 452
 453         If skip_midx is True, all work on .midx files will be skipped and .midx
 454         files will be removed from the list.
 455
 456         The module-global variable 'ignore_midx' can force this function to
 457         always act as if skip_midx was True.
 458         """
 459         self.bloom = None # Always reopen the bloom as it may have been relaced
 460         self.do_bloom = False
 461         skip_midx = skip_midx or ignore_midx
 462         d = dict((p.name, p) for p in self.packs
 463                  if not skip_midx or not isinstance(p, midx.PackMidx))
 464         if os.path.exists(self.dir):
 465             if not skip_midx:
 466                 midxl = []
 467                 for ix in self.packs:
 468                     if isinstance(ix, midx.PackMidx):
 469                         for name in ix.idxnames:
 470                             d[os.path.join(self.dir, name)] = ix
 471                 for full in glob.glob(os.path.join(self.dir,'*.midx')):
 472                     if not d.get(full):
 473                         mx = midx.PackMidx(full)
 474                         (mxd, mxf) = os.path.split(mx.name)
 475                         broken = False
 476                         for n in mx.idxnames:
 477                             if not os.path.exists(os.path.join(mxd, n)):
 478                                 log(('warning: index %s missing\n' +
 479                                     '  used by %s\n') % (n, mxf))
 480                                 broken = True
 481                         if broken:
 482                             mx.close()
 483                             del mx
 484                             unlink(full)
 485                         else:
 486                             midxl.append(mx)
 487                 midxl.sort(key=lambda ix:
 488                            (-len(ix), -xstat.stat(ix.name).st_mtime))
 489                 for ix in midxl:
 490                     any_needed = False
 491                     for sub in ix.idxnames:
 492                         found = d.get(os.path.join(self.dir, sub))
 493                         if not found or isinstance(found, PackIdx):
 494                             # doesn't exist, or exists but not in a midx
 495                             any_needed = True
 496                             break
 497                     if any_needed:
 498                         d[ix.name] = ix
 499                         for name in ix.idxnames:
 500                             d[os.path.join(self.dir, name)] = ix
 501                     elif not ix.force_keep:
 502                         debug1('midx: removing redundant: %s\n'
 503                                % os.path.basename(ix.name))
 504                         ix.close()
 505                         unlink(ix.name)
 506             for full in glob.glob(os.path.join(self.dir,'*.idx')):
 507                 if not d.get(full):
 508                     try:
 509                         ix = open_idx(full)
 510                     except GitError as e:
 511                         add_error(e)
 512                         continue
 513                     d[full] = ix
 514             bfull = os.path.join(self.dir, 'bup.bloom')
 515             if self.bloom is None and os.path.exists(bfull):
 516                 self.bloom = bloom.ShaBloom(bfull)
 517             self.packs = list(set(d.values()))
 518             self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
 519             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 520                 self.do_bloom = True
 521             else:
 522                 self.bloom = None
 523         debug1('PackIdxList: using %d index%s.\n'
 524             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 525
 526     def add(self, hash):
 527         """Insert an additional object in the list."""
 528         self.also.add(hash)
 529
 530
 531 def open_idx(filename):
 532     if filename.endswith('.idx'):
 533         f = open(filename, 'rb')
 534         header = f.read(8)
 535         if header[0:4] == '\377tOc':
 536             version = struct.unpack('!I', header[4:8])[0]
 537             if version == 2:
 538                 return PackIdxV2(filename, f)
 539             else:
 540                 raise GitError('%s: expected idx file version 2, got %d'
 541                                % (filename, version))
 542         elif len(header) == 8 and header[0:4] < '\377tOc':
 543             return PackIdxV1(filename, f)
 544         else:
 545             raise GitError('%s: unrecognized idx file header' % filename)
 546     elif filename.endswith('.midx'):
 547         return midx.PackMidx(filename)
 548     else:
 549         raise GitError('idx filenames must end with .idx or .midx')
 550
 551
 552 def idxmerge(idxlist, final_progress=True):
 553     """Generate a list of all the objects reachable in a PackIdxList."""
 554     def pfunc(count, total):
 555         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 556                   % (count*100.0/total, count, total))
 557     def pfinal(count, total):
 558         if final_progress:
 559             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 560                      % (100, total, total))
 561     return merge_iter(idxlist, 10024, pfunc, pfinal)
 562
 563
 564 def _make_objcache():
 565     return PackIdxList(repo('objects/pack'))
 566
 567 class PackWriter:
 568     """Writes Git objects inside a pack file."""
 569     def __init__(self, objcache_maker=_make_objcache, compression_level=1):
 570         self.file = None
 571         self.count = 0
 572         self.outbytes = 0
 573         self.filename = None
 574         self.idx = None
 575         self.objcache_maker = objcache_maker
 576         self.objcache = None
 577         self.compression_level = compression_level
 578
 579     def __del__(self):
 580         self.close()
 581
 582     def _open(self):
 583         if not self.file:
 584             (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
 585             try:
 586                 self.file = os.fdopen(fd, 'w+b')
 587             except:
 588                 os.close(fd)
 589                 raise
 590             assert(name.endswith('.pack'))
 591             self.filename = name[:-5]
 592             self.file.write('PACK\0\0\0\2\0\0\0\0')
 593             self.idx = list(list() for i in xrange(256))
 594
 595     def _raw_write(self, datalist, sha):
 596         self._open()
 597         f = self.file
 598         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 599         # the file never has a *partial* blob.  So let's make sure it's
 600         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 601         # to our hashsplit algorithm.)  f.write() does its own buffering,
 602         # but that's okay because we'll flush it in _end().
 603         oneblob = ''.join(datalist)
 604         try:
 605             f.write(oneblob)
 606         except IOError as e:
 607             raise GitError, e, sys.exc_info()[2]
 608         nw = len(oneblob)
 609         crc = zlib.crc32(oneblob) & 0xffffffff
 610         self._update_idx(sha, crc, nw)
 611         self.outbytes += nw
 612         self.count += 1
 613         return nw, crc
 614
 615     def _update_idx(self, sha, crc, size):
 616         assert(sha)
 617         if self.idx:
 618             self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
 619
 620     def _write(self, sha, type, content):
 621         if verbose:
 622             log('>')
 623         if not sha:
 624             sha = calc_hash(type, content)
 625         size, crc = self._raw_write(_encode_packobj(type, content,
 626                                                     self.compression_level),
 627                                     sha=sha)
 628         if self.outbytes >= max_pack_size or self.count >= max_pack_objects:
 629             self.breakpoint()
 630         return sha
 631
 632     def breakpoint(self):
 633         """Clear byte and object counts and return the last processed id."""
 634         id = self._end()
 635         self.outbytes = self.count = 0
 636         return id
 637
 638     def _require_objcache(self):
 639         if self.objcache is None and self.objcache_maker:
 640             self.objcache = self.objcache_maker()
 641         if self.objcache is None:
 642             raise GitError(
 643                     "PackWriter not opened or can't check exists w/o objcache")
 644
 645     def exists(self, id, want_source=False):
 646         """Return non-empty if an object is found in the object cache."""
 647         self._require_objcache()
 648         return self.objcache.exists(id, want_source=want_source)
 649
 650     def maybe_write(self, type, content):
 651         """Write an object to the pack file if not present and return its id."""
 652         sha = calc_hash(type, content)
 653         if not self.exists(sha):
 654             self._write(sha, type, content)
 655             self._require_objcache()
 656             self.objcache.add(sha)
 657         return sha
 658
 659     def new_blob(self, blob):
 660         """Create a blob object in the pack with the supplied content."""
 661         return self.maybe_write('blob', blob)
 662
 663     def new_tree(self, shalist):
 664         """Create a tree object in the pack."""
 665         content = tree_encode(shalist)
 666         return self.maybe_write('tree', content)
 667
 668     def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
 669         l = []
 670         if tree: l.append('tree %s' % tree.encode('hex'))
 671         if parent: l.append('parent %s' % parent.encode('hex'))
 672         if author: l.append('author %s %s' % (author, _git_date(adate)))
 673         if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
 674         l.append('')
 675         l.append(msg)
 676         return self.maybe_write('commit', '\n'.join(l))
 677
 678     def new_commit(self, parent, tree, date, msg):
 679         """Create a commit object in the pack."""
 680         userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
 681         commit = self._new_commit(tree, parent,
 682                                   userline, date, userline, date,
 683                                   msg)
 684         return commit
 685
 686     def abort(self):
 687         """Remove the pack file from disk."""
 688         f = self.file
 689         if f:
 690             self.idx = None
 691             self.file = None
 692             try:
 693                 os.unlink(self.filename + '.pack')
 694             finally:
 695                 f.close()
 696
 697     def _end(self, run_midx=True):
 698         f = self.file
 699         if not f: return None
 700         self.file = None
 701         try:
 702             self.objcache = None
 703             idx = self.idx
 704             self.idx = None
 705
 706             # update object count
 707             f.seek(8)
 708             cp = struct.pack('!i', self.count)
 709             assert(len(cp) == 4)
 710             f.write(cp)
 711
 712             # calculate the pack sha1sum
 713             f.seek(0)
 714             sum = Sha1()
 715             for b in chunkyreader(f):
 716                 sum.update(b)
 717             packbin = sum.digest()
 718             f.write(packbin)
 719         finally:
 720             f.close()
 721
 722         obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
 723
 724         nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
 725         if os.path.exists(self.filename + '.map'):
 726             os.unlink(self.filename + '.map')
 727         os.rename(self.filename + '.pack', nameprefix + '.pack')
 728         os.rename(self.filename + '.idx', nameprefix + '.idx')
 729
 730         if run_midx:
 731             auto_midx(repo('objects/pack'))
 732         return nameprefix
 733
 734     def close(self, run_midx=True):
 735         """Close the pack file and move it to its definitive path."""
 736         return self._end(run_midx=run_midx)
 737
 738     def _write_pack_idx_v2(self, filename, idx, packbin):
 739         ofs64_count = 0
 740         for section in idx:
 741             for entry in section:
 742                 if entry[2] >= 2**31:
 743                     ofs64_count += 1
 744
 745         # Length: header + fan-out + shas-and-crcs + overflow-offsets
 746         index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
 747         idx_map = None
 748         idx_f = open(filename, 'w+b')
 749         try:
 750             idx_f.truncate(index_len)
 751             idx_map = mmap_readwrite(idx_f, close=False)
 752             try:
 753                 count = _helpers.write_idx(filename, idx_map, idx, self.count)
 754                 assert(count == self.count)
 755             finally:
 756                 idx_map.close()
 757         finally:
 758             idx_f.close()
 759
 760         idx_f = open(filename, 'a+b')
 761         try:
 762             idx_f.write(packbin)
 763             idx_f.seek(0)
 764             idx_sum = Sha1()
 765             b = idx_f.read(8 + 4*256)
 766             idx_sum.update(b)
 767
 768             obj_list_sum = Sha1()
 769             for b in chunkyreader(idx_f, 20*self.count):
 770                 idx_sum.update(b)
 771                 obj_list_sum.update(b)
 772             namebase = obj_list_sum.hexdigest()
 773
 774             for b in chunkyreader(idx_f):
 775                 idx_sum.update(b)
 776             idx_f.write(idx_sum.digest())
 777             return namebase
 778         finally:
 779             idx_f.close()
 780
 781
 782 def _git_date(date):
 783     return '%d %s' % (date, utc_offset_str(date))
 784
 785
 786 def _gitenv(repo_dir = None):
 787     if not repo_dir:
 788         repo_dir = repo()
 789     def env():
 790         os.environ['GIT_DIR'] = os.path.abspath(repo_dir)
 791     return env
 792
 793
 794 def list_refs(refname=None, repo_dir=None,
 795               limit_to_heads=False, limit_to_tags=False):
 796     """Yield (refname, hash) tuples for all repository refs unless a ref
 797     name is specified.  Given a ref name, only include tuples for that
 798     particular ref.  The limits restrict the result items to
 799     refs/heads or refs/tags.  If both limits are specified, items from
 800     both sources will be included.
 801
 802     """
 803     argv = ['git', 'show-ref']
 804     if limit_to_heads:
 805         argv.append('--heads')
 806     if limit_to_tags:
 807         argv.append('--tags')
 808     argv.append('--')
 809     if refname:
 810         argv += [refname]
 811     p = subprocess.Popen(argv,
 812                          preexec_fn = _gitenv(repo_dir),
 813                          stdout = subprocess.PIPE)
 814     out = p.stdout.read().strip()
 815     rv = p.wait()  # not fatal
 816     if rv:
 817         assert(not out)
 818     if out:
 819         for d in out.split('\n'):
 820             (sha, name) = d.split(' ', 1)
 821             yield (name, sha.decode('hex'))
 822
 823
 824 def read_ref(refname, repo_dir = None):
 825     """Get the commit id of the most recent commit made on a given ref."""
 826     refs = list_refs(refname, repo_dir=repo_dir, limit_to_heads=True)
 827     l = tuple(islice(refs, 2))
 828     if l:
 829         assert(len(l) == 1)
 830         return l[0][1]
 831     else:
 832         return None
 833
 834
 835 def rev_list(ref, count=None, repo_dir=None):
 836     """Generate a list of reachable commits in reverse chronological order.
 837
 838     This generator walks through commits, from child to parent, that are
 839     reachable via the specified ref and yields a series of tuples of the form
 840     (date,hash).
 841
 842     If count is a non-zero integer, limit the number of commits to "count"
 843     objects.
 844     """
 845     assert(not ref.startswith('-'))
 846     opts = []
 847     if count:
 848         opts += ['-n', str(atoi(count))]
 849     argv = ['git', 'rev-list', '--pretty=format:%at'] + opts + [ref, '--']
 850     p = subprocess.Popen(argv,
 851                          preexec_fn = _gitenv(repo_dir),
 852                          stdout = subprocess.PIPE)
 853     commit = None
 854     for row in p.stdout:
 855         s = row.strip()
 856         if s.startswith('commit '):
 857             commit = s[7:].decode('hex')
 858         else:
 859             date = int(s)
 860             yield (date, commit)
 861     rv = p.wait()  # not fatal
 862     if rv:
 863         raise GitError, 'git rev-list returned error %d' % rv
 864
 865
 866 def get_commit_dates(refs, repo_dir=None):
 867     """Get the dates for the specified commit refs.  For now, every unique
 868        string in refs must resolve to a different commit or this
 869        function will fail."""
 870     result = []
 871     for ref in refs:
 872         commit = get_commit_items(ref, cp(repo_dir))
 873         result.append(commit.author_sec)
 874     return result
 875
 876
 877 def rev_parse(committish, repo_dir=None):
 878     """Resolve the full hash for 'committish', if it exists.
 879
 880     Should be roughly equivalent to 'git rev-parse'.
 881
 882     Returns the hex value of the hash if it is found, None if 'committish' does
 883     not correspond to anything.
 884     """
 885     head = read_ref(committish, repo_dir=repo_dir)
 886     if head:
 887         debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
 888         return head
 889
 890     pL = PackIdxList(repo('objects/pack', repo_dir=repo_dir))
 891
 892     if len(committish) == 40:
 893         try:
 894             hash = committish.decode('hex')
 895         except TypeError:
 896             return None
 897
 898         if pL.exists(hash):
 899             return hash
 900
 901     return None
 902
 903
 904 def update_ref(refname, newval, oldval, repo_dir=None):
 905     """Update a repository reference."""
 906     if not oldval:
 907         oldval = ''
 908     assert(refname.startswith('refs/heads/') \
 909            or refname.startswith('refs/tags/'))
 910     p = subprocess.Popen(['git', 'update-ref', refname,
 911                           newval.encode('hex'), oldval.encode('hex')],
 912                          preexec_fn = _gitenv(repo_dir))
 913     _git_wait('git update-ref', p)
 914
 915
 916 def delete_ref(refname):
 917     """Delete a repository reference."""
 918     assert(refname.startswith('refs/'))
 919     p = subprocess.Popen(['git', 'update-ref', '-d', refname],
 920                          preexec_fn = _gitenv())
 921     _git_wait('git update-ref', p)
 922
 923
 924 def guess_repo(path=None):
 925     """Set the path value in the global variable "repodir".
 926     This makes bup look for an existing bup repository, but not fail if a
 927     repository doesn't exist. Usually, if you are interacting with a bup
 928     repository, you would not be calling this function but using
 929     check_repo_or_die().
 930     """
 931     global repodir
 932     if path:
 933         repodir = path
 934     if not repodir:
 935         repodir = os.environ.get('BUP_DIR')
 936         if not repodir:
 937             repodir = os.path.expanduser('~/.bup')
 938
 939
 940 def init_repo(path=None):
 941     """Create the Git bare repository for bup in a given path."""
 942     guess_repo(path)
 943     d = repo()  # appends a / to the path
 944     parent = os.path.dirname(os.path.dirname(d))
 945     if parent and not os.path.exists(parent):
 946         raise GitError('parent directory "%s" does not exist\n' % parent)
 947     if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
 948         raise GitError('"%s" exists but is not a directory\n' % d)
 949     p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
 950                          preexec_fn = _gitenv())
 951     _git_wait('git init', p)
 952     # Force the index version configuration in order to ensure bup works
 953     # regardless of the version of the installed Git binary.
 954     p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
 955                          stdout=sys.stderr, preexec_fn = _gitenv())
 956     _git_wait('git config', p)
 957     # Enable the reflog
 958     p = subprocess.Popen(['git', 'config', 'core.logAllRefUpdates', 'true'],
 959                          stdout=sys.stderr, preexec_fn = _gitenv())
 960     _git_wait('git config', p)
 961
 962
 963 def check_repo_or_die(path=None):
 964     """Make sure a bup repository exists, and abort if not.
 965     If the path to a particular repository was not specified, this function
 966     initializes the default repository automatically.
 967     """
 968     guess_repo(path)
 969     try:
 970         os.stat(repo('objects/pack/.'))
 971     except OSError as e:
 972         if e.errno == errno.ENOENT:
 973             log('error: %r is not a bup repository; run "bup init"\n'
 974                 % repo())
 975             sys.exit(15)
 976         else:
 977             log('error: %s\n' % e)
 978             sys.exit(14)
 979
 980
 981 _ver = None
 982 def ver():
 983     """Get Git's version and ensure a usable version is installed.
 984
 985     The returned version is formatted as an ordered tuple with each position
 986     representing a digit in the version tag. For example, the following tuple
 987     would represent version 1.6.6.9:
 988
 989         ('1', '6', '6', '9')
 990     """
 991     global _ver
 992     if not _ver:
 993         p = subprocess.Popen(['git', '--version'],
 994                              stdout=subprocess.PIPE)
 995         gvs = p.stdout.read()
 996         _git_wait('git --version', p)
 997         m = re.match(r'git version (\S+.\S+)', gvs)
 998         if not m:
 999             raise GitError('git --version weird output: %r' % gvs)
1000         _ver = tuple(m.group(1).split('.'))
1001     needed = ('1','5', '3', '1')
1002     if _ver < needed:
1003         raise GitError('git version %s or higher is required; you have %s'
1004                        % ('.'.join(needed), '.'.join(_ver)))
1005     return _ver
1006
1007
1008 def _git_wait(cmd, p):
1009     rv = p.wait()
1010     if rv != 0:
1011         raise GitError('%s returned %d' % (cmd, rv))
1012
1013
1014 def _git_capture(argv):
1015     p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv())
1016     r = p.stdout.read()
1017     _git_wait(repr(argv), p)
1018     return r
1019
1020
1021 class _AbortableIter:
1022     def __init__(self, it, onabort = None):
1023         self.it = it
1024         self.onabort = onabort
1025         self.done = None
1026
1027     def __iter__(self):
1028         return self
1029
1030     def next(self):
1031         try:
1032             return self.it.next()
1033         except StopIteration as e:
1034             self.done = True
1035             raise
1036         except:
1037             self.abort()
1038             raise
1039
1040     def abort(self):
1041         """Abort iteration and call the abortion callback, if needed."""
1042         if not self.done:
1043             self.done = True
1044             if self.onabort:
1045                 self.onabort()
1046
1047     def __del__(self):
1048         self.abort()
1049
1050
1051 _ver_warned = 0
1052 class CatPipe:
1053     """Link to 'git cat-file' that is used to retrieve blob data."""
1054     def __init__(self, repo_dir = None):
1055         global _ver_warned
1056         self.repo_dir = repo_dir
1057         wanted = ('1','5','6')
1058         if ver() < wanted:
1059             if not _ver_warned:
1060                 log('warning: git version < %s; bup will be slow.\n'
1061                     % '.'.join(wanted))
1062                 _ver_warned = 1
1063             self.get = self._slow_get
1064         else:
1065             self.p = self.inprogress = None
1066             self.get = self._fast_get
1067
1068     def _abort(self):
1069         if self.p:
1070             self.p.stdout.close()
1071             self.p.stdin.close()
1072         self.p = None
1073         self.inprogress = None
1074
1075     def _restart(self):
1076         self._abort()
1077         self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1078                                   stdin=subprocess.PIPE,
1079                                   stdout=subprocess.PIPE,
1080                                   close_fds = True,
1081                                   bufsize = 4096,
1082                                   preexec_fn = _gitenv(self.repo_dir))
1083
1084     def _fast_get(self, id):
1085         if not self.p or self.p.poll() != None:
1086             self._restart()
1087         assert(self.p)
1088         poll_result = self.p.poll()
1089         assert(poll_result == None)
1090         if self.inprogress:
1091             log('_fast_get: opening %r while %r is open\n'
1092                 % (id, self.inprogress))
1093         assert(not self.inprogress)
1094         assert(id.find('\n') < 0)
1095         assert(id.find('\r') < 0)
1096         assert(not id.startswith('-'))
1097         self.inprogress = id
1098         self.p.stdin.write('%s\n' % id)
1099         self.p.stdin.flush()
1100         hdr = self.p.stdout.readline()
1101         if hdr.endswith(' missing\n'):
1102             self.inprogress = None
1103             raise KeyError('blob %r is missing' % id)
1104         spl = hdr.split(' ')
1105         if len(spl) != 3 or len(spl[0]) != 40:
1106             raise GitError('expected blob, got %r' % spl)
1107         (hex, type, size) = spl
1108
1109         it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
1110                            onabort = self._abort)
1111         try:
1112             yield type
1113             for blob in it:
1114                 yield blob
1115             readline_result = self.p.stdout.readline()
1116             assert(readline_result == '\n')
1117             self.inprogress = None
1118         except Exception as e:
1119             it.abort()
1120             raise
1121
1122     def _slow_get(self, id):
1123         assert(id.find('\n') < 0)
1124         assert(id.find('\r') < 0)
1125         assert(id[0] != '-')
1126         type = _git_capture(['git', 'cat-file', '-t', id]).strip()
1127         yield type
1128
1129         p = subprocess.Popen(['git', 'cat-file', type, id],
1130                              stdout=subprocess.PIPE,
1131                              preexec_fn = _gitenv(self.repo_dir))
1132         for blob in chunkyreader(p.stdout):
1133             yield blob
1134         _git_wait('git cat-file', p)
1135
1136     def _join(self, it):
1137         type = it.next()
1138         if type == 'blob':
1139             for blob in it:
1140                 yield blob
1141         elif type == 'tree':
1142             treefile = ''.join(it)
1143             for (mode, name, sha) in tree_decode(treefile):
1144                 for blob in self.join(sha.encode('hex')):
1145                     yield blob
1146         elif type == 'commit':
1147             treeline = ''.join(it).split('\n')[0]
1148             assert(treeline.startswith('tree '))
1149             for blob in self.join(treeline[5:]):
1150                 yield blob
1151         else:
1152             raise GitError('invalid object type %r: expected blob/tree/commit'
1153                            % type)
1154
1155     def join(self, id):
1156         """Generate a list of the content of all blobs that can be reached
1157         from an object.  The hash given in 'id' must point to a blob, a tree
1158         or a commit. The content of all blobs that can be seen from trees or
1159         commits will be added to the list.
1160         """
1161         try:
1162             for d in self._join(self.get(id)):
1163                 yield d
1164         except StopIteration:
1165             log('booger!\n')
1166
1167
1168 _cp = {}
1169
1170 def cp(repo_dir=None):
1171     """Create a CatPipe object or reuse the already existing one."""
1172     global _cp
1173     if not repo_dir:
1174         repo_dir = repo()
1175     repo_dir = os.path.abspath(repo_dir)
1176     cp = _cp.get(repo_dir)
1177     if not cp:
1178         cp = CatPipe(repo_dir)
1179         _cp[repo_dir] = cp
1180     return cp
1181
1182
1183 def tags(repo_dir = None):
1184     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1185     tags = {}
1186     for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1187         assert(n.startswith('refs/tags/'))
1188         name = n[10:]
1189         if not c in tags:
1190             tags[c] = []
1191         tags[c].append(name)  # more than one tag can point at 'c'
1192     return tags