lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5
   6 import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
   7 from collections import namedtuple
   8 from itertools import islice
   9
  10 from bup import _helpers, path, midx, bloom, xstat
  11 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
  12                          fdatasync,
  13                          hostname, log, merge_iter, mmap_read, mmap_readwrite,
  14                          progress, qprogress, unlink, username, userfullname,
  15                          utc_offset_str)
  16
  17
  18 max_pack_size = 1000*1000*1000  # larger packs will slow down pruning
  19 max_pack_objects = 200*1000  # cache memory usage is about 83 bytes per object
  20
  21 verbose = 0
  22 ignore_midx = 0
  23 repodir = None
  24
  25 _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
  26 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
  27
  28 _total_searches = 0
  29 _total_steps = 0
  30
  31
  32 class GitError(Exception):
  33     pass
  34
  35
  36 def parse_tz_offset(s):
  37     """UTC offset in seconds."""
  38     tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
  39     if s[0] == '-':
  40         return - tz_off
  41     return tz_off
  42
  43
  44 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
  45 # Make sure that's authoritative.
  46 _start_end_char = r'[^ .,:;<>"\'\0\n]'
  47 _content_char = r'[^\0\n<>]'
  48 _safe_str_rx = '(?:%s{1,2}|(?:%s%s*%s))' \
  49     % (_start_end_char,
  50        _start_end_char, _content_char, _start_end_char)
  51 _tz_rx = r'[-+]\d\d[0-5]\d'
  52 _parent_rx = r'(?:parent [abcdefABCDEF0123456789]{40}\n)'
  53 _commit_rx = re.compile(r'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
  54 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
  55 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)
  56
  57 (?P<message>(?:.|\n)*)''' % (_parent_rx,
  58                              _safe_str_rx, _safe_str_rx, _tz_rx,
  59                              _safe_str_rx, _safe_str_rx, _tz_rx))
  60 _parent_hash_rx = re.compile(r'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
  61
  62
  63 # Note that the author_sec and committer_sec values are (UTC) epoch seconds.
  64 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
  65                                        'author_name', 'author_mail',
  66                                        'author_sec', 'author_offset',
  67                                        'committer_name', 'committer_mail',
  68                                        'committer_sec', 'committer_offset',
  69                                        'message'])
  70
  71 def parse_commit(content):
  72     commit_match = re.match(_commit_rx, content)
  73     if not commit_match:
  74         raise Exception('cannot parse commit %r' % content)
  75     matches = commit_match.groupdict()
  76     return CommitInfo(tree=matches['tree'],
  77                       parents=re.findall(_parent_hash_rx, matches['parents']),
  78                       author_name=matches['author_name'],
  79                       author_mail=matches['author_mail'],
  80                       author_sec=int(matches['asec']),
  81                       author_offset=parse_tz_offset(matches['atz']),
  82                       committer_name=matches['committer_name'],
  83                       committer_mail=matches['committer_mail'],
  84                       committer_sec=int(matches['csec']),
  85                       committer_offset=parse_tz_offset(matches['ctz']),
  86                       message=matches['message'])
  87
  88
  89 def get_commit_items(id, cp):
  90     commit_it = cp.get(id)
  91     assert(commit_it.next() == 'commit')
  92     commit_content = ''.join(commit_it)
  93     return parse_commit(commit_content)
  94
  95
  96 def _local_git_date_str(epoch_sec):
  97     return '%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
  98
  99
 100 def _git_date_str(epoch_sec, tz_offset_sec):
 101     offs =  tz_offset_sec // 60
 102     return '%d %s%02d%02d' \
 103         % (epoch_sec,
 104            '+' if offs >= 0 else '-',
 105            abs(offs) // 60,
 106            abs(offs) % 60)
 107
 108
 109 def repo(sub = '', repo_dir=None):
 110     """Get the path to the git repository or one of its subdirectories."""
 111     global repodir
 112     repo_dir = repo_dir or repodir
 113     if not repo_dir:
 114         raise GitError('You should call check_repo_or_die()')
 115
 116     # If there's a .git subdirectory, then the actual repo is in there.
 117     gd = os.path.join(repo_dir, '.git')
 118     if os.path.exists(gd):
 119         repodir = gd
 120
 121     return os.path.join(repo_dir, sub)
 122
 123
 124 def shorten_hash(s):
 125     return re.sub(r'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)',
 126                   r'\1\2*\3', s)
 127
 128
 129 def repo_rel(path):
 130     full = os.path.abspath(path)
 131     fullrepo = os.path.abspath(repo(''))
 132     if not fullrepo.endswith('/'):
 133         fullrepo += '/'
 134     if full.startswith(fullrepo):
 135         path = full[len(fullrepo):]
 136     if path.startswith('index-cache/'):
 137         path = path[len('index-cache/'):]
 138     return shorten_hash(path)
 139
 140
 141 def all_packdirs():
 142     paths = [repo('objects/pack')]
 143     paths += glob.glob(repo('index-cache/*/.'))
 144     return paths
 145
 146
 147 def auto_midx(objdir):
 148     args = [path.exe(), 'midx', '--auto', '--dir', objdir]
 149     try:
 150         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 151     except OSError as e:
 152         # make sure 'args' gets printed to help with debugging
 153         add_error('%r: exception: %s' % (args, e))
 154         raise
 155     if rv:
 156         add_error('%r: returned %d' % (args, rv))
 157
 158     args = [path.exe(), 'bloom', '--dir', objdir]
 159     try:
 160         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 161     except OSError as e:
 162         # make sure 'args' gets printed to help with debugging
 163         add_error('%r: exception: %s' % (args, e))
 164         raise
 165     if rv:
 166         add_error('%r: returned %d' % (args, rv))
 167
 168
 169 def mangle_name(name, mode, gitmode):
 170     """Mangle a file name to present an abstract name for segmented files.
 171     Mangled file names will have the ".bup" extension added to them. If a
 172     file's name already ends with ".bup", a ".bupl" extension is added to
 173     disambiguate normal files from segmented ones.
 174     """
 175     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 176         assert(stat.S_ISDIR(gitmode))
 177         return name + '.bup'
 178     elif name.endswith('.bup') or name[:-1].endswith('.bup'):
 179         return name + '.bupl'
 180     else:
 181         return name
 182
 183
 184 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 185 def demangle_name(name, mode):
 186     """Remove name mangling from a file name, if necessary.
 187
 188     The return value is a tuple (demangled_filename,mode), where mode is one of
 189     the following:
 190
 191     * BUP_NORMAL  : files that should be read as-is from the repository
 192     * BUP_CHUNKED : files that were chunked and need to be reassembled
 193
 194     For more information on the name mangling algorithm, see mangle_name()
 195     """
 196     if name.endswith('.bupl'):
 197         return (name[:-5], BUP_NORMAL)
 198     elif name.endswith('.bup'):
 199         return (name[:-4], BUP_CHUNKED)
 200     elif name.endswith('.bupm'):
 201         return (name[:-5],
 202                 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
 203     else:
 204         return (name, BUP_NORMAL)
 205
 206
 207 def calc_hash(type, content):
 208     """Calculate some content's hash in the Git fashion."""
 209     header = '%s %d\0' % (type, len(content))
 210     sum = Sha1(header)
 211     sum.update(content)
 212     return sum.digest()
 213
 214
 215 def shalist_item_sort_key(ent):
 216     (mode, name, id) = ent
 217     assert(mode+0 == mode)
 218     if stat.S_ISDIR(mode):
 219         return name + '/'
 220     else:
 221         return name
 222
 223
 224 def tree_encode(shalist):
 225     """Generate a git tree object from (mode,name,hash) tuples."""
 226     shalist = sorted(shalist, key = shalist_item_sort_key)
 227     l = []
 228     for (mode,name,bin) in shalist:
 229         assert(mode)
 230         assert(mode+0 == mode)
 231         assert(name)
 232         assert(len(bin) == 20)
 233         s = '%o %s\0%s' % (mode,name,bin)
 234         assert(s[0] != '0')  # 0-padded octal is not acceptable in a git tree
 235         l.append(s)
 236     return ''.join(l)
 237
 238
 239 def tree_decode(buf):
 240     """Generate a list of (mode,name,hash) from the git tree object in buf."""
 241     ofs = 0
 242     while ofs < len(buf):
 243         z = buf.find('\0', ofs)
 244         assert(z > ofs)
 245         spl = buf[ofs:z].split(' ', 1)
 246         assert(len(spl) == 2)
 247         mode,name = spl
 248         sha = buf[z+1:z+1+20]
 249         ofs = z+1+20
 250         yield (int(mode, 8), name, sha)
 251
 252
 253 def _encode_packobj(type, content, compression_level=1):
 254     szout = ''
 255     sz = len(content)
 256     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 257     sz >>= 4
 258     while 1:
 259         if sz: szbits |= 0x80
 260         szout += chr(szbits)
 261         if not sz:
 262             break
 263         szbits = sz & 0x7f
 264         sz >>= 7
 265     if compression_level > 9:
 266         compression_level = 9
 267     elif compression_level < 0:
 268         compression_level = 0
 269     z = zlib.compressobj(compression_level)
 270     yield szout
 271     yield z.compress(content)
 272     yield z.flush()
 273
 274
 275 def _encode_looseobj(type, content, compression_level=1):
 276     z = zlib.compressobj(compression_level)
 277     yield z.compress('%s %d\0' % (type, len(content)))
 278     yield z.compress(content)
 279     yield z.flush()
 280
 281
 282 def _decode_looseobj(buf):
 283     assert(buf);
 284     s = zlib.decompress(buf)
 285     i = s.find('\0')
 286     assert(i > 0)
 287     l = s[:i].split(' ')
 288     type = l[0]
 289     sz = int(l[1])
 290     content = s[i+1:]
 291     assert(type in _typemap)
 292     assert(sz == len(content))
 293     return (type, content)
 294
 295
 296 def _decode_packobj(buf):
 297     assert(buf)
 298     c = ord(buf[0])
 299     type = _typermap[(c & 0x70) >> 4]
 300     sz = c & 0x0f
 301     shift = 4
 302     i = 0
 303     while c & 0x80:
 304         i += 1
 305         c = ord(buf[i])
 306         sz |= (c & 0x7f) << shift
 307         shift += 7
 308         if not (c & 0x80):
 309             break
 310     return (type, zlib.decompress(buf[i+1:]))
 311
 312
 313 class PackIdx:
 314     def __init__(self):
 315         assert(0)
 316
 317     def find_offset(self, hash):
 318         """Get the offset of an object inside the index file."""
 319         idx = self._idx_from_hash(hash)
 320         if idx != None:
 321             return self._ofs_from_idx(idx)
 322         return None
 323
 324     def exists(self, hash, want_source=False):
 325         """Return nonempty if the object exists in this index."""
 326         if hash and (self._idx_from_hash(hash) != None):
 327             return want_source and os.path.basename(self.name) or True
 328         return None
 329
 330     def __len__(self):
 331         return int(self.fanout[255])
 332
 333     def _idx_from_hash(self, hash):
 334         global _total_searches, _total_steps
 335         _total_searches += 1
 336         assert(len(hash) == 20)
 337         b1 = ord(hash[0])
 338         start = self.fanout[b1-1] # range -1..254
 339         end = self.fanout[b1] # range 0..255
 340         want = str(hash)
 341         _total_steps += 1  # lookup table is a step
 342         while start < end:
 343             _total_steps += 1
 344             mid = start + (end-start)/2
 345             v = self._idx_to_hash(mid)
 346             if v < want:
 347                 start = mid+1
 348             elif v > want:
 349                 end = mid
 350             else: # got it!
 351                 return mid
 352         return None
 353
 354
 355 class PackIdxV1(PackIdx):
 356     """Object representation of a Git pack index (version 1) file."""
 357     def __init__(self, filename, f):
 358         self.name = filename
 359         self.idxnames = [self.name]
 360         self.map = mmap_read(f)
 361         self.fanout = list(struct.unpack('!256I',
 362                                          str(buffer(self.map, 0, 256*4))))
 363         self.fanout.append(0)  # entry "-1"
 364         nsha = self.fanout[255]
 365         self.sha_ofs = 256*4
 366         self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
 367
 368     def _ofs_from_idx(self, idx):
 369         return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
 370
 371     def _idx_to_hash(self, idx):
 372         return str(self.shatable[idx*24+4 : idx*24+24])
 373
 374     def __iter__(self):
 375         for i in xrange(self.fanout[255]):
 376             yield buffer(self.map, 256*4 + 24*i + 4, 20)
 377
 378
 379 class PackIdxV2(PackIdx):
 380     """Object representation of a Git pack index (version 2) file."""
 381     def __init__(self, filename, f):
 382         self.name = filename
 383         self.idxnames = [self.name]
 384         self.map = mmap_read(f)
 385         assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
 386         self.fanout = list(struct.unpack('!256I',
 387                                          str(buffer(self.map, 8, 256*4))))
 388         self.fanout.append(0)  # entry "-1"
 389         nsha = self.fanout[255]
 390         self.sha_ofs = 8 + 256*4
 391         self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
 392         self.ofstable = buffer(self.map,
 393                                self.sha_ofs + nsha*20 + nsha*4,
 394                                nsha*4)
 395         self.ofs64table = buffer(self.map,
 396                                  8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
 397
 398     def _ofs_from_idx(self, idx):
 399         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
 400         if ofs & 0x80000000:
 401             idx64 = ofs & 0x7fffffff
 402             ofs = struct.unpack('!Q',
 403                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
 404         return ofs
 405
 406     def _idx_to_hash(self, idx):
 407         return str(self.shatable[idx*20:(idx+1)*20])
 408
 409     def __iter__(self):
 410         for i in xrange(self.fanout[255]):
 411             yield buffer(self.map, 8 + 256*4 + 20*i, 20)
 412
 413
 414 _mpi_count = 0
 415 class PackIdxList:
 416     def __init__(self, dir):
 417         global _mpi_count
 418         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 419         _mpi_count += 1
 420         self.dir = dir
 421         self.also = set()
 422         self.packs = []
 423         self.do_bloom = False
 424         self.bloom = None
 425         self.refresh()
 426
 427     def __del__(self):
 428         global _mpi_count
 429         _mpi_count -= 1
 430         assert(_mpi_count == 0)
 431
 432     def __iter__(self):
 433         return iter(idxmerge(self.packs))
 434
 435     def __len__(self):
 436         return sum(len(pack) for pack in self.packs)
 437
 438     def exists(self, hash, want_source=False):
 439         """Return nonempty if the object exists in the index files."""
 440         global _total_searches
 441         _total_searches += 1
 442         if hash in self.also:
 443             return True
 444         if self.do_bloom and self.bloom:
 445             if self.bloom.exists(hash):
 446                 self.do_bloom = False
 447             else:
 448                 _total_searches -= 1  # was counted by bloom
 449                 return None
 450         for i in xrange(len(self.packs)):
 451             p = self.packs[i]
 452             _total_searches -= 1  # will be incremented by sub-pack
 453             ix = p.exists(hash, want_source=want_source)
 454             if ix:
 455                 # reorder so most recently used packs are searched first
 456                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 457                 return ix
 458         self.do_bloom = True
 459         return None
 460
 461     def refresh(self, skip_midx = False):
 462         """Refresh the index list.
 463         This method verifies if .midx files were superseded (e.g. all of its
 464         contents are in another, bigger .midx file) and removes the superseded
 465         files.
 466
 467         If skip_midx is True, all work on .midx files will be skipped and .midx
 468         files will be removed from the list.
 469
 470         The module-global variable 'ignore_midx' can force this function to
 471         always act as if skip_midx was True.
 472         """
 473         self.bloom = None # Always reopen the bloom as it may have been relaced
 474         self.do_bloom = False
 475         skip_midx = skip_midx or ignore_midx
 476         d = dict((p.name, p) for p in self.packs
 477                  if not skip_midx or not isinstance(p, midx.PackMidx))
 478         if os.path.exists(self.dir):
 479             if not skip_midx:
 480                 midxl = []
 481                 for ix in self.packs:
 482                     if isinstance(ix, midx.PackMidx):
 483                         for name in ix.idxnames:
 484                             d[os.path.join(self.dir, name)] = ix
 485                 for full in glob.glob(os.path.join(self.dir,'*.midx')):
 486                     if not d.get(full):
 487                         mx = midx.PackMidx(full)
 488                         (mxd, mxf) = os.path.split(mx.name)
 489                         broken = False
 490                         for n in mx.idxnames:
 491                             if not os.path.exists(os.path.join(mxd, n)):
 492                                 log(('warning: index %s missing\n' +
 493                                     '  used by %s\n') % (n, mxf))
 494                                 broken = True
 495                         if broken:
 496                             mx.close()
 497                             del mx
 498                             unlink(full)
 499                         else:
 500                             midxl.append(mx)
 501                 midxl.sort(key=lambda ix:
 502                            (-len(ix), -xstat.stat(ix.name).st_mtime))
 503                 for ix in midxl:
 504                     any_needed = False
 505                     for sub in ix.idxnames:
 506                         found = d.get(os.path.join(self.dir, sub))
 507                         if not found or isinstance(found, PackIdx):
 508                             # doesn't exist, or exists but not in a midx
 509                             any_needed = True
 510                             break
 511                     if any_needed:
 512                         d[ix.name] = ix
 513                         for name in ix.idxnames:
 514                             d[os.path.join(self.dir, name)] = ix
 515                     elif not ix.force_keep:
 516                         debug1('midx: removing redundant: %s\n'
 517                                % os.path.basename(ix.name))
 518                         ix.close()
 519                         unlink(ix.name)
 520             for full in glob.glob(os.path.join(self.dir,'*.idx')):
 521                 if not d.get(full):
 522                     try:
 523                         ix = open_idx(full)
 524                     except GitError as e:
 525                         add_error(e)
 526                         continue
 527                     d[full] = ix
 528             bfull = os.path.join(self.dir, 'bup.bloom')
 529             if self.bloom is None and os.path.exists(bfull):
 530                 self.bloom = bloom.ShaBloom(bfull)
 531             self.packs = list(set(d.values()))
 532             self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
 533             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 534                 self.do_bloom = True
 535             else:
 536                 self.bloom = None
 537         debug1('PackIdxList: using %d index%s.\n'
 538             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 539
 540     def add(self, hash):
 541         """Insert an additional object in the list."""
 542         self.also.add(hash)
 543
 544
 545 def open_idx(filename):
 546     if filename.endswith('.idx'):
 547         f = open(filename, 'rb')
 548         header = f.read(8)
 549         if header[0:4] == '\377tOc':
 550             version = struct.unpack('!I', header[4:8])[0]
 551             if version == 2:
 552                 return PackIdxV2(filename, f)
 553             else:
 554                 raise GitError('%s: expected idx file version 2, got %d'
 555                                % (filename, version))
 556         elif len(header) == 8 and header[0:4] < '\377tOc':
 557             return PackIdxV1(filename, f)
 558         else:
 559             raise GitError('%s: unrecognized idx file header' % filename)
 560     elif filename.endswith('.midx'):
 561         return midx.PackMidx(filename)
 562     else:
 563         raise GitError('idx filenames must end with .idx or .midx')
 564
 565
 566 def idxmerge(idxlist, final_progress=True):
 567     """Generate a list of all the objects reachable in a PackIdxList."""
 568     def pfunc(count, total):
 569         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 570                   % (count*100.0/total, count, total))
 571     def pfinal(count, total):
 572         if final_progress:
 573             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 574                      % (100, total, total))
 575     return merge_iter(idxlist, 10024, pfunc, pfinal)
 576
 577
 578 def _make_objcache():
 579     return PackIdxList(repo('objects/pack'))
 580
 581 class PackWriter:
 582     """Writes Git objects inside a pack file."""
 583     def __init__(self, objcache_maker=_make_objcache, compression_level=1):
 584         self.file = None
 585         self.parentfd = None
 586         self.count = 0
 587         self.outbytes = 0
 588         self.filename = None
 589         self.idx = None
 590         self.objcache_maker = objcache_maker
 591         self.objcache = None
 592         self.compression_level = compression_level
 593
 594     def __del__(self):
 595         self.close()
 596
 597     def _open(self):
 598         if not self.file:
 599             objdir = dir=repo('objects')
 600             fd, name = tempfile.mkstemp(suffix='.pack', dir=objdir)
 601             try:
 602                 self.file = os.fdopen(fd, 'w+b')
 603             except:
 604                 os.close(fd)
 605                 raise
 606             try:
 607                 self.parentfd = os.open(objdir, os.O_RDONLY)
 608             except:
 609                 f = self.file
 610                 self.file = None
 611                 f.close()
 612                 raise
 613             assert(name.endswith('.pack'))
 614             self.filename = name[:-5]
 615             self.file.write('PACK\0\0\0\2\0\0\0\0')
 616             self.idx = list(list() for i in xrange(256))
 617
 618     def _raw_write(self, datalist, sha):
 619         self._open()
 620         f = self.file
 621         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 622         # the file never has a *partial* blob.  So let's make sure it's
 623         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 624         # to our hashsplit algorithm.)  f.write() does its own buffering,
 625         # but that's okay because we'll flush it in _end().
 626         oneblob = ''.join(datalist)
 627         try:
 628             f.write(oneblob)
 629         except IOError as e:
 630             raise GitError, e, sys.exc_info()[2]
 631         nw = len(oneblob)
 632         crc = zlib.crc32(oneblob) & 0xffffffff
 633         self._update_idx(sha, crc, nw)
 634         self.outbytes += nw
 635         self.count += 1
 636         return nw, crc
 637
 638     def _update_idx(self, sha, crc, size):
 639         assert(sha)
 640         if self.idx:
 641             self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
 642
 643     def _write(self, sha, type, content):
 644         if verbose:
 645             log('>')
 646         if not sha:
 647             sha = calc_hash(type, content)
 648         size, crc = self._raw_write(_encode_packobj(type, content,
 649                                                     self.compression_level),
 650                                     sha=sha)
 651         if self.outbytes >= max_pack_size or self.count >= max_pack_objects:
 652             self.breakpoint()
 653         return sha
 654
 655     def breakpoint(self):
 656         """Clear byte and object counts and return the last processed id."""
 657         id = self._end()
 658         self.outbytes = self.count = 0
 659         return id
 660
 661     def _require_objcache(self):
 662         if self.objcache is None and self.objcache_maker:
 663             self.objcache = self.objcache_maker()
 664         if self.objcache is None:
 665             raise GitError(
 666                     "PackWriter not opened or can't check exists w/o objcache")
 667
 668     def exists(self, id, want_source=False):
 669         """Return non-empty if an object is found in the object cache."""
 670         self._require_objcache()
 671         return self.objcache.exists(id, want_source=want_source)
 672
 673     def maybe_write(self, type, content):
 674         """Write an object to the pack file if not present and return its id."""
 675         sha = calc_hash(type, content)
 676         if not self.exists(sha):
 677             self._write(sha, type, content)
 678             self._require_objcache()
 679             self.objcache.add(sha)
 680         return sha
 681
 682     def new_blob(self, blob):
 683         """Create a blob object in the pack with the supplied content."""
 684         return self.maybe_write('blob', blob)
 685
 686     def new_tree(self, shalist):
 687         """Create a tree object in the pack."""
 688         content = tree_encode(shalist)
 689         return self.maybe_write('tree', content)
 690
 691     def new_commit(self, tree, parent,
 692                    author, adate_sec, adate_tz,
 693                    committer, cdate_sec, cdate_tz,
 694                    msg):
 695         """Create a commit object in the pack.  The date_sec values must be
 696         epoch-seconds, and if a tz is None, the local timezone is assumed."""
 697         if adate_tz:
 698             adate_str = _git_date_str(adate_sec, adate_tz)
 699         else:
 700             adate_str = _local_git_date_str(adate_sec)
 701         if cdate_tz:
 702             cdate_str = _git_date_str(cdate_sec, cdate_tz)
 703         else:
 704             cdate_str = _local_git_date_str(cdate_sec)
 705         l = []
 706         if tree: l.append('tree %s' % tree.encode('hex'))
 707         if parent: l.append('parent %s' % parent.encode('hex'))
 708         if author: l.append('author %s %s' % (author, adate_str))
 709         if committer: l.append('committer %s %s' % (committer, cdate_str))
 710         l.append('')
 711         l.append(msg)
 712         return self.maybe_write('commit', '\n'.join(l))
 713
 714     def abort(self):
 715         """Remove the pack file from disk."""
 716         f = self.file
 717         if f:
 718             pfd = self.parentfd
 719             self.file = None
 720             self.parentfd = None
 721             self.idx = None
 722             try:
 723                 try:
 724                     os.unlink(self.filename + '.pack')
 725                 finally:
 726                     f.close()
 727             finally:
 728                 if pfd is not None:
 729                     os.close(pfd)
 730
 731     def _end(self, run_midx=True):
 732         f = self.file
 733         if not f: return None
 734         self.file = None
 735         try:
 736             self.objcache = None
 737             idx = self.idx
 738             self.idx = None
 739
 740             # update object count
 741             f.seek(8)
 742             cp = struct.pack('!i', self.count)
 743             assert(len(cp) == 4)
 744             f.write(cp)
 745
 746             # calculate the pack sha1sum
 747             f.seek(0)
 748             sum = Sha1()
 749             for b in chunkyreader(f):
 750                 sum.update(b)
 751             packbin = sum.digest()
 752             f.write(packbin)
 753             fdatasync(f.fileno())
 754         finally:
 755             f.close()
 756
 757         obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
 758
 759         nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
 760         if os.path.exists(self.filename + '.map'):
 761             os.unlink(self.filename + '.map')
 762         os.rename(self.filename + '.pack', nameprefix + '.pack')
 763         os.rename(self.filename + '.idx', nameprefix + '.idx')
 764         try:
 765             os.fsync(self.parentfd)
 766         finally:
 767             os.close(self.parentfd)
 768
 769         if run_midx:
 770             auto_midx(repo('objects/pack'))
 771         return nameprefix
 772
 773     def close(self, run_midx=True):
 774         """Close the pack file and move it to its definitive path."""
 775         return self._end(run_midx=run_midx)
 776
 777     def _write_pack_idx_v2(self, filename, idx, packbin):
 778         ofs64_count = 0
 779         for section in idx:
 780             for entry in section:
 781                 if entry[2] >= 2**31:
 782                     ofs64_count += 1
 783
 784         # Length: header + fan-out + shas-and-crcs + overflow-offsets
 785         index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
 786         idx_map = None
 787         idx_f = open(filename, 'w+b')
 788         try:
 789             idx_f.truncate(index_len)
 790             fdatasync(idx_f.fileno())
 791             idx_map = mmap_readwrite(idx_f, close=False)
 792             try:
 793                 count = _helpers.write_idx(filename, idx_map, idx, self.count)
 794                 assert(count == self.count)
 795                 idx_map.flush()
 796             finally:
 797                 idx_map.close()
 798         finally:
 799             idx_f.close()
 800
 801         idx_f = open(filename, 'a+b')
 802         try:
 803             idx_f.write(packbin)
 804             idx_f.seek(0)
 805             idx_sum = Sha1()
 806             b = idx_f.read(8 + 4*256)
 807             idx_sum.update(b)
 808
 809             obj_list_sum = Sha1()
 810             for b in chunkyreader(idx_f, 20*self.count):
 811                 idx_sum.update(b)
 812                 obj_list_sum.update(b)
 813             namebase = obj_list_sum.hexdigest()
 814
 815             for b in chunkyreader(idx_f):
 816                 idx_sum.update(b)
 817             idx_f.write(idx_sum.digest())
 818             fdatasync(idx_f.fileno())
 819             return namebase
 820         finally:
 821             idx_f.close()
 822
 823
 824 def _gitenv(repo_dir = None):
 825     if not repo_dir:
 826         repo_dir = repo()
 827     def env():
 828         os.environ['GIT_DIR'] = os.path.abspath(repo_dir)
 829     return env
 830
 831
 832 def list_refs(refname=None, repo_dir=None,
 833               limit_to_heads=False, limit_to_tags=False):
 834     """Yield (refname, hash) tuples for all repository refs unless a ref
 835     name is specified.  Given a ref name, only include tuples for that
 836     particular ref.  The limits restrict the result items to
 837     refs/heads or refs/tags.  If both limits are specified, items from
 838     both sources will be included.
 839
 840     """
 841     argv = ['git', 'show-ref']
 842     if limit_to_heads:
 843         argv.append('--heads')
 844     if limit_to_tags:
 845         argv.append('--tags')
 846     argv.append('--')
 847     if refname:
 848         argv += [refname]
 849     p = subprocess.Popen(argv,
 850                          preexec_fn = _gitenv(repo_dir),
 851                          stdout = subprocess.PIPE)
 852     out = p.stdout.read().strip()
 853     rv = p.wait()  # not fatal
 854     if rv:
 855         assert(not out)
 856     if out:
 857         for d in out.split('\n'):
 858             (sha, name) = d.split(' ', 1)
 859             yield (name, sha.decode('hex'))
 860
 861
 862 def read_ref(refname, repo_dir = None):
 863     """Get the commit id of the most recent commit made on a given ref."""
 864     refs = list_refs(refname, repo_dir=repo_dir, limit_to_heads=True)
 865     l = tuple(islice(refs, 2))
 866     if l:
 867         assert(len(l) == 1)
 868         return l[0][1]
 869     else:
 870         return None
 871
 872
 873 def rev_list(ref, count=None, repo_dir=None):
 874     """Generate a list of reachable commits in reverse chronological order.
 875
 876     This generator walks through commits, from child to parent, that are
 877     reachable via the specified ref and yields a series of tuples of the form
 878     (date,hash).
 879
 880     If count is a non-zero integer, limit the number of commits to "count"
 881     objects.
 882     """
 883     assert(not ref.startswith('-'))
 884     opts = []
 885     if count:
 886         opts += ['-n', str(atoi(count))]
 887     argv = ['git', 'rev-list', '--pretty=format:%at'] + opts + [ref, '--']
 888     p = subprocess.Popen(argv,
 889                          preexec_fn = _gitenv(repo_dir),
 890                          stdout = subprocess.PIPE)
 891     commit = None
 892     for row in p.stdout:
 893         s = row.strip()
 894         if s.startswith('commit '):
 895             commit = s[7:].decode('hex')
 896         else:
 897             date = int(s)
 898             yield (date, commit)
 899     rv = p.wait()  # not fatal
 900     if rv:
 901         raise GitError, 'git rev-list returned error %d' % rv
 902
 903
 904 def get_commit_dates(refs, repo_dir=None):
 905     """Get the dates for the specified commit refs.  For now, every unique
 906        string in refs must resolve to a different commit or this
 907        function will fail."""
 908     result = []
 909     for ref in refs:
 910         commit = get_commit_items(ref, cp(repo_dir))
 911         result.append(commit.author_sec)
 912     return result
 913
 914
 915 def rev_parse(committish, repo_dir=None):
 916     """Resolve the full hash for 'committish', if it exists.
 917
 918     Should be roughly equivalent to 'git rev-parse'.
 919
 920     Returns the hex value of the hash if it is found, None if 'committish' does
 921     not correspond to anything.
 922     """
 923     head = read_ref(committish, repo_dir=repo_dir)
 924     if head:
 925         debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
 926         return head
 927
 928     pL = PackIdxList(repo('objects/pack', repo_dir=repo_dir))
 929
 930     if len(committish) == 40:
 931         try:
 932             hash = committish.decode('hex')
 933         except TypeError:
 934             return None
 935
 936         if pL.exists(hash):
 937             return hash
 938
 939     return None
 940
 941
 942 def update_ref(refname, newval, oldval, repo_dir=None):
 943     """Update a repository reference."""
 944     if not oldval:
 945         oldval = ''
 946     assert(refname.startswith('refs/heads/') \
 947            or refname.startswith('refs/tags/'))
 948     p = subprocess.Popen(['git', 'update-ref', refname,
 949                           newval.encode('hex'), oldval.encode('hex')],
 950                          preexec_fn = _gitenv(repo_dir))
 951     _git_wait('git update-ref', p)
 952
 953
 954 def delete_ref(refname):
 955     """Delete a repository reference."""
 956     assert(refname.startswith('refs/'))
 957     p = subprocess.Popen(['git', 'update-ref', '-d', refname],
 958                          preexec_fn = _gitenv())
 959     _git_wait('git update-ref', p)
 960
 961
 962 def guess_repo(path=None):
 963     """Set the path value in the global variable "repodir".
 964     This makes bup look for an existing bup repository, but not fail if a
 965     repository doesn't exist. Usually, if you are interacting with a bup
 966     repository, you would not be calling this function but using
 967     check_repo_or_die().
 968     """
 969     global repodir
 970     if path:
 971         repodir = path
 972     if not repodir:
 973         repodir = os.environ.get('BUP_DIR')
 974         if not repodir:
 975             repodir = os.path.expanduser('~/.bup')
 976
 977
 978 def init_repo(path=None):
 979     """Create the Git bare repository for bup in a given path."""
 980     guess_repo(path)
 981     d = repo()  # appends a / to the path
 982     parent = os.path.dirname(os.path.dirname(d))
 983     if parent and not os.path.exists(parent):
 984         raise GitError('parent directory "%s" does not exist\n' % parent)
 985     if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
 986         raise GitError('"%s" exists but is not a directory\n' % d)
 987     p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
 988                          preexec_fn = _gitenv())
 989     _git_wait('git init', p)
 990     # Force the index version configuration in order to ensure bup works
 991     # regardless of the version of the installed Git binary.
 992     p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
 993                          stdout=sys.stderr, preexec_fn = _gitenv())
 994     _git_wait('git config', p)
 995     # Enable the reflog
 996     p = subprocess.Popen(['git', 'config', 'core.logAllRefUpdates', 'true'],
 997                          stdout=sys.stderr, preexec_fn = _gitenv())
 998     _git_wait('git config', p)
 999
1000
1001 def check_repo_or_die(path=None):
1002     """Make sure a bup repository exists, and abort if not.
1003     If the path to a particular repository was not specified, this function
1004     initializes the default repository automatically.
1005     """
1006     guess_repo(path)
1007     try:
1008         os.stat(repo('objects/pack/.'))
1009     except OSError as e:
1010         if e.errno == errno.ENOENT:
1011             log('error: %r is not a bup repository; run "bup init"\n'
1012                 % repo())
1013             sys.exit(15)
1014         else:
1015             log('error: %s\n' % e)
1016             sys.exit(14)
1017
1018
1019 _ver = None
1020 def ver():
1021     """Get Git's version and ensure a usable version is installed.
1022
1023     The returned version is formatted as an ordered tuple with each position
1024     representing a digit in the version tag. For example, the following tuple
1025     would represent version 1.6.6.9:
1026
1027         ('1', '6', '6', '9')
1028     """
1029     global _ver
1030     if not _ver:
1031         p = subprocess.Popen(['git', '--version'],
1032                              stdout=subprocess.PIPE)
1033         gvs = p.stdout.read()
1034         _git_wait('git --version', p)
1035         m = re.match(r'git version (\S+.\S+)', gvs)
1036         if not m:
1037             raise GitError('git --version weird output: %r' % gvs)
1038         _ver = tuple(m.group(1).split('.'))
1039     needed = ('1','5', '3', '1')
1040     if _ver < needed:
1041         raise GitError('git version %s or higher is required; you have %s'
1042                        % ('.'.join(needed), '.'.join(_ver)))
1043     return _ver
1044
1045
1046 def _git_wait(cmd, p):
1047     rv = p.wait()
1048     if rv != 0:
1049         raise GitError('%s returned %d' % (cmd, rv))
1050
1051
1052 def _git_capture(argv):
1053     p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv())
1054     r = p.stdout.read()
1055     _git_wait(repr(argv), p)
1056     return r
1057
1058
1059 class _AbortableIter:
1060     def __init__(self, it, onabort = None):
1061         self.it = it
1062         self.onabort = onabort
1063         self.done = None
1064
1065     def __iter__(self):
1066         return self
1067
1068     def next(self):
1069         try:
1070             return self.it.next()
1071         except StopIteration as e:
1072             self.done = True
1073             raise
1074         except:
1075             self.abort()
1076             raise
1077
1078     def abort(self):
1079         """Abort iteration and call the abortion callback, if needed."""
1080         if not self.done:
1081             self.done = True
1082             if self.onabort:
1083                 self.onabort()
1084
1085     def __del__(self):
1086         self.abort()
1087
1088
1089 _ver_warned = 0
1090 class CatPipe:
1091     """Link to 'git cat-file' that is used to retrieve blob data."""
1092     def __init__(self, repo_dir = None):
1093         global _ver_warned
1094         self.repo_dir = repo_dir
1095         wanted = ('1','5','6')
1096         if ver() < wanted:
1097             if not _ver_warned:
1098                 log('warning: git version < %s; bup will be slow.\n'
1099                     % '.'.join(wanted))
1100                 _ver_warned = 1
1101             self.get = self._slow_get
1102         else:
1103             self.p = self.inprogress = None
1104             self.get = self._fast_get
1105
1106     def _abort(self):
1107         if self.p:
1108             self.p.stdout.close()
1109             self.p.stdin.close()
1110         self.p = None
1111         self.inprogress = None
1112
1113     def _restart(self):
1114         self._abort()
1115         self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1116                                   stdin=subprocess.PIPE,
1117                                   stdout=subprocess.PIPE,
1118                                   close_fds = True,
1119                                   bufsize = 4096,
1120                                   preexec_fn = _gitenv(self.repo_dir))
1121
1122     def _fast_get(self, id):
1123         if not self.p or self.p.poll() != None:
1124             self._restart()
1125         assert(self.p)
1126         poll_result = self.p.poll()
1127         assert(poll_result == None)
1128         if self.inprogress:
1129             log('_fast_get: opening %r while %r is open\n'
1130                 % (id, self.inprogress))
1131         assert(not self.inprogress)
1132         assert(id.find('\n') < 0)
1133         assert(id.find('\r') < 0)
1134         assert(not id.startswith('-'))
1135         self.inprogress = id
1136         self.p.stdin.write('%s\n' % id)
1137         self.p.stdin.flush()
1138         hdr = self.p.stdout.readline()
1139         if hdr.endswith(' missing\n'):
1140             self.inprogress = None
1141             raise KeyError('blob %r is missing' % id)
1142         spl = hdr.split(' ')
1143         if len(spl) != 3 or len(spl[0]) != 40:
1144             raise GitError('expected blob, got %r' % spl)
1145         (hex, type, size) = spl
1146
1147         it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
1148                            onabort = self._abort)
1149         try:
1150             yield type
1151             for blob in it:
1152                 yield blob
1153             readline_result = self.p.stdout.readline()
1154             assert(readline_result == '\n')
1155             self.inprogress = None
1156         except Exception as e:
1157             it.abort()
1158             raise
1159
1160     def _slow_get(self, id):
1161         assert(id.find('\n') < 0)
1162         assert(id.find('\r') < 0)
1163         assert(id[0] != '-')
1164         type = _git_capture(['git', 'cat-file', '-t', id]).strip()
1165         yield type
1166
1167         p = subprocess.Popen(['git', 'cat-file', type, id],
1168                              stdout=subprocess.PIPE,
1169                              preexec_fn = _gitenv(self.repo_dir))
1170         for blob in chunkyreader(p.stdout):
1171             yield blob
1172         _git_wait('git cat-file', p)
1173
1174     def _join(self, it):
1175         type = it.next()
1176         if type == 'blob':
1177             for blob in it:
1178                 yield blob
1179         elif type == 'tree':
1180             treefile = ''.join(it)
1181             for (mode, name, sha) in tree_decode(treefile):
1182                 for blob in self.join(sha.encode('hex')):
1183                     yield blob
1184         elif type == 'commit':
1185             treeline = ''.join(it).split('\n')[0]
1186             assert(treeline.startswith('tree '))
1187             for blob in self.join(treeline[5:]):
1188                 yield blob
1189         else:
1190             raise GitError('invalid object type %r: expected blob/tree/commit'
1191                            % type)
1192
1193     def join(self, id):
1194         """Generate a list of the content of all blobs that can be reached
1195         from an object.  The hash given in 'id' must point to a blob, a tree
1196         or a commit. The content of all blobs that can be seen from trees or
1197         commits will be added to the list.
1198         """
1199         try:
1200             for d in self._join(self.get(id)):
1201                 yield d
1202         except StopIteration:
1203             log('booger!\n')
1204
1205
1206 _cp = {}
1207
1208 def cp(repo_dir=None):
1209     """Create a CatPipe object or reuse the already existing one."""
1210     global _cp
1211     if not repo_dir:
1212         repo_dir = repo()
1213     repo_dir = os.path.abspath(repo_dir)
1214     cp = _cp.get(repo_dir)
1215     if not cp:
1216         cp = CatPipe(repo_dir)
1217         _cp[repo_dir] = cp
1218     return cp
1219
1220
1221 def tags(repo_dir = None):
1222     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1223     tags = {}
1224     for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1225         assert(n.startswith('refs/tags/'))
1226         name = n[10:]
1227         if not c in tags:
1228             tags[c] = []
1229         tags[c].append(name)  # more than one tag can point at 'c'
1230     return tags