1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
6 from __future__ import absolute_import, print_function
7 import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
8 from array import array
9 from binascii import hexlify, unhexlify
10 from collections import namedtuple
11 from itertools import islice
12 from numbers import Integral
14 from bup import _helpers, compat, hashsplit, path, midx, bloom, xstat
15 from bup.compat import (buffer,
16 byte_int, bytes_from_byte, bytes_from_uint,
21 from bup.io import path_msg
22 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
25 hostname, localtime, log,
28 mmap_read, mmap_readwrite,
30 progress, qprogress, stat_if_exists,
33 from bup.pwdgrp import username, userfullname
37 repodir = None # The default repository, once initialized
39 _typemap = {b'blob': 3, b'tree': 2, b'commit': 1, b'tag': 4}
40 _typermap = {v: k for k, v in items(_typemap)}
47 class GitError(Exception):
51 def _gitenv(repo_dir=None):
54 return merge_dict(environ, {b'GIT_DIR': os.path.abspath(repo_dir)})
56 def _git_wait(cmd, p):
59 raise GitError('%r returned %d' % (cmd, rv))
61 def _git_exo(cmd, **kwargs):
62 kwargs['check'] = False
63 result = exo(cmd, **kwargs)
65 if proc.returncode != 0:
66 raise GitError('%r returned %d' % (cmd, proc.returncode))
69 def git_config_get(option, repo_dir=None):
70 cmd = (b'git', b'config', b'--get', option)
71 p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
72 env=_gitenv(repo_dir=repo_dir))
78 raise GitError('%r returned %d' % (cmd, rc))
82 def parse_tz_offset(s):
83 """UTC offset in seconds."""
84 tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
85 if bytes_from_byte(s[0]) == b'-':
90 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
91 # Make sure that's authoritative.
92 _start_end_char = br'[^ .,:;<>"\'\0\n]'
93 _content_char = br'[^\0\n<>]'
94 _safe_str_rx = br'(?:%s{1,2}|(?:%s%s*%s))' \
96 _start_end_char, _content_char, _start_end_char)
97 _tz_rx = br'[-+]\d\d[0-5]\d'
98 _parent_rx = br'(?:parent [abcdefABCDEF0123456789]{40}\n)'
99 # Assumes every following line starting with a space is part of the
100 # mergetag. Is there a formal commit blob spec?
101 _mergetag_rx = br'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)'
102 _commit_rx = re.compile(br'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
103 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
104 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?)
106 (?P<message>(?:.|\n)*)''' % (_parent_rx,
107 _safe_str_rx, _safe_str_rx, _tz_rx,
108 _safe_str_rx, _safe_str_rx, _tz_rx,
110 _parent_hash_rx = re.compile(br'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
112 # Note that the author_sec and committer_sec values are (UTC) epoch
113 # seconds, and for now the mergetag is not included.
114 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
115 'author_name', 'author_mail',
116 'author_sec', 'author_offset',
117 'committer_name', 'committer_mail',
118 'committer_sec', 'committer_offset',
121 def parse_commit(content):
122 commit_match = re.match(_commit_rx, content)
124 raise Exception('cannot parse commit %r' % content)
125 matches = commit_match.groupdict()
126 return CommitInfo(tree=matches['tree'],
127 parents=re.findall(_parent_hash_rx, matches['parents']),
128 author_name=matches['author_name'],
129 author_mail=matches['author_mail'],
130 author_sec=int(matches['asec']),
131 author_offset=parse_tz_offset(matches['atz']),
132 committer_name=matches['committer_name'],
133 committer_mail=matches['committer_mail'],
134 committer_sec=int(matches['csec']),
135 committer_offset=parse_tz_offset(matches['ctz']),
136 message=matches['message'])
139 def get_cat_data(cat_iterator, expected_type):
140 _, kind, _ = next(cat_iterator)
141 if kind != expected_type:
142 raise Exception('expected %r, saw %r' % (expected_type, kind))
143 return b''.join(cat_iterator)
145 def get_commit_items(id, cp):
146 return parse_commit(get_cat_data(cp.get(id), b'commit'))
148 def _local_git_date_str(epoch_sec):
149 return b'%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
152 def _git_date_str(epoch_sec, tz_offset_sec):
153 offs = tz_offset_sec // 60
154 return b'%d %s%02d%02d' \
156 b'+' if offs >= 0 else b'-',
161 def repo(sub = b'', repo_dir=None):
162 """Get the path to the git repository or one of its subdirectories."""
163 repo_dir = repo_dir or repodir
165 raise GitError('You should call check_repo_or_die()')
167 # If there's a .git subdirectory, then the actual repo is in there.
168 gd = os.path.join(repo_dir, b'.git')
169 if os.path.exists(gd):
172 return os.path.join(repo_dir, sub)
176 re.compile(br'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)')
179 return _shorten_hash_rx.sub(br'\1\2*\3', s)
183 full = os.path.abspath(path)
184 fullrepo = os.path.abspath(repo(b''))
185 if not fullrepo.endswith(b'/'):
187 if full.startswith(fullrepo):
188 path = full[len(fullrepo):]
189 if path.startswith(b'index-cache/'):
190 path = path[len(b'index-cache/'):]
191 return shorten_hash(path)
195 paths = [repo(b'objects/pack')]
196 paths += glob.glob(repo(b'index-cache/*/.'))
200 def auto_midx(objdir):
201 args = [path.exe(), b'midx', b'--auto', b'--dir', objdir]
203 rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
205 # make sure 'args' gets printed to help with debugging
206 add_error('%r: exception: %s' % (args, e))
209 add_error('%r: returned %d' % (args, rv))
211 args = [path.exe(), b'bloom', b'--dir', objdir]
213 rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
215 # make sure 'args' gets printed to help with debugging
216 add_error('%r: exception: %s' % (args, e))
219 add_error('%r: returned %d' % (args, rv))
222 def mangle_name(name, mode, gitmode):
223 """Mangle a file name to present an abstract name for segmented files.
224 Mangled file names will have the ".bup" extension added to them. If a
225 file's name already ends with ".bup", a ".bupl" extension is added to
226 disambiguate normal files from segmented ones.
228 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
229 assert(stat.S_ISDIR(gitmode))
230 return name + b'.bup'
231 elif name.endswith(b'.bup') or name[:-1].endswith(b'.bup'):
232 return name + b'.bupl'
237 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
238 def demangle_name(name, mode):
239 """Remove name mangling from a file name, if necessary.
241 The return value is a tuple (demangled_filename,mode), where mode is one of
244 * BUP_NORMAL : files that should be read as-is from the repository
245 * BUP_CHUNKED : files that were chunked and need to be reassembled
247 For more information on the name mangling algorithm, see mangle_name()
249 if name.endswith(b'.bupl'):
250 return (name[:-5], BUP_NORMAL)
251 elif name.endswith(b'.bup'):
252 return (name[:-4], BUP_CHUNKED)
253 elif name.endswith(b'.bupm'):
255 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
257 return (name, BUP_NORMAL)
260 def calc_hash(type, content):
261 """Calculate some content's hash in the Git fashion."""
262 header = b'%s %d\0' % (type, len(content))
268 def shalist_item_sort_key(ent):
269 (mode, name, id) = ent
270 assert(mode+0 == mode)
271 if stat.S_ISDIR(mode):
277 def tree_encode(shalist):
278 """Generate a git tree object from (mode,name,hash) tuples."""
279 shalist = sorted(shalist, key = shalist_item_sort_key)
281 for (mode,name,bin) in shalist:
283 assert(mode+0 == mode)
285 assert(len(bin) == 20)
286 s = b'%o %s\0%s' % (mode,name,bin)
287 assert s[0] != b'0' # 0-padded octal is not acceptable in a git tree
292 def tree_decode(buf):
293 """Generate a list of (mode,name,hash) from the git tree object in buf."""
295 while ofs < len(buf):
296 z = buf.find(b'\0', ofs)
298 spl = buf[ofs:z].split(b' ', 1)
299 assert(len(spl) == 2)
301 sha = buf[z+1:z+1+20]
303 yield (int(mode, 8), name, sha)
306 def _encode_packobj(type, content, compression_level=1):
307 if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
308 raise ValueError('invalid compression level %s' % compression_level)
311 szbits = (sz & 0x0f) | (_typemap[type]<<4)
314 if sz: szbits |= 0x80
315 szout += bytes_from_uint(szbits)
320 z = zlib.compressobj(compression_level)
322 yield z.compress(content)
326 def _encode_looseobj(type, content, compression_level=1):
327 z = zlib.compressobj(compression_level)
328 yield z.compress(b'%s %d\0' % (type, len(content)))
329 yield z.compress(content)
333 def _decode_looseobj(buf):
335 s = zlib.decompress(buf)
338 l = s[:i].split(b' ')
342 assert(type in _typemap)
343 assert(sz == len(content))
344 return (type, content)
347 def _decode_packobj(buf):
350 type = _typermap[(c & 0x70) >> 4]
357 sz |= (c & 0x7f) << shift
361 return (type, zlib.decompress(buf[i+1:]))
368 def find_offset(self, hash):
369 """Get the offset of an object inside the index file."""
370 idx = self._idx_from_hash(hash)
372 return self._ofs_from_idx(idx)
375 def exists(self, hash, want_source=False):
376 """Return nonempty if the object exists in this index."""
377 if hash and (self._idx_from_hash(hash) != None):
378 return want_source and os.path.basename(self.name) or True
381 def _idx_from_hash(self, hash):
382 global _total_searches, _total_steps
384 assert(len(hash) == 20)
385 b1 = byte_int(hash[0])
386 start = self.fanout[b1-1] # range -1..254
387 end = self.fanout[b1] # range 0..255
389 _total_steps += 1 # lookup table is a step
392 mid = start + (end - start) // 2
393 v = self._idx_to_hash(mid)
403 class PackIdxV1(PackIdx):
404 """Object representation of a Git pack index (version 1) file."""
405 def __init__(self, filename, f):
407 self.idxnames = [self.name]
408 self.map = mmap_read(f)
409 # Min size for 'L' is 4, which is sufficient for struct's '!I'
410 self.fanout = array('L', struct.unpack('!256I', self.map))
411 self.fanout.append(0) # entry "-1"
412 self.nsha = self.fanout[255]
413 self.sha_ofs = 256 * 4
414 # Avoid slicing shatable for individual hashes (very high overhead)
415 self.shatable = buffer(self.map, self.sha_ofs, self.nsha * 24)
418 return int(self.nsha) # int() from long for python 2
420 def _ofs_from_idx(self, idx):
421 if idx >= self.nsha or idx < 0:
422 raise IndexError('invalid pack index index %d' % idx)
423 ofs = self.sha_ofs + idx * 24
424 return struct.unpack_from('!I', self.map, offset=ofs)[0]
426 def _idx_to_hash(self, idx):
427 if idx >= self.nsha or idx < 0:
428 raise IndexError('invalid pack index index %d' % idx)
429 ofs = self.sha_ofs + idx * 24 + 4
430 return self.map[ofs : ofs + 20]
433 start = self.sha_ofs + 4
434 for ofs in range(start, start + 24 * self.nsha, 24):
435 yield self.map[ofs : ofs + 20]
438 class PackIdxV2(PackIdx):
439 """Object representation of a Git pack index (version 2) file."""
440 def __init__(self, filename, f):
442 self.idxnames = [self.name]
443 self.map = mmap_read(f)
444 assert self.map[0:8] == b'\377tOc\0\0\0\2'
445 # Min size for 'L' is 4, which is sufficient for struct's '!I'
446 self.fanout = array('L', struct.unpack_from('!256I', self.map, offset=8))
447 self.fanout.append(0)
448 self.nsha = self.fanout[255]
449 self.sha_ofs = 8 + 256*4
450 self.ofstable_ofs = self.sha_ofs + self.nsha * 20 + self.nsha * 4
451 self.ofs64table_ofs = self.ofstable_ofs + self.nsha * 4
452 # Avoid slicing this for individual hashes (very high overhead)
453 self.shatable = buffer(self.map, self.sha_ofs, self.nsha*20)
456 return int(self.nsha) # int() from long for python 2
458 def _ofs_from_idx(self, idx):
459 if idx >= self.nsha or idx < 0:
460 raise IndexError('invalid pack index index %d' % idx)
461 ofs_ofs = self.ofstable_ofs + idx * 4
462 ofs = struct.unpack_from('!I', self.map, offset=ofs_ofs)[0]
464 idx64 = ofs & 0x7fffffff
465 ofs64_ofs = self.ofs64table_ofs + idx64 * 8
466 ofs = struct.unpack_from('!Q', self.map, offset=ofs64_ofs)[0]
469 def _idx_to_hash(self, idx):
470 if idx >= self.nsha or idx < 0:
471 raise IndexError('invalid pack index index %d' % idx)
472 ofs = self.sha_ofs + idx * 20
473 return self.map[ofs : ofs + 20]
477 for ofs in range(start, start + 20 * self.nsha, 20):
478 yield self.map[ofs : ofs + 20]
483 def __init__(self, dir, ignore_midx=False):
485 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
490 self.do_bloom = False
492 self.ignore_midx = ignore_midx
498 assert(_mpi_count == 0)
501 return iter(idxmerge(self.packs))
504 return sum(len(pack) for pack in self.packs)
506 def exists(self, hash, want_source=False):
507 """Return nonempty if the object exists in the index files."""
508 global _total_searches
510 if hash in self.also:
512 if self.do_bloom and self.bloom:
513 if self.bloom.exists(hash):
514 self.do_bloom = False
516 _total_searches -= 1 # was counted by bloom
518 for i in range(len(self.packs)):
520 _total_searches -= 1 # will be incremented by sub-pack
521 ix = p.exists(hash, want_source=want_source)
523 # reorder so most recently used packs are searched first
524 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
529 def refresh(self, skip_midx = False):
530 """Refresh the index list.
531 This method verifies if .midx files were superseded (e.g. all of its
532 contents are in another, bigger .midx file) and removes the superseded
535 If skip_midx is True, all work on .midx files will be skipped and .midx
536 files will be removed from the list.
538 The instance variable 'ignore_midx' can force this function to
539 always act as if skip_midx was True.
541 if self.bloom is not None:
543 self.bloom = None # Always reopen the bloom as it may have been relaced
544 self.do_bloom = False
545 skip_midx = skip_midx or self.ignore_midx
546 d = dict((p.name, p) for p in self.packs
547 if not skip_midx or not isinstance(p, midx.PackMidx))
548 if os.path.exists(self.dir):
551 midxes = set(glob.glob(os.path.join(self.dir, b'*.midx')))
552 # remove any *.midx files from our list that no longer exist
553 for ix in list(d.values()):
554 if not isinstance(ix, midx.PackMidx):
556 if ix.name in midxes:
561 self.packs.remove(ix)
562 for ix in self.packs:
563 if isinstance(ix, midx.PackMidx):
564 for name in ix.idxnames:
565 d[os.path.join(self.dir, name)] = ix
568 mx = midx.PackMidx(full)
569 (mxd, mxf) = os.path.split(mx.name)
571 for n in mx.idxnames:
572 if not os.path.exists(os.path.join(mxd, n)):
573 log(('warning: index %s missing\n'
575 % (path_msg(n), path_msg(mxf)))
583 midxl.sort(key=lambda ix:
584 (-len(ix), -xstat.stat(ix.name).st_mtime))
587 for sub in ix.idxnames:
588 found = d.get(os.path.join(self.dir, sub))
589 if not found or isinstance(found, PackIdx):
590 # doesn't exist, or exists but not in a midx
595 for name in ix.idxnames:
596 d[os.path.join(self.dir, name)] = ix
597 elif not ix.force_keep:
598 debug1('midx: removing redundant: %s\n'
599 % path_msg(os.path.basename(ix.name)))
602 for full in glob.glob(os.path.join(self.dir, b'*.idx')):
606 except GitError as e:
610 bfull = os.path.join(self.dir, b'bup.bloom')
611 if self.bloom is None and os.path.exists(bfull):
612 self.bloom = bloom.ShaBloom(bfull)
613 self.packs = list(set(d.values()))
614 self.packs.sort(reverse=True, key=lambda x: len(x))
615 if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
619 debug1('PackIdxList: using %d index%s.\n'
620 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
623 """Insert an additional object in the list."""
627 def open_idx(filename):
628 if filename.endswith(b'.idx'):
629 f = open(filename, 'rb')
631 if header[0:4] == b'\377tOc':
632 version = struct.unpack('!I', header[4:8])[0]
634 return PackIdxV2(filename, f)
636 raise GitError('%s: expected idx file version 2, got %d'
637 % (path_msg(filename), version))
638 elif len(header) == 8 and header[0:4] < b'\377tOc':
639 return PackIdxV1(filename, f)
641 raise GitError('%s: unrecognized idx file header'
642 % path_msg(filename))
643 elif filename.endswith(b'.midx'):
644 return midx.PackMidx(filename)
646 raise GitError('idx filenames must end with .idx or .midx')
649 def idxmerge(idxlist, final_progress=True):
650 """Generate a list of all the objects reachable in a PackIdxList."""
651 def pfunc(count, total):
652 qprogress('Reading indexes: %.2f%% (%d/%d)\r'
653 % (count*100.0/total, count, total))
654 def pfinal(count, total):
656 progress('Reading indexes: %.2f%% (%d/%d), done.\n'
657 % (100, total, total))
658 return merge_iter(idxlist, 10024, pfunc, pfinal)
661 def _make_objcache():
662 return PackIdxList(repo(b'objects/pack'))
664 # bup-gc assumes that it can disable all PackWriter activities
665 # (bloom/midx/cache) via the constructor and close() arguments.
668 """Writes Git objects inside a pack file."""
669 def __init__(self, objcache_maker=_make_objcache, compression_level=1,
670 run_midx=True, on_pack_finish=None,
671 max_pack_size=None, max_pack_objects=None, repo_dir=None):
672 self.repo_dir = repo_dir or repo()
679 self.objcache_maker = objcache_maker
681 self.compression_level = compression_level
682 self.run_midx=run_midx
683 self.on_pack_finish = on_pack_finish
684 if not max_pack_size:
685 max_pack_size = git_config_get(b'pack.packSizeLimit',
686 repo_dir=self.repo_dir)
687 if max_pack_size is not None:
688 max_pack_size = parse_num(max_pack_size)
689 if not max_pack_size:
690 # larger packs slow down pruning
691 max_pack_size = 1000 * 1000 * 1000
692 self.max_pack_size = max_pack_size
693 # cache memory usage is about 83 bytes per object
694 self.max_pack_objects = max_pack_objects if max_pack_objects \
695 else max(1, self.max_pack_size // 5000)
703 def __exit__(self, type, value, traceback):
708 objdir = dir = os.path.join(self.repo_dir, b'objects')
709 fd, name = tempfile.mkstemp(suffix=b'.pack', dir=objdir)
711 self.file = os.fdopen(fd, 'w+b')
716 self.parentfd = os.open(objdir, os.O_RDONLY)
722 assert name.endswith(b'.pack')
723 self.filename = name[:-5]
724 self.file.write(b'PACK\0\0\0\2\0\0\0\0')
725 self.idx = PackIdxV2Writer()
727 def _raw_write(self, datalist, sha):
730 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
731 # the file never has a *partial* blob. So let's make sure it's
732 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
733 # to our hashsplit algorithm.) f.write() does its own buffering,
734 # but that's okay because we'll flush it in _end().
735 oneblob = b''.join(datalist)
741 crc = zlib.crc32(oneblob) & 0xffffffff
742 self._update_idx(sha, crc, nw)
747 def _update_idx(self, sha, crc, size):
750 self.idx.add(sha, crc, self.file.tell() - size)
752 def _write(self, sha, type, content):
756 sha = calc_hash(type, content)
757 size, crc = self._raw_write(_encode_packobj(type, content,
758 self.compression_level),
760 if self.outbytes >= self.max_pack_size \
761 or self.count >= self.max_pack_objects:
765 def breakpoint(self):
766 """Clear byte and object counts and return the last processed id."""
767 id = self._end(self.run_midx)
768 self.outbytes = self.count = 0
771 def _require_objcache(self):
772 if self.objcache is None and self.objcache_maker:
773 self.objcache = self.objcache_maker()
774 if self.objcache is None:
776 "PackWriter not opened or can't check exists w/o objcache")
778 def exists(self, id, want_source=False):
779 """Return non-empty if an object is found in the object cache."""
780 self._require_objcache()
781 return self.objcache.exists(id, want_source=want_source)
783 def just_write(self, sha, type, content):
784 """Write an object to the pack file without checking for duplication."""
785 self._write(sha, type, content)
786 # If nothing else, gc doesn't have/want an objcache
787 if self.objcache is not None:
788 self.objcache.add(sha)
790 def maybe_write(self, type, content):
791 """Write an object to the pack file if not present and return its id."""
792 sha = calc_hash(type, content)
793 if not self.exists(sha):
794 self._require_objcache()
795 self.just_write(sha, type, content)
798 def new_blob(self, blob):
799 """Create a blob object in the pack with the supplied content."""
800 return self.maybe_write(b'blob', blob)
802 def new_tree(self, shalist):
803 """Create a tree object in the pack."""
804 content = tree_encode(shalist)
805 return self.maybe_write(b'tree', content)
807 def new_commit(self, tree, parent,
808 author, adate_sec, adate_tz,
809 committer, cdate_sec, cdate_tz,
811 """Create a commit object in the pack. The date_sec values must be
812 epoch-seconds, and if a tz is None, the local timezone is assumed."""
814 adate_str = _git_date_str(adate_sec, adate_tz)
816 adate_str = _local_git_date_str(adate_sec)
818 cdate_str = _git_date_str(cdate_sec, cdate_tz)
820 cdate_str = _local_git_date_str(cdate_sec)
822 if tree: l.append(b'tree %s' % hexlify(tree))
823 if parent: l.append(b'parent %s' % hexlify(parent))
824 if author: l.append(b'author %s %s' % (author, adate_str))
825 if committer: l.append(b'committer %s %s' % (committer, cdate_str))
828 return self.maybe_write(b'commit', b'\n'.join(l))
831 """Remove the pack file from disk."""
840 os.unlink(self.filename + b'.pack')
847 def _end(self, run_midx=True):
849 if not f: return None
856 # update object count
858 cp = struct.pack('!i', self.count)
862 # calculate the pack sha1sum
865 for b in chunkyreader(f):
867 packbin = sum.digest()
869 fdatasync(f.fileno())
873 obj_list_sha = idx.write(self.filename + b'.idx', packbin)
874 nameprefix = os.path.join(self.repo_dir,
875 b'objects/pack/pack-' + obj_list_sha)
876 if os.path.exists(self.filename + b'.map'):
877 os.unlink(self.filename + b'.map')
878 os.rename(self.filename + b'.pack', nameprefix + b'.pack')
879 os.rename(self.filename + b'.idx', nameprefix + b'.idx')
881 os.fsync(self.parentfd)
883 os.close(self.parentfd)
886 auto_midx(os.path.join(self.repo_dir, b'objects/pack'))
888 if self.on_pack_finish:
889 self.on_pack_finish(nameprefix)
893 def close(self, run_midx=True):
894 """Close the pack file and move it to its definitive path."""
895 return self._end(run_midx=run_midx)
898 class PackIdxV2Writer:
900 self.idx = list(list() for i in range(256))
903 def add(self, sha, crc, offs):
906 self.idx[byte_int(sha[0])].append((sha, crc, offs))
908 def write(self, filename, packbin):
910 for section in self.idx:
911 for entry in section:
912 if entry[2] >= 2**31:
915 # Length: header + fan-out + shas-and-crcs + overflow-offsets
916 index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
918 idx_f = open(filename, 'w+b')
920 idx_f.truncate(index_len)
921 fdatasync(idx_f.fileno())
922 idx_map = mmap_readwrite(idx_f, close=False)
924 count = _helpers.write_idx(filename, idx_map, self.idx,
926 assert(count == self.count)
933 idx_f = open(filename, 'a+b')
938 b = idx_f.read(8 + 4*256)
941 obj_list_sum = Sha1()
942 for b in chunkyreader(idx_f, 20 * self.count):
944 obj_list_sum.update(b)
945 namebase = hexlify(obj_list_sum.digest())
947 for b in chunkyreader(idx_f):
949 idx_f.write(idx_sum.digest())
950 fdatasync(idx_f.fileno())
956 def list_refs(patterns=None, repo_dir=None,
957 limit_to_heads=False, limit_to_tags=False):
958 """Yield (refname, hash) tuples for all repository refs unless
959 patterns are specified. In that case, only include tuples for
960 refs matching those patterns (cf. git-show-ref(1)). The limits
961 restrict the result items to refs/heads or refs/tags. If both
962 limits are specified, items from both sources will be included.
965 argv = [b'git', b'show-ref']
967 argv.append(b'--heads')
969 argv.append(b'--tags')
972 argv.extend(patterns)
973 p = subprocess.Popen(argv, env=_gitenv(repo_dir), stdout=subprocess.PIPE)
974 out = p.stdout.read().strip()
975 rv = p.wait() # not fatal
979 for d in out.split(b'\n'):
980 sha, name = d.split(b' ', 1)
981 yield name, unhexlify(sha)
984 def read_ref(refname, repo_dir = None):
985 """Get the commit id of the most recent commit made on a given ref."""
986 refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
987 l = tuple(islice(refs, 2))
995 def rev_list_invocation(ref_or_refs, format=None):
996 if isinstance(ref_or_refs, bytes):
997 refs = (ref_or_refs,)
1000 argv = [b'git', b'rev-list']
1003 argv.append(b'--pretty=format:' + format)
1005 assert not ref.startswith(b'-')
1011 def rev_list(ref_or_refs, parse=None, format=None, repo_dir=None):
1012 """Yield information about commits as per "git rev-list". If a format
1013 is not provided, yield one hex hash at a time. If a format is
1014 provided, pass it to rev-list and call parse(git_stdout) for each
1015 commit with the stream positioned just after the rev-list "commit
1016 HASH" header line. When a format is provided yield (oidx,
1017 parse(git_stdout)) for each commit.
1020 assert bool(parse) == bool(format)
1021 p = subprocess.Popen(rev_list_invocation(ref_or_refs,
1023 env=_gitenv(repo_dir),
1024 stdout = subprocess.PIPE)
1026 for line in p.stdout:
1029 line = p.stdout.readline()
1032 if not s.startswith(b'commit '):
1033 raise Exception('unexpected line ' + repr(s))
1036 yield s, parse(p.stdout)
1037 line = p.stdout.readline()
1039 rv = p.wait() # not fatal
1041 raise GitError('git rev-list returned error %d' % rv)
1044 def get_commit_dates(refs, repo_dir=None):
1045 """Get the dates for the specified commit refs. For now, every unique
1046 string in refs must resolve to a different commit or this
1047 function will fail."""
1050 commit = get_commit_items(ref, cp(repo_dir))
1051 result.append(commit.author_sec)
1055 def rev_parse(committish, repo_dir=None):
1056 """Resolve the full hash for 'committish', if it exists.
1058 Should be roughly equivalent to 'git rev-parse'.
1060 Returns the hex value of the hash if it is found, None if 'committish' does
1061 not correspond to anything.
1063 head = read_ref(committish, repo_dir=repo_dir)
1065 debug2("resolved from ref: commit = %s\n" % hexlify(head))
1068 pL = PackIdxList(repo(b'objects/pack', repo_dir=repo_dir))
1070 if len(committish) == 40:
1072 hash = unhexlify(committish)
1082 def update_ref(refname, newval, oldval, repo_dir=None):
1083 """Update a repository reference."""
1086 assert refname.startswith(b'refs/heads/') \
1087 or refname.startswith(b'refs/tags/')
1088 p = subprocess.Popen([b'git', b'update-ref', refname,
1089 hexlify(newval), hexlify(oldval)],
1090 env=_gitenv(repo_dir))
1091 _git_wait(b'git update-ref', p)
1094 def delete_ref(refname, oldvalue=None):
1095 """Delete a repository reference (see git update-ref(1))."""
1096 assert refname.startswith(b'refs/')
1097 oldvalue = [] if not oldvalue else [oldvalue]
1098 p = subprocess.Popen([b'git', b'update-ref', b'-d', refname] + oldvalue,
1100 _git_wait('git update-ref', p)
1103 def guess_repo(path=None):
1104 """Set the path value in the global variable "repodir".
1105 This makes bup look for an existing bup repository, but not fail if a
1106 repository doesn't exist. Usually, if you are interacting with a bup
1107 repository, you would not be calling this function but using
1108 check_repo_or_die().
1114 repodir = environ.get(b'BUP_DIR')
1116 repodir = os.path.expanduser(b'~/.bup')
1119 def init_repo(path=None):
1120 """Create the Git bare repository for bup in a given path."""
1122 d = repo() # appends a / to the path
1123 parent = os.path.dirname(os.path.dirname(d))
1124 if parent and not os.path.exists(parent):
1125 raise GitError('parent directory "%s" does not exist\n'
1127 if os.path.exists(d) and not os.path.isdir(os.path.join(d, b'.')):
1128 raise GitError('"%s" exists but is not a directory\n' % path_msg(d))
1129 p = subprocess.Popen([b'git', b'--bare', b'init'], stdout=sys.stderr,
1131 _git_wait('git init', p)
1132 # Force the index version configuration in order to ensure bup works
1133 # regardless of the version of the installed Git binary.
1134 p = subprocess.Popen([b'git', b'config', b'pack.indexVersion', '2'],
1135 stdout=sys.stderr, env=_gitenv())
1136 _git_wait('git config', p)
1138 p = subprocess.Popen([b'git', b'config', b'core.logAllRefUpdates', b'true'],
1139 stdout=sys.stderr, env=_gitenv())
1140 _git_wait('git config', p)
1143 def check_repo_or_die(path=None):
1144 """Check to see if a bup repository probably exists, and abort if not."""
1147 pst = stat_if_exists(top + b'/objects/pack')
1148 if pst and stat.S_ISDIR(pst.st_mode):
1151 top_st = stat_if_exists(top)
1153 log('error: repository %r does not exist (see "bup help init")\n'
1156 log('error: %s is not a repository\n' % path_msg(top))
1160 def is_suitable_git(ver_str):
1161 if not ver_str.startswith(b'git version '):
1162 return 'unrecognized'
1163 ver_str = ver_str[len(b'git version '):]
1164 if ver_str.startswith(b'0.'):
1165 return 'insufficient'
1166 if ver_str.startswith(b'1.'):
1167 if re.match(br'1\.[012345]rc', ver_str):
1168 return 'insufficient'
1169 if re.match(br'1\.[01234]\.', ver_str):
1170 return 'insufficient'
1171 if re.match(br'1\.5\.[012345]($|\.)', ver_str):
1172 return 'insufficient'
1173 if re.match(br'1\.5\.6-rc', ver_str):
1174 return 'insufficient'
1176 if re.match(br'[0-9]+(\.|$)?', ver_str):
1182 def require_suitable_git(ver_str=None):
1183 """Raise GitError if the version of git isn't suitable.
1185 Rely on ver_str when provided, rather than invoking the git in the
1190 if _git_great is not None:
1192 if environ.get(b'BUP_GIT_VERSION_IS_FINE', b'').lower() \
1193 in (b'yes', b'true', b'1'):
1197 ver_str, _, _ = _git_exo([b'git', b'--version'])
1198 status = is_suitable_git(ver_str)
1199 if status == 'unrecognized':
1200 raise GitError('Unexpected git --version output: %r' % ver_str)
1201 if status == 'insufficient':
1202 log('error: git version must be at least 1.5.6\n')
1204 if status == 'suitable':
1210 class _AbortableIter:
1211 def __init__(self, it, onabort = None):
1213 self.onabort = onabort
1221 return next(self.it)
1222 except StopIteration as e:
1232 """Abort iteration and call the abortion callback, if needed."""
1243 """Link to 'git cat-file' that is used to retrieve blob data."""
1244 def __init__(self, repo_dir = None):
1245 require_suitable_git()
1246 self.repo_dir = repo_dir
1247 self.p = self.inprogress = None
1251 self.p.stdout.close()
1252 self.p.stdin.close()
1254 self.inprogress = None
1258 self.p = subprocess.Popen([b'git', b'cat-file', b'--batch'],
1259 stdin=subprocess.PIPE,
1260 stdout=subprocess.PIPE,
1263 env=_gitenv(self.repo_dir))
1266 """Yield (oidx, type, size), followed by the data referred to by ref.
1267 If ref does not exist, only yield (None, None, None).
1270 if not self.p or self.p.poll() != None:
1273 poll_result = self.p.poll()
1274 assert(poll_result == None)
1276 log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1277 assert(not self.inprogress)
1278 assert ref.find(b'\n') < 0
1279 assert ref.find(b'\r') < 0
1280 assert not ref.startswith(b'-')
1281 self.inprogress = ref
1282 self.p.stdin.write(ref + b'\n')
1283 self.p.stdin.flush()
1284 hdr = self.p.stdout.readline()
1285 if hdr.endswith(b' missing\n'):
1286 self.inprogress = None
1287 yield None, None, None
1289 info = hdr.split(b' ')
1290 if len(info) != 3 or len(info[0]) != 40:
1291 raise GitError('expected object (id, type, size), got %r' % info)
1292 oidx, typ, size = info
1294 it = _AbortableIter(chunkyreader(self.p.stdout, size),
1295 onabort=self._abort)
1297 yield oidx, typ, size
1300 readline_result = self.p.stdout.readline()
1301 assert readline_result == b'\n'
1302 self.inprogress = None
1303 except Exception as e:
1307 def _join(self, it):
1308 _, typ, _ = next(it)
1312 elif typ == b'tree':
1313 treefile = b''.join(it)
1314 for (mode, name, sha) in tree_decode(treefile):
1315 for blob in self.join(hexlify(sha)):
1317 elif typ == b'commit':
1318 treeline = b''.join(it).split(b'\n')[0]
1319 assert treeline.startswith(b'tree ')
1320 for blob in self.join(treeline[5:]):
1323 raise GitError('invalid object type %r: expected blob/tree/commit'
1327 """Generate a list of the content of all blobs that can be reached
1328 from an object. The hash given in 'id' must point to a blob, a tree
1329 or a commit. The content of all blobs that can be seen from trees or
1330 commits will be added to the list.
1332 for d in self._join(self.get(id)):
1338 def cp(repo_dir=None):
1339 """Create a CatPipe object or reuse the already existing one."""
1342 repo_dir = repodir or repo()
1343 repo_dir = os.path.abspath(repo_dir)
1344 cp = _cp.get(repo_dir)
1346 cp = CatPipe(repo_dir)
1351 def tags(repo_dir = None):
1352 """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1354 for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1355 assert n.startswith(b'refs/tags/')
1359 tags[c].append(name) # more than one tag can point at 'c'
1363 class MissingObject(KeyError):
1364 def __init__(self, oid):
1366 KeyError.__init__(self, 'object %r is missing' % hexlify(oid))
1369 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1370 'path', 'chunk_path', 'data'])
1371 # The path is the mangled path, and if an item represents a fragment
1372 # of a chunked file, the chunk_path will be the chunked subtree path
1373 # for the chunk, i.e. ['', '2d3115e', ...]. The top-level path for a
1374 # chunked file will have a chunk_path of ['']. So some chunk subtree
1375 # of the file '/foo/bar/baz' might look like this:
1377 # item.path = ['foo', 'bar', 'baz.bup']
1378 # item.chunk_path = ['', '2d3115e', '016b097']
1379 # item.type = 'tree'
1383 def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1384 """Yield everything reachable from oidx via get_ref (which must behave
1385 like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1386 returns true. Throw MissingObject if a hash encountered is
1387 missing from the repository, and don't read or return blob content
1388 in the data field unless include_data is set.
1391 # Maintain the pending stack on the heap to avoid stack overflow
1392 pending = [(oidx, [], [], None)]
1394 oidx, parent_path, chunk_path, mode = pending.pop()
1395 oid = unhexlify(oidx)
1396 if stop_at and stop_at(oidx):
1399 if (not include_data) and mode and stat.S_ISREG(mode):
1400 # If the object is a "regular file", then it's a leaf in
1401 # the graph, so we can skip reading the data if the caller
1402 # hasn't requested it.
1403 yield WalkItem(oid=oid, type=b'blob',
1404 chunk_path=chunk_path, path=parent_path,
1409 item_it = get_ref(oidx)
1410 get_oidx, typ, _ = next(item_it)
1412 raise MissingObject(unhexlify(oidx))
1413 if typ not in (b'blob', b'commit', b'tree'):
1414 raise Exception('unexpected repository object type %r' % typ)
1416 # FIXME: set the mode based on the type when the mode is None
1417 if typ == b'blob' and not include_data:
1418 # Dump data until we can ask cat_pipe not to fetch it
1419 for ignored in item_it:
1423 data = b''.join(item_it)
1425 yield WalkItem(oid=oid, type=typ,
1426 chunk_path=chunk_path, path=parent_path,
1428 data=(data if include_data else None))
1430 if typ == b'commit':
1431 commit_items = parse_commit(data)
1432 for pid in commit_items.parents:
1433 pending.append((pid, parent_path, chunk_path, mode))
1434 pending.append((commit_items.tree, parent_path, chunk_path,
1435 hashsplit.GIT_MODE_TREE))
1436 elif typ == b'tree':
1437 for mode, name, ent_id in tree_decode(data):
1438 demangled, bup_type = demangle_name(name, mode)
1440 sub_path = parent_path
1441 sub_chunk_path = chunk_path + [name]
1443 sub_path = parent_path + [name]
1444 if bup_type == BUP_CHUNKED:
1445 sub_chunk_path = [b'']
1447 sub_chunk_path = chunk_path
1448 pending.append((hexlify(ent_id), sub_path, sub_chunk_path,