1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
6 from __future__ import absolute_import, print_function
7 import os, sys, zlib, subprocess, struct, stat, re, tempfile, glob
8 from array import array
9 from binascii import hexlify, unhexlify
10 from collections import namedtuple
11 from itertools import islice
13 from bup import _helpers, hashsplit, path, midx, bloom, xstat
14 from bup.compat import (buffer,
15 byte_int, bytes_from_byte, bytes_from_uint,
21 from bup.io import path_msg
22 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
28 mmap_read, mmap_readwrite,
29 progress, qprogress, stat_if_exists,
35 repodir = None # The default repository, once initialized
37 _typemap = {b'blob': 3, b'tree': 2, b'commit': 1, b'tag': 4}
38 _typermap = {v: k for k, v in items(_typemap)}
45 class GitError(Exception):
49 def _gitenv(repo_dir=None):
52 return merge_dict(environ, {b'GIT_DIR': os.path.abspath(repo_dir)})
54 def _git_wait(cmd, p):
57 raise GitError('%r returned %d' % (cmd, rv))
59 def _git_exo(cmd, **kwargs):
60 kwargs['check'] = False
61 result = exo(cmd, **kwargs)
63 if proc.returncode != 0:
64 raise GitError('%r returned %d' % (cmd, proc.returncode))
67 def git_config_get(option, repo_dir=None, opttype=None, cfg_file=None):
68 assert not (repo_dir and cfg_file), "repo_dir and cfg_file cannot both be used"
69 cmd = [b'git', b'config', b'--null']
71 cmd.extend([b'--file', cfg_file])
73 cmd.extend([b'--int'])
74 elif opttype == 'bool':
75 cmd.extend([b'--bool'])
77 assert opttype is None
78 cmd.extend([b'--get', option])
81 env = _gitenv(repo_dir=repo_dir)
82 p = subprocess.Popen(cmd, stdout=subprocess.PIPE, env=env,
84 # with --null, git writes out a trailing \0 after the value
85 r = p.stdout.read()[:-1]
90 elif opttype == 'bool':
91 # git converts to 'true' or 'false'
95 raise GitError('%r returned %d' % (cmd, rc))
99 def parse_tz_offset(s):
100 """UTC offset in seconds."""
101 tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
102 if bytes_from_byte(s[0]) == b'-':
106 def parse_commit_gpgsig(sig):
107 """Return the original signature bytes.
109 i.e. with the "gpgsig " header and the leading space character on
110 each continuation line removed.
115 assert sig.startswith(b'gpgsig ')
117 return sig.replace(b'\n ', b'\n')
119 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
120 # Make sure that's authoritative.
123 # https://github.com/git/git/blob/master/Documentation/technical/signature-format.txt
124 # The continuation lines have only one leading space.
126 _start_end_char = br'[^ .,:;<>"\'\0\n]'
127 _content_char = br'[^\0\n<>]'
128 _safe_str_rx = br'(?:%s{1,2}|(?:%s%s*%s))' \
130 _start_end_char, _content_char, _start_end_char)
131 _tz_rx = br'[-+]\d\d[0-5]\d'
132 _parent_rx = br'(?:parent [abcdefABCDEF0123456789]{40}\n)'
133 # Assumes every following line starting with a space is part of the
134 # mergetag. Is there a formal commit blob spec?
135 _mergetag_rx = br'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)'
136 _commit_rx = re.compile(br'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
137 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
138 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?)
139 (?P<gpgsig>gpgsig .*\n(?: .*\n)*)?
140 (?P<message>(?:.|\n)*)''' % (_parent_rx,
141 _safe_str_rx, _safe_str_rx, _tz_rx,
142 _safe_str_rx, _safe_str_rx, _tz_rx,
144 _parent_hash_rx = re.compile(br'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
146 # Note that the author_sec and committer_sec values are (UTC) epoch
147 # seconds, and for now the mergetag is not included.
148 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
149 'author_name', 'author_mail',
150 'author_sec', 'author_offset',
151 'committer_name', 'committer_mail',
152 'committer_sec', 'committer_offset',
156 def parse_commit(content):
157 commit_match = re.match(_commit_rx, content)
159 raise Exception('cannot parse commit %r' % content)
160 matches = commit_match.groupdict()
161 return CommitInfo(tree=matches['tree'],
162 parents=re.findall(_parent_hash_rx, matches['parents']),
163 author_name=matches['author_name'],
164 author_mail=matches['author_mail'],
165 author_sec=int(matches['asec']),
166 author_offset=parse_tz_offset(matches['atz']),
167 committer_name=matches['committer_name'],
168 committer_mail=matches['committer_mail'],
169 committer_sec=int(matches['csec']),
170 committer_offset=parse_tz_offset(matches['ctz']),
171 gpgsig=parse_commit_gpgsig(matches['gpgsig']),
172 message=matches['message'])
175 def get_cat_data(cat_iterator, expected_type):
176 _, kind, _ = next(cat_iterator)
177 if kind != expected_type:
178 raise Exception('expected %r, saw %r' % (expected_type, kind))
179 return b''.join(cat_iterator)
181 def get_commit_items(id, cp):
182 return parse_commit(get_cat_data(cp.get(id), b'commit'))
184 def _local_git_date_str(epoch_sec):
185 return b'%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
188 def _git_date_str(epoch_sec, tz_offset_sec):
189 offs = tz_offset_sec // 60
190 return b'%d %s%02d%02d' \
192 b'+' if offs >= 0 else b'-',
197 def repo(sub = b'', repo_dir=None):
198 """Get the path to the git repository or one of its subdirectories."""
199 repo_dir = repo_dir or repodir
201 raise GitError('You should call check_repo_or_die()')
203 # If there's a .git subdirectory, then the actual repo is in there.
204 gd = os.path.join(repo_dir, b'.git')
205 if os.path.exists(gd):
208 return os.path.join(repo_dir, sub)
212 re.compile(br'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)')
215 return _shorten_hash_rx.sub(br'\1\2*\3', s)
219 full = os.path.abspath(path)
220 fullrepo = os.path.abspath(repo(b''))
221 if not fullrepo.endswith(b'/'):
223 if full.startswith(fullrepo):
224 path = full[len(fullrepo):]
225 if path.startswith(b'index-cache/'):
226 path = path[len(b'index-cache/'):]
227 return shorten_hash(path)
230 def auto_midx(objdir):
231 args = [path.exe(), b'midx', b'--auto', b'--dir', objdir]
233 rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
235 # make sure 'args' gets printed to help with debugging
236 add_error('%r: exception: %s' % (args, e))
239 add_error('%r: returned %d' % (args, rv))
241 args = [path.exe(), b'bloom', b'--dir', objdir]
243 rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
245 # make sure 'args' gets printed to help with debugging
246 add_error('%r: exception: %s' % (args, e))
249 add_error('%r: returned %d' % (args, rv))
252 def mangle_name(name, mode, gitmode):
253 """Mangle a file name to present an abstract name for segmented files.
254 Mangled file names will have the ".bup" extension added to them. If a
255 file's name already ends with ".bup", a ".bupl" extension is added to
256 disambiguate normal files from segmented ones.
258 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
259 assert(stat.S_ISDIR(gitmode))
260 return name + b'.bup'
261 elif name.endswith(b'.bup') or name[:-1].endswith(b'.bup'):
262 return name + b'.bupl'
267 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
268 def demangle_name(name, mode):
269 """Remove name mangling from a file name, if necessary.
271 The return value is a tuple (demangled_filename,mode), where mode is one of
274 * BUP_NORMAL : files that should be read as-is from the repository
275 * BUP_CHUNKED : files that were chunked and need to be reassembled
277 For more information on the name mangling algorithm, see mangle_name()
279 if name.endswith(b'.bupl'):
280 return (name[:-5], BUP_NORMAL)
281 elif name.endswith(b'.bup'):
282 return (name[:-4], BUP_CHUNKED)
283 elif name.endswith(b'.bupm'):
285 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
286 return (name, BUP_NORMAL)
289 def calc_hash(type, content):
290 """Calculate some content's hash in the Git fashion."""
291 header = b'%s %d\0' % (type, len(content))
297 def shalist_item_sort_key(ent):
298 (mode, name, id) = ent
299 assert(mode+0 == mode)
300 if stat.S_ISDIR(mode):
306 def tree_encode(shalist):
307 """Generate a git tree object from (mode,name,hash) tuples."""
308 shalist = sorted(shalist, key = shalist_item_sort_key)
310 for (mode,name,bin) in shalist:
312 assert(mode+0 == mode)
314 assert(len(bin) == 20)
315 s = b'%o %s\0%s' % (mode,name,bin)
316 assert s[0] != b'0' # 0-padded octal is not acceptable in a git tree
321 def tree_decode(buf):
322 """Generate a list of (mode,name,hash) from the git tree object in buf."""
324 while ofs < len(buf):
325 z = buf.find(b'\0', ofs)
327 spl = buf[ofs:z].split(b' ', 1)
328 assert(len(spl) == 2)
330 sha = buf[z+1:z+1+20]
332 yield (int(mode, 8), name, sha)
335 def _encode_packobj(type, content, compression_level=1):
336 if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
337 raise ValueError('invalid compression level %s' % compression_level)
340 szbits = (sz & 0x0f) | (_typemap[type]<<4)
343 if sz: szbits |= 0x80
344 szout += bytes_from_uint(szbits)
349 z = zlib.compressobj(compression_level)
351 yield z.compress(content)
355 def _decode_packobj(buf):
358 type = _typermap[(c & 0x70) >> 4]
365 sz |= (c & 0x7f) << shift
369 return (type, zlib.decompress(buf[i+1:]))
376 def find_offset(self, hash):
377 """Get the offset of an object inside the index file."""
378 idx = self._idx_from_hash(hash)
380 return self._ofs_from_idx(idx)
383 def exists(self, hash, want_source=False):
384 """Return nonempty if the object exists in this index."""
385 if hash and (self._idx_from_hash(hash) != None):
386 return want_source and os.path.basename(self.name) or True
389 def _idx_from_hash(self, hash):
390 global _total_searches, _total_steps
392 assert(len(hash) == 20)
393 b1 = byte_int(hash[0])
394 start = self.fanout[b1-1] # range -1..254
395 end = self.fanout[b1] # range 0..255
397 _total_steps += 1 # lookup table is a step
400 mid = start + (end - start) // 2
401 v = self._idx_to_hash(mid)
411 class PackIdxV1(PackIdx):
412 """Object representation of a Git pack index (version 1) file."""
413 def __init__(self, filename, f):
415 self.idxnames = [self.name]
416 self.map = mmap_read(f)
417 # Min size for 'L' is 4, which is sufficient for struct's '!I'
418 self.fanout = array('L', struct.unpack('!256I', self.map))
419 self.fanout.append(0) # entry "-1"
420 self.nsha = self.fanout[255]
421 self.sha_ofs = 256 * 4
422 # Avoid slicing shatable for individual hashes (very high overhead)
423 self.shatable = buffer(self.map, self.sha_ofs, self.nsha * 24)
428 def __exit__(self, type, value, traceback):
429 with pending_raise(value, rethrow=False):
433 return int(self.nsha) # int() from long for python 2
435 def _ofs_from_idx(self, idx):
436 if idx >= self.nsha or idx < 0:
437 raise IndexError('invalid pack index index %d' % idx)
438 ofs = self.sha_ofs + idx * 24
439 return struct.unpack_from('!I', self.map, offset=ofs)[0]
441 def _idx_to_hash(self, idx):
442 if idx >= self.nsha or idx < 0:
443 raise IndexError('invalid pack index index %d' % idx)
444 ofs = self.sha_ofs + idx * 24 + 4
445 return self.map[ofs : ofs + 20]
448 start = self.sha_ofs + 4
449 for ofs in range(start, start + 24 * self.nsha, 24):
450 yield self.map[ofs : ofs + 20]
453 if self.map is not None:
459 class PackIdxV2(PackIdx):
460 """Object representation of a Git pack index (version 2) file."""
461 def __init__(self, filename, f):
463 self.idxnames = [self.name]
464 self.map = mmap_read(f)
465 assert self.map[0:8] == b'\377tOc\0\0\0\2'
466 # Min size for 'L' is 4, which is sufficient for struct's '!I'
467 self.fanout = array('L', struct.unpack_from('!256I', self.map, offset=8))
468 self.fanout.append(0)
469 self.nsha = self.fanout[255]
470 self.sha_ofs = 8 + 256*4
471 self.ofstable_ofs = self.sha_ofs + self.nsha * 20 + self.nsha * 4
472 self.ofs64table_ofs = self.ofstable_ofs + self.nsha * 4
473 # Avoid slicing this for individual hashes (very high overhead)
474 self.shatable = buffer(self.map, self.sha_ofs, self.nsha*20)
479 def __exit__(self, type, value, traceback):
480 with pending_raise(value, rethrow=False):
484 return int(self.nsha) # int() from long for python 2
486 def _ofs_from_idx(self, idx):
487 if idx >= self.nsha or idx < 0:
488 raise IndexError('invalid pack index index %d' % idx)
489 ofs_ofs = self.ofstable_ofs + idx * 4
490 ofs = struct.unpack_from('!I', self.map, offset=ofs_ofs)[0]
492 idx64 = ofs & 0x7fffffff
493 ofs64_ofs = self.ofs64table_ofs + idx64 * 8
494 ofs = struct.unpack_from('!Q', self.map, offset=ofs64_ofs)[0]
497 def _idx_to_hash(self, idx):
498 if idx >= self.nsha or idx < 0:
499 raise IndexError('invalid pack index index %d' % idx)
500 ofs = self.sha_ofs + idx * 20
501 return self.map[ofs : ofs + 20]
505 for ofs in range(start, start + 20 * self.nsha, 20):
506 yield self.map[ofs : ofs + 20]
509 if self.map is not None:
517 def __init__(self, dir, ignore_midx=False):
519 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
524 self.do_bloom = False
526 self.ignore_midx = ignore_midx
532 assert(_mpi_count == 0)
535 return iter(idxmerge(self.packs))
538 return sum(len(pack) for pack in self.packs)
540 def exists(self, hash, want_source=False):
541 """Return nonempty if the object exists in the index files."""
542 global _total_searches
544 if hash in self.also:
546 if self.do_bloom and self.bloom:
547 if self.bloom.exists(hash):
548 self.do_bloom = False
550 _total_searches -= 1 # was counted by bloom
552 for i in range(len(self.packs)):
554 _total_searches -= 1 # will be incremented by sub-pack
555 ix = p.exists(hash, want_source=want_source)
557 # reorder so most recently used packs are searched first
558 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
563 def refresh(self, skip_midx = False):
564 """Refresh the index list.
565 This method verifies if .midx files were superseded (e.g. all of its
566 contents are in another, bigger .midx file) and removes the superseded
569 If skip_midx is True, all work on .midx files will be skipped and .midx
570 files will be removed from the list.
572 The instance variable 'ignore_midx' can force this function to
573 always act as if skip_midx was True.
575 if self.bloom is not None:
577 self.bloom = None # Always reopen the bloom as it may have been relaced
578 self.do_bloom = False
579 skip_midx = skip_midx or self.ignore_midx
580 d = dict((p.name, p) for p in self.packs
581 if not skip_midx or not isinstance(p, midx.PackMidx))
582 if os.path.exists(self.dir):
585 midxes = set(glob.glob(os.path.join(self.dir, b'*.midx')))
586 # remove any *.midx files from our list that no longer exist
587 for ix in list(d.values()):
588 if not isinstance(ix, midx.PackMidx):
590 if ix.name in midxes:
595 self.packs.remove(ix)
596 for ix in self.packs:
597 if isinstance(ix, midx.PackMidx):
598 for name in ix.idxnames:
599 d[os.path.join(self.dir, name)] = ix
602 mx = midx.PackMidx(full)
603 (mxd, mxf) = os.path.split(mx.name)
605 for n in mx.idxnames:
606 if not os.path.exists(os.path.join(mxd, n)):
607 log(('warning: index %s missing\n'
609 % (path_msg(n), path_msg(mxf)))
617 midxl.sort(key=lambda ix:
618 (-len(ix), -xstat.stat(ix.name).st_mtime))
621 for sub in ix.idxnames:
622 found = d.get(os.path.join(self.dir, sub))
623 if not found or isinstance(found, PackIdx):
624 # doesn't exist, or exists but not in a midx
629 for name in ix.idxnames:
630 d[os.path.join(self.dir, name)] = ix
631 elif not ix.force_keep:
632 debug1('midx: removing redundant: %s\n'
633 % path_msg(os.path.basename(ix.name)))
636 for full in glob.glob(os.path.join(self.dir, b'*.idx')):
640 except GitError as e:
644 bfull = os.path.join(self.dir, b'bup.bloom')
645 if self.bloom is None and os.path.exists(bfull):
646 self.bloom = bloom.ShaBloom(bfull)
647 self.packs = list(set(d.values()))
648 self.packs.sort(reverse=True, key=lambda x: len(x))
649 if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
653 debug1('PackIdxList: using %d index%s.\n'
654 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
657 """Insert an additional object in the list."""
661 def open_idx(filename):
662 if filename.endswith(b'.idx'):
663 f = open(filename, 'rb')
665 if header[0:4] == b'\377tOc':
666 version = struct.unpack('!I', header[4:8])[0]
668 return PackIdxV2(filename, f)
670 raise GitError('%s: expected idx file version 2, got %d'
671 % (path_msg(filename), version))
672 elif len(header) == 8 and header[0:4] < b'\377tOc':
673 return PackIdxV1(filename, f)
675 raise GitError('%s: unrecognized idx file header'
676 % path_msg(filename))
677 elif filename.endswith(b'.midx'):
678 return midx.PackMidx(filename)
680 raise GitError('idx filenames must end with .idx or .midx')
683 def idxmerge(idxlist, final_progress=True):
684 """Generate a list of all the objects reachable in a PackIdxList."""
685 def pfunc(count, total):
686 qprogress('Reading indexes: %.2f%% (%d/%d)\r'
687 % (count*100.0/total, count, total))
688 def pfinal(count, total):
690 progress('Reading indexes: %.2f%% (%d/%d), done.\n'
691 % (100, total, total))
692 return merge_iter(idxlist, 10024, pfunc, pfinal)
695 def create_commit_blob(tree, parent,
696 author, adate_sec, adate_tz,
697 committer, cdate_sec, cdate_tz,
699 if adate_tz is not None:
700 adate_str = _git_date_str(adate_sec, adate_tz)
702 adate_str = _local_git_date_str(adate_sec)
703 if cdate_tz is not None:
704 cdate_str = _git_date_str(cdate_sec, cdate_tz)
706 cdate_str = _local_git_date_str(cdate_sec)
708 if tree: l.append(b'tree %s' % hexlify(tree))
709 if parent: l.append(b'parent %s' % hexlify(parent))
710 if author: l.append(b'author %s %s' % (author, adate_str))
711 if committer: l.append(b'committer %s %s' % (committer, cdate_str))
717 def _make_objcache():
718 return PackIdxList(repo(b'objects/pack'))
720 # bup-gc assumes that it can disable all PackWriter activities
721 # (bloom/midx/cache) via the constructor and close() arguments.
724 """Writes Git objects inside a pack file."""
725 def __init__(self, objcache_maker=_make_objcache, compression_level=1,
726 run_midx=True, on_pack_finish=None,
727 max_pack_size=None, max_pack_objects=None, repo_dir=None):
728 self.repo_dir = repo_dir or repo()
735 self.objcache_maker = objcache_maker
737 self.compression_level = compression_level
738 self.run_midx=run_midx
739 self.on_pack_finish = on_pack_finish
740 if not max_pack_size:
741 max_pack_size = git_config_get(b'pack.packSizeLimit',
742 repo_dir=self.repo_dir,
744 if not max_pack_size:
745 # larger packs slow down pruning
746 max_pack_size = 1000 * 1000 * 1000
747 self.max_pack_size = max_pack_size
748 # cache memory usage is about 83 bytes per object
749 self.max_pack_objects = max_pack_objects if max_pack_objects \
750 else max(1, self.max_pack_size // 5000)
758 def __exit__(self, type, value, traceback):
759 with pending_raise(value, rethrow=False):
764 objdir = dir = os.path.join(self.repo_dir, b'objects')
765 fd, name = tempfile.mkstemp(suffix=b'.pack', dir=objdir)
767 self.file = os.fdopen(fd, 'w+b')
772 self.parentfd = os.open(objdir, os.O_RDONLY)
778 assert name.endswith(b'.pack')
779 self.filename = name[:-5]
780 self.file.write(b'PACK\0\0\0\2\0\0\0\0')
781 self.idx = PackIdxV2Writer()
783 def _raw_write(self, datalist, sha):
786 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
787 # the file never has a *partial* blob. So let's make sure it's
788 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
789 # to our hashsplit algorithm.) f.write() does its own buffering,
790 # but that's okay because we'll flush it in _end().
791 oneblob = b''.join(datalist)
797 crc = zlib.crc32(oneblob) & 0xffffffff
798 self._update_idx(sha, crc, nw)
803 def _update_idx(self, sha, crc, size):
806 self.idx.add(sha, crc, self.file.tell() - size)
808 def _write(self, sha, type, content):
812 sha = calc_hash(type, content)
813 size, crc = self._raw_write(_encode_packobj(type, content,
814 self.compression_level),
816 if self.outbytes >= self.max_pack_size \
817 or self.count >= self.max_pack_objects:
821 def breakpoint(self):
822 """Clear byte and object counts and return the last processed id."""
823 id = self._end(self.run_midx)
824 self.outbytes = self.count = 0
827 def _require_objcache(self):
828 if self.objcache is None and self.objcache_maker:
829 self.objcache = self.objcache_maker()
830 if self.objcache is None:
832 "PackWriter not opened or can't check exists w/o objcache")
834 def exists(self, id, want_source=False):
835 """Return non-empty if an object is found in the object cache."""
836 self._require_objcache()
837 return self.objcache.exists(id, want_source=want_source)
839 def just_write(self, sha, type, content):
840 """Write an object to the pack file without checking for duplication."""
841 self._write(sha, type, content)
842 # If nothing else, gc doesn't have/want an objcache
843 if self.objcache is not None:
844 self.objcache.add(sha)
846 def maybe_write(self, type, content):
847 """Write an object to the pack file if not present and return its id."""
848 sha = calc_hash(type, content)
849 if not self.exists(sha):
850 self._require_objcache()
851 self.just_write(sha, type, content)
854 def new_blob(self, blob):
855 """Create a blob object in the pack with the supplied content."""
856 return self.maybe_write(b'blob', blob)
858 def new_tree(self, shalist):
859 """Create a tree object in the pack."""
860 content = tree_encode(shalist)
861 return self.maybe_write(b'tree', content)
863 def new_commit(self, tree, parent,
864 author, adate_sec, adate_tz,
865 committer, cdate_sec, cdate_tz,
867 """Create a commit object in the pack. The date_sec values must be
868 epoch-seconds, and if a tz is None, the local timezone is assumed."""
869 content = create_commit_blob(tree, parent,
870 author, adate_sec, adate_tz,
871 committer, cdate_sec, cdate_tz,
873 return self.maybe_write(b'commit', content)
876 """Remove the pack file from disk."""
885 os.unlink(self.filename + b'.pack')
892 def _end(self, run_midx=True):
894 if not f: return None
901 # update object count
903 cp = struct.pack('!i', self.count)
907 # calculate the pack sha1sum
910 for b in chunkyreader(f):
912 packbin = sum.digest()
914 fdatasync(f.fileno())
918 idx.write(self.filename + b'.idx', packbin)
919 nameprefix = os.path.join(self.repo_dir,
920 b'objects/pack/pack-' + hexlify(packbin))
921 if os.path.exists(self.filename + b'.map'):
922 os.unlink(self.filename + b'.map')
923 os.rename(self.filename + b'.pack', nameprefix + b'.pack')
924 os.rename(self.filename + b'.idx', nameprefix + b'.idx')
926 os.fsync(self.parentfd)
928 os.close(self.parentfd)
931 auto_midx(os.path.join(self.repo_dir, b'objects/pack'))
933 if self.on_pack_finish:
934 self.on_pack_finish(nameprefix)
938 def close(self, run_midx=True):
939 """Close the pack file and move it to its definitive path."""
940 return self._end(run_midx=run_midx)
943 class PackIdxV2Writer:
945 self.idx = list(list() for i in range(256))
948 def add(self, sha, crc, offs):
951 self.idx[byte_int(sha[0])].append((sha, crc, offs))
953 def write(self, filename, packbin):
955 for section in self.idx:
956 for entry in section:
957 if entry[2] >= 2**31:
960 # Length: header + fan-out + shas-and-crcs + overflow-offsets
961 index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
963 idx_f = open(filename, 'w+b')
965 idx_f.truncate(index_len)
966 fdatasync(idx_f.fileno())
967 idx_map = mmap_readwrite(idx_f, close=False)
969 count = _helpers.write_idx(filename, idx_map, self.idx,
971 assert(count == self.count)
978 idx_f = open(filename, 'a+b')
983 b = idx_f.read(8 + 4*256)
986 for b in chunkyreader(idx_f, 20 * self.count):
989 for b in chunkyreader(idx_f):
991 idx_f.write(idx_sum.digest())
992 fdatasync(idx_f.fileno())
997 def list_refs(patterns=None, repo_dir=None,
998 limit_to_heads=False, limit_to_tags=False):
999 """Yield (refname, hash) tuples for all repository refs unless
1000 patterns are specified. In that case, only include tuples for
1001 refs matching those patterns (cf. git-show-ref(1)). The limits
1002 restrict the result items to refs/heads or refs/tags. If both
1003 limits are specified, items from both sources will be included.
1006 argv = [b'git', b'show-ref']
1008 argv.append(b'--heads')
1010 argv.append(b'--tags')
1013 argv.extend(patterns)
1014 p = subprocess.Popen(argv, env=_gitenv(repo_dir), stdout=subprocess.PIPE,
1016 out = p.stdout.read().strip()
1017 rv = p.wait() # not fatal
1021 for d in out.split(b'\n'):
1022 sha, name = d.split(b' ', 1)
1023 yield name, unhexlify(sha)
1026 def read_ref(refname, repo_dir = None):
1027 """Get the commit id of the most recent commit made on a given ref."""
1028 refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
1029 l = tuple(islice(refs, 2))
1037 def rev_list_invocation(ref_or_refs, format=None):
1038 if isinstance(ref_or_refs, bytes):
1039 refs = (ref_or_refs,)
1042 argv = [b'git', b'rev-list']
1045 argv.append(b'--pretty=format:' + format)
1047 assert not ref.startswith(b'-')
1053 def rev_list(ref_or_refs, parse=None, format=None, repo_dir=None):
1054 """Yield information about commits as per "git rev-list". If a format
1055 is not provided, yield one hex hash at a time. If a format is
1056 provided, pass it to rev-list and call parse(git_stdout) for each
1057 commit with the stream positioned just after the rev-list "commit
1058 HASH" header line. When a format is provided yield (oidx,
1059 parse(git_stdout)) for each commit.
1062 assert bool(parse) == bool(format)
1063 p = subprocess.Popen(rev_list_invocation(ref_or_refs,
1065 env=_gitenv(repo_dir),
1066 stdout = subprocess.PIPE,
1069 for line in p.stdout:
1072 line = p.stdout.readline()
1075 if not s.startswith(b'commit '):
1076 raise Exception('unexpected line ' + repr(s))
1079 yield s, parse(p.stdout)
1080 line = p.stdout.readline()
1082 rv = p.wait() # not fatal
1084 raise GitError('git rev-list returned error %d' % rv)
1087 def rev_parse(committish, repo_dir=None):
1088 """Resolve the full hash for 'committish', if it exists.
1090 Should be roughly equivalent to 'git rev-parse'.
1092 Returns the hex value of the hash if it is found, None if 'committish' does
1093 not correspond to anything.
1095 head = read_ref(committish, repo_dir=repo_dir)
1097 debug2("resolved from ref: commit = %s\n" % hexlify(head))
1100 pL = PackIdxList(repo(b'objects/pack', repo_dir=repo_dir))
1102 if len(committish) == 40:
1104 hash = unhexlify(committish)
1114 def update_ref(refname, newval, oldval, repo_dir=None):
1115 """Update a repository reference."""
1118 assert refname.startswith(b'refs/heads/') \
1119 or refname.startswith(b'refs/tags/')
1120 p = subprocess.Popen([b'git', b'update-ref', refname,
1121 hexlify(newval), hexlify(oldval)],
1122 env=_gitenv(repo_dir),
1124 _git_wait(b'git update-ref', p)
1127 def delete_ref(refname, oldvalue=None):
1128 """Delete a repository reference (see git update-ref(1))."""
1129 assert refname.startswith(b'refs/')
1130 oldvalue = [] if not oldvalue else [oldvalue]
1131 p = subprocess.Popen([b'git', b'update-ref', b'-d', refname] + oldvalue,
1134 _git_wait('git update-ref', p)
1137 def guess_repo(path=None):
1138 """Set the path value in the global variable "repodir".
1139 This makes bup look for an existing bup repository, but not fail if a
1140 repository doesn't exist. Usually, if you are interacting with a bup
1141 repository, you would not be calling this function but using
1142 check_repo_or_die().
1148 repodir = environ.get(b'BUP_DIR')
1150 repodir = os.path.expanduser(b'~/.bup')
1153 def init_repo(path=None):
1154 """Create the Git bare repository for bup in a given path."""
1156 d = repo() # appends a / to the path
1157 parent = os.path.dirname(os.path.dirname(d))
1158 if parent and not os.path.exists(parent):
1159 raise GitError('parent directory "%s" does not exist\n'
1161 if os.path.exists(d) and not os.path.isdir(os.path.join(d, b'.')):
1162 raise GitError('"%s" exists but is not a directory\n' % path_msg(d))
1163 p = subprocess.Popen([b'git', b'--bare', b'init'], stdout=sys.stderr,
1166 _git_wait('git init', p)
1167 # Force the index version configuration in order to ensure bup works
1168 # regardless of the version of the installed Git binary.
1169 p = subprocess.Popen([b'git', b'config', b'pack.indexVersion', '2'],
1170 stdout=sys.stderr, env=_gitenv(), close_fds=True)
1171 _git_wait('git config', p)
1173 p = subprocess.Popen([b'git', b'config', b'core.logAllRefUpdates', b'true'],
1174 stdout=sys.stderr, env=_gitenv(), close_fds=True)
1175 _git_wait('git config', p)
1178 def check_repo_or_die(path=None):
1179 """Check to see if a bup repository probably exists, and abort if not."""
1182 pst = stat_if_exists(top + b'/objects/pack')
1183 if pst and stat.S_ISDIR(pst.st_mode):
1186 top_st = stat_if_exists(top)
1188 log('error: repository %r does not exist (see "bup help init")\n'
1191 log('error: %s is not a repository\n' % path_msg(top))
1195 def is_suitable_git(ver_str):
1196 if not ver_str.startswith(b'git version '):
1197 return 'unrecognized'
1198 ver_str = ver_str[len(b'git version '):]
1199 if ver_str.startswith(b'0.'):
1200 return 'insufficient'
1201 if ver_str.startswith(b'1.'):
1202 if re.match(br'1\.[012345]rc', ver_str):
1203 return 'insufficient'
1204 if re.match(br'1\.[01234]\.', ver_str):
1205 return 'insufficient'
1206 if re.match(br'1\.5\.[012345]($|\.)', ver_str):
1207 return 'insufficient'
1208 if re.match(br'1\.5\.6-rc', ver_str):
1209 return 'insufficient'
1211 if re.match(br'[0-9]+(\.|$)?', ver_str):
1217 def require_suitable_git(ver_str=None):
1218 """Raise GitError if the version of git isn't suitable.
1220 Rely on ver_str when provided, rather than invoking the git in the
1225 if _git_great is not None:
1227 if environ.get(b'BUP_GIT_VERSION_IS_FINE', b'').lower() \
1228 in (b'yes', b'true', b'1'):
1232 ver_str, _, _ = _git_exo([b'git', b'--version'])
1233 status = is_suitable_git(ver_str)
1234 if status == 'unrecognized':
1235 raise GitError('Unexpected git --version output: %r' % ver_str)
1236 if status == 'insufficient':
1237 log('error: git version must be at least 1.5.6\n')
1239 if status == 'suitable':
1245 class _AbortableIter:
1246 def __init__(self, it, onabort = None):
1248 self.onabort = onabort
1256 return next(self.it)
1257 except StopIteration as e:
1267 """Abort iteration and call the abortion callback, if needed."""
1278 """Link to 'git cat-file' that is used to retrieve blob data."""
1279 def __init__(self, repo_dir = None):
1280 require_suitable_git()
1281 self.repo_dir = repo_dir
1282 self.p = self.inprogress = None
1284 def close(self, wait=False):
1290 self.inprogress = None
1298 self.p = subprocess.Popen([b'git', b'cat-file', b'--batch'],
1299 stdin=subprocess.PIPE,
1300 stdout=subprocess.PIPE,
1303 env=_gitenv(self.repo_dir))
1306 """Yield (oidx, type, size), followed by the data referred to by ref.
1307 If ref does not exist, only yield (None, None, None).
1310 if not self.p or self.p.poll() != None:
1313 poll_result = self.p.poll()
1314 assert(poll_result == None)
1316 log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1317 assert(not self.inprogress)
1318 assert ref.find(b'\n') < 0
1319 assert ref.find(b'\r') < 0
1320 assert not ref.startswith(b'-')
1321 self.inprogress = ref
1322 self.p.stdin.write(ref + b'\n')
1323 self.p.stdin.flush()
1324 hdr = self.p.stdout.readline()
1326 raise GitError('unexpected cat-file EOF (last request: %r, exit: %s)'
1327 % (ref, self.p.poll() or 'none'))
1328 if hdr.endswith(b' missing\n'):
1329 self.inprogress = None
1330 yield None, None, None
1332 info = hdr.split(b' ')
1333 if len(info) != 3 or len(info[0]) != 40:
1334 raise GitError('expected object (id, type, size), got %r' % info)
1335 oidx, typ, size = info
1337 it = _AbortableIter(chunkyreader(self.p.stdout, size),
1340 yield oidx, typ, size
1343 readline_result = self.p.stdout.readline()
1344 assert readline_result == b'\n'
1345 self.inprogress = None
1346 except Exception as e:
1350 def _join(self, it):
1351 _, typ, _ = next(it)
1355 elif typ == b'tree':
1356 treefile = b''.join(it)
1357 for (mode, name, sha) in tree_decode(treefile):
1358 for blob in self.join(hexlify(sha)):
1360 elif typ == b'commit':
1361 treeline = b''.join(it).split(b'\n')[0]
1362 assert treeline.startswith(b'tree ')
1363 for blob in self.join(treeline[5:]):
1366 raise GitError('invalid object type %r: expected blob/tree/commit'
1370 """Generate a list of the content of all blobs that can be reached
1371 from an object. The hash given in 'id' must point to a blob, a tree
1372 or a commit. The content of all blobs that can be seen from trees or
1373 commits will be added to the list.
1375 for d in self._join(self.get(id)):
1381 def cp(repo_dir=None):
1382 """Create a CatPipe object or reuse the already existing one."""
1385 repo_dir = repodir or repo()
1386 repo_dir = os.path.abspath(repo_dir)
1387 cp = _cp.get(repo_dir)
1389 cp = CatPipe(repo_dir)
1394 def close_catpipes():
1395 # FIXME: chain exceptions
1397 _, cp = _cp.popitem()
1401 def tags(repo_dir = None):
1402 """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1404 for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1405 assert n.startswith(b'refs/tags/')
1409 tags[c].append(name) # more than one tag can point at 'c'
1413 class MissingObject(KeyError):
1414 def __init__(self, oid):
1416 KeyError.__init__(self, 'object %r is missing' % hexlify(oid))
1419 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1420 'path', 'chunk_path', 'data'])
1421 # The path is the mangled path, and if an item represents a fragment
1422 # of a chunked file, the chunk_path will be the chunked subtree path
1423 # for the chunk, i.e. ['', '2d3115e', ...]. The top-level path for a
1424 # chunked file will have a chunk_path of ['']. So some chunk subtree
1425 # of the file '/foo/bar/baz' might look like this:
1427 # item.path = ['foo', 'bar', 'baz.bup']
1428 # item.chunk_path = ['', '2d3115e', '016b097']
1429 # item.type = 'tree'
1433 def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1434 """Yield everything reachable from oidx via get_ref (which must behave
1435 like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1436 returns true. Throw MissingObject if a hash encountered is
1437 missing from the repository, and don't read or return blob content
1438 in the data field unless include_data is set.
1441 # Maintain the pending stack on the heap to avoid stack overflow
1442 pending = [(oidx, [], [], None)]
1444 oidx, parent_path, chunk_path, mode = pending.pop()
1445 oid = unhexlify(oidx)
1446 if stop_at and stop_at(oidx):
1449 if (not include_data) and mode and stat.S_ISREG(mode):
1450 # If the object is a "regular file", then it's a leaf in
1451 # the graph, so we can skip reading the data if the caller
1452 # hasn't requested it.
1453 yield WalkItem(oid=oid, type=b'blob',
1454 chunk_path=chunk_path, path=parent_path,
1459 item_it = get_ref(oidx)
1460 get_oidx, typ, _ = next(item_it)
1462 raise MissingObject(unhexlify(oidx))
1463 if typ not in (b'blob', b'commit', b'tree'):
1464 raise Exception('unexpected repository object type %r' % typ)
1466 # FIXME: set the mode based on the type when the mode is None
1467 if typ == b'blob' and not include_data:
1468 # Dump data until we can ask cat_pipe not to fetch it
1469 for ignored in item_it:
1473 data = b''.join(item_it)
1475 yield WalkItem(oid=oid, type=typ,
1476 chunk_path=chunk_path, path=parent_path,
1478 data=(data if include_data else None))
1480 if typ == b'commit':
1481 commit_items = parse_commit(data)
1482 for pid in commit_items.parents:
1483 pending.append((pid, parent_path, chunk_path, mode))
1484 pending.append((commit_items.tree, parent_path, chunk_path,
1485 hashsplit.GIT_MODE_TREE))
1486 elif typ == b'tree':
1487 for mode, name, ent_id in tree_decode(data):
1488 demangled, bup_type = demangle_name(name, mode)
1490 sub_path = parent_path
1491 sub_chunk_path = chunk_path + [name]
1493 sub_path = parent_path + [name]
1494 if bup_type == BUP_CHUNKED:
1495 sub_chunk_path = [b'']
1497 sub_chunk_path = chunk_path
1498 pending.append((hexlify(ent_id), sub_path, sub_chunk_path,