1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
6 from __future__ import absolute_import, print_function
7 import os, sys, zlib, subprocess, struct, stat, re, tempfile, glob
8 from array import array
9 from binascii import hexlify, unhexlify
10 from collections import namedtuple
11 from itertools import islice
13 from bup import _helpers, hashsplit, path, midx, bloom, xstat
14 from bup.compat import (buffer,
15 byte_int, bytes_from_byte, bytes_from_uint,
22 from bup.io import path_msg
23 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
30 mmap_read, mmap_readwrite,
32 progress, qprogress, stat_if_exists,
38 repodir = None # The default repository, once initialized
40 _typemap = {b'blob': 3, b'tree': 2, b'commit': 1, b'tag': 4}
41 _typermap = {v: k for k, v in items(_typemap)}
48 class GitError(Exception):
52 def _gitenv(repo_dir=None):
55 return merge_dict(environ, {b'GIT_DIR': os.path.abspath(repo_dir)})
57 def _git_wait(cmd, p):
60 raise GitError('%r returned %d' % (cmd, rv))
62 def _git_exo(cmd, **kwargs):
63 kwargs['check'] = False
64 result = exo(cmd, **kwargs)
66 if proc.returncode != 0:
67 raise GitError('%r returned %d' % (cmd, proc.returncode))
70 def git_config_get(option, repo_dir=None, opttype=None, cfg_file=None):
71 assert not (repo_dir and cfg_file), "repo_dir and cfg_file cannot both be used"
72 cmd = [b'git', b'config', b'--null']
74 cmd.extend([b'--file', cfg_file])
76 cmd.extend([b'--int'])
77 elif opttype == 'bool':
78 cmd.extend([b'--bool'])
80 assert opttype is None
81 cmd.extend([b'--get', option])
84 env = _gitenv(repo_dir=repo_dir)
85 p = subprocess.Popen(cmd, stdout=subprocess.PIPE, env=env,
87 # with --null, git writes out a trailing \0 after the value
88 r = p.stdout.read()[:-1]
93 elif opttype == 'bool':
94 # git converts to 'true' or 'false'
98 raise GitError('%r returned %d' % (cmd, rc))
102 def parse_tz_offset(s):
103 """UTC offset in seconds."""
104 tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
105 if bytes_from_byte(s[0]) == b'-':
109 def parse_commit_gpgsig(sig):
110 """Return the original signature bytes.
112 i.e. with the "gpgsig " header and the leading space character on
113 each continuation line removed.
118 assert sig.startswith(b'gpgsig ')
120 return sig.replace(b'\n ', b'\n')
122 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
123 # Make sure that's authoritative.
126 # https://github.com/git/git/blob/master/Documentation/technical/signature-format.txt
127 # The continuation lines have only one leading space.
129 _start_end_char = br'[^ .,:;<>"\'\0\n]'
130 _content_char = br'[^\0\n<>]'
131 _safe_str_rx = br'(?:%s{1,2}|(?:%s%s*%s))' \
133 _start_end_char, _content_char, _start_end_char)
134 _tz_rx = br'[-+]\d\d[0-5]\d'
135 _parent_rx = br'(?:parent [abcdefABCDEF0123456789]{40}\n)'
136 # Assumes every following line starting with a space is part of the
137 # mergetag. Is there a formal commit blob spec?
138 _mergetag_rx = br'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)'
139 _commit_rx = re.compile(br'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
140 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
141 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?)
142 (?P<gpgsig>gpgsig .*\n(?: .*\n)*)?
143 (?P<message>(?:.|\n)*)''' % (_parent_rx,
144 _safe_str_rx, _safe_str_rx, _tz_rx,
145 _safe_str_rx, _safe_str_rx, _tz_rx,
147 _parent_hash_rx = re.compile(br'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
149 # Note that the author_sec and committer_sec values are (UTC) epoch
150 # seconds, and for now the mergetag is not included.
151 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
152 'author_name', 'author_mail',
153 'author_sec', 'author_offset',
154 'committer_name', 'committer_mail',
155 'committer_sec', 'committer_offset',
159 def parse_commit(content):
160 commit_match = re.match(_commit_rx, content)
162 raise Exception('cannot parse commit %r' % content)
163 matches = commit_match.groupdict()
164 return CommitInfo(tree=matches['tree'],
165 parents=re.findall(_parent_hash_rx, matches['parents']),
166 author_name=matches['author_name'],
167 author_mail=matches['author_mail'],
168 author_sec=int(matches['asec']),
169 author_offset=parse_tz_offset(matches['atz']),
170 committer_name=matches['committer_name'],
171 committer_mail=matches['committer_mail'],
172 committer_sec=int(matches['csec']),
173 committer_offset=parse_tz_offset(matches['ctz']),
174 gpgsig=parse_commit_gpgsig(matches['gpgsig']),
175 message=matches['message'])
178 def get_cat_data(cat_iterator, expected_type):
179 _, kind, _ = next(cat_iterator)
180 if kind != expected_type:
181 raise Exception('expected %r, saw %r' % (expected_type, kind))
182 return b''.join(cat_iterator)
184 def get_commit_items(id, cp):
185 return parse_commit(get_cat_data(cp.get(id), b'commit'))
187 def _local_git_date_str(epoch_sec):
188 return b'%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
191 def _git_date_str(epoch_sec, tz_offset_sec):
192 offs = tz_offset_sec // 60
193 return b'%d %s%02d%02d' \
195 b'+' if offs >= 0 else b'-',
200 def repo(sub = b'', repo_dir=None):
201 """Get the path to the git repository or one of its subdirectories."""
202 repo_dir = repo_dir or repodir
204 raise GitError('You should call check_repo_or_die()')
206 # If there's a .git subdirectory, then the actual repo is in there.
207 gd = os.path.join(repo_dir, b'.git')
208 if os.path.exists(gd):
211 return os.path.join(repo_dir, sub)
215 re.compile(br'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)')
218 return _shorten_hash_rx.sub(br'\1\2*\3', s)
222 full = os.path.abspath(path)
223 fullrepo = os.path.abspath(repo(b''))
224 if not fullrepo.endswith(b'/'):
226 if full.startswith(fullrepo):
227 path = full[len(fullrepo):]
228 if path.startswith(b'index-cache/'):
229 path = path[len(b'index-cache/'):]
230 return shorten_hash(path)
233 def auto_midx(objdir):
234 args = [path.exe(), b'midx', b'--auto', b'--dir', objdir]
236 rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
238 # make sure 'args' gets printed to help with debugging
239 add_error('%r: exception: %s' % (args, e))
242 add_error('%r: returned %d' % (args, rv))
244 args = [path.exe(), b'bloom', b'--dir', objdir]
246 rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
248 # make sure 'args' gets printed to help with debugging
249 add_error('%r: exception: %s' % (args, e))
252 add_error('%r: returned %d' % (args, rv))
255 def mangle_name(name, mode, gitmode):
256 """Mangle a file name to present an abstract name for segmented files.
257 Mangled file names will have the ".bup" extension added to them. If a
258 file's name already ends with ".bup", a ".bupl" extension is added to
259 disambiguate normal files from segmented ones.
261 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
262 assert(stat.S_ISDIR(gitmode))
263 return name + b'.bup'
264 elif name.endswith(b'.bup') or name[:-1].endswith(b'.bup'):
265 return name + b'.bupl'
270 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
271 def demangle_name(name, mode):
272 """Remove name mangling from a file name, if necessary.
274 The return value is a tuple (demangled_filename,mode), where mode is one of
277 * BUP_NORMAL : files that should be read as-is from the repository
278 * BUP_CHUNKED : files that were chunked and need to be reassembled
280 For more information on the name mangling algorithm, see mangle_name()
282 if name.endswith(b'.bupl'):
283 return (name[:-5], BUP_NORMAL)
284 elif name.endswith(b'.bup'):
285 return (name[:-4], BUP_CHUNKED)
286 elif name.endswith(b'.bupm'):
288 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
289 return (name, BUP_NORMAL)
292 def calc_hash(type, content):
293 """Calculate some content's hash in the Git fashion."""
294 header = b'%s %d\0' % (type, len(content))
300 def shalist_item_sort_key(ent):
301 (mode, name, id) = ent
302 assert(mode+0 == mode)
303 if stat.S_ISDIR(mode):
309 def tree_encode(shalist):
310 """Generate a git tree object from (mode,name,hash) tuples."""
311 shalist = sorted(shalist, key = shalist_item_sort_key)
313 for (mode,name,bin) in shalist:
315 assert(mode+0 == mode)
317 assert(len(bin) == 20)
318 s = b'%o %s\0%s' % (mode,name,bin)
319 assert s[0] != b'0' # 0-padded octal is not acceptable in a git tree
324 def tree_decode(buf):
325 """Generate a list of (mode,name,hash) from the git tree object in buf."""
327 while ofs < len(buf):
328 z = buf.find(b'\0', ofs)
330 spl = buf[ofs:z].split(b' ', 1)
331 assert(len(spl) == 2)
333 sha = buf[z+1:z+1+20]
335 yield (int(mode, 8), name, sha)
338 def _encode_packobj(type, content, compression_level=1):
339 if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
340 raise ValueError('invalid compression level %s' % compression_level)
343 szbits = (sz & 0x0f) | (_typemap[type]<<4)
346 if sz: szbits |= 0x80
347 szout += bytes_from_uint(szbits)
352 z = zlib.compressobj(compression_level)
354 yield z.compress(content)
358 def _decode_packobj(buf):
361 type = _typermap[(c & 0x70) >> 4]
368 sz |= (c & 0x7f) << shift
372 return (type, zlib.decompress(buf[i+1:]))
379 def find_offset(self, hash):
380 """Get the offset of an object inside the index file."""
381 idx = self._idx_from_hash(hash)
383 return self._ofs_from_idx(idx)
386 def exists(self, hash, want_source=False):
387 """Return nonempty if the object exists in this index."""
388 if hash and (self._idx_from_hash(hash) != None):
389 return want_source and os.path.basename(self.name) or True
392 def _idx_from_hash(self, hash):
393 global _total_searches, _total_steps
395 assert(len(hash) == 20)
396 b1 = byte_int(hash[0])
397 start = self.fanout[b1-1] # range -1..254
398 end = self.fanout[b1] # range 0..255
400 _total_steps += 1 # lookup table is a step
403 mid = start + (end - start) // 2
404 v = self._idx_to_hash(mid)
414 class PackIdxV1(PackIdx):
415 """Object representation of a Git pack index (version 1) file."""
416 def __init__(self, filename, f):
419 self.idxnames = [self.name]
420 self.map = mmap_read(f)
421 # Min size for 'L' is 4, which is sufficient for struct's '!I'
422 self.fanout = array('L', struct.unpack('!256I', self.map))
423 self.fanout.append(0) # entry "-1"
424 self.nsha = self.fanout[255]
425 self.sha_ofs = 256 * 4
426 # Avoid slicing shatable for individual hashes (very high overhead)
427 self.shatable = buffer(self.map, self.sha_ofs, self.nsha * 24)
432 def __exit__(self, type, value, traceback):
433 with pending_raise(value, rethrow=False):
437 return int(self.nsha) # int() from long for python 2
439 def _ofs_from_idx(self, idx):
440 if idx >= self.nsha or idx < 0:
441 raise IndexError('invalid pack index index %d' % idx)
442 ofs = self.sha_ofs + idx * 24
443 return struct.unpack_from('!I', self.map, offset=ofs)[0]
445 def _idx_to_hash(self, idx):
446 if idx >= self.nsha or idx < 0:
447 raise IndexError('invalid pack index index %d' % idx)
448 ofs = self.sha_ofs + idx * 24 + 4
449 return self.map[ofs : ofs + 20]
452 start = self.sha_ofs + 4
453 for ofs in range(start, start + 24 * self.nsha, 24):
454 yield self.map[ofs : ofs + 20]
458 if self.map is not None:
467 class PackIdxV2(PackIdx):
468 """Object representation of a Git pack index (version 2) file."""
469 def __init__(self, filename, f):
472 self.idxnames = [self.name]
473 self.map = mmap_read(f)
474 assert self.map[0:8] == b'\377tOc\0\0\0\2'
475 # Min size for 'L' is 4, which is sufficient for struct's '!I'
476 self.fanout = array('L', struct.unpack_from('!256I', self.map, offset=8))
477 self.fanout.append(0)
478 self.nsha = self.fanout[255]
479 self.sha_ofs = 8 + 256*4
480 self.ofstable_ofs = self.sha_ofs + self.nsha * 20 + self.nsha * 4
481 self.ofs64table_ofs = self.ofstable_ofs + self.nsha * 4
482 # Avoid slicing this for individual hashes (very high overhead)
483 self.shatable = buffer(self.map, self.sha_ofs, self.nsha*20)
488 def __exit__(self, type, value, traceback):
489 with pending_raise(value, rethrow=False):
493 return int(self.nsha) # int() from long for python 2
495 def _ofs_from_idx(self, idx):
496 if idx >= self.nsha or idx < 0:
497 raise IndexError('invalid pack index index %d' % idx)
498 ofs_ofs = self.ofstable_ofs + idx * 4
499 ofs = struct.unpack_from('!I', self.map, offset=ofs_ofs)[0]
501 idx64 = ofs & 0x7fffffff
502 ofs64_ofs = self.ofs64table_ofs + idx64 * 8
503 ofs = struct.unpack_from('!Q', self.map, offset=ofs64_ofs)[0]
506 def _idx_to_hash(self, idx):
507 if idx >= self.nsha or idx < 0:
508 raise IndexError('invalid pack index index %d' % idx)
509 ofs = self.sha_ofs + idx * 20
510 return self.map[ofs : ofs + 20]
514 for ofs in range(start, start + 20 * self.nsha, 20):
515 yield self.map[ofs : ofs + 20]
519 if self.map is not None:
530 def __init__(self, dir, ignore_midx=False):
532 # Q: was this also intended to prevent opening multiple repos?
533 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
539 self.do_bloom = False
541 self.ignore_midx = ignore_midx
547 assert _mpi_count == 0
550 assert _mpi_count == 0
552 self.bloom, bloom = None, self.bloom
553 self.packs, packs = None, self.packs
555 with ExitStack() as stack:
557 stack.enter_context(pack)
564 def __exit__(self, type, value, traceback):
565 with pending_raise(value, rethrow=False):
572 return iter(idxmerge(self.packs))
575 return sum(len(pack) for pack in self.packs)
577 def exists(self, hash, want_source=False):
578 """Return nonempty if the object exists in the index files."""
579 global _total_searches
581 if hash in self.also:
583 if self.do_bloom and self.bloom:
584 if self.bloom.exists(hash):
585 self.do_bloom = False
587 _total_searches -= 1 # was counted by bloom
589 for i in range(len(self.packs)):
591 _total_searches -= 1 # will be incremented by sub-pack
592 ix = p.exists(hash, want_source=want_source)
594 # reorder so most recently used packs are searched first
595 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
600 def refresh(self, skip_midx = False):
601 """Refresh the index list.
602 This method verifies if .midx files were superseded (e.g. all of its
603 contents are in another, bigger .midx file) and removes the superseded
606 If skip_midx is True, all work on .midx files will be skipped and .midx
607 files will be removed from the list.
609 The instance variable 'ignore_midx' can force this function to
610 always act as if skip_midx was True.
612 if self.bloom is not None:
614 self.bloom = None # Always reopen the bloom as it may have been relaced
615 self.do_bloom = False
616 skip_midx = skip_midx or self.ignore_midx
617 d = dict((p.name, p) for p in self.packs
618 if not skip_midx or not isinstance(p, midx.PackMidx))
619 if os.path.exists(self.dir):
622 midxes = set(glob.glob(os.path.join(self.dir, b'*.midx')))
623 # remove any *.midx files from our list that no longer exist
624 for ix in list(d.values()):
625 if not isinstance(ix, midx.PackMidx):
627 if ix.name in midxes:
632 self.packs.remove(ix)
633 for ix in self.packs:
634 if isinstance(ix, midx.PackMidx):
635 for name in ix.idxnames:
636 d[os.path.join(self.dir, name)] = ix
639 mx = midx.PackMidx(full)
640 (mxd, mxf) = os.path.split(mx.name)
642 for n in mx.idxnames:
643 if not os.path.exists(os.path.join(mxd, n)):
644 log(('warning: index %s missing\n'
646 % (path_msg(n), path_msg(mxf)))
653 midxl.sort(key=lambda ix:
654 (-len(ix), -xstat.stat(ix.name).st_mtime))
657 for sub in ix.idxnames:
658 found = d.get(os.path.join(self.dir, sub))
659 if not found or isinstance(found, PackIdx):
660 # doesn't exist, or exists but not in a midx
665 for name in ix.idxnames:
666 d[os.path.join(self.dir, name)] = ix
667 elif not ix.force_keep:
668 debug1('midx: removing redundant: %s\n'
669 % path_msg(os.path.basename(ix.name)))
672 for full in glob.glob(os.path.join(self.dir, b'*.idx')):
676 except GitError as e:
680 bfull = os.path.join(self.dir, b'bup.bloom')
681 self.packs = list(set(d.values()))
682 self.packs.sort(reverse=True, key=lambda x: len(x))
683 if self.bloom is None and os.path.exists(bfull):
684 self.bloom = bloom.ShaBloom(bfull)
686 if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
690 self.bloom, bloom_tmp = None, self.bloom
692 except BaseException as ex:
693 with pending_raise(ex):
697 debug1('PackIdxList: using %d index%s.\n'
698 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
701 """Insert an additional object in the list."""
705 def open_idx(filename):
706 if filename.endswith(b'.idx'):
707 f = open(filename, 'rb')
709 if header[0:4] == b'\377tOc':
710 version = struct.unpack('!I', header[4:8])[0]
712 return PackIdxV2(filename, f)
714 raise GitError('%s: expected idx file version 2, got %d'
715 % (path_msg(filename), version))
716 elif len(header) == 8 and header[0:4] < b'\377tOc':
717 return PackIdxV1(filename, f)
719 raise GitError('%s: unrecognized idx file header'
720 % path_msg(filename))
721 elif filename.endswith(b'.midx'):
722 return midx.PackMidx(filename)
724 raise GitError('idx filenames must end with .idx or .midx')
727 def idxmerge(idxlist, final_progress=True):
728 """Generate a list of all the objects reachable in a PackIdxList."""
729 def pfunc(count, total):
730 qprogress('Reading indexes: %.2f%% (%d/%d)\r'
731 % (count*100.0/total, count, total))
732 def pfinal(count, total):
734 progress('Reading indexes: %.2f%% (%d/%d), done.\n'
735 % (100, total, total))
736 return merge_iter(idxlist, 10024, pfunc, pfinal)
739 def create_commit_blob(tree, parent,
740 author, adate_sec, adate_tz,
741 committer, cdate_sec, cdate_tz,
743 if adate_tz is not None:
744 adate_str = _git_date_str(adate_sec, adate_tz)
746 adate_str = _local_git_date_str(adate_sec)
747 if cdate_tz is not None:
748 cdate_str = _git_date_str(cdate_sec, cdate_tz)
750 cdate_str = _local_git_date_str(cdate_sec)
752 if tree: l.append(b'tree %s' % hexlify(tree))
753 if parent: l.append(b'parent %s' % hexlify(parent))
754 if author: l.append(b'author %s %s' % (author, adate_str))
755 if committer: l.append(b'committer %s %s' % (committer, cdate_str))
760 def _make_objcache():
761 return PackIdxList(repo(b'objects/pack'))
763 # bup-gc assumes that it can disable all PackWriter activities
764 # (bloom/midx/cache) via the constructor and close() arguments.
767 """Writes Git objects inside a pack file."""
768 def __init__(self, objcache_maker=_make_objcache, compression_level=1,
769 run_midx=True, on_pack_finish=None,
770 max_pack_size=None, max_pack_objects=None, repo_dir=None):
772 self.repo_dir = repo_dir or repo()
779 self.objcache_maker = objcache_maker
781 self.compression_level = compression_level
782 self.run_midx=run_midx
783 self.on_pack_finish = on_pack_finish
784 if not max_pack_size:
785 max_pack_size = git_config_get(b'pack.packSizeLimit',
786 repo_dir=self.repo_dir,
788 if not max_pack_size:
789 # larger packs slow down pruning
790 max_pack_size = 1000 * 1000 * 1000
791 self.max_pack_size = max_pack_size
792 # cache memory usage is about 83 bytes per object
793 self.max_pack_objects = max_pack_objects if max_pack_objects \
794 else max(1, self.max_pack_size // 5000)
799 def __exit__(self, type, value, traceback):
800 with pending_raise(value, rethrow=False):
805 objdir = dir = os.path.join(self.repo_dir, b'objects')
806 fd, name = tempfile.mkstemp(suffix=b'.pack', dir=objdir)
808 self.file = os.fdopen(fd, 'w+b')
813 self.parentfd = os.open(objdir, os.O_RDONLY)
819 assert name.endswith(b'.pack')
820 self.filename = name[:-5]
821 self.file.write(b'PACK\0\0\0\2\0\0\0\0')
822 self.idx = PackIdxV2Writer()
824 def _raw_write(self, datalist, sha):
827 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
828 # the file never has a *partial* blob. So let's make sure it's
829 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
830 # to our hashsplit algorithm.) f.write() does its own buffering,
831 # but that's okay because we'll flush it in _end().
832 oneblob = b''.join(datalist)
838 crc = zlib.crc32(oneblob) & 0xffffffff
839 self._update_idx(sha, crc, nw)
844 def _update_idx(self, sha, crc, size):
847 self.idx.add(sha, crc, self.file.tell() - size)
849 def _write(self, sha, type, content):
853 sha = calc_hash(type, content)
854 size, crc = self._raw_write(_encode_packobj(type, content,
855 self.compression_level),
857 if self.outbytes >= self.max_pack_size \
858 or self.count >= self.max_pack_objects:
862 def _require_objcache(self):
863 if self.objcache is None and self.objcache_maker:
864 self.objcache = self.objcache_maker()
865 if self.objcache is None:
867 "PackWriter not opened or can't check exists w/o objcache")
869 def exists(self, id, want_source=False):
870 """Return non-empty if an object is found in the object cache."""
871 self._require_objcache()
872 return self.objcache.exists(id, want_source=want_source)
874 def just_write(self, sha, type, content):
875 """Write an object to the pack file without checking for duplication."""
876 self._write(sha, type, content)
877 # If nothing else, gc doesn't have/want an objcache
878 if self.objcache is not None:
879 self.objcache.add(sha)
881 def maybe_write(self, type, content):
882 """Write an object to the pack file if not present and return its id."""
883 sha = calc_hash(type, content)
884 if not self.exists(sha):
885 self._require_objcache()
886 self.just_write(sha, type, content)
889 def new_blob(self, blob):
890 """Create a blob object in the pack with the supplied content."""
891 return self.maybe_write(b'blob', blob)
893 def new_tree(self, shalist):
894 """Create a tree object in the pack."""
895 content = tree_encode(shalist)
896 return self.maybe_write(b'tree', content)
898 def new_commit(self, tree, parent,
899 author, adate_sec, adate_tz,
900 committer, cdate_sec, cdate_tz,
902 """Create a commit object in the pack. The date_sec values must be
903 epoch-seconds, and if a tz is None, the local timezone is assumed."""
904 content = create_commit_blob(tree, parent,
905 author, adate_sec, adate_tz,
906 committer, cdate_sec, cdate_tz,
908 return self.maybe_write(b'commit', content)
910 def _end(self, run_midx=True, abort=False):
911 # Ignores run_midx during abort
914 self.file, f = None, self.file
915 self.idx, idx = None, self.idx
916 self.parentfd, pfd, = None, self.parentfd
919 with nullcontext_if_not(self.objcache), \
920 finalized(pfd, lambda x: x is not None and os.close(x)), \
924 os.unlink(self.filename + b'.pack')
927 # update object count
929 cp = struct.pack('!i', self.count)
933 # calculate the pack sha1sum
936 for b in chunkyreader(f):
938 packbin = sum.digest()
941 fdatasync(f.fileno())
944 idx.write(self.filename + b'.idx', packbin)
945 nameprefix = os.path.join(self.repo_dir,
946 b'objects/pack/pack-' + hexlify(packbin))
947 if os.path.exists(self.filename + b'.map'):
948 os.unlink(self.filename + b'.map')
949 os.rename(self.filename + b'.pack', nameprefix + b'.pack')
950 os.rename(self.filename + b'.idx', nameprefix + b'.idx')
953 auto_midx(os.path.join(self.repo_dir, b'objects/pack'))
954 if self.on_pack_finish:
955 self.on_pack_finish(nameprefix)
958 # Must be last -- some of the code above depends on it
962 """Remove the pack file from disk."""
964 self._end(abort=True)
966 def breakpoint(self):
967 """Clear byte and object counts and return the last processed id."""
968 id = self._end(self.run_midx)
969 self.outbytes = self.count = 0
972 def close(self, run_midx=True):
973 """Close the pack file and move it to its definitive path."""
975 return self._end(run_midx=run_midx)
981 class PackIdxV2Writer:
983 self.idx = list(list() for i in range(256))
986 def add(self, sha, crc, offs):
989 self.idx[byte_int(sha[0])].append((sha, crc, offs))
991 def write(self, filename, packbin):
993 for section in self.idx:
994 for entry in section:
995 if entry[2] >= 2**31:
998 # Length: header + fan-out + shas-and-crcs + overflow-offsets
999 index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
1001 idx_f = open(filename, 'w+b')
1003 idx_f.truncate(index_len)
1004 fdatasync(idx_f.fileno())
1005 idx_map = mmap_readwrite(idx_f, close=False)
1007 count = _helpers.write_idx(filename, idx_map, self.idx,
1009 assert(count == self.count)
1016 idx_f = open(filename, 'a+b')
1018 idx_f.write(packbin)
1021 b = idx_f.read(8 + 4*256)
1024 for b in chunkyreader(idx_f, 20 * self.count):
1027 for b in chunkyreader(idx_f):
1029 idx_f.write(idx_sum.digest())
1030 fdatasync(idx_f.fileno())
1035 def list_refs(patterns=None, repo_dir=None,
1036 limit_to_heads=False, limit_to_tags=False):
1037 """Yield (refname, hash) tuples for all repository refs unless
1038 patterns are specified. In that case, only include tuples for
1039 refs matching those patterns (cf. git-show-ref(1)). The limits
1040 restrict the result items to refs/heads or refs/tags. If both
1041 limits are specified, items from both sources will be included.
1044 argv = [b'git', b'show-ref']
1046 argv.append(b'--heads')
1048 argv.append(b'--tags')
1051 argv.extend(patterns)
1052 p = subprocess.Popen(argv, env=_gitenv(repo_dir), stdout=subprocess.PIPE,
1054 out = p.stdout.read().strip()
1055 rv = p.wait() # not fatal
1059 for d in out.split(b'\n'):
1060 sha, name = d.split(b' ', 1)
1061 yield name, unhexlify(sha)
1064 def read_ref(refname, repo_dir = None):
1065 """Get the commit id of the most recent commit made on a given ref."""
1066 refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
1067 l = tuple(islice(refs, 2))
1075 def rev_list_invocation(ref_or_refs, format=None):
1076 if isinstance(ref_or_refs, bytes):
1077 refs = (ref_or_refs,)
1080 argv = [b'git', b'rev-list']
1083 argv.append(b'--pretty=format:' + format)
1085 assert not ref.startswith(b'-')
1091 def rev_list(ref_or_refs, parse=None, format=None, repo_dir=None):
1092 """Yield information about commits as per "git rev-list". If a format
1093 is not provided, yield one hex hash at a time. If a format is
1094 provided, pass it to rev-list and call parse(git_stdout) for each
1095 commit with the stream positioned just after the rev-list "commit
1096 HASH" header line. When a format is provided yield (oidx,
1097 parse(git_stdout)) for each commit.
1100 assert bool(parse) == bool(format)
1101 p = subprocess.Popen(rev_list_invocation(ref_or_refs,
1103 env=_gitenv(repo_dir),
1104 stdout = subprocess.PIPE,
1107 for line in p.stdout:
1110 line = p.stdout.readline()
1113 if not s.startswith(b'commit '):
1114 raise Exception('unexpected line ' + repr(s))
1117 yield s, parse(p.stdout)
1118 line = p.stdout.readline()
1120 rv = p.wait() # not fatal
1122 raise GitError('git rev-list returned error %d' % rv)
1125 def rev_parse(committish, repo_dir=None):
1126 """Resolve the full hash for 'committish', if it exists.
1128 Should be roughly equivalent to 'git rev-parse'.
1130 Returns the hex value of the hash if it is found, None if 'committish' does
1131 not correspond to anything.
1133 head = read_ref(committish, repo_dir=repo_dir)
1135 debug2("resolved from ref: commit = %s\n" % hexlify(head))
1138 if len(committish) == 40:
1140 hash = unhexlify(committish)
1144 with PackIdxList(repo(b'objects/pack', repo_dir=repo_dir)) as pL:
1151 def update_ref(refname, newval, oldval, repo_dir=None):
1152 """Update a repository reference."""
1155 assert refname.startswith(b'refs/heads/') \
1156 or refname.startswith(b'refs/tags/')
1157 p = subprocess.Popen([b'git', b'update-ref', refname,
1158 hexlify(newval), hexlify(oldval)],
1159 env=_gitenv(repo_dir),
1161 _git_wait(b'git update-ref', p)
1164 def delete_ref(refname, oldvalue=None):
1165 """Delete a repository reference (see git update-ref(1))."""
1166 assert refname.startswith(b'refs/')
1167 oldvalue = [] if not oldvalue else [oldvalue]
1168 p = subprocess.Popen([b'git', b'update-ref', b'-d', refname] + oldvalue,
1171 _git_wait('git update-ref', p)
1174 def guess_repo(path=None):
1175 """Set the path value in the global variable "repodir".
1176 This makes bup look for an existing bup repository, but not fail if a
1177 repository doesn't exist. Usually, if you are interacting with a bup
1178 repository, you would not be calling this function but using
1179 check_repo_or_die().
1185 repodir = environ.get(b'BUP_DIR')
1187 repodir = os.path.expanduser(b'~/.bup')
1190 def init_repo(path=None):
1191 """Create the Git bare repository for bup in a given path."""
1193 d = repo() # appends a / to the path
1194 parent = os.path.dirname(os.path.dirname(d))
1195 if parent and not os.path.exists(parent):
1196 raise GitError('parent directory "%s" does not exist\n'
1198 if os.path.exists(d) and not os.path.isdir(os.path.join(d, b'.')):
1199 raise GitError('"%s" exists but is not a directory\n' % path_msg(d))
1200 p = subprocess.Popen([b'git', b'--bare', b'init'], stdout=sys.stderr,
1203 _git_wait('git init', p)
1204 # Force the index version configuration in order to ensure bup works
1205 # regardless of the version of the installed Git binary.
1206 p = subprocess.Popen([b'git', b'config', b'pack.indexVersion', '2'],
1207 stdout=sys.stderr, env=_gitenv(), close_fds=True)
1208 _git_wait('git config', p)
1210 p = subprocess.Popen([b'git', b'config', b'core.logAllRefUpdates', b'true'],
1211 stdout=sys.stderr, env=_gitenv(), close_fds=True)
1212 _git_wait('git config', p)
1215 def check_repo_or_die(path=None):
1216 """Check to see if a bup repository probably exists, and abort if not."""
1219 pst = stat_if_exists(top + b'/objects/pack')
1220 if pst and stat.S_ISDIR(pst.st_mode):
1223 top_st = stat_if_exists(top)
1225 log('error: repository %r does not exist (see "bup help init")\n'
1228 log('error: %s is not a repository\n' % path_msg(top))
1232 def is_suitable_git(ver_str):
1233 if not ver_str.startswith(b'git version '):
1234 return 'unrecognized'
1235 ver_str = ver_str[len(b'git version '):]
1236 if ver_str.startswith(b'0.'):
1237 return 'insufficient'
1238 if ver_str.startswith(b'1.'):
1239 if re.match(br'1\.[012345]rc', ver_str):
1240 return 'insufficient'
1241 if re.match(br'1\.[01234]\.', ver_str):
1242 return 'insufficient'
1243 if re.match(br'1\.5\.[012345]($|\.)', ver_str):
1244 return 'insufficient'
1245 if re.match(br'1\.5\.6-rc', ver_str):
1246 return 'insufficient'
1248 if re.match(br'[0-9]+(\.|$)?', ver_str):
1254 def require_suitable_git(ver_str=None):
1255 """Raise GitError if the version of git isn't suitable.
1257 Rely on ver_str when provided, rather than invoking the git in the
1262 if _git_great is not None:
1264 if environ.get(b'BUP_GIT_VERSION_IS_FINE', b'').lower() \
1265 in (b'yes', b'true', b'1'):
1269 ver_str, _, _ = _git_exo([b'git', b'--version'])
1270 status = is_suitable_git(ver_str)
1271 if status == 'unrecognized':
1272 raise GitError('Unexpected git --version output: %r' % ver_str)
1273 if status == 'insufficient':
1274 log('error: git version must be at least 1.5.6\n')
1276 if status == 'suitable':
1283 """Link to 'git cat-file' that is used to retrieve blob data."""
1284 def __init__(self, repo_dir = None):
1285 require_suitable_git()
1286 self.repo_dir = repo_dir
1287 self.p = self.inprogress = None
1289 def close(self, wait=False):
1290 self.p, p = None, self.p
1291 self.inprogress = None
1296 # This will handle pending exceptions correctly once
1306 self.p = subprocess.Popen([b'git', b'cat-file', b'--batch'],
1307 stdin=subprocess.PIPE,
1308 stdout=subprocess.PIPE,
1311 env=_gitenv(self.repo_dir))
1314 """Yield (oidx, type, size), followed by the data referred to by ref.
1315 If ref does not exist, only yield (None, None, None).
1318 if not self.p or self.p.poll() != None:
1321 poll_result = self.p.poll()
1322 assert(poll_result == None)
1324 log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1325 assert(not self.inprogress)
1326 assert ref.find(b'\n') < 0
1327 assert ref.find(b'\r') < 0
1328 assert not ref.startswith(b'-')
1329 self.inprogress = ref
1330 self.p.stdin.write(ref + b'\n')
1331 self.p.stdin.flush()
1332 hdr = self.p.stdout.readline()
1334 raise GitError('unexpected cat-file EOF (last request: %r, exit: %s)'
1335 % (ref, self.p.poll() or 'none'))
1336 if hdr.endswith(b' missing\n'):
1337 self.inprogress = None
1338 yield None, None, None
1340 info = hdr.split(b' ')
1341 if len(info) != 3 or len(info[0]) != 40:
1342 raise GitError('expected object (id, type, size), got %r' % info)
1343 oidx, typ, size = info
1346 it = chunkyreader(self.p.stdout, size)
1347 yield oidx, typ, size
1348 for blob in chunkyreader(self.p.stdout, size):
1350 readline_result = self.p.stdout.readline()
1351 assert readline_result == b'\n'
1352 self.inprogress = None
1353 except Exception as ex:
1354 with pending_raise(ex):
1357 def _join(self, it):
1358 _, typ, _ = next(it)
1362 elif typ == b'tree':
1363 treefile = b''.join(it)
1364 for (mode, name, sha) in tree_decode(treefile):
1365 for blob in self.join(hexlify(sha)):
1367 elif typ == b'commit':
1368 treeline = b''.join(it).split(b'\n')[0]
1369 assert treeline.startswith(b'tree ')
1370 for blob in self.join(treeline[5:]):
1373 raise GitError('invalid object type %r: expected blob/tree/commit'
1377 """Generate a list of the content of all blobs that can be reached
1378 from an object. The hash given in 'id' must point to a blob, a tree
1379 or a commit. The content of all blobs that can be seen from trees or
1380 commits will be added to the list.
1382 for d in self._join(self.get(id)):
1388 def cp(repo_dir=None):
1389 """Create a CatPipe object or reuse the already existing one."""
1392 repo_dir = repodir or repo()
1393 repo_dir = os.path.abspath(repo_dir)
1394 cp = _cp.get(repo_dir)
1396 cp = CatPipe(repo_dir)
1401 def close_catpipes():
1402 # FIXME: chain exceptions
1404 _, cp = _cp.popitem()
1408 def tags(repo_dir = None):
1409 """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1411 for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1412 assert n.startswith(b'refs/tags/')
1416 tags[c].append(name) # more than one tag can point at 'c'
1420 class MissingObject(KeyError):
1421 def __init__(self, oid):
1423 KeyError.__init__(self, 'object %r is missing' % hexlify(oid))
1426 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1427 'path', 'chunk_path', 'data'])
1428 # The path is the mangled path, and if an item represents a fragment
1429 # of a chunked file, the chunk_path will be the chunked subtree path
1430 # for the chunk, i.e. ['', '2d3115e', ...]. The top-level path for a
1431 # chunked file will have a chunk_path of ['']. So some chunk subtree
1432 # of the file '/foo/bar/baz' might look like this:
1434 # item.path = ['foo', 'bar', 'baz.bup']
1435 # item.chunk_path = ['', '2d3115e', '016b097']
1436 # item.type = 'tree'
1440 def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1441 """Yield everything reachable from oidx via get_ref (which must behave
1442 like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1443 returns true. Throw MissingObject if a hash encountered is
1444 missing from the repository, and don't read or return blob content
1445 in the data field unless include_data is set.
1448 # Maintain the pending stack on the heap to avoid stack overflow
1449 pending = [(oidx, [], [], None)]
1451 oidx, parent_path, chunk_path, mode = pending.pop()
1452 oid = unhexlify(oidx)
1453 if stop_at and stop_at(oidx):
1456 if (not include_data) and mode and stat.S_ISREG(mode):
1457 # If the object is a "regular file", then it's a leaf in
1458 # the graph, so we can skip reading the data if the caller
1459 # hasn't requested it.
1460 yield WalkItem(oid=oid, type=b'blob',
1461 chunk_path=chunk_path, path=parent_path,
1466 item_it = get_ref(oidx)
1467 get_oidx, typ, _ = next(item_it)
1469 raise MissingObject(unhexlify(oidx))
1470 if typ not in (b'blob', b'commit', b'tree'):
1471 raise Exception('unexpected repository object type %r' % typ)
1473 # FIXME: set the mode based on the type when the mode is None
1474 if typ == b'blob' and not include_data:
1475 # Dump data until we can ask cat_pipe not to fetch it
1476 for ignored in item_it:
1480 data = b''.join(item_it)
1482 yield WalkItem(oid=oid, type=typ,
1483 chunk_path=chunk_path, path=parent_path,
1485 data=(data if include_data else None))
1487 if typ == b'commit':
1488 commit_items = parse_commit(data)
1489 for pid in commit_items.parents:
1490 pending.append((pid, parent_path, chunk_path, mode))
1491 pending.append((commit_items.tree, parent_path, chunk_path,
1492 hashsplit.GIT_MODE_TREE))
1493 elif typ == b'tree':
1494 for mode, name, ent_id in tree_decode(data):
1495 demangled, bup_type = demangle_name(name, mode)
1497 sub_path = parent_path
1498 sub_chunk_path = chunk_path + [name]
1500 sub_path = parent_path + [name]
1501 if bup_type == BUP_CHUNKED:
1502 sub_chunk_path = [b'']
1504 sub_chunk_path = chunk_path
1505 pending.append((hexlify(ent_id), sub_path, sub_chunk_path,