1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
6 from __future__ import absolute_import, print_function
7 import os, sys, zlib, subprocess, struct, stat, re, tempfile, glob
8 from array import array
9 from binascii import hexlify, unhexlify
10 from collections import namedtuple
11 from itertools import islice
13 from bup import _helpers, hashsplit, path, midx, bloom, xstat
14 from bup.compat import (buffer,
15 byte_int, bytes_from_byte, bytes_from_uint,
22 from bup.io import path_msg
23 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
30 mmap_read, mmap_readwrite,
32 progress, qprogress, stat_if_exists,
38 repodir = None # The default repository, once initialized
40 _typemap = {b'blob': 3, b'tree': 2, b'commit': 1, b'tag': 4}
41 _typermap = {v: k for k, v in items(_typemap)}
48 class GitError(Exception):
52 def _gitenv(repo_dir=None):
55 return merge_dict(environ, {b'GIT_DIR': os.path.abspath(repo_dir)})
57 def _git_wait(cmd, p):
60 raise GitError('%r returned %d' % (cmd, rv))
62 def _git_exo(cmd, **kwargs):
63 kwargs['check'] = False
64 result = exo(cmd, **kwargs)
66 if proc.returncode != 0:
67 raise GitError('%r returned %d' % (cmd, proc.returncode))
70 def git_config_get(option, repo_dir=None, opttype=None, cfg_file=None):
71 assert not (repo_dir and cfg_file), "repo_dir and cfg_file cannot both be used"
72 cmd = [b'git', b'config', b'--null']
74 cmd.extend([b'--file', cfg_file])
76 cmd.extend([b'--int'])
77 elif opttype == 'bool':
78 cmd.extend([b'--bool'])
80 assert opttype is None
81 cmd.extend([b'--get', option])
84 env = _gitenv(repo_dir=repo_dir)
85 p = subprocess.Popen(cmd, stdout=subprocess.PIPE, env=env,
87 # with --null, git writes out a trailing \0 after the value
88 r = p.stdout.read()[:-1]
93 elif opttype == 'bool':
94 # git converts to 'true' or 'false'
98 raise GitError('%r returned %d' % (cmd, rc))
102 def parse_tz_offset(s):
103 """UTC offset in seconds."""
104 tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
105 if bytes_from_byte(s[0]) == b'-':
109 def parse_commit_gpgsig(sig):
110 """Return the original signature bytes.
112 i.e. with the "gpgsig " header and the leading space character on
113 each continuation line removed.
118 assert sig.startswith(b'gpgsig ')
120 return sig.replace(b'\n ', b'\n')
122 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
123 # Make sure that's authoritative.
126 # https://github.com/git/git/blob/master/Documentation/technical/signature-format.txt
127 # The continuation lines have only one leading space.
129 _start_end_char = br'[^ .,:;<>"\'\0\n]'
130 _content_char = br'[^\0\n<>]'
131 _safe_str_rx = br'(?:%s{1,2}|(?:%s%s*%s))' \
133 _start_end_char, _content_char, _start_end_char)
134 _tz_rx = br'[-+]\d\d[0-5]\d'
135 _parent_rx = br'(?:parent [abcdefABCDEF0123456789]{40}\n)'
136 # Assumes every following line starting with a space is part of the
137 # mergetag. Is there a formal commit blob spec?
138 _mergetag_rx = br'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)'
139 _commit_rx = re.compile(br'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
140 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
141 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?)
142 (?P<gpgsig>gpgsig .*\n(?: .*\n)*)?
143 (?P<message>(?:.|\n)*)''' % (_parent_rx,
144 _safe_str_rx, _safe_str_rx, _tz_rx,
145 _safe_str_rx, _safe_str_rx, _tz_rx,
147 _parent_hash_rx = re.compile(br'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
149 # Note that the author_sec and committer_sec values are (UTC) epoch
150 # seconds, and for now the mergetag is not included.
151 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
152 'author_name', 'author_mail',
153 'author_sec', 'author_offset',
154 'committer_name', 'committer_mail',
155 'committer_sec', 'committer_offset',
159 def parse_commit(content):
160 commit_match = re.match(_commit_rx, content)
162 raise Exception('cannot parse commit %r' % content)
163 matches = commit_match.groupdict()
164 return CommitInfo(tree=matches['tree'],
165 parents=re.findall(_parent_hash_rx, matches['parents']),
166 author_name=matches['author_name'],
167 author_mail=matches['author_mail'],
168 author_sec=int(matches['asec']),
169 author_offset=parse_tz_offset(matches['atz']),
170 committer_name=matches['committer_name'],
171 committer_mail=matches['committer_mail'],
172 committer_sec=int(matches['csec']),
173 committer_offset=parse_tz_offset(matches['ctz']),
174 gpgsig=parse_commit_gpgsig(matches['gpgsig']),
175 message=matches['message'])
178 def get_cat_data(cat_iterator, expected_type):
179 _, kind, _ = next(cat_iterator)
180 if kind != expected_type:
181 raise Exception('expected %r, saw %r' % (expected_type, kind))
182 return b''.join(cat_iterator)
184 def get_commit_items(id, cp):
185 return parse_commit(get_cat_data(cp.get(id), b'commit'))
187 def _local_git_date_str(epoch_sec):
188 return b'%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
191 def _git_date_str(epoch_sec, tz_offset_sec):
192 offs = tz_offset_sec // 60
193 return b'%d %s%02d%02d' \
195 b'+' if offs >= 0 else b'-',
200 def repo(sub = b'', repo_dir=None):
201 """Get the path to the git repository or one of its subdirectories."""
202 repo_dir = repo_dir or repodir
204 raise GitError('You should call check_repo_or_die()')
206 # If there's a .git subdirectory, then the actual repo is in there.
207 gd = os.path.join(repo_dir, b'.git')
208 if os.path.exists(gd):
211 return os.path.join(repo_dir, sub)
215 re.compile(br'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)')
218 return _shorten_hash_rx.sub(br'\1\2*\3', s)
222 full = os.path.abspath(path)
223 fullrepo = os.path.abspath(repo(b''))
224 if not fullrepo.endswith(b'/'):
226 if full.startswith(fullrepo):
227 path = full[len(fullrepo):]
228 if path.startswith(b'index-cache/'):
229 path = path[len(b'index-cache/'):]
230 return shorten_hash(path)
233 def auto_midx(objdir):
234 args = [path.exe(), b'midx', b'--auto', b'--dir', objdir]
236 rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
238 # make sure 'args' gets printed to help with debugging
239 add_error('%r: exception: %s' % (args, e))
242 add_error('%r: returned %d' % (args, rv))
244 args = [path.exe(), b'bloom', b'--dir', objdir]
246 rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
248 # make sure 'args' gets printed to help with debugging
249 add_error('%r: exception: %s' % (args, e))
252 add_error('%r: returned %d' % (args, rv))
255 def mangle_name(name, mode, gitmode):
256 """Mangle a file name to present an abstract name for segmented files.
257 Mangled file names will have the ".bup" extension added to them. If a
258 file's name already ends with ".bup", a ".bupl" extension is added to
259 disambiguate normal files from segmented ones.
261 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
262 assert(stat.S_ISDIR(gitmode))
263 return name + b'.bup'
264 elif name.endswith(b'.bup') or name[:-1].endswith(b'.bup'):
265 return name + b'.bupl'
270 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
271 def demangle_name(name, mode):
272 """Remove name mangling from a file name, if necessary.
274 The return value is a tuple (demangled_filename,mode), where mode is one of
277 * BUP_NORMAL : files that should be read as-is from the repository
278 * BUP_CHUNKED : files that were chunked and need to be reassembled
280 For more information on the name mangling algorithm, see mangle_name()
282 if name.endswith(b'.bupl'):
283 return (name[:-5], BUP_NORMAL)
284 elif name.endswith(b'.bup'):
285 return (name[:-4], BUP_CHUNKED)
286 elif name.endswith(b'.bupm'):
288 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
289 return (name, BUP_NORMAL)
292 def calc_hash(type, content):
293 """Calculate some content's hash in the Git fashion."""
294 header = b'%s %d\0' % (type, len(content))
300 def shalist_item_sort_key(ent):
301 (mode, name, id) = ent
302 assert(mode+0 == mode)
303 if stat.S_ISDIR(mode):
309 def tree_encode(shalist):
310 """Generate a git tree object from (mode,name,hash) tuples."""
311 shalist = sorted(shalist, key = shalist_item_sort_key)
313 for (mode,name,bin) in shalist:
315 assert(mode+0 == mode)
317 assert(len(bin) == 20)
318 s = b'%o %s\0%s' % (mode,name,bin)
319 assert s[0] != b'0' # 0-padded octal is not acceptable in a git tree
324 def tree_decode(buf):
325 """Generate a list of (mode,name,hash) from the git tree object in buf."""
327 while ofs < len(buf):
328 z = buf.find(b'\0', ofs)
330 spl = buf[ofs:z].split(b' ', 1)
331 assert(len(spl) == 2)
333 sha = buf[z+1:z+1+20]
335 yield (int(mode, 8), name, sha)
338 def _encode_packobj(type, content, compression_level=1):
339 if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
340 raise ValueError('invalid compression level %s' % compression_level)
343 szbits = (sz & 0x0f) | (_typemap[type]<<4)
346 if sz: szbits |= 0x80
347 szout += bytes_from_uint(szbits)
352 z = zlib.compressobj(compression_level)
354 yield z.compress(content)
358 def _decode_packobj(buf):
361 type = _typermap[(c & 0x70) >> 4]
368 sz |= (c & 0x7f) << shift
372 return (type, zlib.decompress(buf[i+1:]))
379 def find_offset(self, hash):
380 """Get the offset of an object inside the index file."""
381 idx = self._idx_from_hash(hash)
383 return self._ofs_from_idx(idx)
386 def exists(self, hash, want_source=False):
387 """Return nonempty if the object exists in this index."""
388 if hash and (self._idx_from_hash(hash) != None):
389 return want_source and os.path.basename(self.name) or True
392 def _idx_from_hash(self, hash):
393 global _total_searches, _total_steps
395 assert(len(hash) == 20)
396 b1 = byte_int(hash[0])
397 start = self.fanout[b1-1] # range -1..254
398 end = self.fanout[b1] # range 0..255
400 _total_steps += 1 # lookup table is a step
403 mid = start + (end - start) // 2
404 v = self._idx_to_hash(mid)
414 class PackIdxV1(PackIdx):
415 """Object representation of a Git pack index (version 1) file."""
416 def __init__(self, filename, f):
419 self.idxnames = [self.name]
420 self.map = mmap_read(f)
421 # Min size for 'L' is 4, which is sufficient for struct's '!I'
422 self.fanout = array('L', struct.unpack('!256I', self.map))
423 self.fanout.append(0) # entry "-1"
424 self.nsha = self.fanout[255]
425 self.sha_ofs = 256 * 4
426 # Avoid slicing shatable for individual hashes (very high overhead)
427 self.shatable = buffer(self.map, self.sha_ofs, self.nsha * 24)
432 def __exit__(self, type, value, traceback):
433 with pending_raise(value, rethrow=False):
437 return int(self.nsha) # int() from long for python 2
439 def _ofs_from_idx(self, idx):
440 if idx >= self.nsha or idx < 0:
441 raise IndexError('invalid pack index index %d' % idx)
442 ofs = self.sha_ofs + idx * 24
443 return struct.unpack_from('!I', self.map, offset=ofs)[0]
445 def _idx_to_hash(self, idx):
446 if idx >= self.nsha or idx < 0:
447 raise IndexError('invalid pack index index %d' % idx)
448 ofs = self.sha_ofs + idx * 24 + 4
449 return self.map[ofs : ofs + 20]
452 start = self.sha_ofs + 4
453 for ofs in range(start, start + 24 * self.nsha, 24):
454 yield self.map[ofs : ofs + 20]
458 if self.map is not None:
467 class PackIdxV2(PackIdx):
468 """Object representation of a Git pack index (version 2) file."""
469 def __init__(self, filename, f):
472 self.idxnames = [self.name]
473 self.map = mmap_read(f)
474 assert self.map[0:8] == b'\377tOc\0\0\0\2'
475 # Min size for 'L' is 4, which is sufficient for struct's '!I'
476 self.fanout = array('L', struct.unpack_from('!256I', self.map, offset=8))
477 self.fanout.append(0)
478 self.nsha = self.fanout[255]
479 self.sha_ofs = 8 + 256*4
480 self.ofstable_ofs = self.sha_ofs + self.nsha * 20 + self.nsha * 4
481 self.ofs64table_ofs = self.ofstable_ofs + self.nsha * 4
482 # Avoid slicing this for individual hashes (very high overhead)
483 self.shatable = buffer(self.map, self.sha_ofs, self.nsha*20)
488 def __exit__(self, type, value, traceback):
489 with pending_raise(value, rethrow=False):
493 return int(self.nsha) # int() from long for python 2
495 def _ofs_from_idx(self, idx):
496 if idx >= self.nsha or idx < 0:
497 raise IndexError('invalid pack index index %d' % idx)
498 ofs_ofs = self.ofstable_ofs + idx * 4
499 ofs = struct.unpack_from('!I', self.map, offset=ofs_ofs)[0]
501 idx64 = ofs & 0x7fffffff
502 ofs64_ofs = self.ofs64table_ofs + idx64 * 8
503 ofs = struct.unpack_from('!Q', self.map, offset=ofs64_ofs)[0]
506 def _idx_to_hash(self, idx):
507 if idx >= self.nsha or idx < 0:
508 raise IndexError('invalid pack index index %d' % idx)
509 ofs = self.sha_ofs + idx * 20
510 return self.map[ofs : ofs + 20]
514 for ofs in range(start, start + 20 * self.nsha, 20):
515 yield self.map[ofs : ofs + 20]
519 if self.map is not None:
530 def __init__(self, dir, ignore_midx=False):
532 # Q: was this also intended to prevent opening multiple repos?
533 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
539 self.do_bloom = False
541 self.ignore_midx = ignore_midx
544 except BaseException as ex:
545 with pending_raise(ex):
551 assert _mpi_count == 0
554 assert _mpi_count == 0
556 self.bloom, bloom = None, self.bloom
557 self.packs, packs = None, self.packs
559 with ExitStack() as stack:
561 stack.enter_context(pack)
568 def __exit__(self, type, value, traceback):
569 with pending_raise(value, rethrow=False):
576 return iter(idxmerge(self.packs))
579 return sum(len(pack) for pack in self.packs)
581 def exists(self, hash, want_source=False):
582 """Return nonempty if the object exists in the index files."""
583 global _total_searches
585 if hash in self.also:
587 if self.do_bloom and self.bloom:
588 if self.bloom.exists(hash):
589 self.do_bloom = False
591 _total_searches -= 1 # was counted by bloom
593 for i in range(len(self.packs)):
595 _total_searches -= 1 # will be incremented by sub-pack
596 ix = p.exists(hash, want_source=want_source)
598 # reorder so most recently used packs are searched first
599 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
604 def refresh(self, skip_midx = False):
605 """Refresh the index list.
606 This method verifies if .midx files were superseded (e.g. all of its
607 contents are in another, bigger .midx file) and removes the superseded
610 If skip_midx is True, all work on .midx files will be skipped and .midx
611 files will be removed from the list.
613 The instance variable 'ignore_midx' can force this function to
614 always act as if skip_midx was True.
616 if self.bloom is not None:
618 self.bloom = None # Always reopen the bloom as it may have been relaced
619 self.do_bloom = False
620 skip_midx = skip_midx or self.ignore_midx
621 d = dict((p.name, p) for p in self.packs
622 if not skip_midx or not isinstance(p, midx.PackMidx))
623 if os.path.exists(self.dir):
626 midxes = set(glob.glob(os.path.join(self.dir, b'*.midx')))
627 # remove any *.midx files from our list that no longer exist
628 for ix in list(d.values()):
629 if not isinstance(ix, midx.PackMidx):
631 if ix.name in midxes:
636 self.packs.remove(ix)
637 for ix in self.packs:
638 if isinstance(ix, midx.PackMidx):
639 for name in ix.idxnames:
640 d[os.path.join(self.dir, name)] = ix
643 mx = midx.PackMidx(full)
644 (mxd, mxf) = os.path.split(mx.name)
646 for n in mx.idxnames:
647 if not os.path.exists(os.path.join(mxd, n)):
648 log(('warning: index %s missing\n'
650 % (path_msg(n), path_msg(mxf)))
657 midxl.sort(key=lambda ix:
658 (-len(ix), -xstat.stat(ix.name).st_mtime))
661 for sub in ix.idxnames:
662 found = d.get(os.path.join(self.dir, sub))
663 if not found or isinstance(found, PackIdx):
664 # doesn't exist, or exists but not in a midx
669 for name in ix.idxnames:
670 d[os.path.join(self.dir, name)] = ix
671 elif not ix.force_keep:
672 debug1('midx: removing redundant: %s\n'
673 % path_msg(os.path.basename(ix.name)))
676 for full in glob.glob(os.path.join(self.dir, b'*.idx')):
680 except GitError as e:
684 bfull = os.path.join(self.dir, b'bup.bloom')
685 new_packs = set(d.values())
687 if not p in new_packs:
689 new_packs = list(new_packs)
690 new_packs.sort(reverse=True, key=lambda x: len(x))
691 self.packs = new_packs
692 if self.bloom is None and os.path.exists(bfull):
693 self.bloom = bloom.ShaBloom(bfull)
695 if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
699 self.bloom, bloom_tmp = None, self.bloom
701 except BaseException as ex:
702 with pending_raise(ex):
706 debug1('PackIdxList: using %d index%s.\n'
707 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
710 """Insert an additional object in the list."""
714 def open_idx(filename):
715 if filename.endswith(b'.idx'):
716 f = open(filename, 'rb')
718 if header[0:4] == b'\377tOc':
719 version = struct.unpack('!I', header[4:8])[0]
721 return PackIdxV2(filename, f)
723 raise GitError('%s: expected idx file version 2, got %d'
724 % (path_msg(filename), version))
725 elif len(header) == 8 and header[0:4] < b'\377tOc':
726 return PackIdxV1(filename, f)
728 raise GitError('%s: unrecognized idx file header'
729 % path_msg(filename))
730 elif filename.endswith(b'.midx'):
731 return midx.PackMidx(filename)
733 raise GitError('idx filenames must end with .idx or .midx')
736 def idxmerge(idxlist, final_progress=True):
737 """Generate a list of all the objects reachable in a PackIdxList."""
738 def pfunc(count, total):
739 qprogress('Reading indexes: %.2f%% (%d/%d)\r'
740 % (count*100.0/total, count, total))
741 def pfinal(count, total):
743 progress('Reading indexes: %.2f%% (%d/%d), done.\n'
744 % (100, total, total))
745 return merge_iter(idxlist, 10024, pfunc, pfinal)
748 def create_commit_blob(tree, parent,
749 author, adate_sec, adate_tz,
750 committer, cdate_sec, cdate_tz,
752 if adate_tz is not None:
753 adate_str = _git_date_str(adate_sec, adate_tz)
755 adate_str = _local_git_date_str(adate_sec)
756 if cdate_tz is not None:
757 cdate_str = _git_date_str(cdate_sec, cdate_tz)
759 cdate_str = _local_git_date_str(cdate_sec)
761 if tree: l.append(b'tree %s' % hexlify(tree))
762 if parent: l.append(b'parent %s' % hexlify(parent))
763 if author: l.append(b'author %s %s' % (author, adate_str))
764 if committer: l.append(b'committer %s %s' % (committer, cdate_str))
769 def _make_objcache():
770 return PackIdxList(repo(b'objects/pack'))
772 # bup-gc assumes that it can disable all PackWriter activities
773 # (bloom/midx/cache) via the constructor and close() arguments.
775 class PackWriter(object):
776 """Writes Git objects inside a pack file."""
777 def __init__(self, objcache_maker=_make_objcache, compression_level=1,
778 run_midx=True, on_pack_finish=None,
779 max_pack_size=None, max_pack_objects=None, repo_dir=None):
781 self.repo_dir = repo_dir or repo()
788 self.objcache_maker = objcache_maker
790 self.compression_level = compression_level
791 self.run_midx=run_midx
792 self.on_pack_finish = on_pack_finish
793 if not max_pack_size:
794 max_pack_size = git_config_get(b'pack.packSizeLimit',
795 repo_dir=self.repo_dir,
797 if not max_pack_size:
798 # larger packs slow down pruning
799 max_pack_size = 1000 * 1000 * 1000
800 self.max_pack_size = max_pack_size
801 # cache memory usage is about 83 bytes per object
802 self.max_pack_objects = max_pack_objects if max_pack_objects \
803 else max(1, self.max_pack_size // 5000)
808 def __exit__(self, type, value, traceback):
809 with pending_raise(value, rethrow=False):
814 objdir = dir = os.path.join(self.repo_dir, b'objects')
815 fd, name = tempfile.mkstemp(suffix=b'.pack', dir=objdir)
817 self.file = os.fdopen(fd, 'w+b')
822 self.parentfd = os.open(objdir, os.O_RDONLY)
828 assert name.endswith(b'.pack')
829 self.filename = name[:-5]
830 self.file.write(b'PACK\0\0\0\2\0\0\0\0')
831 self.idx = PackIdxV2Writer()
833 def _raw_write(self, datalist, sha):
836 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
837 # the file never has a *partial* blob. So let's make sure it's
838 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
839 # to our hashsplit algorithm.) f.write() does its own buffering,
840 # but that's okay because we'll flush it in _end().
841 oneblob = b''.join(datalist)
847 crc = zlib.crc32(oneblob) & 0xffffffff
848 self._update_idx(sha, crc, nw)
853 def _update_idx(self, sha, crc, size):
856 self.idx.add(sha, crc, self.file.tell() - size)
858 def _write(self, sha, type, content):
862 sha = calc_hash(type, content)
863 size, crc = self._raw_write(_encode_packobj(type, content,
864 self.compression_level),
866 if self.outbytes >= self.max_pack_size \
867 or self.count >= self.max_pack_objects:
871 def _require_objcache(self):
872 if self.objcache is None and self.objcache_maker:
873 self.objcache = self.objcache_maker()
874 if self.objcache is None:
876 "PackWriter not opened or can't check exists w/o objcache")
878 def exists(self, id, want_source=False):
879 """Return non-empty if an object is found in the object cache."""
880 self._require_objcache()
881 return self.objcache.exists(id, want_source=want_source)
883 def just_write(self, sha, type, content):
884 """Write an object to the pack file without checking for duplication."""
885 self._write(sha, type, content)
886 # If nothing else, gc doesn't have/want an objcache
887 if self.objcache is not None:
888 self.objcache.add(sha)
890 def maybe_write(self, type, content):
891 """Write an object to the pack file if not present and return its id."""
892 sha = calc_hash(type, content)
893 if not self.exists(sha):
894 self._require_objcache()
895 self.just_write(sha, type, content)
898 def new_blob(self, blob):
899 """Create a blob object in the pack with the supplied content."""
900 return self.maybe_write(b'blob', blob)
902 def new_tree(self, shalist):
903 """Create a tree object in the pack."""
904 content = tree_encode(shalist)
905 return self.maybe_write(b'tree', content)
907 def new_commit(self, tree, parent,
908 author, adate_sec, adate_tz,
909 committer, cdate_sec, cdate_tz,
911 """Create a commit object in the pack. The date_sec values must be
912 epoch-seconds, and if a tz is None, the local timezone is assumed."""
913 content = create_commit_blob(tree, parent,
914 author, adate_sec, adate_tz,
915 committer, cdate_sec, cdate_tz,
917 return self.maybe_write(b'commit', content)
919 def _end(self, run_midx=True, abort=False):
920 # Ignores run_midx during abort
921 self.parentfd, pfd, = None, self.parentfd
922 self.file, f = None, self.file
923 self.idx, idx = None, self.idx
925 with nullcontext_if_not(self.objcache), \
926 finalized(pfd, lambda x: x is not None and os.close(x)), \
927 nullcontext_if_not(f):
932 os.unlink(self.filename + b'.pack')
935 # update object count
937 cp = struct.pack('!i', self.count)
941 # calculate the pack sha1sum
944 for b in chunkyreader(f):
946 packbin = sum.digest()
949 fdatasync(f.fileno())
952 idx.write(self.filename + b'.idx', packbin)
953 nameprefix = os.path.join(self.repo_dir,
954 b'objects/pack/pack-' + hexlify(packbin))
955 if os.path.exists(self.filename + b'.map'):
956 os.unlink(self.filename + b'.map')
957 os.rename(self.filename + b'.pack', nameprefix + b'.pack')
958 os.rename(self.filename + b'.idx', nameprefix + b'.idx')
961 auto_midx(os.path.join(self.repo_dir, b'objects/pack'))
962 if self.on_pack_finish:
963 self.on_pack_finish(nameprefix)
966 # Must be last -- some of the code above depends on it
970 """Remove the pack file from disk."""
972 self._end(abort=True)
974 def breakpoint(self):
975 """Clear byte and object counts and return the last processed id."""
976 id = self._end(self.run_midx)
977 self.outbytes = self.count = 0
980 def close(self, run_midx=True):
981 """Close the pack file and move it to its definitive path."""
983 return self._end(run_midx=run_midx)
989 class PackIdxV2Writer:
991 self.idx = list(list() for i in range(256))
994 def add(self, sha, crc, offs):
997 self.idx[byte_int(sha[0])].append((sha, crc, offs))
999 def write(self, filename, packbin):
1001 for section in self.idx:
1002 for entry in section:
1003 if entry[2] >= 2**31:
1006 # Length: header + fan-out + shas-and-crcs + overflow-offsets
1007 index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
1009 idx_f = open(filename, 'w+b')
1011 idx_f.truncate(index_len)
1012 fdatasync(idx_f.fileno())
1013 idx_map = mmap_readwrite(idx_f, close=False)
1015 count = _helpers.write_idx(filename, idx_map, self.idx,
1017 assert(count == self.count)
1024 idx_f = open(filename, 'a+b')
1026 idx_f.write(packbin)
1029 b = idx_f.read(8 + 4*256)
1032 for b in chunkyreader(idx_f, 20 * self.count):
1035 for b in chunkyreader(idx_f):
1037 idx_f.write(idx_sum.digest())
1038 fdatasync(idx_f.fileno())
1043 def list_refs(patterns=None, repo_dir=None,
1044 limit_to_heads=False, limit_to_tags=False):
1045 """Yield (refname, hash) tuples for all repository refs unless
1046 patterns are specified. In that case, only include tuples for
1047 refs matching those patterns (cf. git-show-ref(1)). The limits
1048 restrict the result items to refs/heads or refs/tags. If both
1049 limits are specified, items from both sources will be included.
1052 argv = [b'git', b'show-ref']
1054 argv.append(b'--heads')
1056 argv.append(b'--tags')
1059 argv.extend(patterns)
1060 p = subprocess.Popen(argv, env=_gitenv(repo_dir), stdout=subprocess.PIPE,
1062 out = p.stdout.read().strip()
1063 rv = p.wait() # not fatal
1067 for d in out.split(b'\n'):
1068 sha, name = d.split(b' ', 1)
1069 yield name, unhexlify(sha)
1072 def read_ref(refname, repo_dir = None):
1073 """Get the commit id of the most recent commit made on a given ref."""
1074 refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
1075 l = tuple(islice(refs, 2))
1083 def rev_list_invocation(ref_or_refs, format=None):
1084 if isinstance(ref_or_refs, bytes):
1085 refs = (ref_or_refs,)
1088 argv = [b'git', b'rev-list']
1091 argv.append(b'--pretty=format:' + format)
1093 assert not ref.startswith(b'-')
1099 def rev_list(ref_or_refs, parse=None, format=None, repo_dir=None):
1100 """Yield information about commits as per "git rev-list". If a format
1101 is not provided, yield one hex hash at a time. If a format is
1102 provided, pass it to rev-list and call parse(git_stdout) for each
1103 commit with the stream positioned just after the rev-list "commit
1104 HASH" header line. When a format is provided yield (oidx,
1105 parse(git_stdout)) for each commit.
1108 assert bool(parse) == bool(format)
1109 p = subprocess.Popen(rev_list_invocation(ref_or_refs,
1111 env=_gitenv(repo_dir),
1112 stdout = subprocess.PIPE,
1115 for line in p.stdout:
1118 line = p.stdout.readline()
1121 if not s.startswith(b'commit '):
1122 raise Exception('unexpected line ' + repr(s))
1125 yield s, parse(p.stdout)
1126 line = p.stdout.readline()
1128 rv = p.wait() # not fatal
1130 raise GitError('git rev-list returned error %d' % rv)
1133 def rev_parse(committish, repo_dir=None):
1134 """Resolve the full hash for 'committish', if it exists.
1136 Should be roughly equivalent to 'git rev-parse'.
1138 Returns the hex value of the hash if it is found, None if 'committish' does
1139 not correspond to anything.
1141 head = read_ref(committish, repo_dir=repo_dir)
1143 debug2("resolved from ref: commit = %s\n" % hexlify(head))
1146 if len(committish) == 40:
1148 hash = unhexlify(committish)
1152 with PackIdxList(repo(b'objects/pack', repo_dir=repo_dir)) as pL:
1159 def update_ref(refname, newval, oldval, repo_dir=None, force=False):
1160 """Update a repository reference.
1162 With force=True, don't care about the previous ref (oldval);
1163 with force=False oldval must be either a sha1 or None (for an
1164 entirely new branch)
1167 assert oldval is None
1172 oldarg = [hexlify(oldval)]
1173 assert refname.startswith(b'refs/heads/') \
1174 or refname.startswith(b'refs/tags/')
1175 p = subprocess.Popen([b'git', b'update-ref', refname,
1176 hexlify(newval)] + oldarg,
1177 env=_gitenv(repo_dir),
1179 _git_wait(b'git update-ref', p)
1182 def delete_ref(refname, oldvalue=None):
1183 """Delete a repository reference (see git update-ref(1))."""
1184 assert refname.startswith(b'refs/')
1185 oldvalue = [] if not oldvalue else [oldvalue]
1186 p = subprocess.Popen([b'git', b'update-ref', b'-d', refname] + oldvalue,
1189 _git_wait('git update-ref', p)
1192 def guess_repo(path=None):
1193 """Set the path value in the global variable "repodir".
1194 This makes bup look for an existing bup repository, but not fail if a
1195 repository doesn't exist. Usually, if you are interacting with a bup
1196 repository, you would not be calling this function but using
1197 check_repo_or_die().
1203 repodir = environ.get(b'BUP_DIR')
1205 repodir = os.path.expanduser(b'~/.bup')
1208 def init_repo(path=None):
1209 """Create the Git bare repository for bup in a given path."""
1211 d = repo() # appends a / to the path
1212 parent = os.path.dirname(os.path.dirname(d))
1213 if parent and not os.path.exists(parent):
1214 raise GitError('parent directory "%s" does not exist\n'
1216 if os.path.exists(d) and not os.path.isdir(os.path.join(d, b'.')):
1217 raise GitError('"%s" exists but is not a directory\n' % path_msg(d))
1218 p = subprocess.Popen([b'git', b'--bare', b'init'], stdout=sys.stderr,
1221 _git_wait('git init', p)
1222 # Force the index version configuration in order to ensure bup works
1223 # regardless of the version of the installed Git binary.
1224 p = subprocess.Popen([b'git', b'config', b'pack.indexVersion', '2'],
1225 stdout=sys.stderr, env=_gitenv(), close_fds=True)
1226 _git_wait('git config', p)
1228 p = subprocess.Popen([b'git', b'config', b'core.logAllRefUpdates', b'true'],
1229 stdout=sys.stderr, env=_gitenv(), close_fds=True)
1230 _git_wait('git config', p)
1233 def check_repo_or_die(path=None):
1234 """Check to see if a bup repository probably exists, and abort if not."""
1237 pst = stat_if_exists(top + b'/objects/pack')
1238 if pst and stat.S_ISDIR(pst.st_mode):
1241 top_st = stat_if_exists(top)
1243 log('error: repository %r does not exist (see "bup help init")\n'
1246 log('error: %s is not a repository\n' % path_msg(top))
1250 def is_suitable_git(ver_str):
1251 if not ver_str.startswith(b'git version '):
1252 return 'unrecognized'
1253 ver_str = ver_str[len(b'git version '):]
1254 if ver_str.startswith(b'0.'):
1255 return 'insufficient'
1256 if ver_str.startswith(b'1.'):
1257 if re.match(br'1\.[012345]rc', ver_str):
1258 return 'insufficient'
1259 if re.match(br'1\.[01234]\.', ver_str):
1260 return 'insufficient'
1261 if re.match(br'1\.5\.[012345]($|\.)', ver_str):
1262 return 'insufficient'
1263 if re.match(br'1\.5\.6-rc', ver_str):
1264 return 'insufficient'
1266 if re.match(br'[0-9]+(\.|$)?', ver_str):
1272 def require_suitable_git(ver_str=None):
1273 """Raise GitError if the version of git isn't suitable.
1275 Rely on ver_str when provided, rather than invoking the git in the
1280 if _git_great is not None:
1282 if environ.get(b'BUP_GIT_VERSION_IS_FINE', b'').lower() \
1283 in (b'yes', b'true', b'1'):
1287 ver_str, _, _ = _git_exo([b'git', b'--version'])
1288 status = is_suitable_git(ver_str)
1289 if status == 'unrecognized':
1290 raise GitError('Unexpected git --version output: %r' % ver_str)
1291 if status == 'insufficient':
1292 log('error: git version must be at least 1.5.6\n')
1294 if status == 'suitable':
1301 """Link to 'git cat-file' that is used to retrieve blob data."""
1302 def __init__(self, repo_dir = None):
1303 require_suitable_git()
1304 self.repo_dir = repo_dir
1305 self.p = self.inprogress = None
1307 def close(self, wait=False):
1308 self.p, p = None, self.p
1309 self.inprogress = None
1314 # This will handle pending exceptions correctly once
1324 self.p = subprocess.Popen([b'git', b'cat-file', b'--batch'],
1325 stdin=subprocess.PIPE,
1326 stdout=subprocess.PIPE,
1329 env=_gitenv(self.repo_dir))
1332 """Yield (oidx, type, size), followed by the data referred to by ref.
1333 If ref does not exist, only yield (None, None, None).
1336 if not self.p or self.p.poll() != None:
1339 poll_result = self.p.poll()
1340 assert(poll_result == None)
1342 log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1343 assert(not self.inprogress)
1344 assert ref.find(b'\n') < 0
1345 assert ref.find(b'\r') < 0
1346 assert not ref.startswith(b'-')
1347 self.inprogress = ref
1348 self.p.stdin.write(ref + b'\n')
1349 self.p.stdin.flush()
1350 hdr = self.p.stdout.readline()
1352 raise GitError('unexpected cat-file EOF (last request: %r, exit: %s)'
1353 % (ref, self.p.poll() or 'none'))
1354 if hdr.endswith(b' missing\n'):
1355 self.inprogress = None
1356 yield None, None, None
1358 info = hdr.split(b' ')
1359 if len(info) != 3 or len(info[0]) != 40:
1360 raise GitError('expected object (id, type, size), got %r' % info)
1361 oidx, typ, size = info
1364 it = chunkyreader(self.p.stdout, size)
1365 yield oidx, typ, size
1366 for blob in chunkyreader(self.p.stdout, size):
1368 readline_result = self.p.stdout.readline()
1369 assert readline_result == b'\n'
1370 self.inprogress = None
1371 except Exception as ex:
1372 with pending_raise(ex):
1375 def _join(self, it):
1376 _, typ, _ = next(it)
1380 elif typ == b'tree':
1381 treefile = b''.join(it)
1382 for (mode, name, sha) in tree_decode(treefile):
1383 for blob in self.join(hexlify(sha)):
1385 elif typ == b'commit':
1386 treeline = b''.join(it).split(b'\n')[0]
1387 assert treeline.startswith(b'tree ')
1388 for blob in self.join(treeline[5:]):
1391 raise GitError('invalid object type %r: expected blob/tree/commit'
1395 """Generate a list of the content of all blobs that can be reached
1396 from an object. The hash given in 'id' must point to a blob, a tree
1397 or a commit. The content of all blobs that can be seen from trees or
1398 commits will be added to the list.
1400 for d in self._join(self.get(id)):
1406 def cp(repo_dir=None):
1407 """Create a CatPipe object or reuse the already existing one."""
1410 repo_dir = repodir or repo()
1411 repo_dir = os.path.abspath(repo_dir)
1412 cp = _cp.get(repo_dir)
1414 cp = CatPipe(repo_dir)
1419 def close_catpipes():
1420 # FIXME: chain exceptions
1422 _, cp = _cp.popitem()
1426 def tags(repo_dir = None):
1427 """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1429 for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1430 assert n.startswith(b'refs/tags/')
1434 tags[c].append(name) # more than one tag can point at 'c'
1438 class MissingObject(KeyError):
1439 def __init__(self, oid):
1441 KeyError.__init__(self, 'object %r is missing' % hexlify(oid))
1444 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1445 'path', 'chunk_path', 'data'])
1446 # The path is the mangled path, and if an item represents a fragment
1447 # of a chunked file, the chunk_path will be the chunked subtree path
1448 # for the chunk, i.e. ['', '2d3115e', ...]. The top-level path for a
1449 # chunked file will have a chunk_path of ['']. So some chunk subtree
1450 # of the file '/foo/bar/baz' might look like this:
1452 # item.path = ['foo', 'bar', 'baz.bup']
1453 # item.chunk_path = ['', '2d3115e', '016b097']
1454 # item.type = 'tree'
1458 def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1459 """Yield everything reachable from oidx via get_ref (which must behave
1460 like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1461 returns true. Throw MissingObject if a hash encountered is
1462 missing from the repository, and don't read or return blob content
1463 in the data field unless include_data is set.
1466 # Maintain the pending stack on the heap to avoid stack overflow
1467 pending = [(oidx, [], [], None)]
1469 oidx, parent_path, chunk_path, mode = pending.pop()
1470 oid = unhexlify(oidx)
1471 if stop_at and stop_at(oidx):
1474 if (not include_data) and mode and stat.S_ISREG(mode):
1475 # If the object is a "regular file", then it's a leaf in
1476 # the graph, so we can skip reading the data if the caller
1477 # hasn't requested it.
1478 yield WalkItem(oid=oid, type=b'blob',
1479 chunk_path=chunk_path, path=parent_path,
1484 item_it = get_ref(oidx)
1485 get_oidx, typ, _ = next(item_it)
1487 raise MissingObject(unhexlify(oidx))
1488 if typ not in (b'blob', b'commit', b'tree'):
1489 raise Exception('unexpected repository object type %r' % typ)
1491 # FIXME: set the mode based on the type when the mode is None
1492 if typ == b'blob' and not include_data:
1493 # Dump data until we can ask cat_pipe not to fetch it
1494 for ignored in item_it:
1498 data = b''.join(item_it)
1500 yield WalkItem(oid=oid, type=typ,
1501 chunk_path=chunk_path, path=parent_path,
1503 data=(data if include_data else None))
1505 if typ == b'commit':
1506 commit_items = parse_commit(data)
1507 for pid in commit_items.parents:
1508 pending.append((pid, parent_path, chunk_path, mode))
1509 pending.append((commit_items.tree, parent_path, chunk_path,
1510 hashsplit.GIT_MODE_TREE))
1511 elif typ == b'tree':
1512 for mode, name, ent_id in tree_decode(data):
1513 demangled, bup_type = demangle_name(name, mode)
1515 sub_path = parent_path
1516 sub_chunk_path = chunk_path + [name]
1518 sub_path = parent_path + [name]
1519 if bup_type == BUP_CHUNKED:
1520 sub_chunk_path = [b'']
1522 sub_chunk_path = chunk_path
1523 pending.append((hexlify(ent_id), sub_path, sub_chunk_path,