1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
6 from __future__ import absolute_import, print_function
7 import os, sys, zlib, subprocess, struct, stat, re, tempfile, glob
8 from array import array
9 from binascii import hexlify, unhexlify
10 from collections import namedtuple
11 from itertools import islice
13 from bup import _helpers, hashsplit, path, midx, bloom, xstat
14 from bup.compat import (buffer,
15 byte_int, bytes_from_byte, bytes_from_uint,
22 from bup.io import path_msg
23 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
30 mmap_read, mmap_readwrite,
32 progress, qprogress, stat_if_exists,
38 repodir = None # The default repository, once initialized
40 _typemap = {b'blob': 3, b'tree': 2, b'commit': 1, b'tag': 4}
41 _typermap = {v: k for k, v in items(_typemap)}
48 class GitError(Exception):
52 def _gitenv(repo_dir=None):
55 return merge_dict(environ, {b'GIT_DIR': os.path.abspath(repo_dir)})
57 def _git_wait(cmd, p):
60 raise GitError('%r returned %d' % (cmd, rv))
62 def _git_exo(cmd, **kwargs):
63 kwargs['check'] = False
64 result = exo(cmd, **kwargs)
66 if proc.returncode != 0:
67 raise GitError('%r returned %d' % (cmd, proc.returncode))
70 def git_config_get(option, repo_dir=None, opttype=None, cfg_file=None):
71 assert not (repo_dir and cfg_file), "repo_dir and cfg_file cannot both be used"
72 cmd = [b'git', b'config', b'--null']
74 cmd.extend([b'--file', cfg_file])
76 cmd.extend([b'--int'])
77 elif opttype == 'bool':
78 cmd.extend([b'--bool'])
80 assert opttype is None
81 cmd.extend([b'--get', option])
84 env = _gitenv(repo_dir=repo_dir)
85 p = subprocess.Popen(cmd, stdout=subprocess.PIPE, env=env,
87 # with --null, git writes out a trailing \0 after the value
88 r = p.stdout.read()[:-1]
93 elif opttype == 'bool':
94 # git converts to 'true' or 'false'
98 raise GitError('%r returned %d' % (cmd, rc))
102 def parse_tz_offset(s):
103 """UTC offset in seconds."""
104 tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
105 if bytes_from_byte(s[0]) == b'-':
109 def parse_commit_gpgsig(sig):
110 """Return the original signature bytes.
112 i.e. with the "gpgsig " header and the leading space character on
113 each continuation line removed.
118 assert sig.startswith(b'gpgsig ')
120 return sig.replace(b'\n ', b'\n')
122 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
123 # Make sure that's authoritative.
126 # https://github.com/git/git/blob/master/Documentation/technical/signature-format.txt
127 # The continuation lines have only one leading space.
129 _start_end_char = br'[^ .,:;<>"\'\0\n]'
130 _content_char = br'[^\0\n<>]'
131 _safe_str_rx = br'(?:%s{1,2}|(?:%s%s*%s))' \
133 _start_end_char, _content_char, _start_end_char)
134 _tz_rx = br'[-+]\d\d[0-5]\d'
135 _parent_rx = br'(?:parent [abcdefABCDEF0123456789]{40}\n)'
136 # Assumes every following line starting with a space is part of the
137 # mergetag. Is there a formal commit blob spec?
138 _mergetag_rx = br'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)'
139 _commit_rx = re.compile(br'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
140 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
141 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?)
142 (?P<gpgsig>gpgsig .*\n(?: .*\n)*)?
143 (?P<message>(?:.|\n)*)''' % (_parent_rx,
144 _safe_str_rx, _safe_str_rx, _tz_rx,
145 _safe_str_rx, _safe_str_rx, _tz_rx,
147 _parent_hash_rx = re.compile(br'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
149 # Note that the author_sec and committer_sec values are (UTC) epoch
150 # seconds, and for now the mergetag is not included.
151 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
152 'author_name', 'author_mail',
153 'author_sec', 'author_offset',
154 'committer_name', 'committer_mail',
155 'committer_sec', 'committer_offset',
159 def parse_commit(content):
160 commit_match = re.match(_commit_rx, content)
162 raise Exception('cannot parse commit %r' % content)
163 matches = commit_match.groupdict()
164 return CommitInfo(tree=matches['tree'],
165 parents=re.findall(_parent_hash_rx, matches['parents']),
166 author_name=matches['author_name'],
167 author_mail=matches['author_mail'],
168 author_sec=int(matches['asec']),
169 author_offset=parse_tz_offset(matches['atz']),
170 committer_name=matches['committer_name'],
171 committer_mail=matches['committer_mail'],
172 committer_sec=int(matches['csec']),
173 committer_offset=parse_tz_offset(matches['ctz']),
174 gpgsig=parse_commit_gpgsig(matches['gpgsig']),
175 message=matches['message'])
178 def get_cat_data(cat_iterator, expected_type):
179 _, kind, _ = next(cat_iterator)
180 if kind != expected_type:
181 raise Exception('expected %r, saw %r' % (expected_type, kind))
182 return b''.join(cat_iterator)
184 def get_commit_items(id, cp):
185 return parse_commit(get_cat_data(cp.get(id), b'commit'))
187 def _local_git_date_str(epoch_sec):
188 return b'%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
191 def _git_date_str(epoch_sec, tz_offset_sec):
192 offs = tz_offset_sec // 60
193 return b'%d %s%02d%02d' \
195 b'+' if offs >= 0 else b'-',
200 def repo(sub = b'', repo_dir=None):
201 """Get the path to the git repository or one of its subdirectories."""
202 repo_dir = repo_dir or repodir
204 raise GitError('You should call check_repo_or_die()')
206 # If there's a .git subdirectory, then the actual repo is in there.
207 gd = os.path.join(repo_dir, b'.git')
208 if os.path.exists(gd):
211 return os.path.join(repo_dir, sub)
215 re.compile(br'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)')
218 return _shorten_hash_rx.sub(br'\1\2*\3', s)
222 full = os.path.abspath(path)
223 fullrepo = os.path.abspath(repo(b''))
224 if not fullrepo.endswith(b'/'):
226 if full.startswith(fullrepo):
227 path = full[len(fullrepo):]
228 if path.startswith(b'index-cache/'):
229 path = path[len(b'index-cache/'):]
230 return shorten_hash(path)
233 def auto_midx(objdir):
234 args = [path.exe(), b'midx', b'--auto', b'--dir', objdir]
236 rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
238 # make sure 'args' gets printed to help with debugging
239 add_error('%r: exception: %s' % (args, e))
242 add_error('%r: returned %d' % (args, rv))
244 args = [path.exe(), b'bloom', b'--dir', objdir]
246 rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
248 # make sure 'args' gets printed to help with debugging
249 add_error('%r: exception: %s' % (args, e))
252 add_error('%r: returned %d' % (args, rv))
255 def mangle_name(name, mode, gitmode):
256 """Mangle a file name to present an abstract name for segmented files.
257 Mangled file names will have the ".bup" extension added to them. If a
258 file's name already ends with ".bup", a ".bupl" extension is added to
259 disambiguate normal files from segmented ones.
261 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
262 assert(stat.S_ISDIR(gitmode))
263 return name + b'.bup'
264 elif name.endswith(b'.bup') or name[:-1].endswith(b'.bup'):
265 return name + b'.bupl'
270 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
271 def demangle_name(name, mode):
272 """Remove name mangling from a file name, if necessary.
274 The return value is a tuple (demangled_filename,mode), where mode is one of
277 * BUP_NORMAL : files that should be read as-is from the repository
278 * BUP_CHUNKED : files that were chunked and need to be reassembled
280 For more information on the name mangling algorithm, see mangle_name()
282 if name.endswith(b'.bupl'):
283 return (name[:-5], BUP_NORMAL)
284 elif name.endswith(b'.bup'):
285 return (name[:-4], BUP_CHUNKED)
286 elif name.endswith(b'.bupm'):
288 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
289 return (name, BUP_NORMAL)
292 def calc_hash(type, content):
293 """Calculate some content's hash in the Git fashion."""
294 header = b'%s %d\0' % (type, len(content))
300 def shalist_item_sort_key(ent):
301 (mode, name, id) = ent
302 assert(mode+0 == mode)
303 if stat.S_ISDIR(mode):
309 def tree_encode(shalist):
310 """Generate a git tree object from (mode,name,hash) tuples."""
311 shalist = sorted(shalist, key = shalist_item_sort_key)
313 for (mode,name,bin) in shalist:
315 assert(mode+0 == mode)
317 assert(len(bin) == 20)
318 s = b'%o %s\0%s' % (mode,name,bin)
319 assert s[0] != b'0' # 0-padded octal is not acceptable in a git tree
324 def tree_decode(buf):
325 """Generate a list of (mode,name,hash) from the git tree object in buf."""
327 while ofs < len(buf):
328 z = buf.find(b'\0', ofs)
330 spl = buf[ofs:z].split(b' ', 1)
331 assert(len(spl) == 2)
333 sha = buf[z+1:z+1+20]
335 yield (int(mode, 8), name, sha)
338 def _encode_packobj(type, content, compression_level=1):
339 if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
340 raise ValueError('invalid compression level %s' % compression_level)
343 szbits = (sz & 0x0f) | (_typemap[type]<<4)
346 if sz: szbits |= 0x80
347 szout += bytes_from_uint(szbits)
352 z = zlib.compressobj(compression_level)
354 yield z.compress(content)
358 def _decode_packobj(buf):
361 type = _typermap[(c & 0x70) >> 4]
368 sz |= (c & 0x7f) << shift
372 return (type, zlib.decompress(buf[i+1:]))
375 class PackIdx(object):
376 def find_offset(self, hash):
377 """Get the offset of an object inside the index file."""
378 idx = self._idx_from_hash(hash)
380 return self._ofs_from_idx(idx)
383 def exists(self, hash, want_source=False):
384 """Return nonempty if the object exists in this index."""
385 if hash and (self._idx_from_hash(hash) != None):
386 return want_source and os.path.basename(self.name) or True
389 def _idx_from_hash(self, hash):
390 global _total_searches, _total_steps
392 assert(len(hash) == 20)
393 b1 = byte_int(hash[0])
394 start = self.fanout[b1-1] # range -1..254
395 end = self.fanout[b1] # range 0..255
397 _total_steps += 1 # lookup table is a step
400 mid = start + (end - start) // 2
401 v = self._idx_to_hash(mid)
411 class PackIdxV1(PackIdx):
412 """Object representation of a Git pack index (version 1) file."""
413 def __init__(self, filename, f):
414 super(PackIdxV1, self).__init__()
417 self.idxnames = [self.name]
418 self.map = mmap_read(f)
419 # Min size for 'L' is 4, which is sufficient for struct's '!I'
420 self.fanout = array('L', struct.unpack('!256I', self.map))
421 self.fanout.append(0) # entry "-1"
422 self.nsha = self.fanout[255]
423 self.sha_ofs = 256 * 4
424 # Avoid slicing shatable for individual hashes (very high overhead)
425 self.shatable = buffer(self.map, self.sha_ofs, self.nsha * 24)
430 def __exit__(self, type, value, traceback):
431 with pending_raise(value, rethrow=False):
435 return int(self.nsha) # int() from long for python 2
437 def _ofs_from_idx(self, idx):
438 if idx >= self.nsha or idx < 0:
439 raise IndexError('invalid pack index index %d' % idx)
440 ofs = self.sha_ofs + idx * 24
441 return struct.unpack_from('!I', self.map, offset=ofs)[0]
443 def _idx_to_hash(self, idx):
444 if idx >= self.nsha or idx < 0:
445 raise IndexError('invalid pack index index %d' % idx)
446 ofs = self.sha_ofs + idx * 24 + 4
447 return self.map[ofs : ofs + 20]
450 start = self.sha_ofs + 4
451 for ofs in range(start, start + 24 * self.nsha, 24):
452 yield self.map[ofs : ofs + 20]
456 if self.map is not None:
465 class PackIdxV2(PackIdx):
466 """Object representation of a Git pack index (version 2) file."""
467 def __init__(self, filename, f):
468 super(PackIdxV2, self).__init__()
471 self.idxnames = [self.name]
472 self.map = mmap_read(f)
473 assert self.map[0:8] == b'\377tOc\0\0\0\2'
474 # Min size for 'L' is 4, which is sufficient for struct's '!I'
475 self.fanout = array('L', struct.unpack_from('!256I', self.map, offset=8))
476 self.fanout.append(0)
477 self.nsha = self.fanout[255]
478 self.sha_ofs = 8 + 256*4
479 self.ofstable_ofs = self.sha_ofs + self.nsha * 20 + self.nsha * 4
480 self.ofs64table_ofs = self.ofstable_ofs + self.nsha * 4
481 # Avoid slicing this for individual hashes (very high overhead)
482 self.shatable = buffer(self.map, self.sha_ofs, self.nsha*20)
487 def __exit__(self, type, value, traceback):
488 with pending_raise(value, rethrow=False):
492 return int(self.nsha) # int() from long for python 2
494 def _ofs_from_idx(self, idx):
495 if idx >= self.nsha or idx < 0:
496 raise IndexError('invalid pack index index %d' % idx)
497 ofs_ofs = self.ofstable_ofs + idx * 4
498 ofs = struct.unpack_from('!I', self.map, offset=ofs_ofs)[0]
500 idx64 = ofs & 0x7fffffff
501 ofs64_ofs = self.ofs64table_ofs + idx64 * 8
502 ofs = struct.unpack_from('!Q', self.map, offset=ofs64_ofs)[0]
505 def _idx_to_hash(self, idx):
506 if idx >= self.nsha or idx < 0:
507 raise IndexError('invalid pack index index %d' % idx)
508 ofs = self.sha_ofs + idx * 20
509 return self.map[ofs : ofs + 20]
513 for ofs in range(start, start + 20 * self.nsha, 20):
514 yield self.map[ofs : ofs + 20]
518 if self.map is not None:
529 def __init__(self, dir, ignore_midx=False):
531 # Q: was this also intended to prevent opening multiple repos?
532 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
538 self.do_bloom = False
540 self.ignore_midx = ignore_midx
543 except BaseException as ex:
544 with pending_raise(ex):
550 assert _mpi_count == 0
553 assert _mpi_count == 0
555 self.bloom, bloom = None, self.bloom
556 self.packs, packs = None, self.packs
558 with ExitStack() as stack:
560 stack.enter_context(pack)
567 def __exit__(self, type, value, traceback):
568 with pending_raise(value, rethrow=False):
575 return iter(idxmerge(self.packs))
578 return sum(len(pack) for pack in self.packs)
580 def exists(self, hash, want_source=False):
581 """Return nonempty if the object exists in the index files."""
582 global _total_searches
584 if hash in self.also:
586 if self.do_bloom and self.bloom:
587 if self.bloom.exists(hash):
588 self.do_bloom = False
590 _total_searches -= 1 # was counted by bloom
592 for i in range(len(self.packs)):
594 _total_searches -= 1 # will be incremented by sub-pack
595 ix = p.exists(hash, want_source=want_source)
597 # reorder so most recently used packs are searched first
598 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
603 def refresh(self, skip_midx = False):
604 """Refresh the index list.
605 This method verifies if .midx files were superseded (e.g. all of its
606 contents are in another, bigger .midx file) and removes the superseded
609 If skip_midx is True, all work on .midx files will be skipped and .midx
610 files will be removed from the list.
612 The instance variable 'ignore_midx' can force this function to
613 always act as if skip_midx was True.
615 if self.bloom is not None:
617 self.bloom = None # Always reopen the bloom as it may have been relaced
618 self.do_bloom = False
619 skip_midx = skip_midx or self.ignore_midx
620 d = dict((p.name, p) for p in self.packs
621 if not skip_midx or not isinstance(p, midx.PackMidx))
622 if os.path.exists(self.dir):
625 midxes = set(glob.glob(os.path.join(self.dir, b'*.midx')))
626 # remove any *.midx files from our list that no longer exist
627 for ix in list(d.values()):
628 if not isinstance(ix, midx.PackMidx):
630 if ix.name in midxes:
635 self.packs.remove(ix)
636 for ix in self.packs:
637 if isinstance(ix, midx.PackMidx):
638 for name in ix.idxnames:
639 d[os.path.join(self.dir, name)] = ix
642 mx = midx.PackMidx(full)
643 (mxd, mxf) = os.path.split(mx.name)
645 for n in mx.idxnames:
646 if not os.path.exists(os.path.join(mxd, n)):
647 log(('warning: index %s missing\n'
649 % (path_msg(n), path_msg(mxf)))
656 midxl.sort(key=lambda ix:
657 (-len(ix), -xstat.stat(ix.name).st_mtime))
660 for sub in ix.idxnames:
661 found = d.get(os.path.join(self.dir, sub))
662 if not found or isinstance(found, PackIdx):
663 # doesn't exist, or exists but not in a midx
668 for name in ix.idxnames:
669 d[os.path.join(self.dir, name)] = ix
670 elif not ix.force_keep:
671 debug1('midx: removing redundant: %s\n'
672 % path_msg(os.path.basename(ix.name)))
675 for full in glob.glob(os.path.join(self.dir, b'*.idx')):
679 except GitError as e:
683 bfull = os.path.join(self.dir, b'bup.bloom')
684 new_packs = set(d.values())
686 if not p in new_packs:
688 new_packs = list(new_packs)
689 new_packs.sort(reverse=True, key=lambda x: len(x))
690 self.packs = new_packs
691 if self.bloom is None and os.path.exists(bfull):
692 self.bloom = bloom.ShaBloom(bfull)
694 if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
698 self.bloom, bloom_tmp = None, self.bloom
700 except BaseException as ex:
701 with pending_raise(ex):
705 debug1('PackIdxList: using %d index%s.\n'
706 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
709 """Insert an additional object in the list."""
713 def open_idx(filename):
714 if filename.endswith(b'.idx'):
715 f = open(filename, 'rb')
717 if header[0:4] == b'\377tOc':
718 version = struct.unpack('!I', header[4:8])[0]
720 return PackIdxV2(filename, f)
722 raise GitError('%s: expected idx file version 2, got %d'
723 % (path_msg(filename), version))
724 elif len(header) == 8 and header[0:4] < b'\377tOc':
725 return PackIdxV1(filename, f)
727 raise GitError('%s: unrecognized idx file header'
728 % path_msg(filename))
729 elif filename.endswith(b'.midx'):
730 return midx.PackMidx(filename)
732 raise GitError('idx filenames must end with .idx or .midx')
735 def idxmerge(idxlist, final_progress=True):
736 """Generate a list of all the objects reachable in a PackIdxList."""
737 def pfunc(count, total):
738 qprogress('Reading indexes: %.2f%% (%d/%d)\r'
739 % (count*100.0/total, count, total))
740 def pfinal(count, total):
742 progress('Reading indexes: %.2f%% (%d/%d), done.\n'
743 % (100, total, total))
744 return merge_iter(idxlist, 10024, pfunc, pfinal)
747 def create_commit_blob(tree, parent,
748 author, adate_sec, adate_tz,
749 committer, cdate_sec, cdate_tz,
751 if adate_tz is not None:
752 adate_str = _git_date_str(adate_sec, adate_tz)
754 adate_str = _local_git_date_str(adate_sec)
755 if cdate_tz is not None:
756 cdate_str = _git_date_str(cdate_sec, cdate_tz)
758 cdate_str = _local_git_date_str(cdate_sec)
760 if tree: l.append(b'tree %s' % hexlify(tree))
761 if parent: l.append(b'parent %s' % hexlify(parent))
762 if author: l.append(b'author %s %s' % (author, adate_str))
763 if committer: l.append(b'committer %s %s' % (committer, cdate_str))
768 def _make_objcache():
769 return PackIdxList(repo(b'objects/pack'))
771 # bup-gc assumes that it can disable all PackWriter activities
772 # (bloom/midx/cache) via the constructor and close() arguments.
774 class PackWriter(object):
775 """Writes Git objects inside a pack file."""
776 def __init__(self, objcache_maker=_make_objcache, compression_level=1,
777 run_midx=True, on_pack_finish=None,
778 max_pack_size=None, max_pack_objects=None, repo_dir=None):
780 self.repo_dir = repo_dir or repo()
787 self.objcache_maker = objcache_maker
789 self.compression_level = compression_level
790 self.run_midx=run_midx
791 self.on_pack_finish = on_pack_finish
792 if not max_pack_size:
793 max_pack_size = git_config_get(b'pack.packSizeLimit',
794 repo_dir=self.repo_dir,
796 if not max_pack_size:
797 # larger packs slow down pruning
798 max_pack_size = 1000 * 1000 * 1000
799 self.max_pack_size = max_pack_size
800 # cache memory usage is about 83 bytes per object
801 self.max_pack_objects = max_pack_objects if max_pack_objects \
802 else max(1, self.max_pack_size // 5000)
807 def __exit__(self, type, value, traceback):
808 with pending_raise(value, rethrow=False):
813 objdir = dir = os.path.join(self.repo_dir, b'objects')
814 fd, name = tempfile.mkstemp(suffix=b'.pack', dir=objdir)
816 self.file = os.fdopen(fd, 'w+b')
821 self.parentfd = os.open(objdir, os.O_RDONLY)
827 assert name.endswith(b'.pack')
828 self.filename = name[:-5]
829 self.file.write(b'PACK\0\0\0\2\0\0\0\0')
830 self.idx = PackIdxV2Writer()
832 def _raw_write(self, datalist, sha):
835 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
836 # the file never has a *partial* blob. So let's make sure it's
837 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
838 # to our hashsplit algorithm.) f.write() does its own buffering,
839 # but that's okay because we'll flush it in _end().
840 oneblob = b''.join(datalist)
846 crc = zlib.crc32(oneblob) & 0xffffffff
847 self._update_idx(sha, crc, nw)
852 def _update_idx(self, sha, crc, size):
855 self.idx.add(sha, crc, self.file.tell() - size)
857 def _write(self, sha, type, content):
861 sha = calc_hash(type, content)
862 size, crc = self._raw_write(_encode_packobj(type, content,
863 self.compression_level),
865 if self.outbytes >= self.max_pack_size \
866 or self.count >= self.max_pack_objects:
870 def _require_objcache(self):
871 if self.objcache is None and self.objcache_maker:
872 self.objcache = self.objcache_maker()
873 if self.objcache is None:
875 "PackWriter not opened or can't check exists w/o objcache")
877 def exists(self, id, want_source=False):
878 """Return non-empty if an object is found in the object cache."""
879 self._require_objcache()
880 return self.objcache.exists(id, want_source=want_source)
882 def just_write(self, sha, type, content):
883 """Write an object to the pack file without checking for duplication."""
884 self._write(sha, type, content)
885 # If nothing else, gc doesn't have/want an objcache
886 if self.objcache is not None:
887 self.objcache.add(sha)
889 def maybe_write(self, type, content):
890 """Write an object to the pack file if not present and return its id."""
891 sha = calc_hash(type, content)
892 if not self.exists(sha):
893 self._require_objcache()
894 self.just_write(sha, type, content)
897 def new_blob(self, blob):
898 """Create a blob object in the pack with the supplied content."""
899 return self.maybe_write(b'blob', blob)
901 def new_tree(self, shalist):
902 """Create a tree object in the pack."""
903 content = tree_encode(shalist)
904 return self.maybe_write(b'tree', content)
906 def new_commit(self, tree, parent,
907 author, adate_sec, adate_tz,
908 committer, cdate_sec, cdate_tz,
910 """Create a commit object in the pack. The date_sec values must be
911 epoch-seconds, and if a tz is None, the local timezone is assumed."""
912 content = create_commit_blob(tree, parent,
913 author, adate_sec, adate_tz,
914 committer, cdate_sec, cdate_tz,
916 return self.maybe_write(b'commit', content)
918 def _end(self, run_midx=True, abort=False):
919 # Ignores run_midx during abort
920 self.parentfd, pfd, = None, self.parentfd
921 self.file, f = None, self.file
922 self.idx, idx = None, self.idx
924 with nullcontext_if_not(self.objcache), \
925 finalized(pfd, lambda x: x is not None and os.close(x)), \
926 nullcontext_if_not(f):
931 os.unlink(self.filename + b'.pack')
934 # update object count
936 cp = struct.pack('!i', self.count)
940 # calculate the pack sha1sum
943 for b in chunkyreader(f):
945 packbin = sum.digest()
948 fdatasync(f.fileno())
951 idx.write(self.filename + b'.idx', packbin)
952 nameprefix = os.path.join(self.repo_dir,
953 b'objects/pack/pack-' + hexlify(packbin))
954 if os.path.exists(self.filename + b'.map'):
955 os.unlink(self.filename + b'.map')
956 os.rename(self.filename + b'.pack', nameprefix + b'.pack')
957 os.rename(self.filename + b'.idx', nameprefix + b'.idx')
960 auto_midx(os.path.join(self.repo_dir, b'objects/pack'))
961 if self.on_pack_finish:
962 self.on_pack_finish(nameprefix)
965 # Must be last -- some of the code above depends on it
969 """Remove the pack file from disk."""
971 self._end(abort=True)
973 def breakpoint(self):
974 """Clear byte and object counts and return the last processed id."""
975 id = self._end(self.run_midx)
976 self.outbytes = self.count = 0
979 def close(self, run_midx=True):
980 """Close the pack file and move it to its definitive path."""
982 return self._end(run_midx=run_midx)
988 class PackIdxV2Writer:
990 self.idx = list(list() for i in range(256))
993 def add(self, sha, crc, offs):
996 self.idx[byte_int(sha[0])].append((sha, crc, offs))
998 def write(self, filename, packbin):
1000 for section in self.idx:
1001 for entry in section:
1002 if entry[2] >= 2**31:
1005 # Length: header + fan-out + shas-and-crcs + overflow-offsets
1006 index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
1008 idx_f = open(filename, 'w+b')
1010 idx_f.truncate(index_len)
1011 fdatasync(idx_f.fileno())
1012 idx_map = mmap_readwrite(idx_f, close=False)
1014 count = _helpers.write_idx(filename, idx_map, self.idx,
1016 assert(count == self.count)
1023 idx_f = open(filename, 'a+b')
1025 idx_f.write(packbin)
1028 b = idx_f.read(8 + 4*256)
1031 for b in chunkyreader(idx_f, 20 * self.count):
1034 for b in chunkyreader(idx_f):
1036 idx_f.write(idx_sum.digest())
1037 fdatasync(idx_f.fileno())
1042 def list_refs(patterns=None, repo_dir=None,
1043 limit_to_heads=False, limit_to_tags=False):
1044 """Yield (refname, hash) tuples for all repository refs unless
1045 patterns are specified. In that case, only include tuples for
1046 refs matching those patterns (cf. git-show-ref(1)). The limits
1047 restrict the result items to refs/heads or refs/tags. If both
1048 limits are specified, items from both sources will be included.
1051 argv = [b'git', b'show-ref']
1053 argv.append(b'--heads')
1055 argv.append(b'--tags')
1058 argv.extend(patterns)
1059 p = subprocess.Popen(argv, env=_gitenv(repo_dir), stdout=subprocess.PIPE,
1061 out = p.stdout.read().strip()
1062 rv = p.wait() # not fatal
1066 for d in out.split(b'\n'):
1067 sha, name = d.split(b' ', 1)
1068 yield name, unhexlify(sha)
1071 def read_ref(refname, repo_dir = None):
1072 """Get the commit id of the most recent commit made on a given ref."""
1073 refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
1074 l = tuple(islice(refs, 2))
1082 def rev_list_invocation(ref_or_refs, format=None):
1083 if isinstance(ref_or_refs, bytes):
1084 refs = (ref_or_refs,)
1087 argv = [b'git', b'rev-list']
1090 argv.append(b'--pretty=format:' + format)
1092 assert not ref.startswith(b'-')
1098 def rev_list(ref_or_refs, parse=None, format=None, repo_dir=None):
1099 """Yield information about commits as per "git rev-list". If a format
1100 is not provided, yield one hex hash at a time. If a format is
1101 provided, pass it to rev-list and call parse(git_stdout) for each
1102 commit with the stream positioned just after the rev-list "commit
1103 HASH" header line. When a format is provided yield (oidx,
1104 parse(git_stdout)) for each commit.
1107 assert bool(parse) == bool(format)
1108 p = subprocess.Popen(rev_list_invocation(ref_or_refs,
1110 env=_gitenv(repo_dir),
1111 stdout = subprocess.PIPE,
1114 for line in p.stdout:
1117 line = p.stdout.readline()
1120 if not s.startswith(b'commit '):
1121 raise Exception('unexpected line ' + repr(s))
1124 yield s, parse(p.stdout)
1125 line = p.stdout.readline()
1127 rv = p.wait() # not fatal
1129 raise GitError('git rev-list returned error %d' % rv)
1132 def rev_parse(committish, repo_dir=None):
1133 """Resolve the full hash for 'committish', if it exists.
1135 Should be roughly equivalent to 'git rev-parse'.
1137 Returns the hex value of the hash if it is found, None if 'committish' does
1138 not correspond to anything.
1140 head = read_ref(committish, repo_dir=repo_dir)
1142 debug2("resolved from ref: commit = %s\n" % hexlify(head))
1145 if len(committish) == 40:
1147 hash = unhexlify(committish)
1151 with PackIdxList(repo(b'objects/pack', repo_dir=repo_dir)) as pL:
1158 def update_ref(refname, newval, oldval, repo_dir=None, force=False):
1159 """Update a repository reference.
1161 With force=True, don't care about the previous ref (oldval);
1162 with force=False oldval must be either a sha1 or None (for an
1163 entirely new branch)
1166 assert oldval is None
1171 oldarg = [hexlify(oldval)]
1172 assert refname.startswith(b'refs/heads/') \
1173 or refname.startswith(b'refs/tags/')
1174 p = subprocess.Popen([b'git', b'update-ref', refname,
1175 hexlify(newval)] + oldarg,
1176 env=_gitenv(repo_dir),
1178 _git_wait(b'git update-ref', p)
1181 def delete_ref(refname, oldvalue=None):
1182 """Delete a repository reference (see git update-ref(1))."""
1183 assert refname.startswith(b'refs/')
1184 oldvalue = [] if not oldvalue else [oldvalue]
1185 p = subprocess.Popen([b'git', b'update-ref', b'-d', refname] + oldvalue,
1188 _git_wait('git update-ref', p)
1191 def guess_repo(path=None):
1192 """Set the path value in the global variable "repodir".
1193 This makes bup look for an existing bup repository, but not fail if a
1194 repository doesn't exist. Usually, if you are interacting with a bup
1195 repository, you would not be calling this function but using
1196 check_repo_or_die().
1202 repodir = environ.get(b'BUP_DIR')
1204 repodir = os.path.expanduser(b'~/.bup')
1207 def init_repo(path=None):
1208 """Create the Git bare repository for bup in a given path."""
1210 d = repo() # appends a / to the path
1211 parent = os.path.dirname(os.path.dirname(d))
1212 if parent and not os.path.exists(parent):
1213 raise GitError('parent directory "%s" does not exist\n'
1215 if os.path.exists(d) and not os.path.isdir(os.path.join(d, b'.')):
1216 raise GitError('"%s" exists but is not a directory\n' % path_msg(d))
1217 p = subprocess.Popen([b'git', b'--bare', b'init'], stdout=sys.stderr,
1220 _git_wait('git init', p)
1221 # Force the index version configuration in order to ensure bup works
1222 # regardless of the version of the installed Git binary.
1223 p = subprocess.Popen([b'git', b'config', b'pack.indexVersion', '2'],
1224 stdout=sys.stderr, env=_gitenv(), close_fds=True)
1225 _git_wait('git config', p)
1227 p = subprocess.Popen([b'git', b'config', b'core.logAllRefUpdates', b'true'],
1228 stdout=sys.stderr, env=_gitenv(), close_fds=True)
1229 _git_wait('git config', p)
1232 def check_repo_or_die(path=None):
1233 """Check to see if a bup repository probably exists, and abort if not."""
1236 pst = stat_if_exists(top + b'/objects/pack')
1237 if pst and stat.S_ISDIR(pst.st_mode):
1240 top_st = stat_if_exists(top)
1242 log('error: repository %r does not exist (see "bup help init")\n'
1245 log('error: %s is not a repository\n' % path_msg(top))
1249 def is_suitable_git(ver_str):
1250 if not ver_str.startswith(b'git version '):
1251 return 'unrecognized'
1252 ver_str = ver_str[len(b'git version '):]
1253 if ver_str.startswith(b'0.'):
1254 return 'insufficient'
1255 if ver_str.startswith(b'1.'):
1256 if re.match(br'1\.[012345]rc', ver_str):
1257 return 'insufficient'
1258 if re.match(br'1\.[01234]\.', ver_str):
1259 return 'insufficient'
1260 if re.match(br'1\.5\.[012345]($|\.)', ver_str):
1261 return 'insufficient'
1262 if re.match(br'1\.5\.6-rc', ver_str):
1263 return 'insufficient'
1265 if re.match(br'[0-9]+(\.|$)?', ver_str):
1271 def require_suitable_git(ver_str=None):
1272 """Raise GitError if the version of git isn't suitable.
1274 Rely on ver_str when provided, rather than invoking the git in the
1279 if _git_great is not None:
1281 if environ.get(b'BUP_GIT_VERSION_IS_FINE', b'').lower() \
1282 in (b'yes', b'true', b'1'):
1286 ver_str, _, _ = _git_exo([b'git', b'--version'])
1287 status = is_suitable_git(ver_str)
1288 if status == 'unrecognized':
1289 raise GitError('Unexpected git --version output: %r' % ver_str)
1290 if status == 'insufficient':
1291 log('error: git version must be at least 1.5.6\n')
1293 if status == 'suitable':
1300 """Link to 'git cat-file' that is used to retrieve blob data."""
1301 def __init__(self, repo_dir = None):
1302 require_suitable_git()
1303 self.repo_dir = repo_dir
1304 self.p = self.inprogress = None
1306 def close(self, wait=False):
1307 self.p, p = None, self.p
1308 self.inprogress = None
1313 # This will handle pending exceptions correctly once
1323 self.p = subprocess.Popen([b'git', b'cat-file', b'--batch'],
1324 stdin=subprocess.PIPE,
1325 stdout=subprocess.PIPE,
1328 env=_gitenv(self.repo_dir))
1331 """Yield (oidx, type, size), followed by the data referred to by ref.
1332 If ref does not exist, only yield (None, None, None).
1335 if not self.p or self.p.poll() != None:
1338 poll_result = self.p.poll()
1339 assert(poll_result == None)
1341 log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1342 assert(not self.inprogress)
1343 assert ref.find(b'\n') < 0
1344 assert ref.find(b'\r') < 0
1345 assert not ref.startswith(b'-')
1346 self.inprogress = ref
1347 self.p.stdin.write(ref + b'\n')
1348 self.p.stdin.flush()
1349 hdr = self.p.stdout.readline()
1351 raise GitError('unexpected cat-file EOF (last request: %r, exit: %s)'
1352 % (ref, self.p.poll() or 'none'))
1353 if hdr.endswith(b' missing\n'):
1354 self.inprogress = None
1355 yield None, None, None
1357 info = hdr.split(b' ')
1358 if len(info) != 3 or len(info[0]) != 40:
1359 raise GitError('expected object (id, type, size), got %r' % info)
1360 oidx, typ, size = info
1363 it = chunkyreader(self.p.stdout, size)
1364 yield oidx, typ, size
1365 for blob in chunkyreader(self.p.stdout, size):
1367 readline_result = self.p.stdout.readline()
1368 assert readline_result == b'\n'
1369 self.inprogress = None
1370 except Exception as ex:
1371 with pending_raise(ex):
1374 def _join(self, it):
1375 _, typ, _ = next(it)
1379 elif typ == b'tree':
1380 treefile = b''.join(it)
1381 for (mode, name, sha) in tree_decode(treefile):
1382 for blob in self.join(hexlify(sha)):
1384 elif typ == b'commit':
1385 treeline = b''.join(it).split(b'\n')[0]
1386 assert treeline.startswith(b'tree ')
1387 for blob in self.join(treeline[5:]):
1390 raise GitError('invalid object type %r: expected blob/tree/commit'
1394 """Generate a list of the content of all blobs that can be reached
1395 from an object. The hash given in 'id' must point to a blob, a tree
1396 or a commit. The content of all blobs that can be seen from trees or
1397 commits will be added to the list.
1399 for d in self._join(self.get(id)):
1405 def cp(repo_dir=None):
1406 """Create a CatPipe object or reuse the already existing one."""
1409 repo_dir = repodir or repo()
1410 repo_dir = os.path.abspath(repo_dir)
1411 cp = _cp.get(repo_dir)
1413 cp = CatPipe(repo_dir)
1418 def close_catpipes():
1419 # FIXME: chain exceptions
1421 _, cp = _cp.popitem()
1425 def tags(repo_dir = None):
1426 """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1428 for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1429 assert n.startswith(b'refs/tags/')
1433 tags[c].append(name) # more than one tag can point at 'c'
1437 class MissingObject(KeyError):
1438 def __init__(self, oid):
1440 KeyError.__init__(self, 'object %r is missing' % hexlify(oid))
1443 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1444 'path', 'chunk_path', 'data'])
1445 # The path is the mangled path, and if an item represents a fragment
1446 # of a chunked file, the chunk_path will be the chunked subtree path
1447 # for the chunk, i.e. ['', '2d3115e', ...]. The top-level path for a
1448 # chunked file will have a chunk_path of ['']. So some chunk subtree
1449 # of the file '/foo/bar/baz' might look like this:
1451 # item.path = ['foo', 'bar', 'baz.bup']
1452 # item.chunk_path = ['', '2d3115e', '016b097']
1453 # item.type = 'tree'
1457 def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1458 """Yield everything reachable from oidx via get_ref (which must behave
1459 like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1460 returns true. Throw MissingObject if a hash encountered is
1461 missing from the repository, and don't read or return blob content
1462 in the data field unless include_data is set.
1465 # Maintain the pending stack on the heap to avoid stack overflow
1466 pending = [(oidx, [], [], None)]
1468 oidx, parent_path, chunk_path, mode = pending.pop()
1469 oid = unhexlify(oidx)
1470 if stop_at and stop_at(oidx):
1473 if (not include_data) and mode and stat.S_ISREG(mode):
1474 # If the object is a "regular file", then it's a leaf in
1475 # the graph, so we can skip reading the data if the caller
1476 # hasn't requested it.
1477 yield WalkItem(oid=oid, type=b'blob',
1478 chunk_path=chunk_path, path=parent_path,
1483 item_it = get_ref(oidx)
1484 get_oidx, typ, _ = next(item_it)
1486 raise MissingObject(unhexlify(oidx))
1487 if typ not in (b'blob', b'commit', b'tree'):
1488 raise Exception('unexpected repository object type %r' % typ)
1490 # FIXME: set the mode based on the type when the mode is None
1491 if typ == b'blob' and not include_data:
1492 # Dump data until we can ask cat_pipe not to fetch it
1493 for ignored in item_it:
1497 data = b''.join(item_it)
1499 yield WalkItem(oid=oid, type=typ,
1500 chunk_path=chunk_path, path=parent_path,
1502 data=(data if include_data else None))
1504 if typ == b'commit':
1505 commit_items = parse_commit(data)
1506 for pid in commit_items.parents:
1507 pending.append((pid, parent_path, chunk_path, mode))
1508 pending.append((commit_items.tree, parent_path, chunk_path,
1509 hashsplit.GIT_MODE_TREE))
1510 elif typ == b'tree':
1511 for mode, name, ent_id in tree_decode(data):
1512 demangled, bup_type = demangle_name(name, mode)
1514 sub_path = parent_path
1515 sub_chunk_path = chunk_path + [name]
1517 sub_path = parent_path + [name]
1518 if bup_type == BUP_CHUNKED:
1519 sub_chunk_path = [b'']
1521 sub_chunk_path = chunk_path
1522 pending.append((hexlify(ent_id), sub_path, sub_chunk_path,