1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
6 from __future__ import absolute_import, print_function
7 import os, sys, zlib, subprocess, struct, stat, re, tempfile, glob
8 from array import array
9 from binascii import hexlify, unhexlify
10 from collections import namedtuple
11 from itertools import islice
13 from bup import _helpers, hashsplit, path, midx, bloom, xstat
14 from bup.compat import (buffer,
15 byte_int, bytes_from_byte, bytes_from_uint,
21 from bup.io import path_msg
22 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
29 mmap_read, mmap_readwrite,
31 progress, qprogress, stat_if_exists,
37 repodir = None # The default repository, once initialized
39 _typemap = {b'blob': 3, b'tree': 2, b'commit': 1, b'tag': 4}
40 _typermap = {v: k for k, v in items(_typemap)}
47 class GitError(Exception):
51 def _gitenv(repo_dir=None):
54 return merge_dict(environ, {b'GIT_DIR': os.path.abspath(repo_dir)})
56 def _git_wait(cmd, p):
59 raise GitError('%r returned %d' % (cmd, rv))
61 def _git_exo(cmd, **kwargs):
62 kwargs['check'] = False
63 result = exo(cmd, **kwargs)
65 if proc.returncode != 0:
66 raise GitError('%r returned %d' % (cmd, proc.returncode))
69 def git_config_get(option, repo_dir=None, opttype=None, cfg_file=None):
70 assert not (repo_dir and cfg_file), "repo_dir and cfg_file cannot both be used"
71 cmd = [b'git', b'config', b'--null']
73 cmd.extend([b'--file', cfg_file])
75 cmd.extend([b'--int'])
76 elif opttype == 'bool':
77 cmd.extend([b'--bool'])
79 assert opttype is None
80 cmd.extend([b'--get', option])
83 env = _gitenv(repo_dir=repo_dir)
84 p = subprocess.Popen(cmd, stdout=subprocess.PIPE, env=env,
86 # with --null, git writes out a trailing \0 after the value
87 r = p.stdout.read()[:-1]
92 elif opttype == 'bool':
93 # git converts to 'true' or 'false'
97 raise GitError('%r returned %d' % (cmd, rc))
101 def parse_tz_offset(s):
102 """UTC offset in seconds."""
103 tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
104 if bytes_from_byte(s[0]) == b'-':
108 def parse_commit_gpgsig(sig):
109 """Return the original signature bytes.
111 i.e. with the "gpgsig " header and the leading space character on
112 each continuation line removed.
117 assert sig.startswith(b'gpgsig ')
119 return sig.replace(b'\n ', b'\n')
121 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
122 # Make sure that's authoritative.
125 # https://github.com/git/git/blob/master/Documentation/technical/signature-format.txt
126 # The continuation lines have only one leading space.
128 _start_end_char = br'[^ .,:;<>"\'\0\n]'
129 _content_char = br'[^\0\n<>]'
130 _safe_str_rx = br'(?:%s{1,2}|(?:%s%s*%s))' \
132 _start_end_char, _content_char, _start_end_char)
133 _tz_rx = br'[-+]\d\d[0-5]\d'
134 _parent_rx = br'(?:parent [abcdefABCDEF0123456789]{40}\n)'
135 # Assumes every following line starting with a space is part of the
136 # mergetag. Is there a formal commit blob spec?
137 _mergetag_rx = br'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)'
138 _commit_rx = re.compile(br'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
139 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
140 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?)
141 (?P<gpgsig>gpgsig .*\n(?: .*\n)*)?
142 (?P<message>(?:.|\n)*)''' % (_parent_rx,
143 _safe_str_rx, _safe_str_rx, _tz_rx,
144 _safe_str_rx, _safe_str_rx, _tz_rx,
146 _parent_hash_rx = re.compile(br'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
148 # Note that the author_sec and committer_sec values are (UTC) epoch
149 # seconds, and for now the mergetag is not included.
150 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
151 'author_name', 'author_mail',
152 'author_sec', 'author_offset',
153 'committer_name', 'committer_mail',
154 'committer_sec', 'committer_offset',
158 def parse_commit(content):
159 commit_match = re.match(_commit_rx, content)
161 raise Exception('cannot parse commit %r' % content)
162 matches = commit_match.groupdict()
163 return CommitInfo(tree=matches['tree'],
164 parents=re.findall(_parent_hash_rx, matches['parents']),
165 author_name=matches['author_name'],
166 author_mail=matches['author_mail'],
167 author_sec=int(matches['asec']),
168 author_offset=parse_tz_offset(matches['atz']),
169 committer_name=matches['committer_name'],
170 committer_mail=matches['committer_mail'],
171 committer_sec=int(matches['csec']),
172 committer_offset=parse_tz_offset(matches['ctz']),
173 gpgsig=parse_commit_gpgsig(matches['gpgsig']),
174 message=matches['message'])
177 def get_cat_data(cat_iterator, expected_type):
178 _, kind, _ = next(cat_iterator)
179 if kind != expected_type:
180 raise Exception('expected %r, saw %r' % (expected_type, kind))
181 return b''.join(cat_iterator)
183 def get_commit_items(id, cp):
184 return parse_commit(get_cat_data(cp.get(id), b'commit'))
186 def _local_git_date_str(epoch_sec):
187 return b'%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
190 def _git_date_str(epoch_sec, tz_offset_sec):
191 offs = tz_offset_sec // 60
192 return b'%d %s%02d%02d' \
194 b'+' if offs >= 0 else b'-',
199 def repo(sub = b'', repo_dir=None):
200 """Get the path to the git repository or one of its subdirectories."""
201 repo_dir = repo_dir or repodir
203 raise GitError('You should call check_repo_or_die()')
205 # If there's a .git subdirectory, then the actual repo is in there.
206 gd = os.path.join(repo_dir, b'.git')
207 if os.path.exists(gd):
210 return os.path.join(repo_dir, sub)
214 re.compile(br'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)')
217 return _shorten_hash_rx.sub(br'\1\2*\3', s)
221 full = os.path.abspath(path)
222 fullrepo = os.path.abspath(repo(b''))
223 if not fullrepo.endswith(b'/'):
225 if full.startswith(fullrepo):
226 path = full[len(fullrepo):]
227 if path.startswith(b'index-cache/'):
228 path = path[len(b'index-cache/'):]
229 return shorten_hash(path)
232 def auto_midx(objdir):
233 args = [path.exe(), b'midx', b'--auto', b'--dir', objdir]
235 rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
237 # make sure 'args' gets printed to help with debugging
238 add_error('%r: exception: %s' % (args, e))
241 add_error('%r: returned %d' % (args, rv))
243 args = [path.exe(), b'bloom', b'--dir', objdir]
245 rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
247 # make sure 'args' gets printed to help with debugging
248 add_error('%r: exception: %s' % (args, e))
251 add_error('%r: returned %d' % (args, rv))
254 def mangle_name(name, mode, gitmode):
255 """Mangle a file name to present an abstract name for segmented files.
256 Mangled file names will have the ".bup" extension added to them. If a
257 file's name already ends with ".bup", a ".bupl" extension is added to
258 disambiguate normal files from segmented ones.
260 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
261 assert(stat.S_ISDIR(gitmode))
262 return name + b'.bup'
263 elif name.endswith(b'.bup') or name[:-1].endswith(b'.bup'):
264 return name + b'.bupl'
269 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
270 def demangle_name(name, mode):
271 """Remove name mangling from a file name, if necessary.
273 The return value is a tuple (demangled_filename,mode), where mode is one of
276 * BUP_NORMAL : files that should be read as-is from the repository
277 * BUP_CHUNKED : files that were chunked and need to be reassembled
279 For more information on the name mangling algorithm, see mangle_name()
281 if name.endswith(b'.bupl'):
282 return (name[:-5], BUP_NORMAL)
283 elif name.endswith(b'.bup'):
284 return (name[:-4], BUP_CHUNKED)
285 elif name.endswith(b'.bupm'):
287 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
288 return (name, BUP_NORMAL)
291 def calc_hash(type, content):
292 """Calculate some content's hash in the Git fashion."""
293 header = b'%s %d\0' % (type, len(content))
299 def shalist_item_sort_key(ent):
300 (mode, name, id) = ent
301 assert(mode+0 == mode)
302 if stat.S_ISDIR(mode):
308 def tree_encode(shalist):
309 """Generate a git tree object from (mode,name,hash) tuples."""
310 shalist = sorted(shalist, key = shalist_item_sort_key)
312 for (mode,name,bin) in shalist:
314 assert(mode+0 == mode)
316 assert(len(bin) == 20)
317 s = b'%o %s\0%s' % (mode,name,bin)
318 assert s[0] != b'0' # 0-padded octal is not acceptable in a git tree
323 def tree_decode(buf):
324 """Generate a list of (mode,name,hash) from the git tree object in buf."""
326 while ofs < len(buf):
327 z = buf.find(b'\0', ofs)
329 spl = buf[ofs:z].split(b' ', 1)
330 assert(len(spl) == 2)
332 sha = buf[z+1:z+1+20]
334 yield (int(mode, 8), name, sha)
337 def _encode_packobj(type, content, compression_level=1):
338 if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
339 raise ValueError('invalid compression level %s' % compression_level)
342 szbits = (sz & 0x0f) | (_typemap[type]<<4)
345 if sz: szbits |= 0x80
346 szout += bytes_from_uint(szbits)
351 z = zlib.compressobj(compression_level)
353 yield z.compress(content)
357 def _decode_packobj(buf):
360 type = _typermap[(c & 0x70) >> 4]
367 sz |= (c & 0x7f) << shift
371 return (type, zlib.decompress(buf[i+1:]))
374 class PackIdx(object):
375 def find_offset(self, hash):
376 """Get the offset of an object inside the index file."""
377 idx = self._idx_from_hash(hash)
379 return self._ofs_from_idx(idx)
382 def exists(self, hash, want_source=False):
383 """Return nonempty if the object exists in this index."""
384 if hash and (self._idx_from_hash(hash) != None):
385 return want_source and os.path.basename(self.name) or True
388 def _idx_from_hash(self, hash):
389 global _total_searches, _total_steps
391 assert(len(hash) == 20)
392 b1 = byte_int(hash[0])
393 start = self.fanout[b1-1] # range -1..254
394 end = self.fanout[b1] # range 0..255
396 _total_steps += 1 # lookup table is a step
399 mid = start + (end - start) // 2
400 v = self._idx_to_hash(mid)
410 class PackIdxV1(PackIdx):
411 """Object representation of a Git pack index (version 1) file."""
412 def __init__(self, filename, f):
413 super(PackIdxV1, self).__init__()
416 self.idxnames = [self.name]
417 self.map = mmap_read(f)
418 # Min size for 'L' is 4, which is sufficient for struct's '!I'
419 self.fanout = array('L', struct.unpack('!256I', self.map))
420 self.fanout.append(0) # entry "-1"
421 self.nsha = self.fanout[255]
422 self.sha_ofs = 256 * 4
423 # Avoid slicing shatable for individual hashes (very high overhead)
424 self.shatable = buffer(self.map, self.sha_ofs, self.nsha * 24)
429 def __exit__(self, type, value, traceback):
430 with pending_raise(value, rethrow=False):
434 return int(self.nsha) # int() from long for python 2
436 def _ofs_from_idx(self, idx):
437 if idx >= self.nsha or idx < 0:
438 raise IndexError('invalid pack index index %d' % idx)
439 ofs = self.sha_ofs + idx * 24
440 return struct.unpack_from('!I', self.map, offset=ofs)[0]
442 def _idx_to_hash(self, idx):
443 if idx >= self.nsha or idx < 0:
444 raise IndexError('invalid pack index index %d' % idx)
445 ofs = self.sha_ofs + idx * 24 + 4
446 return self.map[ofs : ofs + 20]
449 start = self.sha_ofs + 4
450 for ofs in range(start, start + 24 * self.nsha, 24):
451 yield self.map[ofs : ofs + 20]
455 if self.map is not None:
464 class PackIdxV2(PackIdx):
465 """Object representation of a Git pack index (version 2) file."""
466 def __init__(self, filename, f):
467 super(PackIdxV2, self).__init__()
470 self.idxnames = [self.name]
471 self.map = mmap_read(f)
472 assert self.map[0:8] == b'\377tOc\0\0\0\2'
473 # Min size for 'L' is 4, which is sufficient for struct's '!I'
474 self.fanout = array('L', struct.unpack_from('!256I', self.map, offset=8))
475 self.fanout.append(0)
476 self.nsha = self.fanout[255]
477 self.sha_ofs = 8 + 256*4
478 self.ofstable_ofs = self.sha_ofs + self.nsha * 20 + self.nsha * 4
479 self.ofs64table_ofs = self.ofstable_ofs + self.nsha * 4
480 # Avoid slicing this for individual hashes (very high overhead)
481 self.shatable = buffer(self.map, self.sha_ofs, self.nsha*20)
486 def __exit__(self, type, value, traceback):
487 with pending_raise(value, rethrow=False):
491 return int(self.nsha) # int() from long for python 2
493 def _ofs_from_idx(self, idx):
494 if idx >= self.nsha or idx < 0:
495 raise IndexError('invalid pack index index %d' % idx)
496 ofs_ofs = self.ofstable_ofs + idx * 4
497 ofs = struct.unpack_from('!I', self.map, offset=ofs_ofs)[0]
499 idx64 = ofs & 0x7fffffff
500 ofs64_ofs = self.ofs64table_ofs + idx64 * 8
501 ofs = struct.unpack_from('!Q', self.map, offset=ofs64_ofs)[0]
504 def _idx_to_hash(self, idx):
505 if idx >= self.nsha or idx < 0:
506 raise IndexError('invalid pack index index %d' % idx)
507 ofs = self.sha_ofs + idx * 20
508 return self.map[ofs : ofs + 20]
512 for ofs in range(start, start + 20 * self.nsha, 20):
513 yield self.map[ofs : ofs + 20]
517 if self.map is not None:
528 def __init__(self, dir, ignore_midx=False):
530 # Q: was this also intended to prevent opening multiple repos?
531 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
537 self.do_bloom = False
539 self.ignore_midx = ignore_midx
542 except BaseException as ex:
543 with pending_raise(ex):
549 assert _mpi_count == 0
552 assert _mpi_count == 0
554 self.bloom, bloom = None, self.bloom
555 self.packs, packs = None, self.packs
557 with ExitStack() as stack:
559 stack.enter_context(pack)
566 def __exit__(self, type, value, traceback):
567 with pending_raise(value, rethrow=False):
574 return iter(idxmerge(self.packs))
577 return sum(len(pack) for pack in self.packs)
579 def exists(self, hash, want_source=False):
580 """Return nonempty if the object exists in the index files."""
581 global _total_searches
583 if hash in self.also:
585 if self.do_bloom and self.bloom:
586 if self.bloom.exists(hash):
587 self.do_bloom = False
589 _total_searches -= 1 # was counted by bloom
591 for i in range(len(self.packs)):
593 _total_searches -= 1 # will be incremented by sub-pack
594 ix = p.exists(hash, want_source=want_source)
596 # reorder so most recently used packs are searched first
597 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
602 def refresh(self, skip_midx = False):
603 """Refresh the index list.
604 This method verifies if .midx files were superseded (e.g. all of its
605 contents are in another, bigger .midx file) and removes the superseded
608 If skip_midx is True, all work on .midx files will be skipped and .midx
609 files will be removed from the list.
611 The instance variable 'ignore_midx' can force this function to
612 always act as if skip_midx was True.
614 if self.bloom is not None:
616 self.bloom = None # Always reopen the bloom as it may have been relaced
617 self.do_bloom = False
618 skip_midx = skip_midx or self.ignore_midx
619 d = dict((p.name, p) for p in self.packs
620 if not skip_midx or not isinstance(p, midx.PackMidx))
621 if os.path.exists(self.dir):
624 midxes = set(glob.glob(os.path.join(self.dir, b'*.midx')))
625 # remove any *.midx files from our list that no longer exist
626 for ix in list(d.values()):
627 if not isinstance(ix, midx.PackMidx):
629 if ix.name in midxes:
634 self.packs.remove(ix)
635 for ix in self.packs:
636 if isinstance(ix, midx.PackMidx):
637 for name in ix.idxnames:
638 d[os.path.join(self.dir, name)] = ix
641 mx = midx.PackMidx(full)
642 (mxd, mxf) = os.path.split(mx.name)
644 for n in mx.idxnames:
645 if not os.path.exists(os.path.join(mxd, n)):
646 log(('warning: index %s missing\n'
648 % (path_msg(n), path_msg(mxf)))
655 midxl.sort(key=lambda ix:
656 (-len(ix), -xstat.stat(ix.name).st_mtime))
659 for sub in ix.idxnames:
660 found = d.get(os.path.join(self.dir, sub))
661 if not found or isinstance(found, PackIdx):
662 # doesn't exist, or exists but not in a midx
667 for name in ix.idxnames:
668 d[os.path.join(self.dir, name)] = ix
669 elif not ix.force_keep:
670 debug1('midx: removing redundant: %s\n'
671 % path_msg(os.path.basename(ix.name)))
674 for full in glob.glob(os.path.join(self.dir, b'*.idx')):
678 except GitError as e:
682 bfull = os.path.join(self.dir, b'bup.bloom')
683 new_packs = set(d.values())
685 if not p in new_packs:
687 new_packs = list(new_packs)
688 new_packs.sort(reverse=True, key=lambda x: len(x))
689 self.packs = new_packs
690 if self.bloom is None and os.path.exists(bfull):
691 self.bloom = bloom.ShaBloom(bfull)
693 if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
697 self.bloom, bloom_tmp = None, self.bloom
699 except BaseException as ex:
700 with pending_raise(ex):
704 debug1('PackIdxList: using %d index%s.\n'
705 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
708 """Insert an additional object in the list."""
712 def open_idx(filename):
713 if filename.endswith(b'.idx'):
714 f = open(filename, 'rb')
716 if header[0:4] == b'\377tOc':
717 version = struct.unpack('!I', header[4:8])[0]
719 return PackIdxV2(filename, f)
721 raise GitError('%s: expected idx file version 2, got %d'
722 % (path_msg(filename), version))
723 elif len(header) == 8 and header[0:4] < b'\377tOc':
724 return PackIdxV1(filename, f)
726 raise GitError('%s: unrecognized idx file header'
727 % path_msg(filename))
728 elif filename.endswith(b'.midx'):
729 return midx.PackMidx(filename)
731 raise GitError('idx filenames must end with .idx or .midx')
734 def idxmerge(idxlist, final_progress=True):
735 """Generate a list of all the objects reachable in a PackIdxList."""
736 def pfunc(count, total):
737 qprogress('Reading indexes: %.2f%% (%d/%d)\r'
738 % (count*100.0/total, count, total))
739 def pfinal(count, total):
741 progress('Reading indexes: %.2f%% (%d/%d), done.\n'
742 % (100, total, total))
743 return merge_iter(idxlist, 10024, pfunc, pfinal)
746 def create_commit_blob(tree, parent,
747 author, adate_sec, adate_tz,
748 committer, cdate_sec, cdate_tz,
750 if adate_tz is not None:
751 adate_str = _git_date_str(adate_sec, adate_tz)
753 adate_str = _local_git_date_str(adate_sec)
754 if cdate_tz is not None:
755 cdate_str = _git_date_str(cdate_sec, cdate_tz)
757 cdate_str = _local_git_date_str(cdate_sec)
759 if tree: l.append(b'tree %s' % hexlify(tree))
760 if parent: l.append(b'parent %s' % hexlify(parent))
761 if author: l.append(b'author %s %s' % (author, adate_str))
762 if committer: l.append(b'committer %s %s' % (committer, cdate_str))
767 def _make_objcache():
768 return PackIdxList(repo(b'objects/pack'))
770 # bup-gc assumes that it can disable all PackWriter activities
771 # (bloom/midx/cache) via the constructor and close() arguments.
773 class PackWriter(object):
774 """Writes Git objects inside a pack file."""
775 def __init__(self, objcache_maker=_make_objcache, compression_level=1,
776 run_midx=True, on_pack_finish=None,
777 max_pack_size=None, max_pack_objects=None, repo_dir=None):
779 self.repo_dir = repo_dir or repo()
786 self.objcache_maker = objcache_maker
788 self.compression_level = compression_level
789 self.run_midx=run_midx
790 self.on_pack_finish = on_pack_finish
791 if not max_pack_size:
792 max_pack_size = git_config_get(b'pack.packSizeLimit',
793 repo_dir=self.repo_dir,
795 if not max_pack_size:
796 # larger packs slow down pruning
797 max_pack_size = 1000 * 1000 * 1000
798 self.max_pack_size = max_pack_size
799 # cache memory usage is about 83 bytes per object
800 self.max_pack_objects = max_pack_objects if max_pack_objects \
801 else max(1, self.max_pack_size // 5000)
806 def __exit__(self, type, value, traceback):
807 with pending_raise(value, rethrow=False):
812 objdir = dir = os.path.join(self.repo_dir, b'objects')
813 fd, name = tempfile.mkstemp(suffix=b'.pack', dir=objdir)
815 self.file = os.fdopen(fd, 'w+b')
820 self.parentfd = os.open(objdir, os.O_RDONLY)
826 assert name.endswith(b'.pack')
827 self.filename = name[:-5]
828 self.file.write(b'PACK\0\0\0\2\0\0\0\0')
829 self.idx = PackIdxV2Writer()
831 def _raw_write(self, datalist, sha):
834 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
835 # the file never has a *partial* blob. So let's make sure it's
836 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
837 # to our hashsplit algorithm.) f.write() does its own buffering,
838 # but that's okay because we'll flush it in _end().
839 oneblob = b''.join(datalist)
845 crc = zlib.crc32(oneblob) & 0xffffffff
846 self._update_idx(sha, crc, nw)
851 def _update_idx(self, sha, crc, size):
854 self.idx.add(sha, crc, self.file.tell() - size)
856 def _write(self, sha, type, content):
860 sha = calc_hash(type, content)
861 size, crc = self._raw_write(_encode_packobj(type, content,
862 self.compression_level),
864 if self.outbytes >= self.max_pack_size \
865 or self.count >= self.max_pack_objects:
869 def _require_objcache(self):
870 if self.objcache is None and self.objcache_maker:
871 self.objcache = self.objcache_maker()
872 if self.objcache is None:
874 "PackWriter not opened or can't check exists w/o objcache")
876 def exists(self, id, want_source=False):
877 """Return non-empty if an object is found in the object cache."""
878 self._require_objcache()
879 return self.objcache.exists(id, want_source=want_source)
881 def just_write(self, sha, type, content):
882 """Write an object to the pack file without checking for duplication."""
883 self._write(sha, type, content)
884 # If nothing else, gc doesn't have/want an objcache
885 if self.objcache is not None:
886 self.objcache.add(sha)
888 def maybe_write(self, type, content):
889 """Write an object to the pack file if not present and return its id."""
890 sha = calc_hash(type, content)
891 if not self.exists(sha):
892 self._require_objcache()
893 self.just_write(sha, type, content)
896 def new_blob(self, blob):
897 """Create a blob object in the pack with the supplied content."""
898 return self.maybe_write(b'blob', blob)
900 def new_tree(self, shalist):
901 """Create a tree object in the pack."""
902 content = tree_encode(shalist)
903 return self.maybe_write(b'tree', content)
905 def new_commit(self, tree, parent,
906 author, adate_sec, adate_tz,
907 committer, cdate_sec, cdate_tz,
909 """Create a commit object in the pack. The date_sec values must be
910 epoch-seconds, and if a tz is None, the local timezone is assumed."""
911 content = create_commit_blob(tree, parent,
912 author, adate_sec, adate_tz,
913 committer, cdate_sec, cdate_tz,
915 return self.maybe_write(b'commit', content)
917 def _end(self, run_midx=True, abort=False):
918 # Ignores run_midx during abort
919 self.parentfd, pfd, = None, self.parentfd
920 self.file, f = None, self.file
921 self.idx, idx = None, self.idx
923 with nullcontext_if_not(self.objcache), \
924 finalized(pfd, lambda x: x is not None and os.close(x)), \
925 nullcontext_if_not(f):
930 os.unlink(self.filename + b'.pack')
933 # update object count
935 cp = struct.pack('!i', self.count)
939 # calculate the pack sha1sum
942 for b in chunkyreader(f):
944 packbin = sum.digest()
947 fdatasync(f.fileno())
950 idx.write(self.filename + b'.idx', packbin)
951 nameprefix = os.path.join(self.repo_dir,
952 b'objects/pack/pack-' + hexlify(packbin))
953 if os.path.exists(self.filename + b'.map'):
954 os.unlink(self.filename + b'.map')
955 os.rename(self.filename + b'.pack', nameprefix + b'.pack')
956 os.rename(self.filename + b'.idx', nameprefix + b'.idx')
959 auto_midx(os.path.join(self.repo_dir, b'objects/pack'))
960 if self.on_pack_finish:
961 self.on_pack_finish(nameprefix)
964 # Must be last -- some of the code above depends on it
968 """Remove the pack file from disk."""
970 self._end(abort=True)
972 def breakpoint(self):
973 """Clear byte and object counts and return the last processed id."""
974 id = self._end(self.run_midx)
975 self.outbytes = self.count = 0
978 def close(self, run_midx=True):
979 """Close the pack file and move it to its definitive path."""
981 return self._end(run_midx=run_midx)
987 class PackIdxV2Writer:
989 self.idx = list(list() for i in range(256))
992 def add(self, sha, crc, offs):
995 self.idx[byte_int(sha[0])].append((sha, crc, offs))
997 def write(self, filename, packbin):
999 for section in self.idx:
1000 for entry in section:
1001 if entry[2] >= 2**31:
1004 # Length: header + fan-out + shas-and-crcs + overflow-offsets
1005 index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
1007 idx_f = open(filename, 'w+b')
1009 idx_f.truncate(index_len)
1010 fdatasync(idx_f.fileno())
1011 idx_map = mmap_readwrite(idx_f, close=False)
1013 count = _helpers.write_idx(filename, idx_map, self.idx,
1015 assert(count == self.count)
1022 idx_f = open(filename, 'a+b')
1024 idx_f.write(packbin)
1027 b = idx_f.read(8 + 4*256)
1030 for b in chunkyreader(idx_f, 20 * self.count):
1033 for b in chunkyreader(idx_f):
1035 idx_f.write(idx_sum.digest())
1036 fdatasync(idx_f.fileno())
1041 def list_refs(patterns=None, repo_dir=None,
1042 limit_to_heads=False, limit_to_tags=False):
1043 """Yield (refname, hash) tuples for all repository refs unless
1044 patterns are specified. In that case, only include tuples for
1045 refs matching those patterns (cf. git-show-ref(1)). The limits
1046 restrict the result items to refs/heads or refs/tags. If both
1047 limits are specified, items from both sources will be included.
1050 argv = [b'git', b'show-ref']
1052 argv.append(b'--heads')
1054 argv.append(b'--tags')
1057 argv.extend(patterns)
1058 p = subprocess.Popen(argv, env=_gitenv(repo_dir), stdout=subprocess.PIPE,
1060 out = p.stdout.read().strip()
1061 rv = p.wait() # not fatal
1065 for d in out.split(b'\n'):
1066 sha, name = d.split(b' ', 1)
1067 yield name, unhexlify(sha)
1070 def read_ref(refname, repo_dir = None):
1071 """Get the commit id of the most recent commit made on a given ref."""
1072 refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
1073 l = tuple(islice(refs, 2))
1081 def rev_list_invocation(ref_or_refs, format=None):
1082 if isinstance(ref_or_refs, bytes):
1083 refs = (ref_or_refs,)
1086 argv = [b'git', b'rev-list']
1089 argv.append(b'--pretty=format:' + format)
1091 assert not ref.startswith(b'-')
1097 def rev_list(ref_or_refs, parse=None, format=None, repo_dir=None):
1098 """Yield information about commits as per "git rev-list". If a format
1099 is not provided, yield one hex hash at a time. If a format is
1100 provided, pass it to rev-list and call parse(git_stdout) for each
1101 commit with the stream positioned just after the rev-list "commit
1102 HASH" header line. When a format is provided yield (oidx,
1103 parse(git_stdout)) for each commit.
1106 assert bool(parse) == bool(format)
1107 p = subprocess.Popen(rev_list_invocation(ref_or_refs,
1109 env=_gitenv(repo_dir),
1110 stdout = subprocess.PIPE,
1113 for line in p.stdout:
1116 line = p.stdout.readline()
1119 if not s.startswith(b'commit '):
1120 raise Exception('unexpected line ' + repr(s))
1123 yield s, parse(p.stdout)
1124 line = p.stdout.readline()
1126 rv = p.wait() # not fatal
1128 raise GitError('git rev-list returned error %d' % rv)
1131 def rev_parse(committish, repo_dir=None):
1132 """Resolve the full hash for 'committish', if it exists.
1134 Should be roughly equivalent to 'git rev-parse'.
1136 Returns the hex value of the hash if it is found, None if 'committish' does
1137 not correspond to anything.
1139 head = read_ref(committish, repo_dir=repo_dir)
1141 debug2("resolved from ref: commit = %s\n" % hexlify(head))
1144 if len(committish) == 40:
1146 hash = unhexlify(committish)
1150 with PackIdxList(repo(b'objects/pack', repo_dir=repo_dir)) as pL:
1157 def update_ref(refname, newval, oldval, repo_dir=None, force=False):
1158 """Update a repository reference.
1160 With force=True, don't care about the previous ref (oldval);
1161 with force=False oldval must be either a sha1 or None (for an
1162 entirely new branch)
1165 assert oldval is None
1170 oldarg = [hexlify(oldval)]
1171 assert refname.startswith(b'refs/heads/') \
1172 or refname.startswith(b'refs/tags/')
1173 p = subprocess.Popen([b'git', b'update-ref', refname,
1174 hexlify(newval)] + oldarg,
1175 env=_gitenv(repo_dir),
1177 _git_wait(b'git update-ref', p)
1180 def delete_ref(refname, oldvalue=None):
1181 """Delete a repository reference (see git update-ref(1))."""
1182 assert refname.startswith(b'refs/')
1183 oldvalue = [] if not oldvalue else [oldvalue]
1184 p = subprocess.Popen([b'git', b'update-ref', b'-d', refname] + oldvalue,
1187 _git_wait('git update-ref', p)
1190 def guess_repo(path=None):
1191 """Set the path value in the global variable "repodir".
1192 This makes bup look for an existing bup repository, but not fail if a
1193 repository doesn't exist. Usually, if you are interacting with a bup
1194 repository, you would not be calling this function but using
1195 check_repo_or_die().
1201 repodir = environ.get(b'BUP_DIR')
1203 repodir = os.path.expanduser(b'~/.bup')
1206 def init_repo(path=None):
1207 """Create the Git bare repository for bup in a given path."""
1209 d = repo() # appends a / to the path
1210 parent = os.path.dirname(os.path.dirname(d))
1211 if parent and not os.path.exists(parent):
1212 raise GitError('parent directory "%s" does not exist\n'
1214 if os.path.exists(d) and not os.path.isdir(os.path.join(d, b'.')):
1215 raise GitError('"%s" exists but is not a directory\n' % path_msg(d))
1216 p = subprocess.Popen([b'git', b'--bare', b'init'], stdout=sys.stderr,
1219 _git_wait('git init', p)
1220 # Force the index version configuration in order to ensure bup works
1221 # regardless of the version of the installed Git binary.
1222 p = subprocess.Popen([b'git', b'config', b'pack.indexVersion', '2'],
1223 stdout=sys.stderr, env=_gitenv(), close_fds=True)
1224 _git_wait('git config', p)
1226 p = subprocess.Popen([b'git', b'config', b'core.logAllRefUpdates', b'true'],
1227 stdout=sys.stderr, env=_gitenv(), close_fds=True)
1228 _git_wait('git config', p)
1231 def check_repo_or_die(path=None):
1232 """Check to see if a bup repository probably exists, and abort if not."""
1235 pst = stat_if_exists(top + b'/objects/pack')
1236 if pst and stat.S_ISDIR(pst.st_mode):
1239 top_st = stat_if_exists(top)
1241 log('error: repository %r does not exist (see "bup help init")\n'
1244 log('error: %s is not a repository\n' % path_msg(top))
1248 def is_suitable_git(ver_str):
1249 if not ver_str.startswith(b'git version '):
1250 return 'unrecognized'
1251 ver_str = ver_str[len(b'git version '):]
1252 if ver_str.startswith(b'0.'):
1253 return 'insufficient'
1254 if ver_str.startswith(b'1.'):
1255 if re.match(br'1\.[012345]rc', ver_str):
1256 return 'insufficient'
1257 if re.match(br'1\.[01234]\.', ver_str):
1258 return 'insufficient'
1259 if re.match(br'1\.5\.[012345]($|\.)', ver_str):
1260 return 'insufficient'
1261 if re.match(br'1\.5\.6-rc', ver_str):
1262 return 'insufficient'
1264 if re.match(br'[0-9]+(\.|$)?', ver_str):
1270 def require_suitable_git(ver_str=None):
1271 """Raise GitError if the version of git isn't suitable.
1273 Rely on ver_str when provided, rather than invoking the git in the
1278 if _git_great is not None:
1280 if environ.get(b'BUP_GIT_VERSION_IS_FINE', b'').lower() \
1281 in (b'yes', b'true', b'1'):
1285 ver_str, _, _ = _git_exo([b'git', b'--version'])
1286 status = is_suitable_git(ver_str)
1287 if status == 'unrecognized':
1288 raise GitError('Unexpected git --version output: %r' % ver_str)
1289 if status == 'insufficient':
1290 log('error: git version must be at least 1.5.6\n')
1292 if status == 'suitable':
1299 """Link to 'git cat-file' that is used to retrieve blob data."""
1300 def __init__(self, repo_dir = None):
1301 require_suitable_git()
1302 self.repo_dir = repo_dir
1303 self.p = self.inprogress = None
1305 def close(self, wait=False):
1306 self.p, p = None, self.p
1307 self.inprogress = None
1312 # This will handle pending exceptions correctly once
1322 self.p = subprocess.Popen([b'git', b'cat-file', b'--batch'],
1323 stdin=subprocess.PIPE,
1324 stdout=subprocess.PIPE,
1327 env=_gitenv(self.repo_dir))
1330 """Yield (oidx, type, size), followed by the data referred to by ref.
1331 If ref does not exist, only yield (None, None, None).
1334 if not self.p or self.p.poll() != None:
1337 poll_result = self.p.poll()
1338 assert(poll_result == None)
1340 log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1341 assert(not self.inprogress)
1342 assert ref.find(b'\n') < 0
1343 assert ref.find(b'\r') < 0
1344 assert not ref.startswith(b'-')
1345 self.inprogress = ref
1346 self.p.stdin.write(ref + b'\n')
1347 self.p.stdin.flush()
1348 hdr = self.p.stdout.readline()
1350 raise GitError('unexpected cat-file EOF (last request: %r, exit: %s)'
1351 % (ref, self.p.poll() or 'none'))
1352 if hdr.endswith(b' missing\n'):
1353 self.inprogress = None
1354 yield None, None, None
1356 info = hdr.split(b' ')
1357 if len(info) != 3 or len(info[0]) != 40:
1358 raise GitError('expected object (id, type, size), got %r' % info)
1359 oidx, typ, size = info
1362 it = chunkyreader(self.p.stdout, size)
1363 yield oidx, typ, size
1364 for blob in chunkyreader(self.p.stdout, size):
1366 readline_result = self.p.stdout.readline()
1367 assert readline_result == b'\n'
1368 self.inprogress = None
1369 except Exception as ex:
1370 with pending_raise(ex):
1373 def _join(self, it):
1374 _, typ, _ = next(it)
1378 elif typ == b'tree':
1379 treefile = b''.join(it)
1380 for (mode, name, sha) in tree_decode(treefile):
1381 for blob in self.join(hexlify(sha)):
1383 elif typ == b'commit':
1384 treeline = b''.join(it).split(b'\n')[0]
1385 assert treeline.startswith(b'tree ')
1386 for blob in self.join(treeline[5:]):
1389 raise GitError('invalid object type %r: expected blob/tree/commit'
1393 """Generate a list of the content of all blobs that can be reached
1394 from an object. The hash given in 'id' must point to a blob, a tree
1395 or a commit. The content of all blobs that can be seen from trees or
1396 commits will be added to the list.
1398 for d in self._join(self.get(id)):
1404 def cp(repo_dir=None):
1405 """Create a CatPipe object or reuse the already existing one."""
1408 repo_dir = repodir or repo()
1409 repo_dir = os.path.abspath(repo_dir)
1410 cp = _cp.get(repo_dir)
1412 cp = CatPipe(repo_dir)
1417 def close_catpipes():
1418 # FIXME: chain exceptions
1420 _, cp = _cp.popitem()
1424 def tags(repo_dir = None):
1425 """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1427 for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1428 assert n.startswith(b'refs/tags/')
1432 tags[c].append(name) # more than one tag can point at 'c'
1436 class MissingObject(KeyError):
1437 def __init__(self, oid):
1439 KeyError.__init__(self, 'object %r is missing' % hexlify(oid))
1442 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1443 'path', 'chunk_path', 'data'])
1444 # The path is the mangled path, and if an item represents a fragment
1445 # of a chunked file, the chunk_path will be the chunked subtree path
1446 # for the chunk, i.e. ['', '2d3115e', ...]. The top-level path for a
1447 # chunked file will have a chunk_path of ['']. So some chunk subtree
1448 # of the file '/foo/bar/baz' might look like this:
1450 # item.path = ['foo', 'bar', 'baz.bup']
1451 # item.chunk_path = ['', '2d3115e', '016b097']
1452 # item.type = 'tree'
1456 def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1457 """Yield everything reachable from oidx via get_ref (which must behave
1458 like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1459 returns true. Throw MissingObject if a hash encountered is
1460 missing from the repository, and don't read or return blob content
1461 in the data field unless include_data is set.
1464 # Maintain the pending stack on the heap to avoid stack overflow
1465 pending = [(oidx, [], [], None)]
1467 oidx, parent_path, chunk_path, mode = pending.pop()
1468 oid = unhexlify(oidx)
1469 if stop_at and stop_at(oidx):
1472 if (not include_data) and mode and stat.S_ISREG(mode):
1473 # If the object is a "regular file", then it's a leaf in
1474 # the graph, so we can skip reading the data if the caller
1475 # hasn't requested it.
1476 yield WalkItem(oid=oid, type=b'blob',
1477 chunk_path=chunk_path, path=parent_path,
1482 item_it = get_ref(oidx)
1483 get_oidx, typ, _ = next(item_it)
1485 raise MissingObject(unhexlify(oidx))
1486 if typ not in (b'blob', b'commit', b'tree'):
1487 raise Exception('unexpected repository object type %r' % typ)
1489 # FIXME: set the mode based on the type when the mode is None
1490 if typ == b'blob' and not include_data:
1491 # Dump data until we can ask cat_pipe not to fetch it
1492 for ignored in item_it:
1496 data = b''.join(item_it)
1498 yield WalkItem(oid=oid, type=typ,
1499 chunk_path=chunk_path, path=parent_path,
1501 data=(data if include_data else None))
1503 if typ == b'commit':
1504 commit_items = parse_commit(data)
1505 for pid in commit_items.parents:
1506 pending.append((pid, parent_path, chunk_path, mode))
1507 pending.append((commit_items.tree, parent_path, chunk_path,
1508 hashsplit.GIT_MODE_TREE))
1509 elif typ == b'tree':
1510 for mode, name, ent_id in tree_decode(data):
1511 demangled, bup_type = demangle_name(name, mode)
1513 sub_path = parent_path
1514 sub_chunk_path = chunk_path + [name]
1516 sub_path = parent_path + [name]
1517 if bup_type == BUP_CHUNKED:
1518 sub_chunk_path = [b'']
1520 sub_chunk_path = chunk_path
1521 pending.append((hexlify(ent_id), sub_path, sub_chunk_path,