1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
6 from __future__ import absolute_import, print_function
7 import os, sys, zlib, subprocess, struct, stat, re, tempfile, glob
8 from array import array
9 from binascii import hexlify, unhexlify
10 from collections import namedtuple
11 from itertools import islice
13 from bup import _helpers, hashsplit, path, midx, bloom, xstat
14 from bup.compat import (buffer,
15 byte_int, bytes_from_byte, bytes_from_uint,
22 from bup.io import path_msg
23 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
30 mmap_read, mmap_readwrite,
32 progress, qprogress, stat_if_exists,
38 repodir = None # The default repository, once initialized
40 _typemap = {b'blob': 3, b'tree': 2, b'commit': 1, b'tag': 4}
41 _typermap = {v: k for k, v in items(_typemap)}
48 class GitError(Exception):
52 def _gitenv(repo_dir=None):
55 return merge_dict(environ, {b'GIT_DIR': os.path.abspath(repo_dir)})
57 def _git_wait(cmd, p):
60 raise GitError('%r returned %d' % (cmd, rv))
62 def _git_exo(cmd, **kwargs):
63 kwargs['check'] = False
64 result = exo(cmd, **kwargs)
66 if proc.returncode != 0:
67 raise GitError('%r returned %d' % (cmd, proc.returncode))
70 def git_config_get(option, repo_dir=None, opttype=None, cfg_file=None):
71 assert not (repo_dir and cfg_file), "repo_dir and cfg_file cannot both be used"
72 cmd = [b'git', b'config', b'--null']
74 cmd.extend([b'--file', cfg_file])
76 cmd.extend([b'--int'])
77 elif opttype == 'bool':
78 cmd.extend([b'--bool'])
80 assert opttype is None
81 cmd.extend([b'--get', option])
84 env = _gitenv(repo_dir=repo_dir)
85 p = subprocess.Popen(cmd, stdout=subprocess.PIPE, env=env,
87 # with --null, git writes out a trailing \0 after the value
88 r = p.stdout.read()[:-1]
93 elif opttype == 'bool':
94 # git converts to 'true' or 'false'
98 raise GitError('%r returned %d' % (cmd, rc))
102 def parse_tz_offset(s):
103 """UTC offset in seconds."""
104 tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
105 if bytes_from_byte(s[0]) == b'-':
109 def parse_commit_gpgsig(sig):
110 """Return the original signature bytes.
112 i.e. with the "gpgsig " header and the leading space character on
113 each continuation line removed.
118 assert sig.startswith(b'gpgsig ')
120 return sig.replace(b'\n ', b'\n')
122 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
123 # Make sure that's authoritative.
126 # https://github.com/git/git/blob/master/Documentation/technical/signature-format.txt
127 # The continuation lines have only one leading space.
129 _start_end_char = br'[^ .,:;<>"\'\0\n]'
130 _content_char = br'[^\0\n<>]'
131 _safe_str_rx = br'(?:%s{1,2}|(?:%s%s*%s))' \
133 _start_end_char, _content_char, _start_end_char)
134 _tz_rx = br'[-+]\d\d[0-5]\d'
135 _parent_rx = br'(?:parent [abcdefABCDEF0123456789]{40}\n)'
136 # Assumes every following line starting with a space is part of the
137 # mergetag. Is there a formal commit blob spec?
138 _mergetag_rx = br'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)'
139 _commit_rx = re.compile(br'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
140 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
141 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?)
142 (?P<gpgsig>gpgsig .*\n(?: .*\n)*)?
143 (?P<message>(?:.|\n)*)''' % (_parent_rx,
144 _safe_str_rx, _safe_str_rx, _tz_rx,
145 _safe_str_rx, _safe_str_rx, _tz_rx,
147 _parent_hash_rx = re.compile(br'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
149 # Note that the author_sec and committer_sec values are (UTC) epoch
150 # seconds, and for now the mergetag is not included.
151 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
152 'author_name', 'author_mail',
153 'author_sec', 'author_offset',
154 'committer_name', 'committer_mail',
155 'committer_sec', 'committer_offset',
159 def parse_commit(content):
160 commit_match = re.match(_commit_rx, content)
162 raise Exception('cannot parse commit %r' % content)
163 matches = commit_match.groupdict()
164 return CommitInfo(tree=matches['tree'],
165 parents=re.findall(_parent_hash_rx, matches['parents']),
166 author_name=matches['author_name'],
167 author_mail=matches['author_mail'],
168 author_sec=int(matches['asec']),
169 author_offset=parse_tz_offset(matches['atz']),
170 committer_name=matches['committer_name'],
171 committer_mail=matches['committer_mail'],
172 committer_sec=int(matches['csec']),
173 committer_offset=parse_tz_offset(matches['ctz']),
174 gpgsig=parse_commit_gpgsig(matches['gpgsig']),
175 message=matches['message'])
178 def get_cat_data(cat_iterator, expected_type):
179 _, kind, _ = next(cat_iterator)
180 if kind != expected_type:
181 raise Exception('expected %r, saw %r' % (expected_type, kind))
182 return b''.join(cat_iterator)
184 def get_commit_items(id, cp):
185 return parse_commit(get_cat_data(cp.get(id), b'commit'))
187 def _local_git_date_str(epoch_sec):
188 return b'%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
191 def _git_date_str(epoch_sec, tz_offset_sec):
192 offs = tz_offset_sec // 60
193 return b'%d %s%02d%02d' \
195 b'+' if offs >= 0 else b'-',
200 def repo(sub = b'', repo_dir=None):
201 """Get the path to the git repository or one of its subdirectories."""
202 repo_dir = repo_dir or repodir
204 raise GitError('You should call check_repo_or_die()')
206 # If there's a .git subdirectory, then the actual repo is in there.
207 gd = os.path.join(repo_dir, b'.git')
208 if os.path.exists(gd):
211 return os.path.join(repo_dir, sub)
215 re.compile(br'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)')
218 return _shorten_hash_rx.sub(br'\1\2*\3', s)
222 full = os.path.abspath(path)
223 fullrepo = os.path.abspath(repo(b''))
224 if not fullrepo.endswith(b'/'):
226 if full.startswith(fullrepo):
227 path = full[len(fullrepo):]
228 if path.startswith(b'index-cache/'):
229 path = path[len(b'index-cache/'):]
230 return shorten_hash(path)
233 def auto_midx(objdir):
234 args = [path.exe(), b'midx', b'--auto', b'--dir', objdir]
236 rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
238 # make sure 'args' gets printed to help with debugging
239 add_error('%r: exception: %s' % (args, e))
242 add_error('%r: returned %d' % (args, rv))
244 args = [path.exe(), b'bloom', b'--dir', objdir]
246 rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
248 # make sure 'args' gets printed to help with debugging
249 add_error('%r: exception: %s' % (args, e))
252 add_error('%r: returned %d' % (args, rv))
255 def mangle_name(name, mode, gitmode):
256 """Mangle a file name to present an abstract name for segmented files.
257 Mangled file names will have the ".bup" extension added to them. If a
258 file's name already ends with ".bup", a ".bupl" extension is added to
259 disambiguate normal files from segmented ones.
261 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
262 assert(stat.S_ISDIR(gitmode))
263 return name + b'.bup'
264 elif name.endswith(b'.bup') or name[:-1].endswith(b'.bup'):
265 return name + b'.bupl'
270 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
271 def demangle_name(name, mode):
272 """Remove name mangling from a file name, if necessary.
274 The return value is a tuple (demangled_filename,mode), where mode is one of
277 * BUP_NORMAL : files that should be read as-is from the repository
278 * BUP_CHUNKED : files that were chunked and need to be reassembled
280 For more information on the name mangling algorithm, see mangle_name()
282 if name.endswith(b'.bupl'):
283 return (name[:-5], BUP_NORMAL)
284 elif name.endswith(b'.bup'):
285 return (name[:-4], BUP_CHUNKED)
286 elif name.endswith(b'.bupm'):
288 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
289 return (name, BUP_NORMAL)
292 def calc_hash(type, content):
293 """Calculate some content's hash in the Git fashion."""
294 header = b'%s %d\0' % (type, len(content))
300 def shalist_item_sort_key(ent):
301 (mode, name, id) = ent
302 assert(mode+0 == mode)
303 if stat.S_ISDIR(mode):
309 def tree_encode(shalist):
310 """Generate a git tree object from (mode,name,hash) tuples."""
311 shalist = sorted(shalist, key = shalist_item_sort_key)
313 for (mode,name,bin) in shalist:
315 assert(mode+0 == mode)
317 assert(len(bin) == 20)
318 s = b'%o %s\0%s' % (mode,name,bin)
319 assert s[0] != b'0' # 0-padded octal is not acceptable in a git tree
324 def tree_decode(buf):
325 """Generate a list of (mode,name,hash) from the git tree object in buf."""
327 while ofs < len(buf):
328 z = buf.find(b'\0', ofs)
330 spl = buf[ofs:z].split(b' ', 1)
331 assert(len(spl) == 2)
333 sha = buf[z+1:z+1+20]
335 yield (int(mode, 8), name, sha)
338 def _encode_packobj(type, content, compression_level=1):
339 if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
340 raise ValueError('invalid compression level %s' % compression_level)
343 szbits = (sz & 0x0f) | (_typemap[type]<<4)
346 if sz: szbits |= 0x80
347 szout += bytes_from_uint(szbits)
352 z = zlib.compressobj(compression_level)
354 yield z.compress(content)
358 def _decode_packobj(buf):
361 type = _typermap[(c & 0x70) >> 4]
368 sz |= (c & 0x7f) << shift
372 return (type, zlib.decompress(buf[i+1:]))
379 def find_offset(self, hash):
380 """Get the offset of an object inside the index file."""
381 idx = self._idx_from_hash(hash)
383 return self._ofs_from_idx(idx)
386 def exists(self, hash, want_source=False):
387 """Return nonempty if the object exists in this index."""
388 if hash and (self._idx_from_hash(hash) != None):
389 return want_source and os.path.basename(self.name) or True
392 def _idx_from_hash(self, hash):
393 global _total_searches, _total_steps
395 assert(len(hash) == 20)
396 b1 = byte_int(hash[0])
397 start = self.fanout[b1-1] # range -1..254
398 end = self.fanout[b1] # range 0..255
400 _total_steps += 1 # lookup table is a step
403 mid = start + (end - start) // 2
404 v = self._idx_to_hash(mid)
414 class PackIdxV1(PackIdx):
415 """Object representation of a Git pack index (version 1) file."""
416 def __init__(self, filename, f):
418 self.idxnames = [self.name]
419 self.map = mmap_read(f)
420 # Min size for 'L' is 4, which is sufficient for struct's '!I'
421 self.fanout = array('L', struct.unpack('!256I', self.map))
422 self.fanout.append(0) # entry "-1"
423 self.nsha = self.fanout[255]
424 self.sha_ofs = 256 * 4
425 # Avoid slicing shatable for individual hashes (very high overhead)
426 self.shatable = buffer(self.map, self.sha_ofs, self.nsha * 24)
431 def __exit__(self, type, value, traceback):
432 with pending_raise(value, rethrow=False):
436 return int(self.nsha) # int() from long for python 2
438 def _ofs_from_idx(self, idx):
439 if idx >= self.nsha or idx < 0:
440 raise IndexError('invalid pack index index %d' % idx)
441 ofs = self.sha_ofs + idx * 24
442 return struct.unpack_from('!I', self.map, offset=ofs)[0]
444 def _idx_to_hash(self, idx):
445 if idx >= self.nsha or idx < 0:
446 raise IndexError('invalid pack index index %d' % idx)
447 ofs = self.sha_ofs + idx * 24 + 4
448 return self.map[ofs : ofs + 20]
451 start = self.sha_ofs + 4
452 for ofs in range(start, start + 24 * self.nsha, 24):
453 yield self.map[ofs : ofs + 20]
456 if self.map is not None:
462 class PackIdxV2(PackIdx):
463 """Object representation of a Git pack index (version 2) file."""
464 def __init__(self, filename, f):
466 self.idxnames = [self.name]
467 self.map = mmap_read(f)
468 assert self.map[0:8] == b'\377tOc\0\0\0\2'
469 # Min size for 'L' is 4, which is sufficient for struct's '!I'
470 self.fanout = array('L', struct.unpack_from('!256I', self.map, offset=8))
471 self.fanout.append(0)
472 self.nsha = self.fanout[255]
473 self.sha_ofs = 8 + 256*4
474 self.ofstable_ofs = self.sha_ofs + self.nsha * 20 + self.nsha * 4
475 self.ofs64table_ofs = self.ofstable_ofs + self.nsha * 4
476 # Avoid slicing this for individual hashes (very high overhead)
477 self.shatable = buffer(self.map, self.sha_ofs, self.nsha*20)
482 def __exit__(self, type, value, traceback):
483 with pending_raise(value, rethrow=False):
487 return int(self.nsha) # int() from long for python 2
489 def _ofs_from_idx(self, idx):
490 if idx >= self.nsha or idx < 0:
491 raise IndexError('invalid pack index index %d' % idx)
492 ofs_ofs = self.ofstable_ofs + idx * 4
493 ofs = struct.unpack_from('!I', self.map, offset=ofs_ofs)[0]
495 idx64 = ofs & 0x7fffffff
496 ofs64_ofs = self.ofs64table_ofs + idx64 * 8
497 ofs = struct.unpack_from('!Q', self.map, offset=ofs64_ofs)[0]
500 def _idx_to_hash(self, idx):
501 if idx >= self.nsha or idx < 0:
502 raise IndexError('invalid pack index index %d' % idx)
503 ofs = self.sha_ofs + idx * 20
504 return self.map[ofs : ofs + 20]
508 for ofs in range(start, start + 20 * self.nsha, 20):
509 yield self.map[ofs : ofs + 20]
512 if self.map is not None:
520 def __init__(self, dir, ignore_midx=False):
522 # Q: was this also intended to prevent opening multiple repos?
523 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
529 self.do_bloom = False
531 self.ignore_midx = ignore_midx
537 assert _mpi_count == 0
540 assert _mpi_count == 0
542 self.bloom, bloom = None, self.bloom
543 self.packs, packs = None, self.packs
545 with ExitStack() as stack:
547 stack.enter_context(pack)
554 def __exit__(self, type, value, traceback):
555 with pending_raise(value, rethrow=False):
562 return iter(idxmerge(self.packs))
565 return sum(len(pack) for pack in self.packs)
567 def exists(self, hash, want_source=False):
568 """Return nonempty if the object exists in the index files."""
569 global _total_searches
571 if hash in self.also:
573 if self.do_bloom and self.bloom:
574 if self.bloom.exists(hash):
575 self.do_bloom = False
577 _total_searches -= 1 # was counted by bloom
579 for i in range(len(self.packs)):
581 _total_searches -= 1 # will be incremented by sub-pack
582 ix = p.exists(hash, want_source=want_source)
584 # reorder so most recently used packs are searched first
585 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
590 def refresh(self, skip_midx = False):
591 """Refresh the index list.
592 This method verifies if .midx files were superseded (e.g. all of its
593 contents are in another, bigger .midx file) and removes the superseded
596 If skip_midx is True, all work on .midx files will be skipped and .midx
597 files will be removed from the list.
599 The instance variable 'ignore_midx' can force this function to
600 always act as if skip_midx was True.
602 if self.bloom is not None:
604 self.bloom = None # Always reopen the bloom as it may have been relaced
605 self.do_bloom = False
606 skip_midx = skip_midx or self.ignore_midx
607 d = dict((p.name, p) for p in self.packs
608 if not skip_midx or not isinstance(p, midx.PackMidx))
609 if os.path.exists(self.dir):
612 midxes = set(glob.glob(os.path.join(self.dir, b'*.midx')))
613 # remove any *.midx files from our list that no longer exist
614 for ix in list(d.values()):
615 if not isinstance(ix, midx.PackMidx):
617 if ix.name in midxes:
622 self.packs.remove(ix)
623 for ix in self.packs:
624 if isinstance(ix, midx.PackMidx):
625 for name in ix.idxnames:
626 d[os.path.join(self.dir, name)] = ix
629 mx = midx.PackMidx(full)
630 (mxd, mxf) = os.path.split(mx.name)
632 for n in mx.idxnames:
633 if not os.path.exists(os.path.join(mxd, n)):
634 log(('warning: index %s missing\n'
636 % (path_msg(n), path_msg(mxf)))
643 midxl.sort(key=lambda ix:
644 (-len(ix), -xstat.stat(ix.name).st_mtime))
647 for sub in ix.idxnames:
648 found = d.get(os.path.join(self.dir, sub))
649 if not found or isinstance(found, PackIdx):
650 # doesn't exist, or exists but not in a midx
655 for name in ix.idxnames:
656 d[os.path.join(self.dir, name)] = ix
657 elif not ix.force_keep:
658 debug1('midx: removing redundant: %s\n'
659 % path_msg(os.path.basename(ix.name)))
662 for full in glob.glob(os.path.join(self.dir, b'*.idx')):
666 except GitError as e:
670 bfull = os.path.join(self.dir, b'bup.bloom')
671 self.packs = list(set(d.values()))
672 self.packs.sort(reverse=True, key=lambda x: len(x))
673 if self.bloom is None and os.path.exists(bfull):
674 self.bloom = bloom.ShaBloom(bfull)
676 if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
680 self.bloom, bloom_tmp = None, self.bloom
682 except BaseException as ex:
683 with pending_raise(ex):
687 debug1('PackIdxList: using %d index%s.\n'
688 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
691 """Insert an additional object in the list."""
695 def open_idx(filename):
696 if filename.endswith(b'.idx'):
697 f = open(filename, 'rb')
699 if header[0:4] == b'\377tOc':
700 version = struct.unpack('!I', header[4:8])[0]
702 return PackIdxV2(filename, f)
704 raise GitError('%s: expected idx file version 2, got %d'
705 % (path_msg(filename), version))
706 elif len(header) == 8 and header[0:4] < b'\377tOc':
707 return PackIdxV1(filename, f)
709 raise GitError('%s: unrecognized idx file header'
710 % path_msg(filename))
711 elif filename.endswith(b'.midx'):
712 return midx.PackMidx(filename)
714 raise GitError('idx filenames must end with .idx or .midx')
717 def idxmerge(idxlist, final_progress=True):
718 """Generate a list of all the objects reachable in a PackIdxList."""
719 def pfunc(count, total):
720 qprogress('Reading indexes: %.2f%% (%d/%d)\r'
721 % (count*100.0/total, count, total))
722 def pfinal(count, total):
724 progress('Reading indexes: %.2f%% (%d/%d), done.\n'
725 % (100, total, total))
726 return merge_iter(idxlist, 10024, pfunc, pfinal)
729 def create_commit_blob(tree, parent,
730 author, adate_sec, adate_tz,
731 committer, cdate_sec, cdate_tz,
733 if adate_tz is not None:
734 adate_str = _git_date_str(adate_sec, adate_tz)
736 adate_str = _local_git_date_str(adate_sec)
737 if cdate_tz is not None:
738 cdate_str = _git_date_str(cdate_sec, cdate_tz)
740 cdate_str = _local_git_date_str(cdate_sec)
742 if tree: l.append(b'tree %s' % hexlify(tree))
743 if parent: l.append(b'parent %s' % hexlify(parent))
744 if author: l.append(b'author %s %s' % (author, adate_str))
745 if committer: l.append(b'committer %s %s' % (committer, cdate_str))
750 def _make_objcache():
751 return PackIdxList(repo(b'objects/pack'))
753 # bup-gc assumes that it can disable all PackWriter activities
754 # (bloom/midx/cache) via the constructor and close() arguments.
757 """Writes Git objects inside a pack file."""
758 def __init__(self, objcache_maker=_make_objcache, compression_level=1,
759 run_midx=True, on_pack_finish=None,
760 max_pack_size=None, max_pack_objects=None, repo_dir=None):
761 self.repo_dir = repo_dir or repo()
768 self.objcache_maker = objcache_maker
770 self.compression_level = compression_level
771 self.run_midx=run_midx
772 self.on_pack_finish = on_pack_finish
773 if not max_pack_size:
774 max_pack_size = git_config_get(b'pack.packSizeLimit',
775 repo_dir=self.repo_dir,
777 if not max_pack_size:
778 # larger packs slow down pruning
779 max_pack_size = 1000 * 1000 * 1000
780 self.max_pack_size = max_pack_size
781 # cache memory usage is about 83 bytes per object
782 self.max_pack_objects = max_pack_objects if max_pack_objects \
783 else max(1, self.max_pack_size // 5000)
788 def __exit__(self, type, value, traceback):
789 with pending_raise(value, rethrow=False):
794 objdir = dir = os.path.join(self.repo_dir, b'objects')
795 fd, name = tempfile.mkstemp(suffix=b'.pack', dir=objdir)
797 self.file = os.fdopen(fd, 'w+b')
802 self.parentfd = os.open(objdir, os.O_RDONLY)
808 assert name.endswith(b'.pack')
809 self.filename = name[:-5]
810 self.file.write(b'PACK\0\0\0\2\0\0\0\0')
811 self.idx = PackIdxV2Writer()
813 def _raw_write(self, datalist, sha):
816 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
817 # the file never has a *partial* blob. So let's make sure it's
818 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
819 # to our hashsplit algorithm.) f.write() does its own buffering,
820 # but that's okay because we'll flush it in _end().
821 oneblob = b''.join(datalist)
827 crc = zlib.crc32(oneblob) & 0xffffffff
828 self._update_idx(sha, crc, nw)
833 def _update_idx(self, sha, crc, size):
836 self.idx.add(sha, crc, self.file.tell() - size)
838 def _write(self, sha, type, content):
842 sha = calc_hash(type, content)
843 size, crc = self._raw_write(_encode_packobj(type, content,
844 self.compression_level),
846 if self.outbytes >= self.max_pack_size \
847 or self.count >= self.max_pack_objects:
851 def _require_objcache(self):
852 if self.objcache is None and self.objcache_maker:
853 self.objcache = self.objcache_maker()
854 if self.objcache is None:
856 "PackWriter not opened or can't check exists w/o objcache")
858 def exists(self, id, want_source=False):
859 """Return non-empty if an object is found in the object cache."""
860 self._require_objcache()
861 return self.objcache.exists(id, want_source=want_source)
863 def just_write(self, sha, type, content):
864 """Write an object to the pack file without checking for duplication."""
865 self._write(sha, type, content)
866 # If nothing else, gc doesn't have/want an objcache
867 if self.objcache is not None:
868 self.objcache.add(sha)
870 def maybe_write(self, type, content):
871 """Write an object to the pack file if not present and return its id."""
872 sha = calc_hash(type, content)
873 if not self.exists(sha):
874 self._require_objcache()
875 self.just_write(sha, type, content)
878 def new_blob(self, blob):
879 """Create a blob object in the pack with the supplied content."""
880 return self.maybe_write(b'blob', blob)
882 def new_tree(self, shalist):
883 """Create a tree object in the pack."""
884 content = tree_encode(shalist)
885 return self.maybe_write(b'tree', content)
887 def new_commit(self, tree, parent,
888 author, adate_sec, adate_tz,
889 committer, cdate_sec, cdate_tz,
891 """Create a commit object in the pack. The date_sec values must be
892 epoch-seconds, and if a tz is None, the local timezone is assumed."""
893 content = create_commit_blob(tree, parent,
894 author, adate_sec, adate_tz,
895 committer, cdate_sec, cdate_tz,
897 return self.maybe_write(b'commit', content)
899 def _end(self, run_midx=True, abort=False):
900 # Ignores run_midx during abort
903 self.file, f = None, self.file
904 self.idx, idx = None, self.idx
905 self.parentfd, pfd, = None, self.parentfd
908 with nullcontext_if_not(self.objcache), \
909 finalized(pfd, lambda x: x is not None and os.close(x)), \
913 os.unlink(self.filename + b'.pack')
916 # update object count
918 cp = struct.pack('!i', self.count)
922 # calculate the pack sha1sum
925 for b in chunkyreader(f):
927 packbin = sum.digest()
930 fdatasync(f.fileno())
933 idx.write(self.filename + b'.idx', packbin)
934 nameprefix = os.path.join(self.repo_dir,
935 b'objects/pack/pack-' + hexlify(packbin))
936 if os.path.exists(self.filename + b'.map'):
937 os.unlink(self.filename + b'.map')
938 os.rename(self.filename + b'.pack', nameprefix + b'.pack')
939 os.rename(self.filename + b'.idx', nameprefix + b'.idx')
942 auto_midx(os.path.join(self.repo_dir, b'objects/pack'))
943 if self.on_pack_finish:
944 self.on_pack_finish(nameprefix)
947 # Must be last -- some of the code above depends on it
951 """Remove the pack file from disk."""
952 self._end(abort=True)
954 def breakpoint(self):
955 """Clear byte and object counts and return the last processed id."""
956 id = self._end(self.run_midx)
957 self.outbytes = self.count = 0
960 def close(self, run_midx=True):
961 """Close the pack file and move it to its definitive path."""
962 return self._end(run_midx=run_midx)
965 class PackIdxV2Writer:
967 self.idx = list(list() for i in range(256))
970 def add(self, sha, crc, offs):
973 self.idx[byte_int(sha[0])].append((sha, crc, offs))
975 def write(self, filename, packbin):
977 for section in self.idx:
978 for entry in section:
979 if entry[2] >= 2**31:
982 # Length: header + fan-out + shas-and-crcs + overflow-offsets
983 index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
985 idx_f = open(filename, 'w+b')
987 idx_f.truncate(index_len)
988 fdatasync(idx_f.fileno())
989 idx_map = mmap_readwrite(idx_f, close=False)
991 count = _helpers.write_idx(filename, idx_map, self.idx,
993 assert(count == self.count)
1000 idx_f = open(filename, 'a+b')
1002 idx_f.write(packbin)
1005 b = idx_f.read(8 + 4*256)
1008 for b in chunkyreader(idx_f, 20 * self.count):
1011 for b in chunkyreader(idx_f):
1013 idx_f.write(idx_sum.digest())
1014 fdatasync(idx_f.fileno())
1019 def list_refs(patterns=None, repo_dir=None,
1020 limit_to_heads=False, limit_to_tags=False):
1021 """Yield (refname, hash) tuples for all repository refs unless
1022 patterns are specified. In that case, only include tuples for
1023 refs matching those patterns (cf. git-show-ref(1)). The limits
1024 restrict the result items to refs/heads or refs/tags. If both
1025 limits are specified, items from both sources will be included.
1028 argv = [b'git', b'show-ref']
1030 argv.append(b'--heads')
1032 argv.append(b'--tags')
1035 argv.extend(patterns)
1036 p = subprocess.Popen(argv, env=_gitenv(repo_dir), stdout=subprocess.PIPE,
1038 out = p.stdout.read().strip()
1039 rv = p.wait() # not fatal
1043 for d in out.split(b'\n'):
1044 sha, name = d.split(b' ', 1)
1045 yield name, unhexlify(sha)
1048 def read_ref(refname, repo_dir = None):
1049 """Get the commit id of the most recent commit made on a given ref."""
1050 refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
1051 l = tuple(islice(refs, 2))
1059 def rev_list_invocation(ref_or_refs, format=None):
1060 if isinstance(ref_or_refs, bytes):
1061 refs = (ref_or_refs,)
1064 argv = [b'git', b'rev-list']
1067 argv.append(b'--pretty=format:' + format)
1069 assert not ref.startswith(b'-')
1075 def rev_list(ref_or_refs, parse=None, format=None, repo_dir=None):
1076 """Yield information about commits as per "git rev-list". If a format
1077 is not provided, yield one hex hash at a time. If a format is
1078 provided, pass it to rev-list and call parse(git_stdout) for each
1079 commit with the stream positioned just after the rev-list "commit
1080 HASH" header line. When a format is provided yield (oidx,
1081 parse(git_stdout)) for each commit.
1084 assert bool(parse) == bool(format)
1085 p = subprocess.Popen(rev_list_invocation(ref_or_refs,
1087 env=_gitenv(repo_dir),
1088 stdout = subprocess.PIPE,
1091 for line in p.stdout:
1094 line = p.stdout.readline()
1097 if not s.startswith(b'commit '):
1098 raise Exception('unexpected line ' + repr(s))
1101 yield s, parse(p.stdout)
1102 line = p.stdout.readline()
1104 rv = p.wait() # not fatal
1106 raise GitError('git rev-list returned error %d' % rv)
1109 def rev_parse(committish, repo_dir=None):
1110 """Resolve the full hash for 'committish', if it exists.
1112 Should be roughly equivalent to 'git rev-parse'.
1114 Returns the hex value of the hash if it is found, None if 'committish' does
1115 not correspond to anything.
1117 head = read_ref(committish, repo_dir=repo_dir)
1119 debug2("resolved from ref: commit = %s\n" % hexlify(head))
1122 if len(committish) == 40:
1124 hash = unhexlify(committish)
1128 with PackIdxList(repo(b'objects/pack', repo_dir=repo_dir)) as pL:
1135 def update_ref(refname, newval, oldval, repo_dir=None):
1136 """Update a repository reference."""
1139 assert refname.startswith(b'refs/heads/') \
1140 or refname.startswith(b'refs/tags/')
1141 p = subprocess.Popen([b'git', b'update-ref', refname,
1142 hexlify(newval), hexlify(oldval)],
1143 env=_gitenv(repo_dir),
1145 _git_wait(b'git update-ref', p)
1148 def delete_ref(refname, oldvalue=None):
1149 """Delete a repository reference (see git update-ref(1))."""
1150 assert refname.startswith(b'refs/')
1151 oldvalue = [] if not oldvalue else [oldvalue]
1152 p = subprocess.Popen([b'git', b'update-ref', b'-d', refname] + oldvalue,
1155 _git_wait('git update-ref', p)
1158 def guess_repo(path=None):
1159 """Set the path value in the global variable "repodir".
1160 This makes bup look for an existing bup repository, but not fail if a
1161 repository doesn't exist. Usually, if you are interacting with a bup
1162 repository, you would not be calling this function but using
1163 check_repo_or_die().
1169 repodir = environ.get(b'BUP_DIR')
1171 repodir = os.path.expanduser(b'~/.bup')
1174 def init_repo(path=None):
1175 """Create the Git bare repository for bup in a given path."""
1177 d = repo() # appends a / to the path
1178 parent = os.path.dirname(os.path.dirname(d))
1179 if parent and not os.path.exists(parent):
1180 raise GitError('parent directory "%s" does not exist\n'
1182 if os.path.exists(d) and not os.path.isdir(os.path.join(d, b'.')):
1183 raise GitError('"%s" exists but is not a directory\n' % path_msg(d))
1184 p = subprocess.Popen([b'git', b'--bare', b'init'], stdout=sys.stderr,
1187 _git_wait('git init', p)
1188 # Force the index version configuration in order to ensure bup works
1189 # regardless of the version of the installed Git binary.
1190 p = subprocess.Popen([b'git', b'config', b'pack.indexVersion', '2'],
1191 stdout=sys.stderr, env=_gitenv(), close_fds=True)
1192 _git_wait('git config', p)
1194 p = subprocess.Popen([b'git', b'config', b'core.logAllRefUpdates', b'true'],
1195 stdout=sys.stderr, env=_gitenv(), close_fds=True)
1196 _git_wait('git config', p)
1199 def check_repo_or_die(path=None):
1200 """Check to see if a bup repository probably exists, and abort if not."""
1203 pst = stat_if_exists(top + b'/objects/pack')
1204 if pst and stat.S_ISDIR(pst.st_mode):
1207 top_st = stat_if_exists(top)
1209 log('error: repository %r does not exist (see "bup help init")\n'
1212 log('error: %s is not a repository\n' % path_msg(top))
1216 def is_suitable_git(ver_str):
1217 if not ver_str.startswith(b'git version '):
1218 return 'unrecognized'
1219 ver_str = ver_str[len(b'git version '):]
1220 if ver_str.startswith(b'0.'):
1221 return 'insufficient'
1222 if ver_str.startswith(b'1.'):
1223 if re.match(br'1\.[012345]rc', ver_str):
1224 return 'insufficient'
1225 if re.match(br'1\.[01234]\.', ver_str):
1226 return 'insufficient'
1227 if re.match(br'1\.5\.[012345]($|\.)', ver_str):
1228 return 'insufficient'
1229 if re.match(br'1\.5\.6-rc', ver_str):
1230 return 'insufficient'
1232 if re.match(br'[0-9]+(\.|$)?', ver_str):
1238 def require_suitable_git(ver_str=None):
1239 """Raise GitError if the version of git isn't suitable.
1241 Rely on ver_str when provided, rather than invoking the git in the
1246 if _git_great is not None:
1248 if environ.get(b'BUP_GIT_VERSION_IS_FINE', b'').lower() \
1249 in (b'yes', b'true', b'1'):
1253 ver_str, _, _ = _git_exo([b'git', b'--version'])
1254 status = is_suitable_git(ver_str)
1255 if status == 'unrecognized':
1256 raise GitError('Unexpected git --version output: %r' % ver_str)
1257 if status == 'insufficient':
1258 log('error: git version must be at least 1.5.6\n')
1260 if status == 'suitable':
1267 """Link to 'git cat-file' that is used to retrieve blob data."""
1268 def __init__(self, repo_dir = None):
1269 require_suitable_git()
1270 self.repo_dir = repo_dir
1271 self.p = self.inprogress = None
1273 def close(self, wait=False):
1274 self.p, p = None, self.p
1275 self.inprogress = None
1280 # This will handle pending exceptions correctly once
1290 self.p = subprocess.Popen([b'git', b'cat-file', b'--batch'],
1291 stdin=subprocess.PIPE,
1292 stdout=subprocess.PIPE,
1295 env=_gitenv(self.repo_dir))
1298 """Yield (oidx, type, size), followed by the data referred to by ref.
1299 If ref does not exist, only yield (None, None, None).
1302 if not self.p or self.p.poll() != None:
1305 poll_result = self.p.poll()
1306 assert(poll_result == None)
1308 log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1309 assert(not self.inprogress)
1310 assert ref.find(b'\n') < 0
1311 assert ref.find(b'\r') < 0
1312 assert not ref.startswith(b'-')
1313 self.inprogress = ref
1314 self.p.stdin.write(ref + b'\n')
1315 self.p.stdin.flush()
1316 hdr = self.p.stdout.readline()
1318 raise GitError('unexpected cat-file EOF (last request: %r, exit: %s)'
1319 % (ref, self.p.poll() or 'none'))
1320 if hdr.endswith(b' missing\n'):
1321 self.inprogress = None
1322 yield None, None, None
1324 info = hdr.split(b' ')
1325 if len(info) != 3 or len(info[0]) != 40:
1326 raise GitError('expected object (id, type, size), got %r' % info)
1327 oidx, typ, size = info
1330 it = chunkyreader(self.p.stdout, size)
1331 yield oidx, typ, size
1332 for blob in chunkyreader(self.p.stdout, size):
1334 readline_result = self.p.stdout.readline()
1335 assert readline_result == b'\n'
1336 self.inprogress = None
1337 except Exception as ex:
1338 with pending_raise(ex):
1341 def _join(self, it):
1342 _, typ, _ = next(it)
1346 elif typ == b'tree':
1347 treefile = b''.join(it)
1348 for (mode, name, sha) in tree_decode(treefile):
1349 for blob in self.join(hexlify(sha)):
1351 elif typ == b'commit':
1352 treeline = b''.join(it).split(b'\n')[0]
1353 assert treeline.startswith(b'tree ')
1354 for blob in self.join(treeline[5:]):
1357 raise GitError('invalid object type %r: expected blob/tree/commit'
1361 """Generate a list of the content of all blobs that can be reached
1362 from an object. The hash given in 'id' must point to a blob, a tree
1363 or a commit. The content of all blobs that can be seen from trees or
1364 commits will be added to the list.
1366 for d in self._join(self.get(id)):
1372 def cp(repo_dir=None):
1373 """Create a CatPipe object or reuse the already existing one."""
1376 repo_dir = repodir or repo()
1377 repo_dir = os.path.abspath(repo_dir)
1378 cp = _cp.get(repo_dir)
1380 cp = CatPipe(repo_dir)
1385 def close_catpipes():
1386 # FIXME: chain exceptions
1388 _, cp = _cp.popitem()
1392 def tags(repo_dir = None):
1393 """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1395 for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1396 assert n.startswith(b'refs/tags/')
1400 tags[c].append(name) # more than one tag can point at 'c'
1404 class MissingObject(KeyError):
1405 def __init__(self, oid):
1407 KeyError.__init__(self, 'object %r is missing' % hexlify(oid))
1410 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1411 'path', 'chunk_path', 'data'])
1412 # The path is the mangled path, and if an item represents a fragment
1413 # of a chunked file, the chunk_path will be the chunked subtree path
1414 # for the chunk, i.e. ['', '2d3115e', ...]. The top-level path for a
1415 # chunked file will have a chunk_path of ['']. So some chunk subtree
1416 # of the file '/foo/bar/baz' might look like this:
1418 # item.path = ['foo', 'bar', 'baz.bup']
1419 # item.chunk_path = ['', '2d3115e', '016b097']
1420 # item.type = 'tree'
1424 def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1425 """Yield everything reachable from oidx via get_ref (which must behave
1426 like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1427 returns true. Throw MissingObject if a hash encountered is
1428 missing from the repository, and don't read or return blob content
1429 in the data field unless include_data is set.
1432 # Maintain the pending stack on the heap to avoid stack overflow
1433 pending = [(oidx, [], [], None)]
1435 oidx, parent_path, chunk_path, mode = pending.pop()
1436 oid = unhexlify(oidx)
1437 if stop_at and stop_at(oidx):
1440 if (not include_data) and mode and stat.S_ISREG(mode):
1441 # If the object is a "regular file", then it's a leaf in
1442 # the graph, so we can skip reading the data if the caller
1443 # hasn't requested it.
1444 yield WalkItem(oid=oid, type=b'blob',
1445 chunk_path=chunk_path, path=parent_path,
1450 item_it = get_ref(oidx)
1451 get_oidx, typ, _ = next(item_it)
1453 raise MissingObject(unhexlify(oidx))
1454 if typ not in (b'blob', b'commit', b'tree'):
1455 raise Exception('unexpected repository object type %r' % typ)
1457 # FIXME: set the mode based on the type when the mode is None
1458 if typ == b'blob' and not include_data:
1459 # Dump data until we can ask cat_pipe not to fetch it
1460 for ignored in item_it:
1464 data = b''.join(item_it)
1466 yield WalkItem(oid=oid, type=typ,
1467 chunk_path=chunk_path, path=parent_path,
1469 data=(data if include_data else None))
1471 if typ == b'commit':
1472 commit_items = parse_commit(data)
1473 for pid in commit_items.parents:
1474 pending.append((pid, parent_path, chunk_path, mode))
1475 pending.append((commit_items.tree, parent_path, chunk_path,
1476 hashsplit.GIT_MODE_TREE))
1477 elif typ == b'tree':
1478 for mode, name, ent_id in tree_decode(data):
1479 demangled, bup_type = demangle_name(name, mode)
1481 sub_path = parent_path
1482 sub_chunk_path = chunk_path + [name]
1484 sub_path = parent_path + [name]
1485 if bup_type == BUP_CHUNKED:
1486 sub_chunk_path = [b'']
1488 sub_chunk_path = chunk_path
1489 pending.append((hexlify(ent_id), sub_path, sub_chunk_path,