1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
6 from __future__ import absolute_import, print_function
7 import os, sys, zlib, subprocess, struct, stat, re, tempfile, glob
8 from array import array
9 from binascii import hexlify, unhexlify
10 from collections import namedtuple
11 from itertools import islice
13 from bup import _helpers, hashsplit, path, midx, bloom, xstat
14 from bup.compat import (buffer,
15 byte_int, bytes_from_byte, bytes_from_uint,
20 from bup.io import path_msg
21 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
28 mmap_read, mmap_readwrite,
30 progress, qprogress, stat_if_exists,
36 repodir = None # The default repository, once initialized
38 _typemap = {b'blob': 3, b'tree': 2, b'commit': 1, b'tag': 4}
39 _typermap = {v: k for k, v in _typemap.items()}
46 class GitError(Exception):
50 def _gitenv(repo_dir=None):
53 return merge_dict(environ, {b'GIT_DIR': os.path.abspath(repo_dir)})
55 def _git_wait(cmd, p):
58 raise GitError('%r returned %d' % (cmd, rv))
60 def _git_exo(cmd, **kwargs):
61 kwargs['check'] = False
62 result = exo(cmd, **kwargs)
64 if proc.returncode != 0:
65 raise GitError('%r returned %d' % (cmd, proc.returncode))
68 def git_config_get(option, repo_dir=None, opttype=None, cfg_file=None):
69 assert not (repo_dir and cfg_file), "repo_dir and cfg_file cannot both be used"
70 cmd = [b'git', b'config', b'--null']
72 cmd.extend([b'--file', cfg_file])
74 cmd.extend([b'--int'])
75 elif opttype == 'bool':
76 cmd.extend([b'--bool'])
78 assert opttype is None
79 cmd.extend([b'--get', option])
82 env = _gitenv(repo_dir=repo_dir)
83 p = subprocess.Popen(cmd, stdout=subprocess.PIPE, env=env,
85 # with --null, git writes out a trailing \0 after the value
86 r = p.stdout.read()[:-1]
91 elif opttype == 'bool':
92 # git converts to 'true' or 'false'
96 raise GitError('%r returned %d' % (cmd, rc))
100 def parse_tz_offset(s):
101 """UTC offset in seconds."""
102 tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
103 if bytes_from_byte(s[0]) == b'-':
107 def parse_commit_gpgsig(sig):
108 """Return the original signature bytes.
110 i.e. with the "gpgsig " header and the leading space character on
111 each continuation line removed.
116 assert sig.startswith(b'gpgsig ')
118 return sig.replace(b'\n ', b'\n')
120 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
121 # Make sure that's authoritative.
124 # https://github.com/git/git/blob/master/Documentation/technical/signature-format.txt
125 # The continuation lines have only one leading space.
127 _start_end_char = br'[^ .,:;<>"\'\0\n]'
128 _content_char = br'[^\0\n<>]'
129 _safe_str_rx = br'(?:%s{1,2}|(?:%s%s*%s))' \
131 _start_end_char, _content_char, _start_end_char)
132 _tz_rx = br'[-+]\d\d[0-5]\d'
133 _parent_rx = br'(?:parent [abcdefABCDEF0123456789]{40}\n)'
134 # Assumes every following line starting with a space is part of the
135 # mergetag. Is there a formal commit blob spec?
136 _mergetag_rx = br'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)'
137 _commit_rx = re.compile(br'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
138 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
139 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?)
140 (?P<gpgsig>gpgsig .*\n(?: .*\n)*)?
141 (?P<message>(?:.|\n)*)''' % (_parent_rx,
142 _safe_str_rx, _safe_str_rx, _tz_rx,
143 _safe_str_rx, _safe_str_rx, _tz_rx,
145 _parent_hash_rx = re.compile(br'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
147 # Note that the author_sec and committer_sec values are (UTC) epoch
148 # seconds, and for now the mergetag is not included.
149 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
150 'author_name', 'author_mail',
151 'author_sec', 'author_offset',
152 'committer_name', 'committer_mail',
153 'committer_sec', 'committer_offset',
157 def parse_commit(content):
158 commit_match = re.match(_commit_rx, content)
160 raise Exception('cannot parse commit %r' % content)
161 matches = commit_match.groupdict()
162 return CommitInfo(tree=matches['tree'],
163 parents=re.findall(_parent_hash_rx, matches['parents']),
164 author_name=matches['author_name'],
165 author_mail=matches['author_mail'],
166 author_sec=int(matches['asec']),
167 author_offset=parse_tz_offset(matches['atz']),
168 committer_name=matches['committer_name'],
169 committer_mail=matches['committer_mail'],
170 committer_sec=int(matches['csec']),
171 committer_offset=parse_tz_offset(matches['ctz']),
172 gpgsig=parse_commit_gpgsig(matches['gpgsig']),
173 message=matches['message'])
176 def get_cat_data(cat_iterator, expected_type):
177 _, kind, _ = next(cat_iterator)
178 if kind != expected_type:
179 raise Exception('expected %r, saw %r' % (expected_type, kind))
180 return b''.join(cat_iterator)
182 def get_commit_items(id, cp):
183 return parse_commit(get_cat_data(cp.get(id), b'commit'))
185 def _local_git_date_str(epoch_sec):
186 return b'%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
189 def _git_date_str(epoch_sec, tz_offset_sec):
190 offs = tz_offset_sec // 60
191 return b'%d %s%02d%02d' \
193 b'+' if offs >= 0 else b'-',
198 def repo(sub = b'', repo_dir=None):
199 """Get the path to the git repository or one of its subdirectories."""
200 repo_dir = repo_dir or repodir
202 raise GitError('You should call check_repo_or_die()')
204 # If there's a .git subdirectory, then the actual repo is in there.
205 gd = os.path.join(repo_dir, b'.git')
206 if os.path.exists(gd):
209 return os.path.join(repo_dir, sub)
213 re.compile(br'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)')
216 return _shorten_hash_rx.sub(br'\1\2*\3', s)
220 full = os.path.abspath(path)
221 fullrepo = os.path.abspath(repo(b''))
222 if not fullrepo.endswith(b'/'):
224 if full.startswith(fullrepo):
225 path = full[len(fullrepo):]
226 if path.startswith(b'index-cache/'):
227 path = path[len(b'index-cache/'):]
228 return shorten_hash(path)
231 def auto_midx(objdir):
232 args = [path.exe(), b'midx', b'--auto', b'--dir', objdir]
234 rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
236 # make sure 'args' gets printed to help with debugging
237 add_error('%r: exception: %s' % (args, e))
240 add_error('%r: returned %d' % (args, rv))
242 args = [path.exe(), b'bloom', b'--dir', objdir]
244 rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
246 # make sure 'args' gets printed to help with debugging
247 add_error('%r: exception: %s' % (args, e))
250 add_error('%r: returned %d' % (args, rv))
253 def mangle_name(name, mode, gitmode):
254 """Mangle a file name to present an abstract name for segmented files.
255 Mangled file names will have the ".bup" extension added to them. If a
256 file's name already ends with ".bup", a ".bupl" extension is added to
257 disambiguate normal files from segmented ones.
259 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
260 assert(stat.S_ISDIR(gitmode))
261 return name + b'.bup'
262 elif name.endswith(b'.bup') or name[:-1].endswith(b'.bup'):
263 return name + b'.bupl'
268 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
269 def demangle_name(name, mode):
270 """Remove name mangling from a file name, if necessary.
272 The return value is a tuple (demangled_filename,mode), where mode is one of
275 * BUP_NORMAL : files that should be read as-is from the repository
276 * BUP_CHUNKED : files that were chunked and need to be reassembled
278 For more information on the name mangling algorithm, see mangle_name()
280 if name.endswith(b'.bupl'):
281 return (name[:-5], BUP_NORMAL)
282 elif name.endswith(b'.bup'):
283 return (name[:-4], BUP_CHUNKED)
284 elif name.endswith(b'.bupm'):
286 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
287 return (name, BUP_NORMAL)
290 def calc_hash(type, content):
291 """Calculate some content's hash in the Git fashion."""
292 header = b'%s %d\0' % (type, len(content))
298 def shalist_item_sort_key(ent):
299 (mode, name, id) = ent
300 assert(mode+0 == mode)
301 if stat.S_ISDIR(mode):
307 def tree_encode(shalist):
308 """Generate a git tree object from (mode,name,hash) tuples."""
309 shalist = sorted(shalist, key = shalist_item_sort_key)
311 for (mode,name,bin) in shalist:
313 assert(mode+0 == mode)
315 assert(len(bin) == 20)
316 s = b'%o %s\0%s' % (mode,name,bin)
317 assert s[0] != b'0' # 0-padded octal is not acceptable in a git tree
322 def tree_decode(buf):
323 """Generate a list of (mode,name,hash) from the git tree object in buf."""
325 while ofs < len(buf):
326 z = buf.find(b'\0', ofs)
328 spl = buf[ofs:z].split(b' ', 1)
329 assert(len(spl) == 2)
331 sha = buf[z+1:z+1+20]
333 yield (int(mode, 8), name, sha)
336 def _encode_packobj(type, content, compression_level=1):
337 if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
338 raise ValueError('invalid compression level %s' % compression_level)
341 szbits = (sz & 0x0f) | (_typemap[type]<<4)
344 if sz: szbits |= 0x80
345 szout += bytes_from_uint(szbits)
350 z = zlib.compressobj(compression_level)
352 yield z.compress(content)
356 def _decode_packobj(buf):
359 type = _typermap[(c & 0x70) >> 4]
366 sz |= (c & 0x7f) << shift
370 return (type, zlib.decompress(buf[i+1:]))
373 class PackIdx(object):
374 def find_offset(self, hash):
375 """Get the offset of an object inside the index file."""
376 idx = self._idx_from_hash(hash)
378 return self._ofs_from_idx(idx)
381 def exists(self, hash, want_source=False):
382 """Return nonempty if the object exists in this index."""
383 if hash and (self._idx_from_hash(hash) != None):
384 return want_source and os.path.basename(self.name) or True
387 def _idx_from_hash(self, hash):
388 global _total_searches, _total_steps
390 assert(len(hash) == 20)
391 b1 = byte_int(hash[0])
392 start = self.fanout[b1-1] # range -1..254
393 end = self.fanout[b1] # range 0..255
395 _total_steps += 1 # lookup table is a step
398 mid = start + (end - start) // 2
399 v = self._idx_to_hash(mid)
409 class PackIdxV1(PackIdx):
410 """Object representation of a Git pack index (version 1) file."""
411 def __init__(self, filename, f):
412 super(PackIdxV1, self).__init__()
415 self.idxnames = [self.name]
416 self.map = mmap_read(f)
417 # Min size for 'L' is 4, which is sufficient for struct's '!I'
418 self.fanout = array('L', struct.unpack('!256I', self.map))
419 self.fanout.append(0) # entry "-1"
420 self.nsha = self.fanout[255]
421 self.sha_ofs = 256 * 4
422 # Avoid slicing shatable for individual hashes (very high overhead)
423 self.shatable = buffer(self.map, self.sha_ofs, self.nsha * 24)
428 def __exit__(self, type, value, traceback):
429 with pending_raise(value, rethrow=False):
433 return int(self.nsha) # int() from long for python 2
435 def _ofs_from_idx(self, idx):
436 if idx >= self.nsha or idx < 0:
437 raise IndexError('invalid pack index index %d' % idx)
438 ofs = self.sha_ofs + idx * 24
439 return struct.unpack_from('!I', self.map, offset=ofs)[0]
441 def _idx_to_hash(self, idx):
442 if idx >= self.nsha or idx < 0:
443 raise IndexError('invalid pack index index %d' % idx)
444 ofs = self.sha_ofs + idx * 24 + 4
445 return self.map[ofs : ofs + 20]
448 start = self.sha_ofs + 4
449 for ofs in range(start, start + 24 * self.nsha, 24):
450 yield self.map[ofs : ofs + 20]
454 if self.map is not None:
463 class PackIdxV2(PackIdx):
464 """Object representation of a Git pack index (version 2) file."""
465 def __init__(self, filename, f):
466 super(PackIdxV2, self).__init__()
469 self.idxnames = [self.name]
470 self.map = mmap_read(f)
471 assert self.map[0:8] == b'\377tOc\0\0\0\2'
472 # Min size for 'L' is 4, which is sufficient for struct's '!I'
473 self.fanout = array('L', struct.unpack_from('!256I', self.map, offset=8))
474 self.fanout.append(0)
475 self.nsha = self.fanout[255]
476 self.sha_ofs = 8 + 256*4
477 self.ofstable_ofs = self.sha_ofs + self.nsha * 20 + self.nsha * 4
478 self.ofs64table_ofs = self.ofstable_ofs + self.nsha * 4
479 # Avoid slicing this for individual hashes (very high overhead)
480 self.shatable = buffer(self.map, self.sha_ofs, self.nsha*20)
485 def __exit__(self, type, value, traceback):
486 with pending_raise(value, rethrow=False):
490 return int(self.nsha) # int() from long for python 2
492 def _ofs_from_idx(self, idx):
493 if idx >= self.nsha or idx < 0:
494 raise IndexError('invalid pack index index %d' % idx)
495 ofs_ofs = self.ofstable_ofs + idx * 4
496 ofs = struct.unpack_from('!I', self.map, offset=ofs_ofs)[0]
498 idx64 = ofs & 0x7fffffff
499 ofs64_ofs = self.ofs64table_ofs + idx64 * 8
500 ofs = struct.unpack_from('!Q', self.map, offset=ofs64_ofs)[0]
503 def _idx_to_hash(self, idx):
504 if idx >= self.nsha or idx < 0:
505 raise IndexError('invalid pack index index %d' % idx)
506 ofs = self.sha_ofs + idx * 20
507 return self.map[ofs : ofs + 20]
511 for ofs in range(start, start + 20 * self.nsha, 20):
512 yield self.map[ofs : ofs + 20]
516 if self.map is not None:
527 def __init__(self, dir, ignore_midx=False):
529 # Q: was this also intended to prevent opening multiple repos?
530 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
536 self.do_bloom = False
538 self.ignore_midx = ignore_midx
541 except BaseException as ex:
542 with pending_raise(ex):
548 assert _mpi_count == 0
551 assert _mpi_count == 0
553 self.bloom, bloom = None, self.bloom
554 self.packs, packs = None, self.packs
556 with ExitStack() as stack:
558 stack.enter_context(pack)
565 def __exit__(self, type, value, traceback):
566 with pending_raise(value, rethrow=False):
573 return iter(idxmerge(self.packs))
576 return sum(len(pack) for pack in self.packs)
578 def exists(self, hash, want_source=False):
579 """Return nonempty if the object exists in the index files."""
580 global _total_searches
582 if hash in self.also:
584 if self.do_bloom and self.bloom:
585 if self.bloom.exists(hash):
586 self.do_bloom = False
588 _total_searches -= 1 # was counted by bloom
590 for i in range(len(self.packs)):
592 _total_searches -= 1 # will be incremented by sub-pack
593 ix = p.exists(hash, want_source=want_source)
595 # reorder so most recently used packs are searched first
596 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
601 def refresh(self, skip_midx = False):
602 """Refresh the index list.
603 This method verifies if .midx files were superseded (e.g. all of its
604 contents are in another, bigger .midx file) and removes the superseded
607 If skip_midx is True, all work on .midx files will be skipped and .midx
608 files will be removed from the list.
610 The instance variable 'ignore_midx' can force this function to
611 always act as if skip_midx was True.
613 if self.bloom is not None:
615 self.bloom = None # Always reopen the bloom as it may have been relaced
616 self.do_bloom = False
617 skip_midx = skip_midx or self.ignore_midx
618 d = dict((p.name, p) for p in self.packs
619 if not skip_midx or not isinstance(p, midx.PackMidx))
620 if os.path.exists(self.dir):
623 midxes = set(glob.glob(os.path.join(self.dir, b'*.midx')))
624 # remove any *.midx files from our list that no longer exist
625 for ix in list(d.values()):
626 if not isinstance(ix, midx.PackMidx):
628 if ix.name in midxes:
633 self.packs.remove(ix)
634 for ix in self.packs:
635 if isinstance(ix, midx.PackMidx):
636 for name in ix.idxnames:
637 d[os.path.join(self.dir, name)] = ix
640 mx = midx.PackMidx(full)
641 (mxd, mxf) = os.path.split(mx.name)
643 for n in mx.idxnames:
644 if not os.path.exists(os.path.join(mxd, n)):
645 log(('warning: index %s missing\n'
647 % (path_msg(n), path_msg(mxf)))
654 midxl.sort(key=lambda ix:
655 (-len(ix), -xstat.stat(ix.name).st_mtime))
658 for sub in ix.idxnames:
659 found = d.get(os.path.join(self.dir, sub))
660 if not found or isinstance(found, PackIdx):
661 # doesn't exist, or exists but not in a midx
666 for name in ix.idxnames:
667 d[os.path.join(self.dir, name)] = ix
668 elif not ix.force_keep:
669 debug1('midx: removing redundant: %s\n'
670 % path_msg(os.path.basename(ix.name)))
673 for full in glob.glob(os.path.join(self.dir, b'*.idx')):
677 except GitError as e:
681 bfull = os.path.join(self.dir, b'bup.bloom')
682 new_packs = set(d.values())
684 if not p in new_packs:
686 new_packs = list(new_packs)
687 new_packs.sort(reverse=True, key=lambda x: len(x))
688 self.packs = new_packs
689 if self.bloom is None and os.path.exists(bfull):
690 self.bloom = bloom.ShaBloom(bfull)
692 if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
696 self.bloom, bloom_tmp = None, self.bloom
698 except BaseException as ex:
699 with pending_raise(ex):
703 debug1('PackIdxList: using %d index%s.\n'
704 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
707 """Insert an additional object in the list."""
711 def open_idx(filename):
712 if filename.endswith(b'.idx'):
713 f = open(filename, 'rb')
715 if header[0:4] == b'\377tOc':
716 version = struct.unpack('!I', header[4:8])[0]
718 return PackIdxV2(filename, f)
720 raise GitError('%s: expected idx file version 2, got %d'
721 % (path_msg(filename), version))
722 elif len(header) == 8 and header[0:4] < b'\377tOc':
723 return PackIdxV1(filename, f)
725 raise GitError('%s: unrecognized idx file header'
726 % path_msg(filename))
727 elif filename.endswith(b'.midx'):
728 return midx.PackMidx(filename)
730 raise GitError('idx filenames must end with .idx or .midx')
733 def idxmerge(idxlist, final_progress=True):
734 """Generate a list of all the objects reachable in a PackIdxList."""
735 def pfunc(count, total):
736 qprogress('Reading indexes: %.2f%% (%d/%d)\r'
737 % (count*100.0/total, count, total))
738 def pfinal(count, total):
740 progress('Reading indexes: %.2f%% (%d/%d), done.\n'
741 % (100, total, total))
742 return merge_iter(idxlist, 10024, pfunc, pfinal)
745 def create_commit_blob(tree, parent,
746 author, adate_sec, adate_tz,
747 committer, cdate_sec, cdate_tz,
749 if adate_tz is not None:
750 adate_str = _git_date_str(adate_sec, adate_tz)
752 adate_str = _local_git_date_str(adate_sec)
753 if cdate_tz is not None:
754 cdate_str = _git_date_str(cdate_sec, cdate_tz)
756 cdate_str = _local_git_date_str(cdate_sec)
758 if tree: l.append(b'tree %s' % hexlify(tree))
759 if parent: l.append(b'parent %s' % hexlify(parent))
760 if author: l.append(b'author %s %s' % (author, adate_str))
761 if committer: l.append(b'committer %s %s' % (committer, cdate_str))
766 def _make_objcache():
767 return PackIdxList(repo(b'objects/pack'))
769 # bup-gc assumes that it can disable all PackWriter activities
770 # (bloom/midx/cache) via the constructor and close() arguments.
772 class PackWriter(object):
773 """Writes Git objects inside a pack file."""
774 def __init__(self, objcache_maker=_make_objcache, compression_level=1,
775 run_midx=True, on_pack_finish=None,
776 max_pack_size=None, max_pack_objects=None, repo_dir=None):
778 self.repo_dir = repo_dir or repo()
785 self.objcache_maker = objcache_maker
787 self.compression_level = compression_level
788 self.run_midx=run_midx
789 self.on_pack_finish = on_pack_finish
790 if not max_pack_size:
791 max_pack_size = git_config_get(b'pack.packSizeLimit',
792 repo_dir=self.repo_dir,
794 if not max_pack_size:
795 # larger packs slow down pruning
796 max_pack_size = 1000 * 1000 * 1000
797 self.max_pack_size = max_pack_size
798 # cache memory usage is about 83 bytes per object
799 self.max_pack_objects = max_pack_objects if max_pack_objects \
800 else max(1, self.max_pack_size // 5000)
805 def __exit__(self, type, value, traceback):
806 with pending_raise(value, rethrow=False):
811 objdir = dir = os.path.join(self.repo_dir, b'objects')
812 fd, name = tempfile.mkstemp(suffix=b'.pack', dir=objdir)
814 self.file = os.fdopen(fd, 'w+b')
819 self.parentfd = os.open(objdir, os.O_RDONLY)
825 assert name.endswith(b'.pack')
826 self.filename = name[:-5]
827 self.file.write(b'PACK\0\0\0\2\0\0\0\0')
828 self.idx = PackIdxV2Writer()
830 def _raw_write(self, datalist, sha):
833 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
834 # the file never has a *partial* blob. So let's make sure it's
835 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
836 # to our hashsplit algorithm.) f.write() does its own buffering,
837 # but that's okay because we'll flush it in _end().
838 oneblob = b''.join(datalist)
844 crc = zlib.crc32(oneblob) & 0xffffffff
845 self._update_idx(sha, crc, nw)
850 def _update_idx(self, sha, crc, size):
853 self.idx.add(sha, crc, self.file.tell() - size)
855 def _write(self, sha, type, content):
859 sha = calc_hash(type, content)
860 size, crc = self._raw_write(_encode_packobj(type, content,
861 self.compression_level),
863 if self.outbytes >= self.max_pack_size \
864 or self.count >= self.max_pack_objects:
868 def _require_objcache(self):
869 if self.objcache is None and self.objcache_maker:
870 self.objcache = self.objcache_maker()
871 if self.objcache is None:
873 "PackWriter not opened or can't check exists w/o objcache")
875 def exists(self, id, want_source=False):
876 """Return non-empty if an object is found in the object cache."""
877 self._require_objcache()
878 return self.objcache.exists(id, want_source=want_source)
880 def just_write(self, sha, type, content):
881 """Write an object to the pack file without checking for duplication."""
882 self._write(sha, type, content)
883 # If nothing else, gc doesn't have/want an objcache
884 if self.objcache is not None:
885 self.objcache.add(sha)
887 def maybe_write(self, type, content):
888 """Write an object to the pack file if not present and return its id."""
889 sha = calc_hash(type, content)
890 if not self.exists(sha):
891 self._require_objcache()
892 self.just_write(sha, type, content)
895 def new_blob(self, blob):
896 """Create a blob object in the pack with the supplied content."""
897 return self.maybe_write(b'blob', blob)
899 def new_tree(self, shalist):
900 """Create a tree object in the pack."""
901 content = tree_encode(shalist)
902 return self.maybe_write(b'tree', content)
904 def new_commit(self, tree, parent,
905 author, adate_sec, adate_tz,
906 committer, cdate_sec, cdate_tz,
908 """Create a commit object in the pack. The date_sec values must be
909 epoch-seconds, and if a tz is None, the local timezone is assumed."""
910 content = create_commit_blob(tree, parent,
911 author, adate_sec, adate_tz,
912 committer, cdate_sec, cdate_tz,
914 return self.maybe_write(b'commit', content)
916 def _end(self, run_midx=True, abort=False):
917 # Ignores run_midx during abort
918 self.parentfd, pfd, = None, self.parentfd
919 self.file, f = None, self.file
920 self.idx, idx = None, self.idx
922 with nullcontext_if_not(self.objcache), \
923 finalized(pfd, lambda x: x is not None and os.close(x)), \
924 nullcontext_if_not(f):
929 os.unlink(self.filename + b'.pack')
932 # update object count
934 cp = struct.pack('!i', self.count)
938 # calculate the pack sha1sum
941 for b in chunkyreader(f):
943 packbin = sum.digest()
946 fdatasync(f.fileno())
949 idx.write(self.filename + b'.idx', packbin)
950 nameprefix = os.path.join(self.repo_dir,
951 b'objects/pack/pack-' + hexlify(packbin))
952 if os.path.exists(self.filename + b'.map'):
953 os.unlink(self.filename + b'.map')
954 os.rename(self.filename + b'.pack', nameprefix + b'.pack')
955 os.rename(self.filename + b'.idx', nameprefix + b'.idx')
958 auto_midx(os.path.join(self.repo_dir, b'objects/pack'))
959 if self.on_pack_finish:
960 self.on_pack_finish(nameprefix)
963 # Must be last -- some of the code above depends on it
967 """Remove the pack file from disk."""
969 self._end(abort=True)
971 def breakpoint(self):
972 """Clear byte and object counts and return the last processed id."""
973 id = self._end(self.run_midx)
974 self.outbytes = self.count = 0
977 def close(self, run_midx=True):
978 """Close the pack file and move it to its definitive path."""
980 return self._end(run_midx=run_midx)
986 class PackIdxV2Writer:
988 self.idx = list(list() for i in range(256))
991 def add(self, sha, crc, offs):
994 self.idx[byte_int(sha[0])].append((sha, crc, offs))
996 def write(self, filename, packbin):
998 for section in self.idx:
999 for entry in section:
1000 if entry[2] >= 2**31:
1003 # Length: header + fan-out + shas-and-crcs + overflow-offsets
1004 index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
1006 idx_f = open(filename, 'w+b')
1008 idx_f.truncate(index_len)
1009 fdatasync(idx_f.fileno())
1010 idx_map = mmap_readwrite(idx_f, close=False)
1012 count = _helpers.write_idx(filename, idx_map, self.idx,
1014 assert(count == self.count)
1021 idx_f = open(filename, 'a+b')
1023 idx_f.write(packbin)
1026 b = idx_f.read(8 + 4*256)
1029 for b in chunkyreader(idx_f, 20 * self.count):
1032 for b in chunkyreader(idx_f):
1034 idx_f.write(idx_sum.digest())
1035 fdatasync(idx_f.fileno())
1040 def list_refs(patterns=None, repo_dir=None,
1041 limit_to_heads=False, limit_to_tags=False):
1042 """Yield (refname, hash) tuples for all repository refs unless
1043 patterns are specified. In that case, only include tuples for
1044 refs matching those patterns (cf. git-show-ref(1)). The limits
1045 restrict the result items to refs/heads or refs/tags. If both
1046 limits are specified, items from both sources will be included.
1049 argv = [b'git', b'show-ref']
1051 argv.append(b'--heads')
1053 argv.append(b'--tags')
1056 argv.extend(patterns)
1057 p = subprocess.Popen(argv, env=_gitenv(repo_dir), stdout=subprocess.PIPE,
1059 out = p.stdout.read().strip()
1060 rv = p.wait() # not fatal
1064 for d in out.split(b'\n'):
1065 sha, name = d.split(b' ', 1)
1066 yield name, unhexlify(sha)
1069 def read_ref(refname, repo_dir = None):
1070 """Get the commit id of the most recent commit made on a given ref."""
1071 refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
1072 l = tuple(islice(refs, 2))
1080 def rev_list_invocation(ref_or_refs, format=None):
1081 if isinstance(ref_or_refs, bytes):
1082 refs = (ref_or_refs,)
1085 argv = [b'git', b'rev-list']
1088 argv.append(b'--pretty=format:' + format)
1090 assert not ref.startswith(b'-')
1096 def rev_list(ref_or_refs, parse=None, format=None, repo_dir=None):
1097 """Yield information about commits as per "git rev-list". If a format
1098 is not provided, yield one hex hash at a time. If a format is
1099 provided, pass it to rev-list and call parse(git_stdout) for each
1100 commit with the stream positioned just after the rev-list "commit
1101 HASH" header line. When a format is provided yield (oidx,
1102 parse(git_stdout)) for each commit.
1105 assert bool(parse) == bool(format)
1106 p = subprocess.Popen(rev_list_invocation(ref_or_refs,
1108 env=_gitenv(repo_dir),
1109 stdout = subprocess.PIPE,
1112 for line in p.stdout:
1115 line = p.stdout.readline()
1118 if not s.startswith(b'commit '):
1119 raise Exception('unexpected line ' + repr(s))
1122 yield s, parse(p.stdout)
1123 line = p.stdout.readline()
1125 rv = p.wait() # not fatal
1127 raise GitError('git rev-list returned error %d' % rv)
1130 def rev_parse(committish, repo_dir=None):
1131 """Resolve the full hash for 'committish', if it exists.
1133 Should be roughly equivalent to 'git rev-parse'.
1135 Returns the hex value of the hash if it is found, None if 'committish' does
1136 not correspond to anything.
1138 head = read_ref(committish, repo_dir=repo_dir)
1140 debug2("resolved from ref: commit = %s\n" % hexlify(head))
1143 if len(committish) == 40:
1145 hash = unhexlify(committish)
1149 with PackIdxList(repo(b'objects/pack', repo_dir=repo_dir)) as pL:
1156 def update_ref(refname, newval, oldval, repo_dir=None, force=False):
1157 """Update a repository reference.
1159 With force=True, don't care about the previous ref (oldval);
1160 with force=False oldval must be either a sha1 or None (for an
1161 entirely new branch)
1164 assert oldval is None
1169 oldarg = [hexlify(oldval)]
1170 assert refname.startswith(b'refs/heads/') \
1171 or refname.startswith(b'refs/tags/')
1172 p = subprocess.Popen([b'git', b'update-ref', refname,
1173 hexlify(newval)] + oldarg,
1174 env=_gitenv(repo_dir),
1176 _git_wait(b'git update-ref', p)
1179 def delete_ref(refname, oldvalue=None):
1180 """Delete a repository reference (see git update-ref(1))."""
1181 assert refname.startswith(b'refs/')
1182 oldvalue = [] if not oldvalue else [oldvalue]
1183 p = subprocess.Popen([b'git', b'update-ref', b'-d', refname] + oldvalue,
1186 _git_wait('git update-ref', p)
1189 def guess_repo(path=None):
1190 """Set the path value in the global variable "repodir".
1191 This makes bup look for an existing bup repository, but not fail if a
1192 repository doesn't exist. Usually, if you are interacting with a bup
1193 repository, you would not be calling this function but using
1194 check_repo_or_die().
1200 repodir = environ.get(b'BUP_DIR')
1202 repodir = os.path.expanduser(b'~/.bup')
1205 def init_repo(path=None):
1206 """Create the Git bare repository for bup in a given path."""
1208 d = repo() # appends a / to the path
1209 parent = os.path.dirname(os.path.dirname(d))
1210 if parent and not os.path.exists(parent):
1211 raise GitError('parent directory "%s" does not exist\n'
1213 if os.path.exists(d) and not os.path.isdir(os.path.join(d, b'.')):
1214 raise GitError('"%s" exists but is not a directory\n' % path_msg(d))
1215 p = subprocess.Popen([b'git', b'--bare', b'init'], stdout=sys.stderr,
1218 _git_wait('git init', p)
1219 # Force the index version configuration in order to ensure bup works
1220 # regardless of the version of the installed Git binary.
1221 p = subprocess.Popen([b'git', b'config', b'pack.indexVersion', '2'],
1222 stdout=sys.stderr, env=_gitenv(), close_fds=True)
1223 _git_wait('git config', p)
1225 p = subprocess.Popen([b'git', b'config', b'core.logAllRefUpdates', b'true'],
1226 stdout=sys.stderr, env=_gitenv(), close_fds=True)
1227 _git_wait('git config', p)
1230 def check_repo_or_die(path=None):
1231 """Check to see if a bup repository probably exists, and abort if not."""
1234 pst = stat_if_exists(top + b'/objects/pack')
1235 if pst and stat.S_ISDIR(pst.st_mode):
1238 top_st = stat_if_exists(top)
1240 log('error: repository %r does not exist (see "bup help init")\n'
1243 log('error: %s is not a repository\n' % path_msg(top))
1247 def is_suitable_git(ver_str):
1248 if not ver_str.startswith(b'git version '):
1249 return 'unrecognized'
1250 ver_str = ver_str[len(b'git version '):]
1251 if ver_str.startswith(b'0.'):
1252 return 'insufficient'
1253 if ver_str.startswith(b'1.'):
1254 if re.match(br'1\.[012345]rc', ver_str):
1255 return 'insufficient'
1256 if re.match(br'1\.[01234]\.', ver_str):
1257 return 'insufficient'
1258 if re.match(br'1\.5\.[012345]($|\.)', ver_str):
1259 return 'insufficient'
1260 if re.match(br'1\.5\.6-rc', ver_str):
1261 return 'insufficient'
1263 if re.match(br'[0-9]+(\.|$)?', ver_str):
1269 def require_suitable_git(ver_str=None):
1270 """Raise GitError if the version of git isn't suitable.
1272 Rely on ver_str when provided, rather than invoking the git in the
1277 if _git_great is not None:
1279 if environ.get(b'BUP_GIT_VERSION_IS_FINE', b'').lower() \
1280 in (b'yes', b'true', b'1'):
1284 ver_str, _, _ = _git_exo([b'git', b'--version'])
1285 status = is_suitable_git(ver_str)
1286 if status == 'unrecognized':
1287 raise GitError('Unexpected git --version output: %r' % ver_str)
1288 if status == 'insufficient':
1289 log('error: git version must be at least 1.5.6\n')
1291 if status == 'suitable':
1298 """Link to 'git cat-file' that is used to retrieve blob data."""
1299 def __init__(self, repo_dir = None):
1300 require_suitable_git()
1301 self.repo_dir = repo_dir
1302 self.p = self.inprogress = None
1304 def close(self, wait=False):
1305 self.p, p = None, self.p
1306 self.inprogress = None
1311 # This will handle pending exceptions correctly once
1321 self.p = subprocess.Popen([b'git', b'cat-file', b'--batch'],
1322 stdin=subprocess.PIPE,
1323 stdout=subprocess.PIPE,
1326 env=_gitenv(self.repo_dir))
1329 """Yield (oidx, type, size), followed by the data referred to by ref.
1330 If ref does not exist, only yield (None, None, None).
1333 if not self.p or self.p.poll() != None:
1336 poll_result = self.p.poll()
1337 assert(poll_result == None)
1339 log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1340 assert(not self.inprogress)
1341 assert ref.find(b'\n') < 0
1342 assert ref.find(b'\r') < 0
1343 assert not ref.startswith(b'-')
1344 self.inprogress = ref
1345 self.p.stdin.write(ref + b'\n')
1346 self.p.stdin.flush()
1347 hdr = self.p.stdout.readline()
1349 raise GitError('unexpected cat-file EOF (last request: %r, exit: %s)'
1350 % (ref, self.p.poll() or 'none'))
1351 if hdr.endswith(b' missing\n'):
1352 self.inprogress = None
1353 yield None, None, None
1355 info = hdr.split(b' ')
1356 if len(info) != 3 or len(info[0]) != 40:
1357 raise GitError('expected object (id, type, size), got %r' % info)
1358 oidx, typ, size = info
1361 it = chunkyreader(self.p.stdout, size)
1362 yield oidx, typ, size
1363 for blob in chunkyreader(self.p.stdout, size):
1365 readline_result = self.p.stdout.readline()
1366 assert readline_result == b'\n'
1367 self.inprogress = None
1368 except Exception as ex:
1369 with pending_raise(ex):
1372 def _join(self, it):
1373 _, typ, _ = next(it)
1377 elif typ == b'tree':
1378 treefile = b''.join(it)
1379 for (mode, name, sha) in tree_decode(treefile):
1380 for blob in self.join(hexlify(sha)):
1382 elif typ == b'commit':
1383 treeline = b''.join(it).split(b'\n')[0]
1384 assert treeline.startswith(b'tree ')
1385 for blob in self.join(treeline[5:]):
1388 raise GitError('invalid object type %r: expected blob/tree/commit'
1392 """Generate a list of the content of all blobs that can be reached
1393 from an object. The hash given in 'id' must point to a blob, a tree
1394 or a commit. The content of all blobs that can be seen from trees or
1395 commits will be added to the list.
1397 for d in self._join(self.get(id)):
1403 def cp(repo_dir=None):
1404 """Create a CatPipe object or reuse the already existing one."""
1407 repo_dir = repodir or repo()
1408 repo_dir = os.path.abspath(repo_dir)
1409 cp = _cp.get(repo_dir)
1411 cp = CatPipe(repo_dir)
1416 def close_catpipes():
1417 # FIXME: chain exceptions
1419 _, cp = _cp.popitem()
1423 def tags(repo_dir = None):
1424 """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1426 for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1427 assert n.startswith(b'refs/tags/')
1431 tags[c].append(name) # more than one tag can point at 'c'
1435 class MissingObject(KeyError):
1436 def __init__(self, oid):
1438 KeyError.__init__(self, 'object %r is missing' % hexlify(oid))
1441 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1442 'path', 'chunk_path', 'data'])
1443 # The path is the mangled path, and if an item represents a fragment
1444 # of a chunked file, the chunk_path will be the chunked subtree path
1445 # for the chunk, i.e. ['', '2d3115e', ...]. The top-level path for a
1446 # chunked file will have a chunk_path of ['']. So some chunk subtree
1447 # of the file '/foo/bar/baz' might look like this:
1449 # item.path = ['foo', 'bar', 'baz.bup']
1450 # item.chunk_path = ['', '2d3115e', '016b097']
1451 # item.type = 'tree'
1455 def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1456 """Yield everything reachable from oidx via get_ref (which must behave
1457 like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1458 returns true. Throw MissingObject if a hash encountered is
1459 missing from the repository, and don't read or return blob content
1460 in the data field unless include_data is set.
1463 # Maintain the pending stack on the heap to avoid stack overflow
1464 pending = [(oidx, [], [], None)]
1466 oidx, parent_path, chunk_path, mode = pending.pop()
1467 oid = unhexlify(oidx)
1468 if stop_at and stop_at(oidx):
1471 if (not include_data) and mode and stat.S_ISREG(mode):
1472 # If the object is a "regular file", then it's a leaf in
1473 # the graph, so we can skip reading the data if the caller
1474 # hasn't requested it.
1475 yield WalkItem(oid=oid, type=b'blob',
1476 chunk_path=chunk_path, path=parent_path,
1481 item_it = get_ref(oidx)
1482 get_oidx, typ, _ = next(item_it)
1484 raise MissingObject(unhexlify(oidx))
1485 if typ not in (b'blob', b'commit', b'tree'):
1486 raise Exception('unexpected repository object type %r' % typ)
1488 # FIXME: set the mode based on the type when the mode is None
1489 if typ == b'blob' and not include_data:
1490 # Dump data until we can ask cat_pipe not to fetch it
1491 for ignored in item_it:
1495 data = b''.join(item_it)
1497 yield WalkItem(oid=oid, type=typ,
1498 chunk_path=chunk_path, path=parent_path,
1500 data=(data if include_data else None))
1502 if typ == b'commit':
1503 commit_items = parse_commit(data)
1504 for pid in commit_items.parents:
1505 pending.append((pid, parent_path, chunk_path, mode))
1506 pending.append((commit_items.tree, parent_path, chunk_path,
1507 hashsplit.GIT_MODE_TREE))
1508 elif typ == b'tree':
1509 for mode, name, ent_id in tree_decode(data):
1510 demangled, bup_type = demangle_name(name, mode)
1512 sub_path = parent_path
1513 sub_chunk_path = chunk_path + [name]
1515 sub_path = parent_path + [name]
1516 if bup_type == BUP_CHUNKED:
1517 sub_chunk_path = [b'']
1519 sub_chunk_path = chunk_path
1520 pending.append((hexlify(ent_id), sub_path, sub_chunk_path,