1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
6 from __future__ import absolute_import, print_function
7 import os, sys, zlib, subprocess, struct, stat, re, tempfile, glob
8 from array import array
9 from binascii import hexlify, unhexlify
10 from collections import namedtuple
11 from itertools import islice
13 from bup import _helpers, hashsplit, path, midx, bloom, xstat
14 from bup.compat import (buffer,
15 byte_int, bytes_from_byte, bytes_from_uint,
21 from bup.io import path_msg
22 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
29 mmap_read, mmap_readwrite,
30 progress, qprogress, stat_if_exists,
36 repodir = None # The default repository, once initialized
38 _typemap = {b'blob': 3, b'tree': 2, b'commit': 1, b'tag': 4}
39 _typermap = {v: k for k, v in items(_typemap)}
46 class GitError(Exception):
50 def _gitenv(repo_dir=None):
53 return merge_dict(environ, {b'GIT_DIR': os.path.abspath(repo_dir)})
55 def _git_wait(cmd, p):
58 raise GitError('%r returned %d' % (cmd, rv))
60 def _git_exo(cmd, **kwargs):
61 kwargs['check'] = False
62 result = exo(cmd, **kwargs)
64 if proc.returncode != 0:
65 raise GitError('%r returned %d' % (cmd, proc.returncode))
68 def git_config_get(option, repo_dir=None, opttype=None, cfg_file=None):
69 assert not (repo_dir and cfg_file), "repo_dir and cfg_file cannot both be used"
70 cmd = [b'git', b'config', b'--null']
72 cmd.extend([b'--file', cfg_file])
74 cmd.extend([b'--int'])
75 elif opttype == 'bool':
76 cmd.extend([b'--bool'])
78 assert opttype is None
79 cmd.extend([b'--get', option])
82 env = _gitenv(repo_dir=repo_dir)
83 p = subprocess.Popen(cmd, stdout=subprocess.PIPE, env=env,
85 # with --null, git writes out a trailing \0 after the value
86 r = p.stdout.read()[:-1]
91 elif opttype == 'bool':
92 # git converts to 'true' or 'false'
96 raise GitError('%r returned %d' % (cmd, rc))
100 def parse_tz_offset(s):
101 """UTC offset in seconds."""
102 tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
103 if bytes_from_byte(s[0]) == b'-':
107 def parse_commit_gpgsig(sig):
108 """Return the original signature bytes.
110 i.e. with the "gpgsig " header and the leading space character on
111 each continuation line removed.
116 assert sig.startswith(b'gpgsig ')
118 return sig.replace(b'\n ', b'\n')
120 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
121 # Make sure that's authoritative.
124 # https://github.com/git/git/blob/master/Documentation/technical/signature-format.txt
125 # The continuation lines have only one leading space.
127 _start_end_char = br'[^ .,:;<>"\'\0\n]'
128 _content_char = br'[^\0\n<>]'
129 _safe_str_rx = br'(?:%s{1,2}|(?:%s%s*%s))' \
131 _start_end_char, _content_char, _start_end_char)
132 _tz_rx = br'[-+]\d\d[0-5]\d'
133 _parent_rx = br'(?:parent [abcdefABCDEF0123456789]{40}\n)'
134 # Assumes every following line starting with a space is part of the
135 # mergetag. Is there a formal commit blob spec?
136 _mergetag_rx = br'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)'
137 _commit_rx = re.compile(br'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
138 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
139 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?)
140 (?P<gpgsig>gpgsig .*\n(?: .*\n)*)?
141 (?P<message>(?:.|\n)*)''' % (_parent_rx,
142 _safe_str_rx, _safe_str_rx, _tz_rx,
143 _safe_str_rx, _safe_str_rx, _tz_rx,
145 _parent_hash_rx = re.compile(br'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
147 # Note that the author_sec and committer_sec values are (UTC) epoch
148 # seconds, and for now the mergetag is not included.
149 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
150 'author_name', 'author_mail',
151 'author_sec', 'author_offset',
152 'committer_name', 'committer_mail',
153 'committer_sec', 'committer_offset',
157 def parse_commit(content):
158 commit_match = re.match(_commit_rx, content)
160 raise Exception('cannot parse commit %r' % content)
161 matches = commit_match.groupdict()
162 return CommitInfo(tree=matches['tree'],
163 parents=re.findall(_parent_hash_rx, matches['parents']),
164 author_name=matches['author_name'],
165 author_mail=matches['author_mail'],
166 author_sec=int(matches['asec']),
167 author_offset=parse_tz_offset(matches['atz']),
168 committer_name=matches['committer_name'],
169 committer_mail=matches['committer_mail'],
170 committer_sec=int(matches['csec']),
171 committer_offset=parse_tz_offset(matches['ctz']),
172 gpgsig=parse_commit_gpgsig(matches['gpgsig']),
173 message=matches['message'])
176 def get_cat_data(cat_iterator, expected_type):
177 _, kind, _ = next(cat_iterator)
178 if kind != expected_type:
179 raise Exception('expected %r, saw %r' % (expected_type, kind))
180 return b''.join(cat_iterator)
182 def get_commit_items(id, cp):
183 return parse_commit(get_cat_data(cp.get(id), b'commit'))
185 def _local_git_date_str(epoch_sec):
186 return b'%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
189 def _git_date_str(epoch_sec, tz_offset_sec):
190 offs = tz_offset_sec // 60
191 return b'%d %s%02d%02d' \
193 b'+' if offs >= 0 else b'-',
198 def repo(sub = b'', repo_dir=None):
199 """Get the path to the git repository or one of its subdirectories."""
200 repo_dir = repo_dir or repodir
202 raise GitError('You should call check_repo_or_die()')
204 # If there's a .git subdirectory, then the actual repo is in there.
205 gd = os.path.join(repo_dir, b'.git')
206 if os.path.exists(gd):
209 return os.path.join(repo_dir, sub)
213 re.compile(br'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)')
216 return _shorten_hash_rx.sub(br'\1\2*\3', s)
220 full = os.path.abspath(path)
221 fullrepo = os.path.abspath(repo(b''))
222 if not fullrepo.endswith(b'/'):
224 if full.startswith(fullrepo):
225 path = full[len(fullrepo):]
226 if path.startswith(b'index-cache/'):
227 path = path[len(b'index-cache/'):]
228 return shorten_hash(path)
231 def auto_midx(objdir):
232 args = [path.exe(), b'midx', b'--auto', b'--dir', objdir]
234 rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
236 # make sure 'args' gets printed to help with debugging
237 add_error('%r: exception: %s' % (args, e))
240 add_error('%r: returned %d' % (args, rv))
242 args = [path.exe(), b'bloom', b'--dir', objdir]
244 rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
246 # make sure 'args' gets printed to help with debugging
247 add_error('%r: exception: %s' % (args, e))
250 add_error('%r: returned %d' % (args, rv))
253 def mangle_name(name, mode, gitmode):
254 """Mangle a file name to present an abstract name for segmented files.
255 Mangled file names will have the ".bup" extension added to them. If a
256 file's name already ends with ".bup", a ".bupl" extension is added to
257 disambiguate normal files from segmented ones.
259 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
260 assert(stat.S_ISDIR(gitmode))
261 return name + b'.bup'
262 elif name.endswith(b'.bup') or name[:-1].endswith(b'.bup'):
263 return name + b'.bupl'
268 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
269 def demangle_name(name, mode):
270 """Remove name mangling from a file name, if necessary.
272 The return value is a tuple (demangled_filename,mode), where mode is one of
275 * BUP_NORMAL : files that should be read as-is from the repository
276 * BUP_CHUNKED : files that were chunked and need to be reassembled
278 For more information on the name mangling algorithm, see mangle_name()
280 if name.endswith(b'.bupl'):
281 return (name[:-5], BUP_NORMAL)
282 elif name.endswith(b'.bup'):
283 return (name[:-4], BUP_CHUNKED)
284 elif name.endswith(b'.bupm'):
286 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
287 return (name, BUP_NORMAL)
290 def calc_hash(type, content):
291 """Calculate some content's hash in the Git fashion."""
292 header = b'%s %d\0' % (type, len(content))
298 def shalist_item_sort_key(ent):
299 (mode, name, id) = ent
300 assert(mode+0 == mode)
301 if stat.S_ISDIR(mode):
307 def tree_encode(shalist):
308 """Generate a git tree object from (mode,name,hash) tuples."""
309 shalist = sorted(shalist, key = shalist_item_sort_key)
311 for (mode,name,bin) in shalist:
313 assert(mode+0 == mode)
315 assert(len(bin) == 20)
316 s = b'%o %s\0%s' % (mode,name,bin)
317 assert s[0] != b'0' # 0-padded octal is not acceptable in a git tree
322 def tree_decode(buf):
323 """Generate a list of (mode,name,hash) from the git tree object in buf."""
325 while ofs < len(buf):
326 z = buf.find(b'\0', ofs)
328 spl = buf[ofs:z].split(b' ', 1)
329 assert(len(spl) == 2)
331 sha = buf[z+1:z+1+20]
333 yield (int(mode, 8), name, sha)
336 def _encode_packobj(type, content, compression_level=1):
337 if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
338 raise ValueError('invalid compression level %s' % compression_level)
341 szbits = (sz & 0x0f) | (_typemap[type]<<4)
344 if sz: szbits |= 0x80
345 szout += bytes_from_uint(szbits)
350 z = zlib.compressobj(compression_level)
352 yield z.compress(content)
356 def _decode_packobj(buf):
359 type = _typermap[(c & 0x70) >> 4]
366 sz |= (c & 0x7f) << shift
370 return (type, zlib.decompress(buf[i+1:]))
377 def find_offset(self, hash):
378 """Get the offset of an object inside the index file."""
379 idx = self._idx_from_hash(hash)
381 return self._ofs_from_idx(idx)
384 def exists(self, hash, want_source=False):
385 """Return nonempty if the object exists in this index."""
386 if hash and (self._idx_from_hash(hash) != None):
387 return want_source and os.path.basename(self.name) or True
390 def _idx_from_hash(self, hash):
391 global _total_searches, _total_steps
393 assert(len(hash) == 20)
394 b1 = byte_int(hash[0])
395 start = self.fanout[b1-1] # range -1..254
396 end = self.fanout[b1] # range 0..255
398 _total_steps += 1 # lookup table is a step
401 mid = start + (end - start) // 2
402 v = self._idx_to_hash(mid)
412 class PackIdxV1(PackIdx):
413 """Object representation of a Git pack index (version 1) file."""
414 def __init__(self, filename, f):
416 self.idxnames = [self.name]
417 self.map = mmap_read(f)
418 # Min size for 'L' is 4, which is sufficient for struct's '!I'
419 self.fanout = array('L', struct.unpack('!256I', self.map))
420 self.fanout.append(0) # entry "-1"
421 self.nsha = self.fanout[255]
422 self.sha_ofs = 256 * 4
423 # Avoid slicing shatable for individual hashes (very high overhead)
424 self.shatable = buffer(self.map, self.sha_ofs, self.nsha * 24)
429 def __exit__(self, type, value, traceback):
430 with pending_raise(value, rethrow=False):
434 return int(self.nsha) # int() from long for python 2
436 def _ofs_from_idx(self, idx):
437 if idx >= self.nsha or idx < 0:
438 raise IndexError('invalid pack index index %d' % idx)
439 ofs = self.sha_ofs + idx * 24
440 return struct.unpack_from('!I', self.map, offset=ofs)[0]
442 def _idx_to_hash(self, idx):
443 if idx >= self.nsha or idx < 0:
444 raise IndexError('invalid pack index index %d' % idx)
445 ofs = self.sha_ofs + idx * 24 + 4
446 return self.map[ofs : ofs + 20]
449 start = self.sha_ofs + 4
450 for ofs in range(start, start + 24 * self.nsha, 24):
451 yield self.map[ofs : ofs + 20]
454 if self.map is not None:
460 class PackIdxV2(PackIdx):
461 """Object representation of a Git pack index (version 2) file."""
462 def __init__(self, filename, f):
464 self.idxnames = [self.name]
465 self.map = mmap_read(f)
466 assert self.map[0:8] == b'\377tOc\0\0\0\2'
467 # Min size for 'L' is 4, which is sufficient for struct's '!I'
468 self.fanout = array('L', struct.unpack_from('!256I', self.map, offset=8))
469 self.fanout.append(0)
470 self.nsha = self.fanout[255]
471 self.sha_ofs = 8 + 256*4
472 self.ofstable_ofs = self.sha_ofs + self.nsha * 20 + self.nsha * 4
473 self.ofs64table_ofs = self.ofstable_ofs + self.nsha * 4
474 # Avoid slicing this for individual hashes (very high overhead)
475 self.shatable = buffer(self.map, self.sha_ofs, self.nsha*20)
480 def __exit__(self, type, value, traceback):
481 with pending_raise(value, rethrow=False):
485 return int(self.nsha) # int() from long for python 2
487 def _ofs_from_idx(self, idx):
488 if idx >= self.nsha or idx < 0:
489 raise IndexError('invalid pack index index %d' % idx)
490 ofs_ofs = self.ofstable_ofs + idx * 4
491 ofs = struct.unpack_from('!I', self.map, offset=ofs_ofs)[0]
493 idx64 = ofs & 0x7fffffff
494 ofs64_ofs = self.ofs64table_ofs + idx64 * 8
495 ofs = struct.unpack_from('!Q', self.map, offset=ofs64_ofs)[0]
498 def _idx_to_hash(self, idx):
499 if idx >= self.nsha or idx < 0:
500 raise IndexError('invalid pack index index %d' % idx)
501 ofs = self.sha_ofs + idx * 20
502 return self.map[ofs : ofs + 20]
506 for ofs in range(start, start + 20 * self.nsha, 20):
507 yield self.map[ofs : ofs + 20]
510 if self.map is not None:
518 def __init__(self, dir, ignore_midx=False):
520 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
525 self.do_bloom = False
527 self.ignore_midx = ignore_midx
533 assert(_mpi_count == 0)
536 return iter(idxmerge(self.packs))
539 return sum(len(pack) for pack in self.packs)
541 def exists(self, hash, want_source=False):
542 """Return nonempty if the object exists in the index files."""
543 global _total_searches
545 if hash in self.also:
547 if self.do_bloom and self.bloom:
548 if self.bloom.exists(hash):
549 self.do_bloom = False
551 _total_searches -= 1 # was counted by bloom
553 for i in range(len(self.packs)):
555 _total_searches -= 1 # will be incremented by sub-pack
556 ix = p.exists(hash, want_source=want_source)
558 # reorder so most recently used packs are searched first
559 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
564 def refresh(self, skip_midx = False):
565 """Refresh the index list.
566 This method verifies if .midx files were superseded (e.g. all of its
567 contents are in another, bigger .midx file) and removes the superseded
570 If skip_midx is True, all work on .midx files will be skipped and .midx
571 files will be removed from the list.
573 The instance variable 'ignore_midx' can force this function to
574 always act as if skip_midx was True.
576 if self.bloom is not None:
578 self.bloom = None # Always reopen the bloom as it may have been relaced
579 self.do_bloom = False
580 skip_midx = skip_midx or self.ignore_midx
581 d = dict((p.name, p) for p in self.packs
582 if not skip_midx or not isinstance(p, midx.PackMidx))
583 if os.path.exists(self.dir):
586 midxes = set(glob.glob(os.path.join(self.dir, b'*.midx')))
587 # remove any *.midx files from our list that no longer exist
588 for ix in list(d.values()):
589 if not isinstance(ix, midx.PackMidx):
591 if ix.name in midxes:
596 self.packs.remove(ix)
597 for ix in self.packs:
598 if isinstance(ix, midx.PackMidx):
599 for name in ix.idxnames:
600 d[os.path.join(self.dir, name)] = ix
603 mx = midx.PackMidx(full)
604 (mxd, mxf) = os.path.split(mx.name)
606 for n in mx.idxnames:
607 if not os.path.exists(os.path.join(mxd, n)):
608 log(('warning: index %s missing\n'
610 % (path_msg(n), path_msg(mxf)))
618 midxl.sort(key=lambda ix:
619 (-len(ix), -xstat.stat(ix.name).st_mtime))
622 for sub in ix.idxnames:
623 found = d.get(os.path.join(self.dir, sub))
624 if not found or isinstance(found, PackIdx):
625 # doesn't exist, or exists but not in a midx
630 for name in ix.idxnames:
631 d[os.path.join(self.dir, name)] = ix
632 elif not ix.force_keep:
633 debug1('midx: removing redundant: %s\n'
634 % path_msg(os.path.basename(ix.name)))
637 for full in glob.glob(os.path.join(self.dir, b'*.idx')):
641 except GitError as e:
645 bfull = os.path.join(self.dir, b'bup.bloom')
646 self.packs = list(set(d.values()))
647 self.packs.sort(reverse=True, key=lambda x: len(x))
648 if self.bloom is None and os.path.exists(bfull):
649 self.bloom = bloom.ShaBloom(bfull)
651 if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
655 self.bloom, bloom_tmp = None, self.bloom
657 except BaseException as ex:
658 with pending_raise(ex):
662 debug1('PackIdxList: using %d index%s.\n'
663 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
666 """Insert an additional object in the list."""
670 def open_idx(filename):
671 if filename.endswith(b'.idx'):
672 f = open(filename, 'rb')
674 if header[0:4] == b'\377tOc':
675 version = struct.unpack('!I', header[4:8])[0]
677 return PackIdxV2(filename, f)
679 raise GitError('%s: expected idx file version 2, got %d'
680 % (path_msg(filename), version))
681 elif len(header) == 8 and header[0:4] < b'\377tOc':
682 return PackIdxV1(filename, f)
684 raise GitError('%s: unrecognized idx file header'
685 % path_msg(filename))
686 elif filename.endswith(b'.midx'):
687 return midx.PackMidx(filename)
689 raise GitError('idx filenames must end with .idx or .midx')
692 def idxmerge(idxlist, final_progress=True):
693 """Generate a list of all the objects reachable in a PackIdxList."""
694 def pfunc(count, total):
695 qprogress('Reading indexes: %.2f%% (%d/%d)\r'
696 % (count*100.0/total, count, total))
697 def pfinal(count, total):
699 progress('Reading indexes: %.2f%% (%d/%d), done.\n'
700 % (100, total, total))
701 return merge_iter(idxlist, 10024, pfunc, pfinal)
704 def create_commit_blob(tree, parent,
705 author, adate_sec, adate_tz,
706 committer, cdate_sec, cdate_tz,
708 if adate_tz is not None:
709 adate_str = _git_date_str(adate_sec, adate_tz)
711 adate_str = _local_git_date_str(adate_sec)
712 if cdate_tz is not None:
713 cdate_str = _git_date_str(cdate_sec, cdate_tz)
715 cdate_str = _local_git_date_str(cdate_sec)
717 if tree: l.append(b'tree %s' % hexlify(tree))
718 if parent: l.append(b'parent %s' % hexlify(parent))
719 if author: l.append(b'author %s %s' % (author, adate_str))
720 if committer: l.append(b'committer %s %s' % (committer, cdate_str))
726 def _make_objcache():
727 return PackIdxList(repo(b'objects/pack'))
729 # bup-gc assumes that it can disable all PackWriter activities
730 # (bloom/midx/cache) via the constructor and close() arguments.
733 """Writes Git objects inside a pack file."""
734 def __init__(self, objcache_maker=_make_objcache, compression_level=1,
735 run_midx=True, on_pack_finish=None,
736 max_pack_size=None, max_pack_objects=None, repo_dir=None):
737 self.repo_dir = repo_dir or repo()
744 self.objcache_maker = objcache_maker
746 self.compression_level = compression_level
747 self.run_midx=run_midx
748 self.on_pack_finish = on_pack_finish
749 if not max_pack_size:
750 max_pack_size = git_config_get(b'pack.packSizeLimit',
751 repo_dir=self.repo_dir,
753 if not max_pack_size:
754 # larger packs slow down pruning
755 max_pack_size = 1000 * 1000 * 1000
756 self.max_pack_size = max_pack_size
757 # cache memory usage is about 83 bytes per object
758 self.max_pack_objects = max_pack_objects if max_pack_objects \
759 else max(1, self.max_pack_size // 5000)
764 def __exit__(self, type, value, traceback):
765 with pending_raise(value, rethrow=False):
770 objdir = dir = os.path.join(self.repo_dir, b'objects')
771 fd, name = tempfile.mkstemp(suffix=b'.pack', dir=objdir)
773 self.file = os.fdopen(fd, 'w+b')
778 self.parentfd = os.open(objdir, os.O_RDONLY)
784 assert name.endswith(b'.pack')
785 self.filename = name[:-5]
786 self.file.write(b'PACK\0\0\0\2\0\0\0\0')
787 self.idx = PackIdxV2Writer()
789 def _raw_write(self, datalist, sha):
792 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
793 # the file never has a *partial* blob. So let's make sure it's
794 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
795 # to our hashsplit algorithm.) f.write() does its own buffering,
796 # but that's okay because we'll flush it in _end().
797 oneblob = b''.join(datalist)
803 crc = zlib.crc32(oneblob) & 0xffffffff
804 self._update_idx(sha, crc, nw)
809 def _update_idx(self, sha, crc, size):
812 self.idx.add(sha, crc, self.file.tell() - size)
814 def _write(self, sha, type, content):
818 sha = calc_hash(type, content)
819 size, crc = self._raw_write(_encode_packobj(type, content,
820 self.compression_level),
822 if self.outbytes >= self.max_pack_size \
823 or self.count >= self.max_pack_objects:
827 def _require_objcache(self):
828 if self.objcache is None and self.objcache_maker:
829 self.objcache = self.objcache_maker()
830 if self.objcache is None:
832 "PackWriter not opened or can't check exists w/o objcache")
834 def exists(self, id, want_source=False):
835 """Return non-empty if an object is found in the object cache."""
836 self._require_objcache()
837 return self.objcache.exists(id, want_source=want_source)
839 def just_write(self, sha, type, content):
840 """Write an object to the pack file without checking for duplication."""
841 self._write(sha, type, content)
842 # If nothing else, gc doesn't have/want an objcache
843 if self.objcache is not None:
844 self.objcache.add(sha)
846 def maybe_write(self, type, content):
847 """Write an object to the pack file if not present and return its id."""
848 sha = calc_hash(type, content)
849 if not self.exists(sha):
850 self._require_objcache()
851 self.just_write(sha, type, content)
854 def new_blob(self, blob):
855 """Create a blob object in the pack with the supplied content."""
856 return self.maybe_write(b'blob', blob)
858 def new_tree(self, shalist):
859 """Create a tree object in the pack."""
860 content = tree_encode(shalist)
861 return self.maybe_write(b'tree', content)
863 def new_commit(self, tree, parent,
864 author, adate_sec, adate_tz,
865 committer, cdate_sec, cdate_tz,
867 """Create a commit object in the pack. The date_sec values must be
868 epoch-seconds, and if a tz is None, the local timezone is assumed."""
869 content = create_commit_blob(tree, parent,
870 author, adate_sec, adate_tz,
871 committer, cdate_sec, cdate_tz,
873 return self.maybe_write(b'commit', content)
875 def _end(self, run_midx=True, abort=False):
876 # Ignores run_midx during abort
879 self.file, f = None, self.file
880 self.idx, idx = None, self.idx
881 self.parentfd, pfd, = None, self.parentfd
884 with finalized(pfd, lambda x: x is not None and os.close(x)), \
888 os.unlink(self.filename + b'.pack')
891 # update object count
893 cp = struct.pack('!i', self.count)
897 # calculate the pack sha1sum
900 for b in chunkyreader(f):
902 packbin = sum.digest()
905 fdatasync(f.fileno())
908 idx.write(self.filename + b'.idx', packbin)
909 nameprefix = os.path.join(self.repo_dir,
910 b'objects/pack/pack-' + hexlify(packbin))
911 if os.path.exists(self.filename + b'.map'):
912 os.unlink(self.filename + b'.map')
913 os.rename(self.filename + b'.pack', nameprefix + b'.pack')
914 os.rename(self.filename + b'.idx', nameprefix + b'.idx')
917 auto_midx(os.path.join(self.repo_dir, b'objects/pack'))
918 if self.on_pack_finish:
919 self.on_pack_finish(nameprefix)
923 """Remove the pack file from disk."""
924 self._end(abort=True)
926 def breakpoint(self):
927 """Clear byte and object counts and return the last processed id."""
928 id = self._end(self.run_midx)
929 self.outbytes = self.count = 0
932 def close(self, run_midx=True):
933 """Close the pack file and move it to its definitive path."""
934 return self._end(run_midx=run_midx)
937 class PackIdxV2Writer:
939 self.idx = list(list() for i in range(256))
942 def add(self, sha, crc, offs):
945 self.idx[byte_int(sha[0])].append((sha, crc, offs))
947 def write(self, filename, packbin):
949 for section in self.idx:
950 for entry in section:
951 if entry[2] >= 2**31:
954 # Length: header + fan-out + shas-and-crcs + overflow-offsets
955 index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
957 idx_f = open(filename, 'w+b')
959 idx_f.truncate(index_len)
960 fdatasync(idx_f.fileno())
961 idx_map = mmap_readwrite(idx_f, close=False)
963 count = _helpers.write_idx(filename, idx_map, self.idx,
965 assert(count == self.count)
972 idx_f = open(filename, 'a+b')
977 b = idx_f.read(8 + 4*256)
980 for b in chunkyreader(idx_f, 20 * self.count):
983 for b in chunkyreader(idx_f):
985 idx_f.write(idx_sum.digest())
986 fdatasync(idx_f.fileno())
991 def list_refs(patterns=None, repo_dir=None,
992 limit_to_heads=False, limit_to_tags=False):
993 """Yield (refname, hash) tuples for all repository refs unless
994 patterns are specified. In that case, only include tuples for
995 refs matching those patterns (cf. git-show-ref(1)). The limits
996 restrict the result items to refs/heads or refs/tags. If both
997 limits are specified, items from both sources will be included.
1000 argv = [b'git', b'show-ref']
1002 argv.append(b'--heads')
1004 argv.append(b'--tags')
1007 argv.extend(patterns)
1008 p = subprocess.Popen(argv, env=_gitenv(repo_dir), stdout=subprocess.PIPE,
1010 out = p.stdout.read().strip()
1011 rv = p.wait() # not fatal
1015 for d in out.split(b'\n'):
1016 sha, name = d.split(b' ', 1)
1017 yield name, unhexlify(sha)
1020 def read_ref(refname, repo_dir = None):
1021 """Get the commit id of the most recent commit made on a given ref."""
1022 refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
1023 l = tuple(islice(refs, 2))
1031 def rev_list_invocation(ref_or_refs, format=None):
1032 if isinstance(ref_or_refs, bytes):
1033 refs = (ref_or_refs,)
1036 argv = [b'git', b'rev-list']
1039 argv.append(b'--pretty=format:' + format)
1041 assert not ref.startswith(b'-')
1047 def rev_list(ref_or_refs, parse=None, format=None, repo_dir=None):
1048 """Yield information about commits as per "git rev-list". If a format
1049 is not provided, yield one hex hash at a time. If a format is
1050 provided, pass it to rev-list and call parse(git_stdout) for each
1051 commit with the stream positioned just after the rev-list "commit
1052 HASH" header line. When a format is provided yield (oidx,
1053 parse(git_stdout)) for each commit.
1056 assert bool(parse) == bool(format)
1057 p = subprocess.Popen(rev_list_invocation(ref_or_refs,
1059 env=_gitenv(repo_dir),
1060 stdout = subprocess.PIPE,
1063 for line in p.stdout:
1066 line = p.stdout.readline()
1069 if not s.startswith(b'commit '):
1070 raise Exception('unexpected line ' + repr(s))
1073 yield s, parse(p.stdout)
1074 line = p.stdout.readline()
1076 rv = p.wait() # not fatal
1078 raise GitError('git rev-list returned error %d' % rv)
1081 def rev_parse(committish, repo_dir=None):
1082 """Resolve the full hash for 'committish', if it exists.
1084 Should be roughly equivalent to 'git rev-parse'.
1086 Returns the hex value of the hash if it is found, None if 'committish' does
1087 not correspond to anything.
1089 head = read_ref(committish, repo_dir=repo_dir)
1091 debug2("resolved from ref: commit = %s\n" % hexlify(head))
1094 pL = PackIdxList(repo(b'objects/pack', repo_dir=repo_dir))
1096 if len(committish) == 40:
1098 hash = unhexlify(committish)
1108 def update_ref(refname, newval, oldval, repo_dir=None):
1109 """Update a repository reference."""
1112 assert refname.startswith(b'refs/heads/') \
1113 or refname.startswith(b'refs/tags/')
1114 p = subprocess.Popen([b'git', b'update-ref', refname,
1115 hexlify(newval), hexlify(oldval)],
1116 env=_gitenv(repo_dir),
1118 _git_wait(b'git update-ref', p)
1121 def delete_ref(refname, oldvalue=None):
1122 """Delete a repository reference (see git update-ref(1))."""
1123 assert refname.startswith(b'refs/')
1124 oldvalue = [] if not oldvalue else [oldvalue]
1125 p = subprocess.Popen([b'git', b'update-ref', b'-d', refname] + oldvalue,
1128 _git_wait('git update-ref', p)
1131 def guess_repo(path=None):
1132 """Set the path value in the global variable "repodir".
1133 This makes bup look for an existing bup repository, but not fail if a
1134 repository doesn't exist. Usually, if you are interacting with a bup
1135 repository, you would not be calling this function but using
1136 check_repo_or_die().
1142 repodir = environ.get(b'BUP_DIR')
1144 repodir = os.path.expanduser(b'~/.bup')
1147 def init_repo(path=None):
1148 """Create the Git bare repository for bup in a given path."""
1150 d = repo() # appends a / to the path
1151 parent = os.path.dirname(os.path.dirname(d))
1152 if parent and not os.path.exists(parent):
1153 raise GitError('parent directory "%s" does not exist\n'
1155 if os.path.exists(d) and not os.path.isdir(os.path.join(d, b'.')):
1156 raise GitError('"%s" exists but is not a directory\n' % path_msg(d))
1157 p = subprocess.Popen([b'git', b'--bare', b'init'], stdout=sys.stderr,
1160 _git_wait('git init', p)
1161 # Force the index version configuration in order to ensure bup works
1162 # regardless of the version of the installed Git binary.
1163 p = subprocess.Popen([b'git', b'config', b'pack.indexVersion', '2'],
1164 stdout=sys.stderr, env=_gitenv(), close_fds=True)
1165 _git_wait('git config', p)
1167 p = subprocess.Popen([b'git', b'config', b'core.logAllRefUpdates', b'true'],
1168 stdout=sys.stderr, env=_gitenv(), close_fds=True)
1169 _git_wait('git config', p)
1172 def check_repo_or_die(path=None):
1173 """Check to see if a bup repository probably exists, and abort if not."""
1176 pst = stat_if_exists(top + b'/objects/pack')
1177 if pst and stat.S_ISDIR(pst.st_mode):
1180 top_st = stat_if_exists(top)
1182 log('error: repository %r does not exist (see "bup help init")\n'
1185 log('error: %s is not a repository\n' % path_msg(top))
1189 def is_suitable_git(ver_str):
1190 if not ver_str.startswith(b'git version '):
1191 return 'unrecognized'
1192 ver_str = ver_str[len(b'git version '):]
1193 if ver_str.startswith(b'0.'):
1194 return 'insufficient'
1195 if ver_str.startswith(b'1.'):
1196 if re.match(br'1\.[012345]rc', ver_str):
1197 return 'insufficient'
1198 if re.match(br'1\.[01234]\.', ver_str):
1199 return 'insufficient'
1200 if re.match(br'1\.5\.[012345]($|\.)', ver_str):
1201 return 'insufficient'
1202 if re.match(br'1\.5\.6-rc', ver_str):
1203 return 'insufficient'
1205 if re.match(br'[0-9]+(\.|$)?', ver_str):
1211 def require_suitable_git(ver_str=None):
1212 """Raise GitError if the version of git isn't suitable.
1214 Rely on ver_str when provided, rather than invoking the git in the
1219 if _git_great is not None:
1221 if environ.get(b'BUP_GIT_VERSION_IS_FINE', b'').lower() \
1222 in (b'yes', b'true', b'1'):
1226 ver_str, _, _ = _git_exo([b'git', b'--version'])
1227 status = is_suitable_git(ver_str)
1228 if status == 'unrecognized':
1229 raise GitError('Unexpected git --version output: %r' % ver_str)
1230 if status == 'insufficient':
1231 log('error: git version must be at least 1.5.6\n')
1233 if status == 'suitable':
1239 class _AbortableIter:
1240 def __init__(self, it, onabort = None):
1242 self.onabort = onabort
1250 return next(self.it)
1251 except StopIteration as e:
1261 """Abort iteration and call the abortion callback, if needed."""
1272 """Link to 'git cat-file' that is used to retrieve blob data."""
1273 def __init__(self, repo_dir = None):
1274 require_suitable_git()
1275 self.repo_dir = repo_dir
1276 self.p = self.inprogress = None
1278 def close(self, wait=False):
1284 self.inprogress = None
1292 self.p = subprocess.Popen([b'git', b'cat-file', b'--batch'],
1293 stdin=subprocess.PIPE,
1294 stdout=subprocess.PIPE,
1297 env=_gitenv(self.repo_dir))
1300 """Yield (oidx, type, size), followed by the data referred to by ref.
1301 If ref does not exist, only yield (None, None, None).
1304 if not self.p or self.p.poll() != None:
1307 poll_result = self.p.poll()
1308 assert(poll_result == None)
1310 log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1311 assert(not self.inprogress)
1312 assert ref.find(b'\n') < 0
1313 assert ref.find(b'\r') < 0
1314 assert not ref.startswith(b'-')
1315 self.inprogress = ref
1316 self.p.stdin.write(ref + b'\n')
1317 self.p.stdin.flush()
1318 hdr = self.p.stdout.readline()
1320 raise GitError('unexpected cat-file EOF (last request: %r, exit: %s)'
1321 % (ref, self.p.poll() or 'none'))
1322 if hdr.endswith(b' missing\n'):
1323 self.inprogress = None
1324 yield None, None, None
1326 info = hdr.split(b' ')
1327 if len(info) != 3 or len(info[0]) != 40:
1328 raise GitError('expected object (id, type, size), got %r' % info)
1329 oidx, typ, size = info
1331 it = _AbortableIter(chunkyreader(self.p.stdout, size),
1334 yield oidx, typ, size
1337 readline_result = self.p.stdout.readline()
1338 assert readline_result == b'\n'
1339 self.inprogress = None
1340 except Exception as e:
1344 def _join(self, it):
1345 _, typ, _ = next(it)
1349 elif typ == b'tree':
1350 treefile = b''.join(it)
1351 for (mode, name, sha) in tree_decode(treefile):
1352 for blob in self.join(hexlify(sha)):
1354 elif typ == b'commit':
1355 treeline = b''.join(it).split(b'\n')[0]
1356 assert treeline.startswith(b'tree ')
1357 for blob in self.join(treeline[5:]):
1360 raise GitError('invalid object type %r: expected blob/tree/commit'
1364 """Generate a list of the content of all blobs that can be reached
1365 from an object. The hash given in 'id' must point to a blob, a tree
1366 or a commit. The content of all blobs that can be seen from trees or
1367 commits will be added to the list.
1369 for d in self._join(self.get(id)):
1375 def cp(repo_dir=None):
1376 """Create a CatPipe object or reuse the already existing one."""
1379 repo_dir = repodir or repo()
1380 repo_dir = os.path.abspath(repo_dir)
1381 cp = _cp.get(repo_dir)
1383 cp = CatPipe(repo_dir)
1388 def close_catpipes():
1389 # FIXME: chain exceptions
1391 _, cp = _cp.popitem()
1395 def tags(repo_dir = None):
1396 """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1398 for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1399 assert n.startswith(b'refs/tags/')
1403 tags[c].append(name) # more than one tag can point at 'c'
1407 class MissingObject(KeyError):
1408 def __init__(self, oid):
1410 KeyError.__init__(self, 'object %r is missing' % hexlify(oid))
1413 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1414 'path', 'chunk_path', 'data'])
1415 # The path is the mangled path, and if an item represents a fragment
1416 # of a chunked file, the chunk_path will be the chunked subtree path
1417 # for the chunk, i.e. ['', '2d3115e', ...]. The top-level path for a
1418 # chunked file will have a chunk_path of ['']. So some chunk subtree
1419 # of the file '/foo/bar/baz' might look like this:
1421 # item.path = ['foo', 'bar', 'baz.bup']
1422 # item.chunk_path = ['', '2d3115e', '016b097']
1423 # item.type = 'tree'
1427 def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1428 """Yield everything reachable from oidx via get_ref (which must behave
1429 like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1430 returns true. Throw MissingObject if a hash encountered is
1431 missing from the repository, and don't read or return blob content
1432 in the data field unless include_data is set.
1435 # Maintain the pending stack on the heap to avoid stack overflow
1436 pending = [(oidx, [], [], None)]
1438 oidx, parent_path, chunk_path, mode = pending.pop()
1439 oid = unhexlify(oidx)
1440 if stop_at and stop_at(oidx):
1443 if (not include_data) and mode and stat.S_ISREG(mode):
1444 # If the object is a "regular file", then it's a leaf in
1445 # the graph, so we can skip reading the data if the caller
1446 # hasn't requested it.
1447 yield WalkItem(oid=oid, type=b'blob',
1448 chunk_path=chunk_path, path=parent_path,
1453 item_it = get_ref(oidx)
1454 get_oidx, typ, _ = next(item_it)
1456 raise MissingObject(unhexlify(oidx))
1457 if typ not in (b'blob', b'commit', b'tree'):
1458 raise Exception('unexpected repository object type %r' % typ)
1460 # FIXME: set the mode based on the type when the mode is None
1461 if typ == b'blob' and not include_data:
1462 # Dump data until we can ask cat_pipe not to fetch it
1463 for ignored in item_it:
1467 data = b''.join(item_it)
1469 yield WalkItem(oid=oid, type=typ,
1470 chunk_path=chunk_path, path=parent_path,
1472 data=(data if include_data else None))
1474 if typ == b'commit':
1475 commit_items = parse_commit(data)
1476 for pid in commit_items.parents:
1477 pending.append((pid, parent_path, chunk_path, mode))
1478 pending.append((commit_items.tree, parent_path, chunk_path,
1479 hashsplit.GIT_MODE_TREE))
1480 elif typ == b'tree':
1481 for mode, name, ent_id in tree_decode(data):
1482 demangled, bup_type = demangle_name(name, mode)
1484 sub_path = parent_path
1485 sub_chunk_path = chunk_path + [name]
1487 sub_path = parent_path + [name]
1488 if bup_type == BUP_CHUNKED:
1489 sub_chunk_path = [b'']
1491 sub_chunk_path = chunk_path
1492 pending.append((hexlify(ent_id), sub_path, sub_chunk_path,