1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
6 from __future__ import absolute_import, print_function
7 import os, sys, zlib, subprocess, struct, stat, re, tempfile, glob
8 from array import array
9 from binascii import hexlify, unhexlify
10 from collections import namedtuple
11 from itertools import islice
13 from bup import _helpers, hashsplit, path, midx, bloom, xstat
14 from bup.compat import (buffer,
15 byte_int, bytes_from_byte, bytes_from_uint,
20 from bup.io import path_msg
21 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
27 mmap_read, mmap_readwrite,
28 progress, qprogress, stat_if_exists,
34 repodir = None # The default repository, once initialized
36 _typemap = {b'blob': 3, b'tree': 2, b'commit': 1, b'tag': 4}
37 _typermap = {v: k for k, v in items(_typemap)}
44 class GitError(Exception):
48 def _gitenv(repo_dir=None):
51 return merge_dict(environ, {b'GIT_DIR': os.path.abspath(repo_dir)})
53 def _git_wait(cmd, p):
56 raise GitError('%r returned %d' % (cmd, rv))
58 def _git_exo(cmd, **kwargs):
59 kwargs['check'] = False
60 result = exo(cmd, **kwargs)
62 if proc.returncode != 0:
63 raise GitError('%r returned %d' % (cmd, proc.returncode))
66 def git_config_get(option, repo_dir=None, opttype=None, cfg_file=None):
67 assert not (repo_dir and cfg_file), "repo_dir and cfg_file cannot both be used"
68 cmd = [b'git', b'config', b'--null']
70 cmd.extend([b'--file', cfg_file])
72 cmd.extend([b'--int'])
73 elif opttype == 'bool':
74 cmd.extend([b'--bool'])
76 assert opttype is None
77 cmd.extend([b'--get', option])
80 env = _gitenv(repo_dir=repo_dir)
81 p = subprocess.Popen(cmd, stdout=subprocess.PIPE, env=env,
83 # with --null, git writes out a trailing \0 after the value
84 r = p.stdout.read()[:-1]
89 elif opttype == 'bool':
90 # git converts to 'true' or 'false'
94 raise GitError('%r returned %d' % (cmd, rc))
98 def parse_tz_offset(s):
99 """UTC offset in seconds."""
100 tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
101 if bytes_from_byte(s[0]) == b'-':
105 def parse_commit_gpgsig(sig):
106 """Return the original signature bytes.
108 i.e. with the "gpgsig " header and the leading space character on
109 each continuation line removed.
114 assert sig.startswith(b'gpgsig ')
116 return sig.replace(b'\n ', b'\n')
118 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
119 # Make sure that's authoritative.
122 # https://github.com/git/git/blob/master/Documentation/technical/signature-format.txt
123 # The continuation lines have only one leading space.
125 _start_end_char = br'[^ .,:;<>"\'\0\n]'
126 _content_char = br'[^\0\n<>]'
127 _safe_str_rx = br'(?:%s{1,2}|(?:%s%s*%s))' \
129 _start_end_char, _content_char, _start_end_char)
130 _tz_rx = br'[-+]\d\d[0-5]\d'
131 _parent_rx = br'(?:parent [abcdefABCDEF0123456789]{40}\n)'
132 # Assumes every following line starting with a space is part of the
133 # mergetag. Is there a formal commit blob spec?
134 _mergetag_rx = br'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)'
135 _commit_rx = re.compile(br'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
136 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
137 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?)
138 (?P<gpgsig>gpgsig .*\n(?: .*\n)*)?
139 (?P<message>(?:.|\n)*)''' % (_parent_rx,
140 _safe_str_rx, _safe_str_rx, _tz_rx,
141 _safe_str_rx, _safe_str_rx, _tz_rx,
143 _parent_hash_rx = re.compile(br'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
145 # Note that the author_sec and committer_sec values are (UTC) epoch
146 # seconds, and for now the mergetag is not included.
147 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
148 'author_name', 'author_mail',
149 'author_sec', 'author_offset',
150 'committer_name', 'committer_mail',
151 'committer_sec', 'committer_offset',
155 def parse_commit(content):
156 commit_match = re.match(_commit_rx, content)
158 raise Exception('cannot parse commit %r' % content)
159 matches = commit_match.groupdict()
160 return CommitInfo(tree=matches['tree'],
161 parents=re.findall(_parent_hash_rx, matches['parents']),
162 author_name=matches['author_name'],
163 author_mail=matches['author_mail'],
164 author_sec=int(matches['asec']),
165 author_offset=parse_tz_offset(matches['atz']),
166 committer_name=matches['committer_name'],
167 committer_mail=matches['committer_mail'],
168 committer_sec=int(matches['csec']),
169 committer_offset=parse_tz_offset(matches['ctz']),
170 gpgsig=parse_commit_gpgsig(matches['gpgsig']),
171 message=matches['message'])
174 def get_cat_data(cat_iterator, expected_type):
175 _, kind, _ = next(cat_iterator)
176 if kind != expected_type:
177 raise Exception('expected %r, saw %r' % (expected_type, kind))
178 return b''.join(cat_iterator)
180 def get_commit_items(id, cp):
181 return parse_commit(get_cat_data(cp.get(id), b'commit'))
183 def _local_git_date_str(epoch_sec):
184 return b'%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
187 def _git_date_str(epoch_sec, tz_offset_sec):
188 offs = tz_offset_sec // 60
189 return b'%d %s%02d%02d' \
191 b'+' if offs >= 0 else b'-',
196 def repo(sub = b'', repo_dir=None):
197 """Get the path to the git repository or one of its subdirectories."""
198 repo_dir = repo_dir or repodir
200 raise GitError('You should call check_repo_or_die()')
202 # If there's a .git subdirectory, then the actual repo is in there.
203 gd = os.path.join(repo_dir, b'.git')
204 if os.path.exists(gd):
207 return os.path.join(repo_dir, sub)
211 re.compile(br'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)')
214 return _shorten_hash_rx.sub(br'\1\2*\3', s)
218 full = os.path.abspath(path)
219 fullrepo = os.path.abspath(repo(b''))
220 if not fullrepo.endswith(b'/'):
222 if full.startswith(fullrepo):
223 path = full[len(fullrepo):]
224 if path.startswith(b'index-cache/'):
225 path = path[len(b'index-cache/'):]
226 return shorten_hash(path)
229 def auto_midx(objdir):
230 args = [path.exe(), b'midx', b'--auto', b'--dir', objdir]
232 rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
234 # make sure 'args' gets printed to help with debugging
235 add_error('%r: exception: %s' % (args, e))
238 add_error('%r: returned %d' % (args, rv))
240 args = [path.exe(), b'bloom', b'--dir', objdir]
242 rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
244 # make sure 'args' gets printed to help with debugging
245 add_error('%r: exception: %s' % (args, e))
248 add_error('%r: returned %d' % (args, rv))
251 def mangle_name(name, mode, gitmode):
252 """Mangle a file name to present an abstract name for segmented files.
253 Mangled file names will have the ".bup" extension added to them. If a
254 file's name already ends with ".bup", a ".bupl" extension is added to
255 disambiguate normal files from segmented ones.
257 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
258 assert(stat.S_ISDIR(gitmode))
259 return name + b'.bup'
260 elif name.endswith(b'.bup') or name[:-1].endswith(b'.bup'):
261 return name + b'.bupl'
266 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
267 def demangle_name(name, mode):
268 """Remove name mangling from a file name, if necessary.
270 The return value is a tuple (demangled_filename,mode), where mode is one of
273 * BUP_NORMAL : files that should be read as-is from the repository
274 * BUP_CHUNKED : files that were chunked and need to be reassembled
276 For more information on the name mangling algorithm, see mangle_name()
278 if name.endswith(b'.bupl'):
279 return (name[:-5], BUP_NORMAL)
280 elif name.endswith(b'.bup'):
281 return (name[:-4], BUP_CHUNKED)
282 elif name.endswith(b'.bupm'):
284 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
285 return (name, BUP_NORMAL)
288 def calc_hash(type, content):
289 """Calculate some content's hash in the Git fashion."""
290 header = b'%s %d\0' % (type, len(content))
296 def shalist_item_sort_key(ent):
297 (mode, name, id) = ent
298 assert(mode+0 == mode)
299 if stat.S_ISDIR(mode):
305 def tree_encode(shalist):
306 """Generate a git tree object from (mode,name,hash) tuples."""
307 shalist = sorted(shalist, key = shalist_item_sort_key)
309 for (mode,name,bin) in shalist:
311 assert(mode+0 == mode)
313 assert(len(bin) == 20)
314 s = b'%o %s\0%s' % (mode,name,bin)
315 assert s[0] != b'0' # 0-padded octal is not acceptable in a git tree
320 def tree_decode(buf):
321 """Generate a list of (mode,name,hash) from the git tree object in buf."""
323 while ofs < len(buf):
324 z = buf.find(b'\0', ofs)
326 spl = buf[ofs:z].split(b' ', 1)
327 assert(len(spl) == 2)
329 sha = buf[z+1:z+1+20]
331 yield (int(mode, 8), name, sha)
334 def _encode_packobj(type, content, compression_level=1):
335 if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
336 raise ValueError('invalid compression level %s' % compression_level)
339 szbits = (sz & 0x0f) | (_typemap[type]<<4)
342 if sz: szbits |= 0x80
343 szout += bytes_from_uint(szbits)
348 z = zlib.compressobj(compression_level)
350 yield z.compress(content)
354 def _decode_packobj(buf):
357 type = _typermap[(c & 0x70) >> 4]
364 sz |= (c & 0x7f) << shift
368 return (type, zlib.decompress(buf[i+1:]))
375 def find_offset(self, hash):
376 """Get the offset of an object inside the index file."""
377 idx = self._idx_from_hash(hash)
379 return self._ofs_from_idx(idx)
382 def exists(self, hash, want_source=False):
383 """Return nonempty if the object exists in this index."""
384 if hash and (self._idx_from_hash(hash) != None):
385 return want_source and os.path.basename(self.name) or True
388 def _idx_from_hash(self, hash):
389 global _total_searches, _total_steps
391 assert(len(hash) == 20)
392 b1 = byte_int(hash[0])
393 start = self.fanout[b1-1] # range -1..254
394 end = self.fanout[b1] # range 0..255
396 _total_steps += 1 # lookup table is a step
399 mid = start + (end - start) // 2
400 v = self._idx_to_hash(mid)
410 class PackIdxV1(PackIdx):
411 """Object representation of a Git pack index (version 1) file."""
412 def __init__(self, filename, f):
414 self.idxnames = [self.name]
415 self.map = mmap_read(f)
416 # Min size for 'L' is 4, which is sufficient for struct's '!I'
417 self.fanout = array('L', struct.unpack('!256I', self.map))
418 self.fanout.append(0) # entry "-1"
419 self.nsha = self.fanout[255]
420 self.sha_ofs = 256 * 4
421 # Avoid slicing shatable for individual hashes (very high overhead)
422 self.shatable = buffer(self.map, self.sha_ofs, self.nsha * 24)
427 def __exit__(self, type, value, traceback):
431 return int(self.nsha) # int() from long for python 2
433 def _ofs_from_idx(self, idx):
434 if idx >= self.nsha or idx < 0:
435 raise IndexError('invalid pack index index %d' % idx)
436 ofs = self.sha_ofs + idx * 24
437 return struct.unpack_from('!I', self.map, offset=ofs)[0]
439 def _idx_to_hash(self, idx):
440 if idx >= self.nsha or idx < 0:
441 raise IndexError('invalid pack index index %d' % idx)
442 ofs = self.sha_ofs + idx * 24 + 4
443 return self.map[ofs : ofs + 20]
446 start = self.sha_ofs + 4
447 for ofs in range(start, start + 24 * self.nsha, 24):
448 yield self.map[ofs : ofs + 20]
451 if self.map is not None:
457 class PackIdxV2(PackIdx):
458 """Object representation of a Git pack index (version 2) file."""
459 def __init__(self, filename, f):
461 self.idxnames = [self.name]
462 self.map = mmap_read(f)
463 assert self.map[0:8] == b'\377tOc\0\0\0\2'
464 # Min size for 'L' is 4, which is sufficient for struct's '!I'
465 self.fanout = array('L', struct.unpack_from('!256I', self.map, offset=8))
466 self.fanout.append(0)
467 self.nsha = self.fanout[255]
468 self.sha_ofs = 8 + 256*4
469 self.ofstable_ofs = self.sha_ofs + self.nsha * 20 + self.nsha * 4
470 self.ofs64table_ofs = self.ofstable_ofs + self.nsha * 4
471 # Avoid slicing this for individual hashes (very high overhead)
472 self.shatable = buffer(self.map, self.sha_ofs, self.nsha*20)
477 def __exit__(self, type, value, traceback):
481 return int(self.nsha) # int() from long for python 2
483 def _ofs_from_idx(self, idx):
484 if idx >= self.nsha or idx < 0:
485 raise IndexError('invalid pack index index %d' % idx)
486 ofs_ofs = self.ofstable_ofs + idx * 4
487 ofs = struct.unpack_from('!I', self.map, offset=ofs_ofs)[0]
489 idx64 = ofs & 0x7fffffff
490 ofs64_ofs = self.ofs64table_ofs + idx64 * 8
491 ofs = struct.unpack_from('!Q', self.map, offset=ofs64_ofs)[0]
494 def _idx_to_hash(self, idx):
495 if idx >= self.nsha or idx < 0:
496 raise IndexError('invalid pack index index %d' % idx)
497 ofs = self.sha_ofs + idx * 20
498 return self.map[ofs : ofs + 20]
502 for ofs in range(start, start + 20 * self.nsha, 20):
503 yield self.map[ofs : ofs + 20]
506 if self.map is not None:
514 def __init__(self, dir, ignore_midx=False):
516 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
521 self.do_bloom = False
523 self.ignore_midx = ignore_midx
529 assert(_mpi_count == 0)
532 return iter(idxmerge(self.packs))
535 return sum(len(pack) for pack in self.packs)
537 def exists(self, hash, want_source=False):
538 """Return nonempty if the object exists in the index files."""
539 global _total_searches
541 if hash in self.also:
543 if self.do_bloom and self.bloom:
544 if self.bloom.exists(hash):
545 self.do_bloom = False
547 _total_searches -= 1 # was counted by bloom
549 for i in range(len(self.packs)):
551 _total_searches -= 1 # will be incremented by sub-pack
552 ix = p.exists(hash, want_source=want_source)
554 # reorder so most recently used packs are searched first
555 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
560 def refresh(self, skip_midx = False):
561 """Refresh the index list.
562 This method verifies if .midx files were superseded (e.g. all of its
563 contents are in another, bigger .midx file) and removes the superseded
566 If skip_midx is True, all work on .midx files will be skipped and .midx
567 files will be removed from the list.
569 The instance variable 'ignore_midx' can force this function to
570 always act as if skip_midx was True.
572 if self.bloom is not None:
574 self.bloom = None # Always reopen the bloom as it may have been relaced
575 self.do_bloom = False
576 skip_midx = skip_midx or self.ignore_midx
577 d = dict((p.name, p) for p in self.packs
578 if not skip_midx or not isinstance(p, midx.PackMidx))
579 if os.path.exists(self.dir):
582 midxes = set(glob.glob(os.path.join(self.dir, b'*.midx')))
583 # remove any *.midx files from our list that no longer exist
584 for ix in list(d.values()):
585 if not isinstance(ix, midx.PackMidx):
587 if ix.name in midxes:
592 self.packs.remove(ix)
593 for ix in self.packs:
594 if isinstance(ix, midx.PackMidx):
595 for name in ix.idxnames:
596 d[os.path.join(self.dir, name)] = ix
599 mx = midx.PackMidx(full)
600 (mxd, mxf) = os.path.split(mx.name)
602 for n in mx.idxnames:
603 if not os.path.exists(os.path.join(mxd, n)):
604 log(('warning: index %s missing\n'
606 % (path_msg(n), path_msg(mxf)))
614 midxl.sort(key=lambda ix:
615 (-len(ix), -xstat.stat(ix.name).st_mtime))
618 for sub in ix.idxnames:
619 found = d.get(os.path.join(self.dir, sub))
620 if not found or isinstance(found, PackIdx):
621 # doesn't exist, or exists but not in a midx
626 for name in ix.idxnames:
627 d[os.path.join(self.dir, name)] = ix
628 elif not ix.force_keep:
629 debug1('midx: removing redundant: %s\n'
630 % path_msg(os.path.basename(ix.name)))
633 for full in glob.glob(os.path.join(self.dir, b'*.idx')):
637 except GitError as e:
641 bfull = os.path.join(self.dir, b'bup.bloom')
642 if self.bloom is None and os.path.exists(bfull):
643 self.bloom = bloom.ShaBloom(bfull)
644 self.packs = list(set(d.values()))
645 self.packs.sort(reverse=True, key=lambda x: len(x))
646 if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
650 debug1('PackIdxList: using %d index%s.\n'
651 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
654 """Insert an additional object in the list."""
658 def open_idx(filename):
659 if filename.endswith(b'.idx'):
660 f = open(filename, 'rb')
662 if header[0:4] == b'\377tOc':
663 version = struct.unpack('!I', header[4:8])[0]
665 return PackIdxV2(filename, f)
667 raise GitError('%s: expected idx file version 2, got %d'
668 % (path_msg(filename), version))
669 elif len(header) == 8 and header[0:4] < b'\377tOc':
670 return PackIdxV1(filename, f)
672 raise GitError('%s: unrecognized idx file header'
673 % path_msg(filename))
674 elif filename.endswith(b'.midx'):
675 return midx.PackMidx(filename)
677 raise GitError('idx filenames must end with .idx or .midx')
680 def idxmerge(idxlist, final_progress=True):
681 """Generate a list of all the objects reachable in a PackIdxList."""
682 def pfunc(count, total):
683 qprogress('Reading indexes: %.2f%% (%d/%d)\r'
684 % (count*100.0/total, count, total))
685 def pfinal(count, total):
687 progress('Reading indexes: %.2f%% (%d/%d), done.\n'
688 % (100, total, total))
689 return merge_iter(idxlist, 10024, pfunc, pfinal)
692 def create_commit_blob(tree, parent,
693 author, adate_sec, adate_tz,
694 committer, cdate_sec, cdate_tz,
696 if adate_tz is not None:
697 adate_str = _git_date_str(adate_sec, adate_tz)
699 adate_str = _local_git_date_str(adate_sec)
700 if cdate_tz is not None:
701 cdate_str = _git_date_str(cdate_sec, cdate_tz)
703 cdate_str = _local_git_date_str(cdate_sec)
705 if tree: l.append(b'tree %s' % hexlify(tree))
706 if parent: l.append(b'parent %s' % hexlify(parent))
707 if author: l.append(b'author %s %s' % (author, adate_str))
708 if committer: l.append(b'committer %s %s' % (committer, cdate_str))
714 def _make_objcache():
715 return PackIdxList(repo(b'objects/pack'))
717 # bup-gc assumes that it can disable all PackWriter activities
718 # (bloom/midx/cache) via the constructor and close() arguments.
721 """Writes Git objects inside a pack file."""
722 def __init__(self, objcache_maker=_make_objcache, compression_level=1,
723 run_midx=True, on_pack_finish=None,
724 max_pack_size=None, max_pack_objects=None, repo_dir=None):
725 self.repo_dir = repo_dir or repo()
732 self.objcache_maker = objcache_maker
734 self.compression_level = compression_level
735 self.run_midx=run_midx
736 self.on_pack_finish = on_pack_finish
737 if not max_pack_size:
738 max_pack_size = git_config_get(b'pack.packSizeLimit',
739 repo_dir=self.repo_dir,
741 if not max_pack_size:
742 # larger packs slow down pruning
743 max_pack_size = 1000 * 1000 * 1000
744 self.max_pack_size = max_pack_size
745 # cache memory usage is about 83 bytes per object
746 self.max_pack_objects = max_pack_objects if max_pack_objects \
747 else max(1, self.max_pack_size // 5000)
755 def __exit__(self, type, value, traceback):
760 objdir = dir = os.path.join(self.repo_dir, b'objects')
761 fd, name = tempfile.mkstemp(suffix=b'.pack', dir=objdir)
763 self.file = os.fdopen(fd, 'w+b')
768 self.parentfd = os.open(objdir, os.O_RDONLY)
774 assert name.endswith(b'.pack')
775 self.filename = name[:-5]
776 self.file.write(b'PACK\0\0\0\2\0\0\0\0')
777 self.idx = PackIdxV2Writer()
779 def _raw_write(self, datalist, sha):
782 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
783 # the file never has a *partial* blob. So let's make sure it's
784 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
785 # to our hashsplit algorithm.) f.write() does its own buffering,
786 # but that's okay because we'll flush it in _end().
787 oneblob = b''.join(datalist)
793 crc = zlib.crc32(oneblob) & 0xffffffff
794 self._update_idx(sha, crc, nw)
799 def _update_idx(self, sha, crc, size):
802 self.idx.add(sha, crc, self.file.tell() - size)
804 def _write(self, sha, type, content):
808 sha = calc_hash(type, content)
809 size, crc = self._raw_write(_encode_packobj(type, content,
810 self.compression_level),
812 if self.outbytes >= self.max_pack_size \
813 or self.count >= self.max_pack_objects:
817 def breakpoint(self):
818 """Clear byte and object counts and return the last processed id."""
819 id = self._end(self.run_midx)
820 self.outbytes = self.count = 0
823 def _require_objcache(self):
824 if self.objcache is None and self.objcache_maker:
825 self.objcache = self.objcache_maker()
826 if self.objcache is None:
828 "PackWriter not opened or can't check exists w/o objcache")
830 def exists(self, id, want_source=False):
831 """Return non-empty if an object is found in the object cache."""
832 self._require_objcache()
833 return self.objcache.exists(id, want_source=want_source)
835 def just_write(self, sha, type, content):
836 """Write an object to the pack file without checking for duplication."""
837 self._write(sha, type, content)
838 # If nothing else, gc doesn't have/want an objcache
839 if self.objcache is not None:
840 self.objcache.add(sha)
842 def maybe_write(self, type, content):
843 """Write an object to the pack file if not present and return its id."""
844 sha = calc_hash(type, content)
845 if not self.exists(sha):
846 self._require_objcache()
847 self.just_write(sha, type, content)
850 def new_blob(self, blob):
851 """Create a blob object in the pack with the supplied content."""
852 return self.maybe_write(b'blob', blob)
854 def new_tree(self, shalist):
855 """Create a tree object in the pack."""
856 content = tree_encode(shalist)
857 return self.maybe_write(b'tree', content)
859 def new_commit(self, tree, parent,
860 author, adate_sec, adate_tz,
861 committer, cdate_sec, cdate_tz,
863 """Create a commit object in the pack. The date_sec values must be
864 epoch-seconds, and if a tz is None, the local timezone is assumed."""
865 content = create_commit_blob(tree, parent,
866 author, adate_sec, adate_tz,
867 committer, cdate_sec, cdate_tz,
869 return self.maybe_write(b'commit', content)
872 """Remove the pack file from disk."""
881 os.unlink(self.filename + b'.pack')
888 def _end(self, run_midx=True):
890 if not f: return None
897 # update object count
899 cp = struct.pack('!i', self.count)
903 # calculate the pack sha1sum
906 for b in chunkyreader(f):
908 packbin = sum.digest()
910 fdatasync(f.fileno())
914 idx.write(self.filename + b'.idx', packbin)
915 nameprefix = os.path.join(self.repo_dir,
916 b'objects/pack/pack-' + hexlify(packbin))
917 if os.path.exists(self.filename + b'.map'):
918 os.unlink(self.filename + b'.map')
919 os.rename(self.filename + b'.pack', nameprefix + b'.pack')
920 os.rename(self.filename + b'.idx', nameprefix + b'.idx')
922 os.fsync(self.parentfd)
924 os.close(self.parentfd)
927 auto_midx(os.path.join(self.repo_dir, b'objects/pack'))
929 if self.on_pack_finish:
930 self.on_pack_finish(nameprefix)
934 def close(self, run_midx=True):
935 """Close the pack file and move it to its definitive path."""
936 return self._end(run_midx=run_midx)
939 class PackIdxV2Writer:
941 self.idx = list(list() for i in range(256))
944 def add(self, sha, crc, offs):
947 self.idx[byte_int(sha[0])].append((sha, crc, offs))
949 def write(self, filename, packbin):
951 for section in self.idx:
952 for entry in section:
953 if entry[2] >= 2**31:
956 # Length: header + fan-out + shas-and-crcs + overflow-offsets
957 index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
959 idx_f = open(filename, 'w+b')
961 idx_f.truncate(index_len)
962 fdatasync(idx_f.fileno())
963 idx_map = mmap_readwrite(idx_f, close=False)
965 count = _helpers.write_idx(filename, idx_map, self.idx,
967 assert(count == self.count)
974 idx_f = open(filename, 'a+b')
979 b = idx_f.read(8 + 4*256)
982 for b in chunkyreader(idx_f, 20 * self.count):
985 for b in chunkyreader(idx_f):
987 idx_f.write(idx_sum.digest())
988 fdatasync(idx_f.fileno())
993 def list_refs(patterns=None, repo_dir=None,
994 limit_to_heads=False, limit_to_tags=False):
995 """Yield (refname, hash) tuples for all repository refs unless
996 patterns are specified. In that case, only include tuples for
997 refs matching those patterns (cf. git-show-ref(1)). The limits
998 restrict the result items to refs/heads or refs/tags. If both
999 limits are specified, items from both sources will be included.
1002 argv = [b'git', b'show-ref']
1004 argv.append(b'--heads')
1006 argv.append(b'--tags')
1009 argv.extend(patterns)
1010 p = subprocess.Popen(argv, env=_gitenv(repo_dir), stdout=subprocess.PIPE,
1012 out = p.stdout.read().strip()
1013 rv = p.wait() # not fatal
1017 for d in out.split(b'\n'):
1018 sha, name = d.split(b' ', 1)
1019 yield name, unhexlify(sha)
1022 def read_ref(refname, repo_dir = None):
1023 """Get the commit id of the most recent commit made on a given ref."""
1024 refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
1025 l = tuple(islice(refs, 2))
1033 def rev_list_invocation(ref_or_refs, format=None):
1034 if isinstance(ref_or_refs, bytes):
1035 refs = (ref_or_refs,)
1038 argv = [b'git', b'rev-list']
1041 argv.append(b'--pretty=format:' + format)
1043 assert not ref.startswith(b'-')
1049 def rev_list(ref_or_refs, parse=None, format=None, repo_dir=None):
1050 """Yield information about commits as per "git rev-list". If a format
1051 is not provided, yield one hex hash at a time. If a format is
1052 provided, pass it to rev-list and call parse(git_stdout) for each
1053 commit with the stream positioned just after the rev-list "commit
1054 HASH" header line. When a format is provided yield (oidx,
1055 parse(git_stdout)) for each commit.
1058 assert bool(parse) == bool(format)
1059 p = subprocess.Popen(rev_list_invocation(ref_or_refs,
1061 env=_gitenv(repo_dir),
1062 stdout = subprocess.PIPE,
1065 for line in p.stdout:
1068 line = p.stdout.readline()
1071 if not s.startswith(b'commit '):
1072 raise Exception('unexpected line ' + repr(s))
1075 yield s, parse(p.stdout)
1076 line = p.stdout.readline()
1078 rv = p.wait() # not fatal
1080 raise GitError('git rev-list returned error %d' % rv)
1083 def rev_parse(committish, repo_dir=None):
1084 """Resolve the full hash for 'committish', if it exists.
1086 Should be roughly equivalent to 'git rev-parse'.
1088 Returns the hex value of the hash if it is found, None if 'committish' does
1089 not correspond to anything.
1091 head = read_ref(committish, repo_dir=repo_dir)
1093 debug2("resolved from ref: commit = %s\n" % hexlify(head))
1096 pL = PackIdxList(repo(b'objects/pack', repo_dir=repo_dir))
1098 if len(committish) == 40:
1100 hash = unhexlify(committish)
1110 def update_ref(refname, newval, oldval, repo_dir=None):
1111 """Update a repository reference."""
1114 assert refname.startswith(b'refs/heads/') \
1115 or refname.startswith(b'refs/tags/')
1116 p = subprocess.Popen([b'git', b'update-ref', refname,
1117 hexlify(newval), hexlify(oldval)],
1118 env=_gitenv(repo_dir),
1120 _git_wait(b'git update-ref', p)
1123 def delete_ref(refname, oldvalue=None):
1124 """Delete a repository reference (see git update-ref(1))."""
1125 assert refname.startswith(b'refs/')
1126 oldvalue = [] if not oldvalue else [oldvalue]
1127 p = subprocess.Popen([b'git', b'update-ref', b'-d', refname] + oldvalue,
1130 _git_wait('git update-ref', p)
1133 def guess_repo(path=None):
1134 """Set the path value in the global variable "repodir".
1135 This makes bup look for an existing bup repository, but not fail if a
1136 repository doesn't exist. Usually, if you are interacting with a bup
1137 repository, you would not be calling this function but using
1138 check_repo_or_die().
1144 repodir = environ.get(b'BUP_DIR')
1146 repodir = os.path.expanduser(b'~/.bup')
1149 def init_repo(path=None):
1150 """Create the Git bare repository for bup in a given path."""
1152 d = repo() # appends a / to the path
1153 parent = os.path.dirname(os.path.dirname(d))
1154 if parent and not os.path.exists(parent):
1155 raise GitError('parent directory "%s" does not exist\n'
1157 if os.path.exists(d) and not os.path.isdir(os.path.join(d, b'.')):
1158 raise GitError('"%s" exists but is not a directory\n' % path_msg(d))
1159 p = subprocess.Popen([b'git', b'--bare', b'init'], stdout=sys.stderr,
1162 _git_wait('git init', p)
1163 # Force the index version configuration in order to ensure bup works
1164 # regardless of the version of the installed Git binary.
1165 p = subprocess.Popen([b'git', b'config', b'pack.indexVersion', '2'],
1166 stdout=sys.stderr, env=_gitenv(), close_fds=True)
1167 _git_wait('git config', p)
1169 p = subprocess.Popen([b'git', b'config', b'core.logAllRefUpdates', b'true'],
1170 stdout=sys.stderr, env=_gitenv(), close_fds=True)
1171 _git_wait('git config', p)
1174 def check_repo_or_die(path=None):
1175 """Check to see if a bup repository probably exists, and abort if not."""
1178 pst = stat_if_exists(top + b'/objects/pack')
1179 if pst and stat.S_ISDIR(pst.st_mode):
1182 top_st = stat_if_exists(top)
1184 log('error: repository %r does not exist (see "bup help init")\n'
1187 log('error: %s is not a repository\n' % path_msg(top))
1191 def is_suitable_git(ver_str):
1192 if not ver_str.startswith(b'git version '):
1193 return 'unrecognized'
1194 ver_str = ver_str[len(b'git version '):]
1195 if ver_str.startswith(b'0.'):
1196 return 'insufficient'
1197 if ver_str.startswith(b'1.'):
1198 if re.match(br'1\.[012345]rc', ver_str):
1199 return 'insufficient'
1200 if re.match(br'1\.[01234]\.', ver_str):
1201 return 'insufficient'
1202 if re.match(br'1\.5\.[012345]($|\.)', ver_str):
1203 return 'insufficient'
1204 if re.match(br'1\.5\.6-rc', ver_str):
1205 return 'insufficient'
1207 if re.match(br'[0-9]+(\.|$)?', ver_str):
1213 def require_suitable_git(ver_str=None):
1214 """Raise GitError if the version of git isn't suitable.
1216 Rely on ver_str when provided, rather than invoking the git in the
1221 if _git_great is not None:
1223 if environ.get(b'BUP_GIT_VERSION_IS_FINE', b'').lower() \
1224 in (b'yes', b'true', b'1'):
1228 ver_str, _, _ = _git_exo([b'git', b'--version'])
1229 status = is_suitable_git(ver_str)
1230 if status == 'unrecognized':
1231 raise GitError('Unexpected git --version output: %r' % ver_str)
1232 if status == 'insufficient':
1233 log('error: git version must be at least 1.5.6\n')
1235 if status == 'suitable':
1241 class _AbortableIter:
1242 def __init__(self, it, onabort = None):
1244 self.onabort = onabort
1252 return next(self.it)
1253 except StopIteration as e:
1263 """Abort iteration and call the abortion callback, if needed."""
1274 """Link to 'git cat-file' that is used to retrieve blob data."""
1275 def __init__(self, repo_dir = None):
1276 require_suitable_git()
1277 self.repo_dir = repo_dir
1278 self.p = self.inprogress = None
1280 def close(self, wait=False):
1286 self.inprogress = None
1294 self.p = subprocess.Popen([b'git', b'cat-file', b'--batch'],
1295 stdin=subprocess.PIPE,
1296 stdout=subprocess.PIPE,
1299 env=_gitenv(self.repo_dir))
1302 """Yield (oidx, type, size), followed by the data referred to by ref.
1303 If ref does not exist, only yield (None, None, None).
1306 if not self.p or self.p.poll() != None:
1309 poll_result = self.p.poll()
1310 assert(poll_result == None)
1312 log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1313 assert(not self.inprogress)
1314 assert ref.find(b'\n') < 0
1315 assert ref.find(b'\r') < 0
1316 assert not ref.startswith(b'-')
1317 self.inprogress = ref
1318 self.p.stdin.write(ref + b'\n')
1319 self.p.stdin.flush()
1320 hdr = self.p.stdout.readline()
1322 raise GitError('unexpected cat-file EOF (last request: %r, exit: %s)'
1323 % (ref, self.p.poll() or 'none'))
1324 if hdr.endswith(b' missing\n'):
1325 self.inprogress = None
1326 yield None, None, None
1328 info = hdr.split(b' ')
1329 if len(info) != 3 or len(info[0]) != 40:
1330 raise GitError('expected object (id, type, size), got %r' % info)
1331 oidx, typ, size = info
1333 it = _AbortableIter(chunkyreader(self.p.stdout, size),
1336 yield oidx, typ, size
1339 readline_result = self.p.stdout.readline()
1340 assert readline_result == b'\n'
1341 self.inprogress = None
1342 except Exception as e:
1346 def _join(self, it):
1347 _, typ, _ = next(it)
1351 elif typ == b'tree':
1352 treefile = b''.join(it)
1353 for (mode, name, sha) in tree_decode(treefile):
1354 for blob in self.join(hexlify(sha)):
1356 elif typ == b'commit':
1357 treeline = b''.join(it).split(b'\n')[0]
1358 assert treeline.startswith(b'tree ')
1359 for blob in self.join(treeline[5:]):
1362 raise GitError('invalid object type %r: expected blob/tree/commit'
1366 """Generate a list of the content of all blobs that can be reached
1367 from an object. The hash given in 'id' must point to a blob, a tree
1368 or a commit. The content of all blobs that can be seen from trees or
1369 commits will be added to the list.
1371 for d in self._join(self.get(id)):
1377 def cp(repo_dir=None):
1378 """Create a CatPipe object or reuse the already existing one."""
1381 repo_dir = repodir or repo()
1382 repo_dir = os.path.abspath(repo_dir)
1383 cp = _cp.get(repo_dir)
1385 cp = CatPipe(repo_dir)
1390 def close_catpipes():
1391 # FIXME: chain exceptions
1393 _, cp = _cp.popitem()
1397 def tags(repo_dir = None):
1398 """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1400 for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1401 assert n.startswith(b'refs/tags/')
1405 tags[c].append(name) # more than one tag can point at 'c'
1409 class MissingObject(KeyError):
1410 def __init__(self, oid):
1412 KeyError.__init__(self, 'object %r is missing' % hexlify(oid))
1415 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1416 'path', 'chunk_path', 'data'])
1417 # The path is the mangled path, and if an item represents a fragment
1418 # of a chunked file, the chunk_path will be the chunked subtree path
1419 # for the chunk, i.e. ['', '2d3115e', ...]. The top-level path for a
1420 # chunked file will have a chunk_path of ['']. So some chunk subtree
1421 # of the file '/foo/bar/baz' might look like this:
1423 # item.path = ['foo', 'bar', 'baz.bup']
1424 # item.chunk_path = ['', '2d3115e', '016b097']
1425 # item.type = 'tree'
1429 def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1430 """Yield everything reachable from oidx via get_ref (which must behave
1431 like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1432 returns true. Throw MissingObject if a hash encountered is
1433 missing from the repository, and don't read or return blob content
1434 in the data field unless include_data is set.
1437 # Maintain the pending stack on the heap to avoid stack overflow
1438 pending = [(oidx, [], [], None)]
1440 oidx, parent_path, chunk_path, mode = pending.pop()
1441 oid = unhexlify(oidx)
1442 if stop_at and stop_at(oidx):
1445 if (not include_data) and mode and stat.S_ISREG(mode):
1446 # If the object is a "regular file", then it's a leaf in
1447 # the graph, so we can skip reading the data if the caller
1448 # hasn't requested it.
1449 yield WalkItem(oid=oid, type=b'blob',
1450 chunk_path=chunk_path, path=parent_path,
1455 item_it = get_ref(oidx)
1456 get_oidx, typ, _ = next(item_it)
1458 raise MissingObject(unhexlify(oidx))
1459 if typ not in (b'blob', b'commit', b'tree'):
1460 raise Exception('unexpected repository object type %r' % typ)
1462 # FIXME: set the mode based on the type when the mode is None
1463 if typ == b'blob' and not include_data:
1464 # Dump data until we can ask cat_pipe not to fetch it
1465 for ignored in item_it:
1469 data = b''.join(item_it)
1471 yield WalkItem(oid=oid, type=typ,
1472 chunk_path=chunk_path, path=parent_path,
1474 data=(data if include_data else None))
1476 if typ == b'commit':
1477 commit_items = parse_commit(data)
1478 for pid in commit_items.parents:
1479 pending.append((pid, parent_path, chunk_path, mode))
1480 pending.append((commit_items.tree, parent_path, chunk_path,
1481 hashsplit.GIT_MODE_TREE))
1482 elif typ == b'tree':
1483 for mode, name, ent_id in tree_decode(data):
1484 demangled, bup_type = demangle_name(name, mode)
1486 sub_path = parent_path
1487 sub_chunk_path = chunk_path + [name]
1489 sub_path = parent_path + [name]
1490 if bup_type == BUP_CHUNKED:
1491 sub_chunk_path = [b'']
1493 sub_chunk_path = chunk_path
1494 pending.append((hexlify(ent_id), sub_path, sub_chunk_path,