1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
6 from __future__ import absolute_import, print_function
7 import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
8 from array import array
9 from binascii import hexlify, unhexlify
10 from collections import namedtuple
11 from itertools import islice
12 from numbers import Integral
14 from bup import _helpers, compat, hashsplit, path, midx, bloom, xstat
15 from bup.compat import (buffer,
16 byte_int, bytes_from_byte, bytes_from_uint,
21 from bup.io import path_msg
22 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
25 hostname, localtime, log,
28 mmap_read, mmap_readwrite,
30 progress, qprogress, stat_if_exists,
33 from bup.pwdgrp import username, userfullname
37 repodir = None # The default repository, once initialized
39 _typemap = {b'blob': 3, b'tree': 2, b'commit': 1, b'tag': 4}
40 _typermap = {v: k for k, v in items(_typemap)}
47 class GitError(Exception):
51 def _gitenv(repo_dir=None):
54 return merge_dict(environ, {b'GIT_DIR': os.path.abspath(repo_dir)})
56 def _git_wait(cmd, p):
59 raise GitError('%r returned %d' % (cmd, rv))
61 def _git_exo(cmd, **kwargs):
62 kwargs['check'] = False
63 result = exo(cmd, **kwargs)
65 if proc.returncode != 0:
66 raise GitError('%r returned %d' % (cmd, proc.returncode))
69 def git_config_get(option, repo_dir=None):
70 cmd = (b'git', b'config', b'--get', option)
71 p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
72 env=_gitenv(repo_dir=repo_dir),
79 raise GitError('%r returned %d' % (cmd, rc))
83 def parse_tz_offset(s):
84 """UTC offset in seconds."""
85 tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
86 if bytes_from_byte(s[0]) == b'-':
91 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
92 # Make sure that's authoritative.
93 _start_end_char = br'[^ .,:;<>"\'\0\n]'
94 _content_char = br'[^\0\n<>]'
95 _safe_str_rx = br'(?:%s{1,2}|(?:%s%s*%s))' \
97 _start_end_char, _content_char, _start_end_char)
98 _tz_rx = br'[-+]\d\d[0-5]\d'
99 _parent_rx = br'(?:parent [abcdefABCDEF0123456789]{40}\n)'
100 # Assumes every following line starting with a space is part of the
101 # mergetag. Is there a formal commit blob spec?
102 _mergetag_rx = br'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)'
103 _commit_rx = re.compile(br'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
104 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
105 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?)
107 (?P<message>(?:.|\n)*)''' % (_parent_rx,
108 _safe_str_rx, _safe_str_rx, _tz_rx,
109 _safe_str_rx, _safe_str_rx, _tz_rx,
111 _parent_hash_rx = re.compile(br'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
113 # Note that the author_sec and committer_sec values are (UTC) epoch
114 # seconds, and for now the mergetag is not included.
115 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
116 'author_name', 'author_mail',
117 'author_sec', 'author_offset',
118 'committer_name', 'committer_mail',
119 'committer_sec', 'committer_offset',
122 def parse_commit(content):
123 commit_match = re.match(_commit_rx, content)
125 raise Exception('cannot parse commit %r' % content)
126 matches = commit_match.groupdict()
127 return CommitInfo(tree=matches['tree'],
128 parents=re.findall(_parent_hash_rx, matches['parents']),
129 author_name=matches['author_name'],
130 author_mail=matches['author_mail'],
131 author_sec=int(matches['asec']),
132 author_offset=parse_tz_offset(matches['atz']),
133 committer_name=matches['committer_name'],
134 committer_mail=matches['committer_mail'],
135 committer_sec=int(matches['csec']),
136 committer_offset=parse_tz_offset(matches['ctz']),
137 message=matches['message'])
140 def get_cat_data(cat_iterator, expected_type):
141 _, kind, _ = next(cat_iterator)
142 if kind != expected_type:
143 raise Exception('expected %r, saw %r' % (expected_type, kind))
144 return b''.join(cat_iterator)
146 def get_commit_items(id, cp):
147 return parse_commit(get_cat_data(cp.get(id), b'commit'))
149 def _local_git_date_str(epoch_sec):
150 return b'%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
153 def _git_date_str(epoch_sec, tz_offset_sec):
154 offs = tz_offset_sec // 60
155 return b'%d %s%02d%02d' \
157 b'+' if offs >= 0 else b'-',
162 def repo(sub = b'', repo_dir=None):
163 """Get the path to the git repository or one of its subdirectories."""
164 repo_dir = repo_dir or repodir
166 raise GitError('You should call check_repo_or_die()')
168 # If there's a .git subdirectory, then the actual repo is in there.
169 gd = os.path.join(repo_dir, b'.git')
170 if os.path.exists(gd):
173 return os.path.join(repo_dir, sub)
177 re.compile(br'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)')
180 return _shorten_hash_rx.sub(br'\1\2*\3', s)
184 full = os.path.abspath(path)
185 fullrepo = os.path.abspath(repo(b''))
186 if not fullrepo.endswith(b'/'):
188 if full.startswith(fullrepo):
189 path = full[len(fullrepo):]
190 if path.startswith(b'index-cache/'):
191 path = path[len(b'index-cache/'):]
192 return shorten_hash(path)
195 def auto_midx(objdir):
196 args = [path.exe(), b'midx', b'--auto', b'--dir', objdir]
198 rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
200 # make sure 'args' gets printed to help with debugging
201 add_error('%r: exception: %s' % (args, e))
204 add_error('%r: returned %d' % (args, rv))
206 args = [path.exe(), b'bloom', b'--dir', objdir]
208 rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
210 # make sure 'args' gets printed to help with debugging
211 add_error('%r: exception: %s' % (args, e))
214 add_error('%r: returned %d' % (args, rv))
217 def mangle_name(name, mode, gitmode):
218 """Mangle a file name to present an abstract name for segmented files.
219 Mangled file names will have the ".bup" extension added to them. If a
220 file's name already ends with ".bup", a ".bupl" extension is added to
221 disambiguate normal files from segmented ones.
223 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
224 assert(stat.S_ISDIR(gitmode))
225 return name + b'.bup'
226 elif name.endswith(b'.bup') or name[:-1].endswith(b'.bup'):
227 return name + b'.bupl'
232 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
233 def demangle_name(name, mode):
234 """Remove name mangling from a file name, if necessary.
236 The return value is a tuple (demangled_filename,mode), where mode is one of
239 * BUP_NORMAL : files that should be read as-is from the repository
240 * BUP_CHUNKED : files that were chunked and need to be reassembled
242 For more information on the name mangling algorithm, see mangle_name()
244 if name.endswith(b'.bupl'):
245 return (name[:-5], BUP_NORMAL)
246 elif name.endswith(b'.bup'):
247 return (name[:-4], BUP_CHUNKED)
248 elif name.endswith(b'.bupm'):
250 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
252 return (name, BUP_NORMAL)
255 def calc_hash(type, content):
256 """Calculate some content's hash in the Git fashion."""
257 header = b'%s %d\0' % (type, len(content))
263 def shalist_item_sort_key(ent):
264 (mode, name, id) = ent
265 assert(mode+0 == mode)
266 if stat.S_ISDIR(mode):
272 def tree_encode(shalist):
273 """Generate a git tree object from (mode,name,hash) tuples."""
274 shalist = sorted(shalist, key = shalist_item_sort_key)
276 for (mode,name,bin) in shalist:
278 assert(mode+0 == mode)
280 assert(len(bin) == 20)
281 s = b'%o %s\0%s' % (mode,name,bin)
282 assert s[0] != b'0' # 0-padded octal is not acceptable in a git tree
287 def tree_decode(buf):
288 """Generate a list of (mode,name,hash) from the git tree object in buf."""
290 while ofs < len(buf):
291 z = buf.find(b'\0', ofs)
293 spl = buf[ofs:z].split(b' ', 1)
294 assert(len(spl) == 2)
296 sha = buf[z+1:z+1+20]
298 yield (int(mode, 8), name, sha)
301 def _encode_packobj(type, content, compression_level=1):
302 if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
303 raise ValueError('invalid compression level %s' % compression_level)
306 szbits = (sz & 0x0f) | (_typemap[type]<<4)
309 if sz: szbits |= 0x80
310 szout += bytes_from_uint(szbits)
315 z = zlib.compressobj(compression_level)
317 yield z.compress(content)
321 def _decode_packobj(buf):
324 type = _typermap[(c & 0x70) >> 4]
331 sz |= (c & 0x7f) << shift
335 return (type, zlib.decompress(buf[i+1:]))
342 def find_offset(self, hash):
343 """Get the offset of an object inside the index file."""
344 idx = self._idx_from_hash(hash)
346 return self._ofs_from_idx(idx)
349 def exists(self, hash, want_source=False):
350 """Return nonempty if the object exists in this index."""
351 if hash and (self._idx_from_hash(hash) != None):
352 return want_source and os.path.basename(self.name) or True
355 def _idx_from_hash(self, hash):
356 global _total_searches, _total_steps
358 assert(len(hash) == 20)
359 b1 = byte_int(hash[0])
360 start = self.fanout[b1-1] # range -1..254
361 end = self.fanout[b1] # range 0..255
363 _total_steps += 1 # lookup table is a step
366 mid = start + (end - start) // 2
367 v = self._idx_to_hash(mid)
377 class PackIdxV1(PackIdx):
378 """Object representation of a Git pack index (version 1) file."""
379 def __init__(self, filename, f):
381 self.idxnames = [self.name]
382 self.map = mmap_read(f)
383 # Min size for 'L' is 4, which is sufficient for struct's '!I'
384 self.fanout = array('L', struct.unpack('!256I', self.map))
385 self.fanout.append(0) # entry "-1"
386 self.nsha = self.fanout[255]
387 self.sha_ofs = 256 * 4
388 # Avoid slicing shatable for individual hashes (very high overhead)
389 self.shatable = buffer(self.map, self.sha_ofs, self.nsha * 24)
394 def __exit__(self, type, value, traceback):
398 return int(self.nsha) # int() from long for python 2
400 def _ofs_from_idx(self, idx):
401 if idx >= self.nsha or idx < 0:
402 raise IndexError('invalid pack index index %d' % idx)
403 ofs = self.sha_ofs + idx * 24
404 return struct.unpack_from('!I', self.map, offset=ofs)[0]
406 def _idx_to_hash(self, idx):
407 if idx >= self.nsha or idx < 0:
408 raise IndexError('invalid pack index index %d' % idx)
409 ofs = self.sha_ofs + idx * 24 + 4
410 return self.map[ofs : ofs + 20]
413 start = self.sha_ofs + 4
414 for ofs in range(start, start + 24 * self.nsha, 24):
415 yield self.map[ofs : ofs + 20]
418 if self.map is not None:
424 class PackIdxV2(PackIdx):
425 """Object representation of a Git pack index (version 2) file."""
426 def __init__(self, filename, f):
428 self.idxnames = [self.name]
429 self.map = mmap_read(f)
430 assert self.map[0:8] == b'\377tOc\0\0\0\2'
431 # Min size for 'L' is 4, which is sufficient for struct's '!I'
432 self.fanout = array('L', struct.unpack_from('!256I', self.map, offset=8))
433 self.fanout.append(0)
434 self.nsha = self.fanout[255]
435 self.sha_ofs = 8 + 256*4
436 self.ofstable_ofs = self.sha_ofs + self.nsha * 20 + self.nsha * 4
437 self.ofs64table_ofs = self.ofstable_ofs + self.nsha * 4
438 # Avoid slicing this for individual hashes (very high overhead)
439 self.shatable = buffer(self.map, self.sha_ofs, self.nsha*20)
444 def __exit__(self, type, value, traceback):
448 return int(self.nsha) # int() from long for python 2
450 def _ofs_from_idx(self, idx):
451 if idx >= self.nsha or idx < 0:
452 raise IndexError('invalid pack index index %d' % idx)
453 ofs_ofs = self.ofstable_ofs + idx * 4
454 ofs = struct.unpack_from('!I', self.map, offset=ofs_ofs)[0]
456 idx64 = ofs & 0x7fffffff
457 ofs64_ofs = self.ofs64table_ofs + idx64 * 8
458 ofs = struct.unpack_from('!Q', self.map, offset=ofs64_ofs)[0]
461 def _idx_to_hash(self, idx):
462 if idx >= self.nsha or idx < 0:
463 raise IndexError('invalid pack index index %d' % idx)
464 ofs = self.sha_ofs + idx * 20
465 return self.map[ofs : ofs + 20]
469 for ofs in range(start, start + 20 * self.nsha, 20):
470 yield self.map[ofs : ofs + 20]
473 if self.map is not None:
481 def __init__(self, dir, ignore_midx=False):
483 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
488 self.do_bloom = False
490 self.ignore_midx = ignore_midx
496 assert(_mpi_count == 0)
499 return iter(idxmerge(self.packs))
502 return sum(len(pack) for pack in self.packs)
504 def exists(self, hash, want_source=False):
505 """Return nonempty if the object exists in the index files."""
506 global _total_searches
508 if hash in self.also:
510 if self.do_bloom and self.bloom:
511 if self.bloom.exists(hash):
512 self.do_bloom = False
514 _total_searches -= 1 # was counted by bloom
516 for i in range(len(self.packs)):
518 _total_searches -= 1 # will be incremented by sub-pack
519 ix = p.exists(hash, want_source=want_source)
521 # reorder so most recently used packs are searched first
522 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
527 def refresh(self, skip_midx = False):
528 """Refresh the index list.
529 This method verifies if .midx files were superseded (e.g. all of its
530 contents are in another, bigger .midx file) and removes the superseded
533 If skip_midx is True, all work on .midx files will be skipped and .midx
534 files will be removed from the list.
536 The instance variable 'ignore_midx' can force this function to
537 always act as if skip_midx was True.
539 if self.bloom is not None:
541 self.bloom = None # Always reopen the bloom as it may have been relaced
542 self.do_bloom = False
543 skip_midx = skip_midx or self.ignore_midx
544 d = dict((p.name, p) for p in self.packs
545 if not skip_midx or not isinstance(p, midx.PackMidx))
546 if os.path.exists(self.dir):
549 midxes = set(glob.glob(os.path.join(self.dir, b'*.midx')))
550 # remove any *.midx files from our list that no longer exist
551 for ix in list(d.values()):
552 if not isinstance(ix, midx.PackMidx):
554 if ix.name in midxes:
559 self.packs.remove(ix)
560 for ix in self.packs:
561 if isinstance(ix, midx.PackMidx):
562 for name in ix.idxnames:
563 d[os.path.join(self.dir, name)] = ix
566 mx = midx.PackMidx(full)
567 (mxd, mxf) = os.path.split(mx.name)
569 for n in mx.idxnames:
570 if not os.path.exists(os.path.join(mxd, n)):
571 log(('warning: index %s missing\n'
573 % (path_msg(n), path_msg(mxf)))
581 midxl.sort(key=lambda ix:
582 (-len(ix), -xstat.stat(ix.name).st_mtime))
585 for sub in ix.idxnames:
586 found = d.get(os.path.join(self.dir, sub))
587 if not found or isinstance(found, PackIdx):
588 # doesn't exist, or exists but not in a midx
593 for name in ix.idxnames:
594 d[os.path.join(self.dir, name)] = ix
595 elif not ix.force_keep:
596 debug1('midx: removing redundant: %s\n'
597 % path_msg(os.path.basename(ix.name)))
600 for full in glob.glob(os.path.join(self.dir, b'*.idx')):
604 except GitError as e:
608 bfull = os.path.join(self.dir, b'bup.bloom')
609 if self.bloom is None and os.path.exists(bfull):
610 self.bloom = bloom.ShaBloom(bfull)
611 self.packs = list(set(d.values()))
612 self.packs.sort(reverse=True, key=lambda x: len(x))
613 if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
617 debug1('PackIdxList: using %d index%s.\n'
618 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
621 """Insert an additional object in the list."""
625 def open_idx(filename):
626 if filename.endswith(b'.idx'):
627 f = open(filename, 'rb')
629 if header[0:4] == b'\377tOc':
630 version = struct.unpack('!I', header[4:8])[0]
632 return PackIdxV2(filename, f)
634 raise GitError('%s: expected idx file version 2, got %d'
635 % (path_msg(filename), version))
636 elif len(header) == 8 and header[0:4] < b'\377tOc':
637 return PackIdxV1(filename, f)
639 raise GitError('%s: unrecognized idx file header'
640 % path_msg(filename))
641 elif filename.endswith(b'.midx'):
642 return midx.PackMidx(filename)
644 raise GitError('idx filenames must end with .idx or .midx')
647 def idxmerge(idxlist, final_progress=True):
648 """Generate a list of all the objects reachable in a PackIdxList."""
649 def pfunc(count, total):
650 qprogress('Reading indexes: %.2f%% (%d/%d)\r'
651 % (count*100.0/total, count, total))
652 def pfinal(count, total):
654 progress('Reading indexes: %.2f%% (%d/%d), done.\n'
655 % (100, total, total))
656 return merge_iter(idxlist, 10024, pfunc, pfinal)
659 def _make_objcache():
660 return PackIdxList(repo(b'objects/pack'))
662 # bup-gc assumes that it can disable all PackWriter activities
663 # (bloom/midx/cache) via the constructor and close() arguments.
666 """Writes Git objects inside a pack file."""
667 def __init__(self, objcache_maker=_make_objcache, compression_level=1,
668 run_midx=True, on_pack_finish=None,
669 max_pack_size=None, max_pack_objects=None, repo_dir=None):
670 self.repo_dir = repo_dir or repo()
677 self.objcache_maker = objcache_maker
679 self.compression_level = compression_level
680 self.run_midx=run_midx
681 self.on_pack_finish = on_pack_finish
682 if not max_pack_size:
683 max_pack_size = git_config_get(b'pack.packSizeLimit',
684 repo_dir=self.repo_dir)
685 if max_pack_size is not None:
686 max_pack_size = parse_num(max_pack_size)
687 if not max_pack_size:
688 # larger packs slow down pruning
689 max_pack_size = 1000 * 1000 * 1000
690 self.max_pack_size = max_pack_size
691 # cache memory usage is about 83 bytes per object
692 self.max_pack_objects = max_pack_objects if max_pack_objects \
693 else max(1, self.max_pack_size // 5000)
701 def __exit__(self, type, value, traceback):
706 objdir = dir = os.path.join(self.repo_dir, b'objects')
707 fd, name = tempfile.mkstemp(suffix=b'.pack', dir=objdir)
709 self.file = os.fdopen(fd, 'w+b')
714 self.parentfd = os.open(objdir, os.O_RDONLY)
720 assert name.endswith(b'.pack')
721 self.filename = name[:-5]
722 self.file.write(b'PACK\0\0\0\2\0\0\0\0')
723 self.idx = PackIdxV2Writer()
725 def _raw_write(self, datalist, sha):
728 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
729 # the file never has a *partial* blob. So let's make sure it's
730 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
731 # to our hashsplit algorithm.) f.write() does its own buffering,
732 # but that's okay because we'll flush it in _end().
733 oneblob = b''.join(datalist)
739 crc = zlib.crc32(oneblob) & 0xffffffff
740 self._update_idx(sha, crc, nw)
745 def _update_idx(self, sha, crc, size):
748 self.idx.add(sha, crc, self.file.tell() - size)
750 def _write(self, sha, type, content):
754 sha = calc_hash(type, content)
755 size, crc = self._raw_write(_encode_packobj(type, content,
756 self.compression_level),
758 if self.outbytes >= self.max_pack_size \
759 or self.count >= self.max_pack_objects:
763 def breakpoint(self):
764 """Clear byte and object counts and return the last processed id."""
765 id = self._end(self.run_midx)
766 self.outbytes = self.count = 0
769 def _require_objcache(self):
770 if self.objcache is None and self.objcache_maker:
771 self.objcache = self.objcache_maker()
772 if self.objcache is None:
774 "PackWriter not opened or can't check exists w/o objcache")
776 def exists(self, id, want_source=False):
777 """Return non-empty if an object is found in the object cache."""
778 self._require_objcache()
779 return self.objcache.exists(id, want_source=want_source)
781 def just_write(self, sha, type, content):
782 """Write an object to the pack file without checking for duplication."""
783 self._write(sha, type, content)
784 # If nothing else, gc doesn't have/want an objcache
785 if self.objcache is not None:
786 self.objcache.add(sha)
788 def maybe_write(self, type, content):
789 """Write an object to the pack file if not present and return its id."""
790 sha = calc_hash(type, content)
791 if not self.exists(sha):
792 self._require_objcache()
793 self.just_write(sha, type, content)
796 def new_blob(self, blob):
797 """Create a blob object in the pack with the supplied content."""
798 return self.maybe_write(b'blob', blob)
800 def new_tree(self, shalist):
801 """Create a tree object in the pack."""
802 content = tree_encode(shalist)
803 return self.maybe_write(b'tree', content)
805 def new_commit(self, tree, parent,
806 author, adate_sec, adate_tz,
807 committer, cdate_sec, cdate_tz,
809 """Create a commit object in the pack. The date_sec values must be
810 epoch-seconds, and if a tz is None, the local timezone is assumed."""
811 if adate_tz is not None:
812 adate_str = _git_date_str(adate_sec, adate_tz)
814 adate_str = _local_git_date_str(adate_sec)
815 if cdate_tz is not None:
816 cdate_str = _git_date_str(cdate_sec, cdate_tz)
818 cdate_str = _local_git_date_str(cdate_sec)
820 if tree: l.append(b'tree %s' % hexlify(tree))
821 if parent: l.append(b'parent %s' % hexlify(parent))
822 if author: l.append(b'author %s %s' % (author, adate_str))
823 if committer: l.append(b'committer %s %s' % (committer, cdate_str))
826 return self.maybe_write(b'commit', b'\n'.join(l))
829 """Remove the pack file from disk."""
838 os.unlink(self.filename + b'.pack')
845 def _end(self, run_midx=True):
847 if not f: return None
854 # update object count
856 cp = struct.pack('!i', self.count)
860 # calculate the pack sha1sum
863 for b in chunkyreader(f):
865 packbin = sum.digest()
867 fdatasync(f.fileno())
871 obj_list_sha = idx.write(self.filename + b'.idx', packbin)
872 nameprefix = os.path.join(self.repo_dir,
873 b'objects/pack/pack-' + obj_list_sha)
874 if os.path.exists(self.filename + b'.map'):
875 os.unlink(self.filename + b'.map')
876 os.rename(self.filename + b'.pack', nameprefix + b'.pack')
877 os.rename(self.filename + b'.idx', nameprefix + b'.idx')
879 os.fsync(self.parentfd)
881 os.close(self.parentfd)
884 auto_midx(os.path.join(self.repo_dir, b'objects/pack'))
886 if self.on_pack_finish:
887 self.on_pack_finish(nameprefix)
891 def close(self, run_midx=True):
892 """Close the pack file and move it to its definitive path."""
893 return self._end(run_midx=run_midx)
896 class PackIdxV2Writer:
898 self.idx = list(list() for i in range(256))
901 def add(self, sha, crc, offs):
904 self.idx[byte_int(sha[0])].append((sha, crc, offs))
906 def write(self, filename, packbin):
908 for section in self.idx:
909 for entry in section:
910 if entry[2] >= 2**31:
913 # Length: header + fan-out + shas-and-crcs + overflow-offsets
914 index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
916 idx_f = open(filename, 'w+b')
918 idx_f.truncate(index_len)
919 fdatasync(idx_f.fileno())
920 idx_map = mmap_readwrite(idx_f, close=False)
922 count = _helpers.write_idx(filename, idx_map, self.idx,
924 assert(count == self.count)
931 idx_f = open(filename, 'a+b')
936 b = idx_f.read(8 + 4*256)
939 obj_list_sum = Sha1()
940 for b in chunkyreader(idx_f, 20 * self.count):
942 obj_list_sum.update(b)
943 namebase = hexlify(obj_list_sum.digest())
945 for b in chunkyreader(idx_f):
947 idx_f.write(idx_sum.digest())
948 fdatasync(idx_f.fileno())
954 def list_refs(patterns=None, repo_dir=None,
955 limit_to_heads=False, limit_to_tags=False):
956 """Yield (refname, hash) tuples for all repository refs unless
957 patterns are specified. In that case, only include tuples for
958 refs matching those patterns (cf. git-show-ref(1)). The limits
959 restrict the result items to refs/heads or refs/tags. If both
960 limits are specified, items from both sources will be included.
963 argv = [b'git', b'show-ref']
965 argv.append(b'--heads')
967 argv.append(b'--tags')
970 argv.extend(patterns)
971 p = subprocess.Popen(argv, env=_gitenv(repo_dir), stdout=subprocess.PIPE,
973 out = p.stdout.read().strip()
974 rv = p.wait() # not fatal
978 for d in out.split(b'\n'):
979 sha, name = d.split(b' ', 1)
980 yield name, unhexlify(sha)
983 def read_ref(refname, repo_dir = None):
984 """Get the commit id of the most recent commit made on a given ref."""
985 refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
986 l = tuple(islice(refs, 2))
994 def rev_list_invocation(ref_or_refs, format=None):
995 if isinstance(ref_or_refs, bytes):
996 refs = (ref_or_refs,)
999 argv = [b'git', b'rev-list']
1002 argv.append(b'--pretty=format:' + format)
1004 assert not ref.startswith(b'-')
1010 def rev_list(ref_or_refs, parse=None, format=None, repo_dir=None):
1011 """Yield information about commits as per "git rev-list". If a format
1012 is not provided, yield one hex hash at a time. If a format is
1013 provided, pass it to rev-list and call parse(git_stdout) for each
1014 commit with the stream positioned just after the rev-list "commit
1015 HASH" header line. When a format is provided yield (oidx,
1016 parse(git_stdout)) for each commit.
1019 assert bool(parse) == bool(format)
1020 p = subprocess.Popen(rev_list_invocation(ref_or_refs,
1022 env=_gitenv(repo_dir),
1023 stdout = subprocess.PIPE,
1026 for line in p.stdout:
1029 line = p.stdout.readline()
1032 if not s.startswith(b'commit '):
1033 raise Exception('unexpected line ' + repr(s))
1036 yield s, parse(p.stdout)
1037 line = p.stdout.readline()
1039 rv = p.wait() # not fatal
1041 raise GitError('git rev-list returned error %d' % rv)
1044 def get_commit_dates(refs, repo_dir=None):
1045 """Get the dates for the specified commit refs. For now, every unique
1046 string in refs must resolve to a different commit or this
1047 function will fail."""
1050 commit = get_commit_items(ref, cp(repo_dir))
1051 result.append(commit.author_sec)
1055 def rev_parse(committish, repo_dir=None):
1056 """Resolve the full hash for 'committish', if it exists.
1058 Should be roughly equivalent to 'git rev-parse'.
1060 Returns the hex value of the hash if it is found, None if 'committish' does
1061 not correspond to anything.
1063 head = read_ref(committish, repo_dir=repo_dir)
1065 debug2("resolved from ref: commit = %s\n" % hexlify(head))
1068 pL = PackIdxList(repo(b'objects/pack', repo_dir=repo_dir))
1070 if len(committish) == 40:
1072 hash = unhexlify(committish)
1082 def update_ref(refname, newval, oldval, repo_dir=None):
1083 """Update a repository reference."""
1086 assert refname.startswith(b'refs/heads/') \
1087 or refname.startswith(b'refs/tags/')
1088 p = subprocess.Popen([b'git', b'update-ref', refname,
1089 hexlify(newval), hexlify(oldval)],
1090 env=_gitenv(repo_dir),
1092 _git_wait(b'git update-ref', p)
1095 def delete_ref(refname, oldvalue=None):
1096 """Delete a repository reference (see git update-ref(1))."""
1097 assert refname.startswith(b'refs/')
1098 oldvalue = [] if not oldvalue else [oldvalue]
1099 p = subprocess.Popen([b'git', b'update-ref', b'-d', refname] + oldvalue,
1102 _git_wait('git update-ref', p)
1105 def guess_repo(path=None):
1106 """Set the path value in the global variable "repodir".
1107 This makes bup look for an existing bup repository, but not fail if a
1108 repository doesn't exist. Usually, if you are interacting with a bup
1109 repository, you would not be calling this function but using
1110 check_repo_or_die().
1116 repodir = environ.get(b'BUP_DIR')
1118 repodir = os.path.expanduser(b'~/.bup')
1121 def init_repo(path=None):
1122 """Create the Git bare repository for bup in a given path."""
1124 d = repo() # appends a / to the path
1125 parent = os.path.dirname(os.path.dirname(d))
1126 if parent and not os.path.exists(parent):
1127 raise GitError('parent directory "%s" does not exist\n'
1129 if os.path.exists(d) and not os.path.isdir(os.path.join(d, b'.')):
1130 raise GitError('"%s" exists but is not a directory\n' % path_msg(d))
1131 p = subprocess.Popen([b'git', b'--bare', b'init'], stdout=sys.stderr,
1134 _git_wait('git init', p)
1135 # Force the index version configuration in order to ensure bup works
1136 # regardless of the version of the installed Git binary.
1137 p = subprocess.Popen([b'git', b'config', b'pack.indexVersion', '2'],
1138 stdout=sys.stderr, env=_gitenv(), close_fds=True)
1139 _git_wait('git config', p)
1141 p = subprocess.Popen([b'git', b'config', b'core.logAllRefUpdates', b'true'],
1142 stdout=sys.stderr, env=_gitenv(), close_fds=True)
1143 _git_wait('git config', p)
1146 def check_repo_or_die(path=None):
1147 """Check to see if a bup repository probably exists, and abort if not."""
1150 pst = stat_if_exists(top + b'/objects/pack')
1151 if pst and stat.S_ISDIR(pst.st_mode):
1154 top_st = stat_if_exists(top)
1156 log('error: repository %r does not exist (see "bup help init")\n'
1159 log('error: %s is not a repository\n' % path_msg(top))
1163 def is_suitable_git(ver_str):
1164 if not ver_str.startswith(b'git version '):
1165 return 'unrecognized'
1166 ver_str = ver_str[len(b'git version '):]
1167 if ver_str.startswith(b'0.'):
1168 return 'insufficient'
1169 if ver_str.startswith(b'1.'):
1170 if re.match(br'1\.[012345]rc', ver_str):
1171 return 'insufficient'
1172 if re.match(br'1\.[01234]\.', ver_str):
1173 return 'insufficient'
1174 if re.match(br'1\.5\.[012345]($|\.)', ver_str):
1175 return 'insufficient'
1176 if re.match(br'1\.5\.6-rc', ver_str):
1177 return 'insufficient'
1179 if re.match(br'[0-9]+(\.|$)?', ver_str):
1185 def require_suitable_git(ver_str=None):
1186 """Raise GitError if the version of git isn't suitable.
1188 Rely on ver_str when provided, rather than invoking the git in the
1193 if _git_great is not None:
1195 if environ.get(b'BUP_GIT_VERSION_IS_FINE', b'').lower() \
1196 in (b'yes', b'true', b'1'):
1200 ver_str, _, _ = _git_exo([b'git', b'--version'])
1201 status = is_suitable_git(ver_str)
1202 if status == 'unrecognized':
1203 raise GitError('Unexpected git --version output: %r' % ver_str)
1204 if status == 'insufficient':
1205 log('error: git version must be at least 1.5.6\n')
1207 if status == 'suitable':
1213 class _AbortableIter:
1214 def __init__(self, it, onabort = None):
1216 self.onabort = onabort
1224 return next(self.it)
1225 except StopIteration as e:
1235 """Abort iteration and call the abortion callback, if needed."""
1246 """Link to 'git cat-file' that is used to retrieve blob data."""
1247 def __init__(self, repo_dir = None):
1248 require_suitable_git()
1249 self.repo_dir = repo_dir
1250 self.p = self.inprogress = None
1252 def close(self, wait=False):
1258 self.inprogress = None
1265 self.p = subprocess.Popen([b'git', b'cat-file', b'--batch'],
1266 stdin=subprocess.PIPE,
1267 stdout=subprocess.PIPE,
1270 env=_gitenv(self.repo_dir))
1273 """Yield (oidx, type, size), followed by the data referred to by ref.
1274 If ref does not exist, only yield (None, None, None).
1277 if not self.p or self.p.poll() != None:
1280 poll_result = self.p.poll()
1281 assert(poll_result == None)
1283 log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1284 assert(not self.inprogress)
1285 assert ref.find(b'\n') < 0
1286 assert ref.find(b'\r') < 0
1287 assert not ref.startswith(b'-')
1288 self.inprogress = ref
1289 self.p.stdin.write(ref + b'\n')
1290 self.p.stdin.flush()
1291 hdr = self.p.stdout.readline()
1292 if hdr.endswith(b' missing\n'):
1293 self.inprogress = None
1294 yield None, None, None
1296 info = hdr.split(b' ')
1297 if len(info) != 3 or len(info[0]) != 40:
1298 raise GitError('expected object (id, type, size), got %r' % info)
1299 oidx, typ, size = info
1301 it = _AbortableIter(chunkyreader(self.p.stdout, size),
1304 yield oidx, typ, size
1307 readline_result = self.p.stdout.readline()
1308 assert readline_result == b'\n'
1309 self.inprogress = None
1310 except Exception as e:
1314 def _join(self, it):
1315 _, typ, _ = next(it)
1319 elif typ == b'tree':
1320 treefile = b''.join(it)
1321 for (mode, name, sha) in tree_decode(treefile):
1322 for blob in self.join(hexlify(sha)):
1324 elif typ == b'commit':
1325 treeline = b''.join(it).split(b'\n')[0]
1326 assert treeline.startswith(b'tree ')
1327 for blob in self.join(treeline[5:]):
1330 raise GitError('invalid object type %r: expected blob/tree/commit'
1334 """Generate a list of the content of all blobs that can be reached
1335 from an object. The hash given in 'id' must point to a blob, a tree
1336 or a commit. The content of all blobs that can be seen from trees or
1337 commits will be added to the list.
1339 for d in self._join(self.get(id)):
1345 def cp(repo_dir=None):
1346 """Create a CatPipe object or reuse the already existing one."""
1349 repo_dir = repodir or repo()
1350 repo_dir = os.path.abspath(repo_dir)
1351 cp = _cp.get(repo_dir)
1353 cp = CatPipe(repo_dir)
1358 def close_catpipes():
1359 # FIXME: chain exceptions
1361 _, cp = _cp.popitem()
1365 def tags(repo_dir = None):
1366 """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1368 for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1369 assert n.startswith(b'refs/tags/')
1373 tags[c].append(name) # more than one tag can point at 'c'
1377 class MissingObject(KeyError):
1378 def __init__(self, oid):
1380 KeyError.__init__(self, 'object %r is missing' % hexlify(oid))
1383 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1384 'path', 'chunk_path', 'data'])
1385 # The path is the mangled path, and if an item represents a fragment
1386 # of a chunked file, the chunk_path will be the chunked subtree path
1387 # for the chunk, i.e. ['', '2d3115e', ...]. The top-level path for a
1388 # chunked file will have a chunk_path of ['']. So some chunk subtree
1389 # of the file '/foo/bar/baz' might look like this:
1391 # item.path = ['foo', 'bar', 'baz.bup']
1392 # item.chunk_path = ['', '2d3115e', '016b097']
1393 # item.type = 'tree'
1397 def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1398 """Yield everything reachable from oidx via get_ref (which must behave
1399 like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1400 returns true. Throw MissingObject if a hash encountered is
1401 missing from the repository, and don't read or return blob content
1402 in the data field unless include_data is set.
1405 # Maintain the pending stack on the heap to avoid stack overflow
1406 pending = [(oidx, [], [], None)]
1408 oidx, parent_path, chunk_path, mode = pending.pop()
1409 oid = unhexlify(oidx)
1410 if stop_at and stop_at(oidx):
1413 if (not include_data) and mode and stat.S_ISREG(mode):
1414 # If the object is a "regular file", then it's a leaf in
1415 # the graph, so we can skip reading the data if the caller
1416 # hasn't requested it.
1417 yield WalkItem(oid=oid, type=b'blob',
1418 chunk_path=chunk_path, path=parent_path,
1423 item_it = get_ref(oidx)
1424 get_oidx, typ, _ = next(item_it)
1426 raise MissingObject(unhexlify(oidx))
1427 if typ not in (b'blob', b'commit', b'tree'):
1428 raise Exception('unexpected repository object type %r' % typ)
1430 # FIXME: set the mode based on the type when the mode is None
1431 if typ == b'blob' and not include_data:
1432 # Dump data until we can ask cat_pipe not to fetch it
1433 for ignored in item_it:
1437 data = b''.join(item_it)
1439 yield WalkItem(oid=oid, type=typ,
1440 chunk_path=chunk_path, path=parent_path,
1442 data=(data if include_data else None))
1444 if typ == b'commit':
1445 commit_items = parse_commit(data)
1446 for pid in commit_items.parents:
1447 pending.append((pid, parent_path, chunk_path, mode))
1448 pending.append((commit_items.tree, parent_path, chunk_path,
1449 hashsplit.GIT_MODE_TREE))
1450 elif typ == b'tree':
1451 for mode, name, ent_id in tree_decode(data):
1452 demangled, bup_type = demangle_name(name, mode)
1454 sub_path = parent_path
1455 sub_chunk_path = chunk_path + [name]
1457 sub_path = parent_path + [name]
1458 if bup_type == BUP_CHUNKED:
1459 sub_chunk_path = [b'']
1461 sub_chunk_path = chunk_path
1462 pending.append((hexlify(ent_id), sub_path, sub_chunk_path,