1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
6 from __future__ import absolute_import, print_function
7 import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
8 from array import array
9 from binascii import hexlify, unhexlify
10 from collections import namedtuple
11 from itertools import islice
12 from numbers import Integral
14 from bup import _helpers, compat, hashsplit, path, midx, bloom, xstat
15 from bup.compat import (buffer,
16 byte_int, bytes_from_byte, bytes_from_uint,
21 from bup.io import path_msg
22 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
25 hostname, localtime, log,
28 mmap_read, mmap_readwrite,
30 progress, qprogress, stat_if_exists,
33 from bup.pwdgrp import username, userfullname
37 repodir = None # The default repository, once initialized
39 _typemap = {b'blob': 3, b'tree': 2, b'commit': 1, b'tag': 4}
40 _typermap = {v: k for k, v in items(_typemap)}
47 class GitError(Exception):
51 def _gitenv(repo_dir=None):
54 return merge_dict(environ, {b'GIT_DIR': os.path.abspath(repo_dir)})
56 def _git_wait(cmd, p):
59 raise GitError('%r returned %d' % (cmd, rv))
61 def _git_exo(cmd, **kwargs):
62 kwargs['check'] = False
63 result = exo(cmd, **kwargs)
65 if proc.returncode != 0:
66 raise GitError('%r returned %d' % (cmd, proc.returncode))
69 def git_config_get(option, repo_dir=None, opttype=None, cfg_file=None):
70 assert not (repo_dir and cfg_file), "repo_dir and cfg_file cannot both be used"
71 cmd = [b'git', b'config', b'--null']
73 cmd.extend([b'--file', cfg_file])
75 cmd.extend([b'--int'])
76 elif opttype == 'bool':
77 cmd.extend([b'--bool'])
79 assert opttype is None
80 cmd.extend([b'--get', option])
83 env = _gitenv(repo_dir=repo_dir)
84 p = subprocess.Popen(cmd, stdout=subprocess.PIPE, env=env,
86 # with --null, git writes out a trailing \0 after the value
87 r = p.stdout.read()[:-1]
92 elif opttype == 'bool':
93 # git converts to 'true' or 'false'
97 raise GitError('%r returned %d' % (cmd, rc))
101 def parse_tz_offset(s):
102 """UTC offset in seconds."""
103 tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
104 if bytes_from_byte(s[0]) == b'-':
109 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
110 # Make sure that's authoritative.
111 _start_end_char = br'[^ .,:;<>"\'\0\n]'
112 _content_char = br'[^\0\n<>]'
113 _safe_str_rx = br'(?:%s{1,2}|(?:%s%s*%s))' \
115 _start_end_char, _content_char, _start_end_char)
116 _tz_rx = br'[-+]\d\d[0-5]\d'
117 _parent_rx = br'(?:parent [abcdefABCDEF0123456789]{40}\n)'
118 # Assumes every following line starting with a space is part of the
119 # mergetag. Is there a formal commit blob spec?
120 _mergetag_rx = br'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)'
121 _commit_rx = re.compile(br'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
122 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
123 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?)
125 (?P<message>(?:.|\n)*)''' % (_parent_rx,
126 _safe_str_rx, _safe_str_rx, _tz_rx,
127 _safe_str_rx, _safe_str_rx, _tz_rx,
129 _parent_hash_rx = re.compile(br'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
131 # Note that the author_sec and committer_sec values are (UTC) epoch
132 # seconds, and for now the mergetag is not included.
133 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
134 'author_name', 'author_mail',
135 'author_sec', 'author_offset',
136 'committer_name', 'committer_mail',
137 'committer_sec', 'committer_offset',
140 def parse_commit(content):
141 commit_match = re.match(_commit_rx, content)
143 raise Exception('cannot parse commit %r' % content)
144 matches = commit_match.groupdict()
145 return CommitInfo(tree=matches['tree'],
146 parents=re.findall(_parent_hash_rx, matches['parents']),
147 author_name=matches['author_name'],
148 author_mail=matches['author_mail'],
149 author_sec=int(matches['asec']),
150 author_offset=parse_tz_offset(matches['atz']),
151 committer_name=matches['committer_name'],
152 committer_mail=matches['committer_mail'],
153 committer_sec=int(matches['csec']),
154 committer_offset=parse_tz_offset(matches['ctz']),
155 message=matches['message'])
158 def get_cat_data(cat_iterator, expected_type):
159 _, kind, _ = next(cat_iterator)
160 if kind != expected_type:
161 raise Exception('expected %r, saw %r' % (expected_type, kind))
162 return b''.join(cat_iterator)
164 def get_commit_items(id, cp):
165 return parse_commit(get_cat_data(cp.get(id), b'commit'))
167 def _local_git_date_str(epoch_sec):
168 return b'%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
171 def _git_date_str(epoch_sec, tz_offset_sec):
172 offs = tz_offset_sec // 60
173 return b'%d %s%02d%02d' \
175 b'+' if offs >= 0 else b'-',
180 def repo(sub = b'', repo_dir=None):
181 """Get the path to the git repository or one of its subdirectories."""
182 repo_dir = repo_dir or repodir
184 raise GitError('You should call check_repo_or_die()')
186 # If there's a .git subdirectory, then the actual repo is in there.
187 gd = os.path.join(repo_dir, b'.git')
188 if os.path.exists(gd):
191 return os.path.join(repo_dir, sub)
195 re.compile(br'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)')
198 return _shorten_hash_rx.sub(br'\1\2*\3', s)
202 full = os.path.abspath(path)
203 fullrepo = os.path.abspath(repo(b''))
204 if not fullrepo.endswith(b'/'):
206 if full.startswith(fullrepo):
207 path = full[len(fullrepo):]
208 if path.startswith(b'index-cache/'):
209 path = path[len(b'index-cache/'):]
210 return shorten_hash(path)
213 def auto_midx(objdir):
214 args = [path.exe(), b'midx', b'--auto', b'--dir', objdir]
216 rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
218 # make sure 'args' gets printed to help with debugging
219 add_error('%r: exception: %s' % (args, e))
222 add_error('%r: returned %d' % (args, rv))
224 args = [path.exe(), b'bloom', b'--dir', objdir]
226 rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
228 # make sure 'args' gets printed to help with debugging
229 add_error('%r: exception: %s' % (args, e))
232 add_error('%r: returned %d' % (args, rv))
235 def mangle_name(name, mode, gitmode):
236 """Mangle a file name to present an abstract name for segmented files.
237 Mangled file names will have the ".bup" extension added to them. If a
238 file's name already ends with ".bup", a ".bupl" extension is added to
239 disambiguate normal files from segmented ones.
241 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
242 assert(stat.S_ISDIR(gitmode))
243 return name + b'.bup'
244 elif name.endswith(b'.bup') or name[:-1].endswith(b'.bup'):
245 return name + b'.bupl'
250 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
251 def demangle_name(name, mode):
252 """Remove name mangling from a file name, if necessary.
254 The return value is a tuple (demangled_filename,mode), where mode is one of
257 * BUP_NORMAL : files that should be read as-is from the repository
258 * BUP_CHUNKED : files that were chunked and need to be reassembled
260 For more information on the name mangling algorithm, see mangle_name()
262 if name.endswith(b'.bupl'):
263 return (name[:-5], BUP_NORMAL)
264 elif name.endswith(b'.bup'):
265 return (name[:-4], BUP_CHUNKED)
266 elif name.endswith(b'.bupm'):
268 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
270 return (name, BUP_NORMAL)
273 def calc_hash(type, content):
274 """Calculate some content's hash in the Git fashion."""
275 header = b'%s %d\0' % (type, len(content))
281 def shalist_item_sort_key(ent):
282 (mode, name, id) = ent
283 assert(mode+0 == mode)
284 if stat.S_ISDIR(mode):
290 def tree_encode(shalist):
291 """Generate a git tree object from (mode,name,hash) tuples."""
292 shalist = sorted(shalist, key = shalist_item_sort_key)
294 for (mode,name,bin) in shalist:
296 assert(mode+0 == mode)
298 assert(len(bin) == 20)
299 s = b'%o %s\0%s' % (mode,name,bin)
300 assert s[0] != b'0' # 0-padded octal is not acceptable in a git tree
305 def tree_decode(buf):
306 """Generate a list of (mode,name,hash) from the git tree object in buf."""
308 while ofs < len(buf):
309 z = buf.find(b'\0', ofs)
311 spl = buf[ofs:z].split(b' ', 1)
312 assert(len(spl) == 2)
314 sha = buf[z+1:z+1+20]
316 yield (int(mode, 8), name, sha)
319 def _encode_packobj(type, content, compression_level=1):
320 if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
321 raise ValueError('invalid compression level %s' % compression_level)
324 szbits = (sz & 0x0f) | (_typemap[type]<<4)
327 if sz: szbits |= 0x80
328 szout += bytes_from_uint(szbits)
333 z = zlib.compressobj(compression_level)
335 yield z.compress(content)
339 def _decode_packobj(buf):
342 type = _typermap[(c & 0x70) >> 4]
349 sz |= (c & 0x7f) << shift
353 return (type, zlib.decompress(buf[i+1:]))
360 def find_offset(self, hash):
361 """Get the offset of an object inside the index file."""
362 idx = self._idx_from_hash(hash)
364 return self._ofs_from_idx(idx)
367 def exists(self, hash, want_source=False):
368 """Return nonempty if the object exists in this index."""
369 if hash and (self._idx_from_hash(hash) != None):
370 return want_source and os.path.basename(self.name) or True
373 def _idx_from_hash(self, hash):
374 global _total_searches, _total_steps
376 assert(len(hash) == 20)
377 b1 = byte_int(hash[0])
378 start = self.fanout[b1-1] # range -1..254
379 end = self.fanout[b1] # range 0..255
381 _total_steps += 1 # lookup table is a step
384 mid = start + (end - start) // 2
385 v = self._idx_to_hash(mid)
395 class PackIdxV1(PackIdx):
396 """Object representation of a Git pack index (version 1) file."""
397 def __init__(self, filename, f):
399 self.idxnames = [self.name]
400 self.map = mmap_read(f)
401 # Min size for 'L' is 4, which is sufficient for struct's '!I'
402 self.fanout = array('L', struct.unpack('!256I', self.map))
403 self.fanout.append(0) # entry "-1"
404 self.nsha = self.fanout[255]
405 self.sha_ofs = 256 * 4
406 # Avoid slicing shatable for individual hashes (very high overhead)
407 self.shatable = buffer(self.map, self.sha_ofs, self.nsha * 24)
412 def __exit__(self, type, value, traceback):
416 return int(self.nsha) # int() from long for python 2
418 def _ofs_from_idx(self, idx):
419 if idx >= self.nsha or idx < 0:
420 raise IndexError('invalid pack index index %d' % idx)
421 ofs = self.sha_ofs + idx * 24
422 return struct.unpack_from('!I', self.map, offset=ofs)[0]
424 def _idx_to_hash(self, idx):
425 if idx >= self.nsha or idx < 0:
426 raise IndexError('invalid pack index index %d' % idx)
427 ofs = self.sha_ofs + idx * 24 + 4
428 return self.map[ofs : ofs + 20]
431 start = self.sha_ofs + 4
432 for ofs in range(start, start + 24 * self.nsha, 24):
433 yield self.map[ofs : ofs + 20]
436 if self.map is not None:
442 class PackIdxV2(PackIdx):
443 """Object representation of a Git pack index (version 2) file."""
444 def __init__(self, filename, f):
446 self.idxnames = [self.name]
447 self.map = mmap_read(f)
448 assert self.map[0:8] == b'\377tOc\0\0\0\2'
449 # Min size for 'L' is 4, which is sufficient for struct's '!I'
450 self.fanout = array('L', struct.unpack_from('!256I', self.map, offset=8))
451 self.fanout.append(0)
452 self.nsha = self.fanout[255]
453 self.sha_ofs = 8 + 256*4
454 self.ofstable_ofs = self.sha_ofs + self.nsha * 20 + self.nsha * 4
455 self.ofs64table_ofs = self.ofstable_ofs + self.nsha * 4
456 # Avoid slicing this for individual hashes (very high overhead)
457 self.shatable = buffer(self.map, self.sha_ofs, self.nsha*20)
462 def __exit__(self, type, value, traceback):
466 return int(self.nsha) # int() from long for python 2
468 def _ofs_from_idx(self, idx):
469 if idx >= self.nsha or idx < 0:
470 raise IndexError('invalid pack index index %d' % idx)
471 ofs_ofs = self.ofstable_ofs + idx * 4
472 ofs = struct.unpack_from('!I', self.map, offset=ofs_ofs)[0]
474 idx64 = ofs & 0x7fffffff
475 ofs64_ofs = self.ofs64table_ofs + idx64 * 8
476 ofs = struct.unpack_from('!Q', self.map, offset=ofs64_ofs)[0]
479 def _idx_to_hash(self, idx):
480 if idx >= self.nsha or idx < 0:
481 raise IndexError('invalid pack index index %d' % idx)
482 ofs = self.sha_ofs + idx * 20
483 return self.map[ofs : ofs + 20]
487 for ofs in range(start, start + 20 * self.nsha, 20):
488 yield self.map[ofs : ofs + 20]
491 if self.map is not None:
499 def __init__(self, dir, ignore_midx=False):
501 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
506 self.do_bloom = False
508 self.ignore_midx = ignore_midx
514 assert(_mpi_count == 0)
517 return iter(idxmerge(self.packs))
520 return sum(len(pack) for pack in self.packs)
522 def exists(self, hash, want_source=False):
523 """Return nonempty if the object exists in the index files."""
524 global _total_searches
526 if hash in self.also:
528 if self.do_bloom and self.bloom:
529 if self.bloom.exists(hash):
530 self.do_bloom = False
532 _total_searches -= 1 # was counted by bloom
534 for i in range(len(self.packs)):
536 _total_searches -= 1 # will be incremented by sub-pack
537 ix = p.exists(hash, want_source=want_source)
539 # reorder so most recently used packs are searched first
540 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
545 def refresh(self, skip_midx = False):
546 """Refresh the index list.
547 This method verifies if .midx files were superseded (e.g. all of its
548 contents are in another, bigger .midx file) and removes the superseded
551 If skip_midx is True, all work on .midx files will be skipped and .midx
552 files will be removed from the list.
554 The instance variable 'ignore_midx' can force this function to
555 always act as if skip_midx was True.
557 if self.bloom is not None:
559 self.bloom = None # Always reopen the bloom as it may have been relaced
560 self.do_bloom = False
561 skip_midx = skip_midx or self.ignore_midx
562 d = dict((p.name, p) for p in self.packs
563 if not skip_midx or not isinstance(p, midx.PackMidx))
564 if os.path.exists(self.dir):
567 midxes = set(glob.glob(os.path.join(self.dir, b'*.midx')))
568 # remove any *.midx files from our list that no longer exist
569 for ix in list(d.values()):
570 if not isinstance(ix, midx.PackMidx):
572 if ix.name in midxes:
577 self.packs.remove(ix)
578 for ix in self.packs:
579 if isinstance(ix, midx.PackMidx):
580 for name in ix.idxnames:
581 d[os.path.join(self.dir, name)] = ix
584 mx = midx.PackMidx(full)
585 (mxd, mxf) = os.path.split(mx.name)
587 for n in mx.idxnames:
588 if not os.path.exists(os.path.join(mxd, n)):
589 log(('warning: index %s missing\n'
591 % (path_msg(n), path_msg(mxf)))
599 midxl.sort(key=lambda ix:
600 (-len(ix), -xstat.stat(ix.name).st_mtime))
603 for sub in ix.idxnames:
604 found = d.get(os.path.join(self.dir, sub))
605 if not found or isinstance(found, PackIdx):
606 # doesn't exist, or exists but not in a midx
611 for name in ix.idxnames:
612 d[os.path.join(self.dir, name)] = ix
613 elif not ix.force_keep:
614 debug1('midx: removing redundant: %s\n'
615 % path_msg(os.path.basename(ix.name)))
618 for full in glob.glob(os.path.join(self.dir, b'*.idx')):
622 except GitError as e:
626 bfull = os.path.join(self.dir, b'bup.bloom')
627 if self.bloom is None and os.path.exists(bfull):
628 self.bloom = bloom.ShaBloom(bfull)
629 self.packs = list(set(d.values()))
630 self.packs.sort(reverse=True, key=lambda x: len(x))
631 if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
635 debug1('PackIdxList: using %d index%s.\n'
636 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
639 """Insert an additional object in the list."""
643 def open_idx(filename):
644 if filename.endswith(b'.idx'):
645 f = open(filename, 'rb')
647 if header[0:4] == b'\377tOc':
648 version = struct.unpack('!I', header[4:8])[0]
650 return PackIdxV2(filename, f)
652 raise GitError('%s: expected idx file version 2, got %d'
653 % (path_msg(filename), version))
654 elif len(header) == 8 and header[0:4] < b'\377tOc':
655 return PackIdxV1(filename, f)
657 raise GitError('%s: unrecognized idx file header'
658 % path_msg(filename))
659 elif filename.endswith(b'.midx'):
660 return midx.PackMidx(filename)
662 raise GitError('idx filenames must end with .idx or .midx')
665 def idxmerge(idxlist, final_progress=True):
666 """Generate a list of all the objects reachable in a PackIdxList."""
667 def pfunc(count, total):
668 qprogress('Reading indexes: %.2f%% (%d/%d)\r'
669 % (count*100.0/total, count, total))
670 def pfinal(count, total):
672 progress('Reading indexes: %.2f%% (%d/%d), done.\n'
673 % (100, total, total))
674 return merge_iter(idxlist, 10024, pfunc, pfinal)
677 def create_commit_blob(tree, parent,
678 author, adate_sec, adate_tz,
679 committer, cdate_sec, cdate_tz,
681 if adate_tz is not None:
682 adate_str = _git_date_str(adate_sec, adate_tz)
684 adate_str = _local_git_date_str(adate_sec)
685 if cdate_tz is not None:
686 cdate_str = _git_date_str(cdate_sec, cdate_tz)
688 cdate_str = _local_git_date_str(cdate_sec)
690 if tree: l.append(b'tree %s' % hexlify(tree))
691 if parent: l.append(b'parent %s' % hexlify(parent))
692 if author: l.append(b'author %s %s' % (author, adate_str))
693 if committer: l.append(b'committer %s %s' % (committer, cdate_str))
699 def _make_objcache():
700 return PackIdxList(repo(b'objects/pack'))
702 # bup-gc assumes that it can disable all PackWriter activities
703 # (bloom/midx/cache) via the constructor and close() arguments.
706 """Writes Git objects inside a pack file."""
707 def __init__(self, objcache_maker=_make_objcache, compression_level=1,
708 run_midx=True, on_pack_finish=None,
709 max_pack_size=None, max_pack_objects=None, repo_dir=None):
710 self.repo_dir = repo_dir or repo()
717 self.objcache_maker = objcache_maker
719 self.compression_level = compression_level
720 self.run_midx=run_midx
721 self.on_pack_finish = on_pack_finish
722 if not max_pack_size:
723 max_pack_size = git_config_get(b'pack.packSizeLimit',
724 repo_dir=self.repo_dir,
726 if not max_pack_size:
727 # larger packs slow down pruning
728 max_pack_size = 1000 * 1000 * 1000
729 self.max_pack_size = max_pack_size
730 # cache memory usage is about 83 bytes per object
731 self.max_pack_objects = max_pack_objects if max_pack_objects \
732 else max(1, self.max_pack_size // 5000)
740 def __exit__(self, type, value, traceback):
745 objdir = dir = os.path.join(self.repo_dir, b'objects')
746 fd, name = tempfile.mkstemp(suffix=b'.pack', dir=objdir)
748 self.file = os.fdopen(fd, 'w+b')
753 self.parentfd = os.open(objdir, os.O_RDONLY)
759 assert name.endswith(b'.pack')
760 self.filename = name[:-5]
761 self.file.write(b'PACK\0\0\0\2\0\0\0\0')
762 self.idx = PackIdxV2Writer()
764 def _raw_write(self, datalist, sha):
767 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
768 # the file never has a *partial* blob. So let's make sure it's
769 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
770 # to our hashsplit algorithm.) f.write() does its own buffering,
771 # but that's okay because we'll flush it in _end().
772 oneblob = b''.join(datalist)
778 crc = zlib.crc32(oneblob) & 0xffffffff
779 self._update_idx(sha, crc, nw)
784 def _update_idx(self, sha, crc, size):
787 self.idx.add(sha, crc, self.file.tell() - size)
789 def _write(self, sha, type, content):
793 sha = calc_hash(type, content)
794 size, crc = self._raw_write(_encode_packobj(type, content,
795 self.compression_level),
797 if self.outbytes >= self.max_pack_size \
798 or self.count >= self.max_pack_objects:
802 def breakpoint(self):
803 """Clear byte and object counts and return the last processed id."""
804 id = self._end(self.run_midx)
805 self.outbytes = self.count = 0
808 def _require_objcache(self):
809 if self.objcache is None and self.objcache_maker:
810 self.objcache = self.objcache_maker()
811 if self.objcache is None:
813 "PackWriter not opened or can't check exists w/o objcache")
815 def exists(self, id, want_source=False):
816 """Return non-empty if an object is found in the object cache."""
817 self._require_objcache()
818 return self.objcache.exists(id, want_source=want_source)
820 def just_write(self, sha, type, content):
821 """Write an object to the pack file without checking for duplication."""
822 self._write(sha, type, content)
823 # If nothing else, gc doesn't have/want an objcache
824 if self.objcache is not None:
825 self.objcache.add(sha)
827 def maybe_write(self, type, content):
828 """Write an object to the pack file if not present and return its id."""
829 sha = calc_hash(type, content)
830 if not self.exists(sha):
831 self._require_objcache()
832 self.just_write(sha, type, content)
835 def new_blob(self, blob):
836 """Create a blob object in the pack with the supplied content."""
837 return self.maybe_write(b'blob', blob)
839 def new_tree(self, shalist):
840 """Create a tree object in the pack."""
841 content = tree_encode(shalist)
842 return self.maybe_write(b'tree', content)
844 def new_commit(self, tree, parent,
845 author, adate_sec, adate_tz,
846 committer, cdate_sec, cdate_tz,
848 """Create a commit object in the pack. The date_sec values must be
849 epoch-seconds, and if a tz is None, the local timezone is assumed."""
850 content = create_commit_blob(tree, parent,
851 author, adate_sec, adate_tz,
852 committer, cdate_sec, cdate_tz,
854 return self.maybe_write(b'commit', content)
857 """Remove the pack file from disk."""
866 os.unlink(self.filename + b'.pack')
873 def _end(self, run_midx=True):
875 if not f: return None
882 # update object count
884 cp = struct.pack('!i', self.count)
888 # calculate the pack sha1sum
891 for b in chunkyreader(f):
893 packbin = sum.digest()
895 fdatasync(f.fileno())
899 idx.write(self.filename + b'.idx', packbin)
900 nameprefix = os.path.join(self.repo_dir,
901 b'objects/pack/pack-' + hexlify(packbin))
902 if os.path.exists(self.filename + b'.map'):
903 os.unlink(self.filename + b'.map')
904 os.rename(self.filename + b'.pack', nameprefix + b'.pack')
905 os.rename(self.filename + b'.idx', nameprefix + b'.idx')
907 os.fsync(self.parentfd)
909 os.close(self.parentfd)
912 auto_midx(os.path.join(self.repo_dir, b'objects/pack'))
914 if self.on_pack_finish:
915 self.on_pack_finish(nameprefix)
919 def close(self, run_midx=True):
920 """Close the pack file and move it to its definitive path."""
921 return self._end(run_midx=run_midx)
924 class PackIdxV2Writer:
926 self.idx = list(list() for i in range(256))
929 def add(self, sha, crc, offs):
932 self.idx[byte_int(sha[0])].append((sha, crc, offs))
934 def write(self, filename, packbin):
936 for section in self.idx:
937 for entry in section:
938 if entry[2] >= 2**31:
941 # Length: header + fan-out + shas-and-crcs + overflow-offsets
942 index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
944 idx_f = open(filename, 'w+b')
946 idx_f.truncate(index_len)
947 fdatasync(idx_f.fileno())
948 idx_map = mmap_readwrite(idx_f, close=False)
950 count = _helpers.write_idx(filename, idx_map, self.idx,
952 assert(count == self.count)
959 idx_f = open(filename, 'a+b')
964 b = idx_f.read(8 + 4*256)
967 for b in chunkyreader(idx_f, 20 * self.count):
970 for b in chunkyreader(idx_f):
972 idx_f.write(idx_sum.digest())
973 fdatasync(idx_f.fileno())
978 def list_refs(patterns=None, repo_dir=None,
979 limit_to_heads=False, limit_to_tags=False):
980 """Yield (refname, hash) tuples for all repository refs unless
981 patterns are specified. In that case, only include tuples for
982 refs matching those patterns (cf. git-show-ref(1)). The limits
983 restrict the result items to refs/heads or refs/tags. If both
984 limits are specified, items from both sources will be included.
987 argv = [b'git', b'show-ref']
989 argv.append(b'--heads')
991 argv.append(b'--tags')
994 argv.extend(patterns)
995 p = subprocess.Popen(argv, env=_gitenv(repo_dir), stdout=subprocess.PIPE,
997 out = p.stdout.read().strip()
998 rv = p.wait() # not fatal
1002 for d in out.split(b'\n'):
1003 sha, name = d.split(b' ', 1)
1004 yield name, unhexlify(sha)
1007 def read_ref(refname, repo_dir = None):
1008 """Get the commit id of the most recent commit made on a given ref."""
1009 refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
1010 l = tuple(islice(refs, 2))
1018 def rev_list_invocation(ref_or_refs, format=None):
1019 if isinstance(ref_or_refs, bytes):
1020 refs = (ref_or_refs,)
1023 argv = [b'git', b'rev-list']
1026 argv.append(b'--pretty=format:' + format)
1028 assert not ref.startswith(b'-')
1034 def rev_list(ref_or_refs, parse=None, format=None, repo_dir=None):
1035 """Yield information about commits as per "git rev-list". If a format
1036 is not provided, yield one hex hash at a time. If a format is
1037 provided, pass it to rev-list and call parse(git_stdout) for each
1038 commit with the stream positioned just after the rev-list "commit
1039 HASH" header line. When a format is provided yield (oidx,
1040 parse(git_stdout)) for each commit.
1043 assert bool(parse) == bool(format)
1044 p = subprocess.Popen(rev_list_invocation(ref_or_refs,
1046 env=_gitenv(repo_dir),
1047 stdout = subprocess.PIPE,
1050 for line in p.stdout:
1053 line = p.stdout.readline()
1056 if not s.startswith(b'commit '):
1057 raise Exception('unexpected line ' + repr(s))
1060 yield s, parse(p.stdout)
1061 line = p.stdout.readline()
1063 rv = p.wait() # not fatal
1065 raise GitError('git rev-list returned error %d' % rv)
1068 def rev_parse(committish, repo_dir=None):
1069 """Resolve the full hash for 'committish', if it exists.
1071 Should be roughly equivalent to 'git rev-parse'.
1073 Returns the hex value of the hash if it is found, None if 'committish' does
1074 not correspond to anything.
1076 head = read_ref(committish, repo_dir=repo_dir)
1078 debug2("resolved from ref: commit = %s\n" % hexlify(head))
1081 pL = PackIdxList(repo(b'objects/pack', repo_dir=repo_dir))
1083 if len(committish) == 40:
1085 hash = unhexlify(committish)
1095 def update_ref(refname, newval, oldval, repo_dir=None):
1096 """Update a repository reference."""
1099 assert refname.startswith(b'refs/heads/') \
1100 or refname.startswith(b'refs/tags/')
1101 p = subprocess.Popen([b'git', b'update-ref', refname,
1102 hexlify(newval), hexlify(oldval)],
1103 env=_gitenv(repo_dir),
1105 _git_wait(b'git update-ref', p)
1108 def delete_ref(refname, oldvalue=None):
1109 """Delete a repository reference (see git update-ref(1))."""
1110 assert refname.startswith(b'refs/')
1111 oldvalue = [] if not oldvalue else [oldvalue]
1112 p = subprocess.Popen([b'git', b'update-ref', b'-d', refname] + oldvalue,
1115 _git_wait('git update-ref', p)
1118 def guess_repo(path=None):
1119 """Set the path value in the global variable "repodir".
1120 This makes bup look for an existing bup repository, but not fail if a
1121 repository doesn't exist. Usually, if you are interacting with a bup
1122 repository, you would not be calling this function but using
1123 check_repo_or_die().
1129 repodir = environ.get(b'BUP_DIR')
1131 repodir = os.path.expanduser(b'~/.bup')
1134 def init_repo(path=None):
1135 """Create the Git bare repository for bup in a given path."""
1137 d = repo() # appends a / to the path
1138 parent = os.path.dirname(os.path.dirname(d))
1139 if parent and not os.path.exists(parent):
1140 raise GitError('parent directory "%s" does not exist\n'
1142 if os.path.exists(d) and not os.path.isdir(os.path.join(d, b'.')):
1143 raise GitError('"%s" exists but is not a directory\n' % path_msg(d))
1144 p = subprocess.Popen([b'git', b'--bare', b'init'], stdout=sys.stderr,
1147 _git_wait('git init', p)
1148 # Force the index version configuration in order to ensure bup works
1149 # regardless of the version of the installed Git binary.
1150 p = subprocess.Popen([b'git', b'config', b'pack.indexVersion', '2'],
1151 stdout=sys.stderr, env=_gitenv(), close_fds=True)
1152 _git_wait('git config', p)
1154 p = subprocess.Popen([b'git', b'config', b'core.logAllRefUpdates', b'true'],
1155 stdout=sys.stderr, env=_gitenv(), close_fds=True)
1156 _git_wait('git config', p)
1159 def check_repo_or_die(path=None):
1160 """Check to see if a bup repository probably exists, and abort if not."""
1163 pst = stat_if_exists(top + b'/objects/pack')
1164 if pst and stat.S_ISDIR(pst.st_mode):
1167 top_st = stat_if_exists(top)
1169 log('error: repository %r does not exist (see "bup help init")\n'
1172 log('error: %s is not a repository\n' % path_msg(top))
1176 def is_suitable_git(ver_str):
1177 if not ver_str.startswith(b'git version '):
1178 return 'unrecognized'
1179 ver_str = ver_str[len(b'git version '):]
1180 if ver_str.startswith(b'0.'):
1181 return 'insufficient'
1182 if ver_str.startswith(b'1.'):
1183 if re.match(br'1\.[012345]rc', ver_str):
1184 return 'insufficient'
1185 if re.match(br'1\.[01234]\.', ver_str):
1186 return 'insufficient'
1187 if re.match(br'1\.5\.[012345]($|\.)', ver_str):
1188 return 'insufficient'
1189 if re.match(br'1\.5\.6-rc', ver_str):
1190 return 'insufficient'
1192 if re.match(br'[0-9]+(\.|$)?', ver_str):
1198 def require_suitable_git(ver_str=None):
1199 """Raise GitError if the version of git isn't suitable.
1201 Rely on ver_str when provided, rather than invoking the git in the
1206 if _git_great is not None:
1208 if environ.get(b'BUP_GIT_VERSION_IS_FINE', b'').lower() \
1209 in (b'yes', b'true', b'1'):
1213 ver_str, _, _ = _git_exo([b'git', b'--version'])
1214 status = is_suitable_git(ver_str)
1215 if status == 'unrecognized':
1216 raise GitError('Unexpected git --version output: %r' % ver_str)
1217 if status == 'insufficient':
1218 log('error: git version must be at least 1.5.6\n')
1220 if status == 'suitable':
1226 class _AbortableIter:
1227 def __init__(self, it, onabort = None):
1229 self.onabort = onabort
1237 return next(self.it)
1238 except StopIteration as e:
1248 """Abort iteration and call the abortion callback, if needed."""
1259 """Link to 'git cat-file' that is used to retrieve blob data."""
1260 def __init__(self, repo_dir = None):
1261 require_suitable_git()
1262 self.repo_dir = repo_dir
1263 self.p = self.inprogress = None
1265 def close(self, wait=False):
1271 self.inprogress = None
1278 self.p = subprocess.Popen([b'git', b'cat-file', b'--batch'],
1279 stdin=subprocess.PIPE,
1280 stdout=subprocess.PIPE,
1283 env=_gitenv(self.repo_dir))
1286 """Yield (oidx, type, size), followed by the data referred to by ref.
1287 If ref does not exist, only yield (None, None, None).
1290 if not self.p or self.p.poll() != None:
1293 poll_result = self.p.poll()
1294 assert(poll_result == None)
1296 log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1297 assert(not self.inprogress)
1298 assert ref.find(b'\n') < 0
1299 assert ref.find(b'\r') < 0
1300 assert not ref.startswith(b'-')
1301 self.inprogress = ref
1302 self.p.stdin.write(ref + b'\n')
1303 self.p.stdin.flush()
1304 hdr = self.p.stdout.readline()
1305 if hdr.endswith(b' missing\n'):
1306 self.inprogress = None
1307 yield None, None, None
1309 info = hdr.split(b' ')
1310 if len(info) != 3 or len(info[0]) != 40:
1311 raise GitError('expected object (id, type, size), got %r' % info)
1312 oidx, typ, size = info
1314 it = _AbortableIter(chunkyreader(self.p.stdout, size),
1317 yield oidx, typ, size
1320 readline_result = self.p.stdout.readline()
1321 assert readline_result == b'\n'
1322 self.inprogress = None
1323 except Exception as e:
1327 def _join(self, it):
1328 _, typ, _ = next(it)
1332 elif typ == b'tree':
1333 treefile = b''.join(it)
1334 for (mode, name, sha) in tree_decode(treefile):
1335 for blob in self.join(hexlify(sha)):
1337 elif typ == b'commit':
1338 treeline = b''.join(it).split(b'\n')[0]
1339 assert treeline.startswith(b'tree ')
1340 for blob in self.join(treeline[5:]):
1343 raise GitError('invalid object type %r: expected blob/tree/commit'
1347 """Generate a list of the content of all blobs that can be reached
1348 from an object. The hash given in 'id' must point to a blob, a tree
1349 or a commit. The content of all blobs that can be seen from trees or
1350 commits will be added to the list.
1352 for d in self._join(self.get(id)):
1358 def cp(repo_dir=None):
1359 """Create a CatPipe object or reuse the already existing one."""
1362 repo_dir = repodir or repo()
1363 repo_dir = os.path.abspath(repo_dir)
1364 cp = _cp.get(repo_dir)
1366 cp = CatPipe(repo_dir)
1371 def close_catpipes():
1372 # FIXME: chain exceptions
1374 _, cp = _cp.popitem()
1378 def tags(repo_dir = None):
1379 """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1381 for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1382 assert n.startswith(b'refs/tags/')
1386 tags[c].append(name) # more than one tag can point at 'c'
1390 class MissingObject(KeyError):
1391 def __init__(self, oid):
1393 KeyError.__init__(self, 'object %r is missing' % hexlify(oid))
1396 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1397 'path', 'chunk_path', 'data'])
1398 # The path is the mangled path, and if an item represents a fragment
1399 # of a chunked file, the chunk_path will be the chunked subtree path
1400 # for the chunk, i.e. ['', '2d3115e', ...]. The top-level path for a
1401 # chunked file will have a chunk_path of ['']. So some chunk subtree
1402 # of the file '/foo/bar/baz' might look like this:
1404 # item.path = ['foo', 'bar', 'baz.bup']
1405 # item.chunk_path = ['', '2d3115e', '016b097']
1406 # item.type = 'tree'
1410 def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1411 """Yield everything reachable from oidx via get_ref (which must behave
1412 like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1413 returns true. Throw MissingObject if a hash encountered is
1414 missing from the repository, and don't read or return blob content
1415 in the data field unless include_data is set.
1418 # Maintain the pending stack on the heap to avoid stack overflow
1419 pending = [(oidx, [], [], None)]
1421 oidx, parent_path, chunk_path, mode = pending.pop()
1422 oid = unhexlify(oidx)
1423 if stop_at and stop_at(oidx):
1426 if (not include_data) and mode and stat.S_ISREG(mode):
1427 # If the object is a "regular file", then it's a leaf in
1428 # the graph, so we can skip reading the data if the caller
1429 # hasn't requested it.
1430 yield WalkItem(oid=oid, type=b'blob',
1431 chunk_path=chunk_path, path=parent_path,
1436 item_it = get_ref(oidx)
1437 get_oidx, typ, _ = next(item_it)
1439 raise MissingObject(unhexlify(oidx))
1440 if typ not in (b'blob', b'commit', b'tree'):
1441 raise Exception('unexpected repository object type %r' % typ)
1443 # FIXME: set the mode based on the type when the mode is None
1444 if typ == b'blob' and not include_data:
1445 # Dump data until we can ask cat_pipe not to fetch it
1446 for ignored in item_it:
1450 data = b''.join(item_it)
1452 yield WalkItem(oid=oid, type=typ,
1453 chunk_path=chunk_path, path=parent_path,
1455 data=(data if include_data else None))
1457 if typ == b'commit':
1458 commit_items = parse_commit(data)
1459 for pid in commit_items.parents:
1460 pending.append((pid, parent_path, chunk_path, mode))
1461 pending.append((commit_items.tree, parent_path, chunk_path,
1462 hashsplit.GIT_MODE_TREE))
1463 elif typ == b'tree':
1464 for mode, name, ent_id in tree_decode(data):
1465 demangled, bup_type = demangle_name(name, mode)
1467 sub_path = parent_path
1468 sub_chunk_path = chunk_path + [name]
1470 sub_path = parent_path + [name]
1471 if bup_type == BUP_CHUNKED:
1472 sub_chunk_path = [b'']
1474 sub_chunk_path = chunk_path
1475 pending.append((hexlify(ent_id), sub_path, sub_chunk_path,