1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
6 from __future__ import absolute_import, print_function
7 import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
8 from array import array
9 from binascii import hexlify, unhexlify
10 from collections import namedtuple
11 from itertools import islice
12 from numbers import Integral
14 from bup import _helpers, compat, hashsplit, path, midx, bloom, xstat
15 from bup.compat import (buffer,
16 byte_int, bytes_from_byte, bytes_from_uint,
21 from bup.io import path_msg
22 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
25 hostname, localtime, log,
28 mmap_read, mmap_readwrite,
30 progress, qprogress, stat_if_exists,
33 from bup.pwdgrp import username, userfullname
37 repodir = None # The default repository, once initialized
39 _typemap = {b'blob': 3, b'tree': 2, b'commit': 1, b'tag': 4}
40 _typermap = {v: k for k, v in items(_typemap)}
47 class GitError(Exception):
51 def _gitenv(repo_dir=None):
54 return merge_dict(environ, {b'GIT_DIR': os.path.abspath(repo_dir)})
56 def _git_wait(cmd, p):
59 raise GitError('%r returned %d' % (cmd, rv))
61 def _git_exo(cmd, **kwargs):
62 kwargs['check'] = False
63 result = exo(cmd, **kwargs)
65 if proc.returncode != 0:
66 raise GitError('%r returned %d' % (cmd, proc.returncode))
69 def git_config_get(option, repo_dir=None):
70 cmd = (b'git', b'config', b'--get', option)
71 p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
72 env=_gitenv(repo_dir=repo_dir),
79 raise GitError('%r returned %d' % (cmd, rc))
83 def parse_tz_offset(s):
84 """UTC offset in seconds."""
85 tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
86 if bytes_from_byte(s[0]) == b'-':
91 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
92 # Make sure that's authoritative.
93 _start_end_char = br'[^ .,:;<>"\'\0\n]'
94 _content_char = br'[^\0\n<>]'
95 _safe_str_rx = br'(?:%s{1,2}|(?:%s%s*%s))' \
97 _start_end_char, _content_char, _start_end_char)
98 _tz_rx = br'[-+]\d\d[0-5]\d'
99 _parent_rx = br'(?:parent [abcdefABCDEF0123456789]{40}\n)'
100 # Assumes every following line starting with a space is part of the
101 # mergetag. Is there a formal commit blob spec?
102 _mergetag_rx = br'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)'
103 _commit_rx = re.compile(br'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
104 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
105 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?)
107 (?P<message>(?:.|\n)*)''' % (_parent_rx,
108 _safe_str_rx, _safe_str_rx, _tz_rx,
109 _safe_str_rx, _safe_str_rx, _tz_rx,
111 _parent_hash_rx = re.compile(br'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
113 # Note that the author_sec and committer_sec values are (UTC) epoch
114 # seconds, and for now the mergetag is not included.
115 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
116 'author_name', 'author_mail',
117 'author_sec', 'author_offset',
118 'committer_name', 'committer_mail',
119 'committer_sec', 'committer_offset',
122 def parse_commit(content):
123 commit_match = re.match(_commit_rx, content)
125 raise Exception('cannot parse commit %r' % content)
126 matches = commit_match.groupdict()
127 return CommitInfo(tree=matches['tree'],
128 parents=re.findall(_parent_hash_rx, matches['parents']),
129 author_name=matches['author_name'],
130 author_mail=matches['author_mail'],
131 author_sec=int(matches['asec']),
132 author_offset=parse_tz_offset(matches['atz']),
133 committer_name=matches['committer_name'],
134 committer_mail=matches['committer_mail'],
135 committer_sec=int(matches['csec']),
136 committer_offset=parse_tz_offset(matches['ctz']),
137 message=matches['message'])
140 def get_cat_data(cat_iterator, expected_type):
141 _, kind, _ = next(cat_iterator)
142 if kind != expected_type:
143 raise Exception('expected %r, saw %r' % (expected_type, kind))
144 return b''.join(cat_iterator)
146 def get_commit_items(id, cp):
147 return parse_commit(get_cat_data(cp.get(id), b'commit'))
149 def _local_git_date_str(epoch_sec):
150 return b'%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
153 def _git_date_str(epoch_sec, tz_offset_sec):
154 offs = tz_offset_sec // 60
155 return b'%d %s%02d%02d' \
157 b'+' if offs >= 0 else b'-',
162 def repo(sub = b'', repo_dir=None):
163 """Get the path to the git repository or one of its subdirectories."""
164 repo_dir = repo_dir or repodir
166 raise GitError('You should call check_repo_or_die()')
168 # If there's a .git subdirectory, then the actual repo is in there.
169 gd = os.path.join(repo_dir, b'.git')
170 if os.path.exists(gd):
173 return os.path.join(repo_dir, sub)
177 re.compile(br'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)')
180 return _shorten_hash_rx.sub(br'\1\2*\3', s)
184 full = os.path.abspath(path)
185 fullrepo = os.path.abspath(repo(b''))
186 if not fullrepo.endswith(b'/'):
188 if full.startswith(fullrepo):
189 path = full[len(fullrepo):]
190 if path.startswith(b'index-cache/'):
191 path = path[len(b'index-cache/'):]
192 return shorten_hash(path)
196 paths = [repo(b'objects/pack')]
197 paths += glob.glob(repo(b'index-cache/*/.'))
201 def auto_midx(objdir):
202 args = [path.exe(), b'midx', b'--auto', b'--dir', objdir]
204 rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
206 # make sure 'args' gets printed to help with debugging
207 add_error('%r: exception: %s' % (args, e))
210 add_error('%r: returned %d' % (args, rv))
212 args = [path.exe(), b'bloom', b'--dir', objdir]
214 rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
216 # make sure 'args' gets printed to help with debugging
217 add_error('%r: exception: %s' % (args, e))
220 add_error('%r: returned %d' % (args, rv))
223 def mangle_name(name, mode, gitmode):
224 """Mangle a file name to present an abstract name for segmented files.
225 Mangled file names will have the ".bup" extension added to them. If a
226 file's name already ends with ".bup", a ".bupl" extension is added to
227 disambiguate normal files from segmented ones.
229 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
230 assert(stat.S_ISDIR(gitmode))
231 return name + b'.bup'
232 elif name.endswith(b'.bup') or name[:-1].endswith(b'.bup'):
233 return name + b'.bupl'
238 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
239 def demangle_name(name, mode):
240 """Remove name mangling from a file name, if necessary.
242 The return value is a tuple (demangled_filename,mode), where mode is one of
245 * BUP_NORMAL : files that should be read as-is from the repository
246 * BUP_CHUNKED : files that were chunked and need to be reassembled
248 For more information on the name mangling algorithm, see mangle_name()
250 if name.endswith(b'.bupl'):
251 return (name[:-5], BUP_NORMAL)
252 elif name.endswith(b'.bup'):
253 return (name[:-4], BUP_CHUNKED)
254 elif name.endswith(b'.bupm'):
256 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
258 return (name, BUP_NORMAL)
261 def calc_hash(type, content):
262 """Calculate some content's hash in the Git fashion."""
263 header = b'%s %d\0' % (type, len(content))
269 def shalist_item_sort_key(ent):
270 (mode, name, id) = ent
271 assert(mode+0 == mode)
272 if stat.S_ISDIR(mode):
278 def tree_encode(shalist):
279 """Generate a git tree object from (mode,name,hash) tuples."""
280 shalist = sorted(shalist, key = shalist_item_sort_key)
282 for (mode,name,bin) in shalist:
284 assert(mode+0 == mode)
286 assert(len(bin) == 20)
287 s = b'%o %s\0%s' % (mode,name,bin)
288 assert s[0] != b'0' # 0-padded octal is not acceptable in a git tree
293 def tree_decode(buf):
294 """Generate a list of (mode,name,hash) from the git tree object in buf."""
296 while ofs < len(buf):
297 z = buf.find(b'\0', ofs)
299 spl = buf[ofs:z].split(b' ', 1)
300 assert(len(spl) == 2)
302 sha = buf[z+1:z+1+20]
304 yield (int(mode, 8), name, sha)
307 def _encode_packobj(type, content, compression_level=1):
308 if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
309 raise ValueError('invalid compression level %s' % compression_level)
312 szbits = (sz & 0x0f) | (_typemap[type]<<4)
315 if sz: szbits |= 0x80
316 szout += bytes_from_uint(szbits)
321 z = zlib.compressobj(compression_level)
323 yield z.compress(content)
327 def _encode_looseobj(type, content, compression_level=1):
328 z = zlib.compressobj(compression_level)
329 yield z.compress(b'%s %d\0' % (type, len(content)))
330 yield z.compress(content)
334 def _decode_looseobj(buf):
336 s = zlib.decompress(buf)
339 l = s[:i].split(b' ')
343 assert(type in _typemap)
344 assert(sz == len(content))
345 return (type, content)
348 def _decode_packobj(buf):
351 type = _typermap[(c & 0x70) >> 4]
358 sz |= (c & 0x7f) << shift
362 return (type, zlib.decompress(buf[i+1:]))
369 def find_offset(self, hash):
370 """Get the offset of an object inside the index file."""
371 idx = self._idx_from_hash(hash)
373 return self._ofs_from_idx(idx)
376 def exists(self, hash, want_source=False):
377 """Return nonempty if the object exists in this index."""
378 if hash and (self._idx_from_hash(hash) != None):
379 return want_source and os.path.basename(self.name) or True
382 def _idx_from_hash(self, hash):
383 global _total_searches, _total_steps
385 assert(len(hash) == 20)
386 b1 = byte_int(hash[0])
387 start = self.fanout[b1-1] # range -1..254
388 end = self.fanout[b1] # range 0..255
390 _total_steps += 1 # lookup table is a step
393 mid = start + (end - start) // 2
394 v = self._idx_to_hash(mid)
404 class PackIdxV1(PackIdx):
405 """Object representation of a Git pack index (version 1) file."""
406 def __init__(self, filename, f):
408 self.idxnames = [self.name]
409 self.map = mmap_read(f)
410 # Min size for 'L' is 4, which is sufficient for struct's '!I'
411 self.fanout = array('L', struct.unpack('!256I', self.map))
412 self.fanout.append(0) # entry "-1"
413 self.nsha = self.fanout[255]
414 self.sha_ofs = 256 * 4
415 # Avoid slicing shatable for individual hashes (very high overhead)
416 self.shatable = buffer(self.map, self.sha_ofs, self.nsha * 24)
421 def __exit__(self, type, value, traceback):
425 return int(self.nsha) # int() from long for python 2
427 def _ofs_from_idx(self, idx):
428 if idx >= self.nsha or idx < 0:
429 raise IndexError('invalid pack index index %d' % idx)
430 ofs = self.sha_ofs + idx * 24
431 return struct.unpack_from('!I', self.map, offset=ofs)[0]
433 def _idx_to_hash(self, idx):
434 if idx >= self.nsha or idx < 0:
435 raise IndexError('invalid pack index index %d' % idx)
436 ofs = self.sha_ofs + idx * 24 + 4
437 return self.map[ofs : ofs + 20]
440 start = self.sha_ofs + 4
441 for ofs in range(start, start + 24 * self.nsha, 24):
442 yield self.map[ofs : ofs + 20]
445 if self.map is not None:
451 class PackIdxV2(PackIdx):
452 """Object representation of a Git pack index (version 2) file."""
453 def __init__(self, filename, f):
455 self.idxnames = [self.name]
456 self.map = mmap_read(f)
457 assert self.map[0:8] == b'\377tOc\0\0\0\2'
458 # Min size for 'L' is 4, which is sufficient for struct's '!I'
459 self.fanout = array('L', struct.unpack_from('!256I', self.map, offset=8))
460 self.fanout.append(0)
461 self.nsha = self.fanout[255]
462 self.sha_ofs = 8 + 256*4
463 self.ofstable_ofs = self.sha_ofs + self.nsha * 20 + self.nsha * 4
464 self.ofs64table_ofs = self.ofstable_ofs + self.nsha * 4
465 # Avoid slicing this for individual hashes (very high overhead)
466 self.shatable = buffer(self.map, self.sha_ofs, self.nsha*20)
471 def __exit__(self, type, value, traceback):
475 return int(self.nsha) # int() from long for python 2
477 def _ofs_from_idx(self, idx):
478 if idx >= self.nsha or idx < 0:
479 raise IndexError('invalid pack index index %d' % idx)
480 ofs_ofs = self.ofstable_ofs + idx * 4
481 ofs = struct.unpack_from('!I', self.map, offset=ofs_ofs)[0]
483 idx64 = ofs & 0x7fffffff
484 ofs64_ofs = self.ofs64table_ofs + idx64 * 8
485 ofs = struct.unpack_from('!Q', self.map, offset=ofs64_ofs)[0]
488 def _idx_to_hash(self, idx):
489 if idx >= self.nsha or idx < 0:
490 raise IndexError('invalid pack index index %d' % idx)
491 ofs = self.sha_ofs + idx * 20
492 return self.map[ofs : ofs + 20]
496 for ofs in range(start, start + 20 * self.nsha, 20):
497 yield self.map[ofs : ofs + 20]
500 if self.map is not None:
508 def __init__(self, dir, ignore_midx=False):
510 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
515 self.do_bloom = False
517 self.ignore_midx = ignore_midx
523 assert(_mpi_count == 0)
526 return iter(idxmerge(self.packs))
529 return sum(len(pack) for pack in self.packs)
531 def exists(self, hash, want_source=False):
532 """Return nonempty if the object exists in the index files."""
533 global _total_searches
535 if hash in self.also:
537 if self.do_bloom and self.bloom:
538 if self.bloom.exists(hash):
539 self.do_bloom = False
541 _total_searches -= 1 # was counted by bloom
543 for i in range(len(self.packs)):
545 _total_searches -= 1 # will be incremented by sub-pack
546 ix = p.exists(hash, want_source=want_source)
548 # reorder so most recently used packs are searched first
549 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
554 def refresh(self, skip_midx = False):
555 """Refresh the index list.
556 This method verifies if .midx files were superseded (e.g. all of its
557 contents are in another, bigger .midx file) and removes the superseded
560 If skip_midx is True, all work on .midx files will be skipped and .midx
561 files will be removed from the list.
563 The instance variable 'ignore_midx' can force this function to
564 always act as if skip_midx was True.
566 if self.bloom is not None:
568 self.bloom = None # Always reopen the bloom as it may have been relaced
569 self.do_bloom = False
570 skip_midx = skip_midx or self.ignore_midx
571 d = dict((p.name, p) for p in self.packs
572 if not skip_midx or not isinstance(p, midx.PackMidx))
573 if os.path.exists(self.dir):
576 midxes = set(glob.glob(os.path.join(self.dir, b'*.midx')))
577 # remove any *.midx files from our list that no longer exist
578 for ix in list(d.values()):
579 if not isinstance(ix, midx.PackMidx):
581 if ix.name in midxes:
586 self.packs.remove(ix)
587 for ix in self.packs:
588 if isinstance(ix, midx.PackMidx):
589 for name in ix.idxnames:
590 d[os.path.join(self.dir, name)] = ix
593 mx = midx.PackMidx(full)
594 (mxd, mxf) = os.path.split(mx.name)
596 for n in mx.idxnames:
597 if not os.path.exists(os.path.join(mxd, n)):
598 log(('warning: index %s missing\n'
600 % (path_msg(n), path_msg(mxf)))
608 midxl.sort(key=lambda ix:
609 (-len(ix), -xstat.stat(ix.name).st_mtime))
612 for sub in ix.idxnames:
613 found = d.get(os.path.join(self.dir, sub))
614 if not found or isinstance(found, PackIdx):
615 # doesn't exist, or exists but not in a midx
620 for name in ix.idxnames:
621 d[os.path.join(self.dir, name)] = ix
622 elif not ix.force_keep:
623 debug1('midx: removing redundant: %s\n'
624 % path_msg(os.path.basename(ix.name)))
627 for full in glob.glob(os.path.join(self.dir, b'*.idx')):
631 except GitError as e:
635 bfull = os.path.join(self.dir, b'bup.bloom')
636 if self.bloom is None and os.path.exists(bfull):
637 self.bloom = bloom.ShaBloom(bfull)
638 self.packs = list(set(d.values()))
639 self.packs.sort(reverse=True, key=lambda x: len(x))
640 if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
644 debug1('PackIdxList: using %d index%s.\n'
645 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
648 """Insert an additional object in the list."""
652 def open_idx(filename):
653 if filename.endswith(b'.idx'):
654 f = open(filename, 'rb')
656 if header[0:4] == b'\377tOc':
657 version = struct.unpack('!I', header[4:8])[0]
659 return PackIdxV2(filename, f)
661 raise GitError('%s: expected idx file version 2, got %d'
662 % (path_msg(filename), version))
663 elif len(header) == 8 and header[0:4] < b'\377tOc':
664 return PackIdxV1(filename, f)
666 raise GitError('%s: unrecognized idx file header'
667 % path_msg(filename))
668 elif filename.endswith(b'.midx'):
669 return midx.PackMidx(filename)
671 raise GitError('idx filenames must end with .idx or .midx')
674 def idxmerge(idxlist, final_progress=True):
675 """Generate a list of all the objects reachable in a PackIdxList."""
676 def pfunc(count, total):
677 qprogress('Reading indexes: %.2f%% (%d/%d)\r'
678 % (count*100.0/total, count, total))
679 def pfinal(count, total):
681 progress('Reading indexes: %.2f%% (%d/%d), done.\n'
682 % (100, total, total))
683 return merge_iter(idxlist, 10024, pfunc, pfinal)
686 def _make_objcache():
687 return PackIdxList(repo(b'objects/pack'))
689 # bup-gc assumes that it can disable all PackWriter activities
690 # (bloom/midx/cache) via the constructor and close() arguments.
693 """Writes Git objects inside a pack file."""
694 def __init__(self, objcache_maker=_make_objcache, compression_level=1,
695 run_midx=True, on_pack_finish=None,
696 max_pack_size=None, max_pack_objects=None, repo_dir=None):
697 self.repo_dir = repo_dir or repo()
704 self.objcache_maker = objcache_maker
706 self.compression_level = compression_level
707 self.run_midx=run_midx
708 self.on_pack_finish = on_pack_finish
709 if not max_pack_size:
710 max_pack_size = git_config_get(b'pack.packSizeLimit',
711 repo_dir=self.repo_dir)
712 if max_pack_size is not None:
713 max_pack_size = parse_num(max_pack_size)
714 if not max_pack_size:
715 # larger packs slow down pruning
716 max_pack_size = 1000 * 1000 * 1000
717 self.max_pack_size = max_pack_size
718 # cache memory usage is about 83 bytes per object
719 self.max_pack_objects = max_pack_objects if max_pack_objects \
720 else max(1, self.max_pack_size // 5000)
728 def __exit__(self, type, value, traceback):
733 objdir = dir = os.path.join(self.repo_dir, b'objects')
734 fd, name = tempfile.mkstemp(suffix=b'.pack', dir=objdir)
736 self.file = os.fdopen(fd, 'w+b')
741 self.parentfd = os.open(objdir, os.O_RDONLY)
747 assert name.endswith(b'.pack')
748 self.filename = name[:-5]
749 self.file.write(b'PACK\0\0\0\2\0\0\0\0')
750 self.idx = PackIdxV2Writer()
752 def _raw_write(self, datalist, sha):
755 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
756 # the file never has a *partial* blob. So let's make sure it's
757 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
758 # to our hashsplit algorithm.) f.write() does its own buffering,
759 # but that's okay because we'll flush it in _end().
760 oneblob = b''.join(datalist)
766 crc = zlib.crc32(oneblob) & 0xffffffff
767 self._update_idx(sha, crc, nw)
772 def _update_idx(self, sha, crc, size):
775 self.idx.add(sha, crc, self.file.tell() - size)
777 def _write(self, sha, type, content):
781 sha = calc_hash(type, content)
782 size, crc = self._raw_write(_encode_packobj(type, content,
783 self.compression_level),
785 if self.outbytes >= self.max_pack_size \
786 or self.count >= self.max_pack_objects:
790 def breakpoint(self):
791 """Clear byte and object counts and return the last processed id."""
792 id = self._end(self.run_midx)
793 self.outbytes = self.count = 0
796 def _require_objcache(self):
797 if self.objcache is None and self.objcache_maker:
798 self.objcache = self.objcache_maker()
799 if self.objcache is None:
801 "PackWriter not opened or can't check exists w/o objcache")
803 def exists(self, id, want_source=False):
804 """Return non-empty if an object is found in the object cache."""
805 self._require_objcache()
806 return self.objcache.exists(id, want_source=want_source)
808 def just_write(self, sha, type, content):
809 """Write an object to the pack file without checking for duplication."""
810 self._write(sha, type, content)
811 # If nothing else, gc doesn't have/want an objcache
812 if self.objcache is not None:
813 self.objcache.add(sha)
815 def maybe_write(self, type, content):
816 """Write an object to the pack file if not present and return its id."""
817 sha = calc_hash(type, content)
818 if not self.exists(sha):
819 self._require_objcache()
820 self.just_write(sha, type, content)
823 def new_blob(self, blob):
824 """Create a blob object in the pack with the supplied content."""
825 return self.maybe_write(b'blob', blob)
827 def new_tree(self, shalist):
828 """Create a tree object in the pack."""
829 content = tree_encode(shalist)
830 return self.maybe_write(b'tree', content)
832 def new_commit(self, tree, parent,
833 author, adate_sec, adate_tz,
834 committer, cdate_sec, cdate_tz,
836 """Create a commit object in the pack. The date_sec values must be
837 epoch-seconds, and if a tz is None, the local timezone is assumed."""
838 if adate_tz is not None:
839 adate_str = _git_date_str(adate_sec, adate_tz)
841 adate_str = _local_git_date_str(adate_sec)
842 if cdate_tz is not None:
843 cdate_str = _git_date_str(cdate_sec, cdate_tz)
845 cdate_str = _local_git_date_str(cdate_sec)
847 if tree: l.append(b'tree %s' % hexlify(tree))
848 if parent: l.append(b'parent %s' % hexlify(parent))
849 if author: l.append(b'author %s %s' % (author, adate_str))
850 if committer: l.append(b'committer %s %s' % (committer, cdate_str))
853 return self.maybe_write(b'commit', b'\n'.join(l))
856 """Remove the pack file from disk."""
865 os.unlink(self.filename + b'.pack')
872 def _end(self, run_midx=True):
874 if not f: return None
881 # update object count
883 cp = struct.pack('!i', self.count)
887 # calculate the pack sha1sum
890 for b in chunkyreader(f):
892 packbin = sum.digest()
894 fdatasync(f.fileno())
898 obj_list_sha = idx.write(self.filename + b'.idx', packbin)
899 nameprefix = os.path.join(self.repo_dir,
900 b'objects/pack/pack-' + obj_list_sha)
901 if os.path.exists(self.filename + b'.map'):
902 os.unlink(self.filename + b'.map')
903 os.rename(self.filename + b'.pack', nameprefix + b'.pack')
904 os.rename(self.filename + b'.idx', nameprefix + b'.idx')
906 os.fsync(self.parentfd)
908 os.close(self.parentfd)
911 auto_midx(os.path.join(self.repo_dir, b'objects/pack'))
913 if self.on_pack_finish:
914 self.on_pack_finish(nameprefix)
918 def close(self, run_midx=True):
919 """Close the pack file and move it to its definitive path."""
920 return self._end(run_midx=run_midx)
923 class PackIdxV2Writer:
925 self.idx = list(list() for i in range(256))
928 def add(self, sha, crc, offs):
931 self.idx[byte_int(sha[0])].append((sha, crc, offs))
933 def write(self, filename, packbin):
935 for section in self.idx:
936 for entry in section:
937 if entry[2] >= 2**31:
940 # Length: header + fan-out + shas-and-crcs + overflow-offsets
941 index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
943 idx_f = open(filename, 'w+b')
945 idx_f.truncate(index_len)
946 fdatasync(idx_f.fileno())
947 idx_map = mmap_readwrite(idx_f, close=False)
949 count = _helpers.write_idx(filename, idx_map, self.idx,
951 assert(count == self.count)
958 idx_f = open(filename, 'a+b')
963 b = idx_f.read(8 + 4*256)
966 obj_list_sum = Sha1()
967 for b in chunkyreader(idx_f, 20 * self.count):
969 obj_list_sum.update(b)
970 namebase = hexlify(obj_list_sum.digest())
972 for b in chunkyreader(idx_f):
974 idx_f.write(idx_sum.digest())
975 fdatasync(idx_f.fileno())
981 def list_refs(patterns=None, repo_dir=None,
982 limit_to_heads=False, limit_to_tags=False):
983 """Yield (refname, hash) tuples for all repository refs unless
984 patterns are specified. In that case, only include tuples for
985 refs matching those patterns (cf. git-show-ref(1)). The limits
986 restrict the result items to refs/heads or refs/tags. If both
987 limits are specified, items from both sources will be included.
990 argv = [b'git', b'show-ref']
992 argv.append(b'--heads')
994 argv.append(b'--tags')
997 argv.extend(patterns)
998 p = subprocess.Popen(argv, env=_gitenv(repo_dir), stdout=subprocess.PIPE,
1000 out = p.stdout.read().strip()
1001 rv = p.wait() # not fatal
1005 for d in out.split(b'\n'):
1006 sha, name = d.split(b' ', 1)
1007 yield name, unhexlify(sha)
1010 def read_ref(refname, repo_dir = None):
1011 """Get the commit id of the most recent commit made on a given ref."""
1012 refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
1013 l = tuple(islice(refs, 2))
1021 def rev_list_invocation(ref_or_refs, format=None):
1022 if isinstance(ref_or_refs, bytes):
1023 refs = (ref_or_refs,)
1026 argv = [b'git', b'rev-list']
1029 argv.append(b'--pretty=format:' + format)
1031 assert not ref.startswith(b'-')
1037 def rev_list(ref_or_refs, parse=None, format=None, repo_dir=None):
1038 """Yield information about commits as per "git rev-list". If a format
1039 is not provided, yield one hex hash at a time. If a format is
1040 provided, pass it to rev-list and call parse(git_stdout) for each
1041 commit with the stream positioned just after the rev-list "commit
1042 HASH" header line. When a format is provided yield (oidx,
1043 parse(git_stdout)) for each commit.
1046 assert bool(parse) == bool(format)
1047 p = subprocess.Popen(rev_list_invocation(ref_or_refs,
1049 env=_gitenv(repo_dir),
1050 stdout = subprocess.PIPE,
1053 for line in p.stdout:
1056 line = p.stdout.readline()
1059 if not s.startswith(b'commit '):
1060 raise Exception('unexpected line ' + repr(s))
1063 yield s, parse(p.stdout)
1064 line = p.stdout.readline()
1066 rv = p.wait() # not fatal
1068 raise GitError('git rev-list returned error %d' % rv)
1071 def get_commit_dates(refs, repo_dir=None):
1072 """Get the dates for the specified commit refs. For now, every unique
1073 string in refs must resolve to a different commit or this
1074 function will fail."""
1077 commit = get_commit_items(ref, cp(repo_dir))
1078 result.append(commit.author_sec)
1082 def rev_parse(committish, repo_dir=None):
1083 """Resolve the full hash for 'committish', if it exists.
1085 Should be roughly equivalent to 'git rev-parse'.
1087 Returns the hex value of the hash if it is found, None if 'committish' does
1088 not correspond to anything.
1090 head = read_ref(committish, repo_dir=repo_dir)
1092 debug2("resolved from ref: commit = %s\n" % hexlify(head))
1095 pL = PackIdxList(repo(b'objects/pack', repo_dir=repo_dir))
1097 if len(committish) == 40:
1099 hash = unhexlify(committish)
1109 def update_ref(refname, newval, oldval, repo_dir=None):
1110 """Update a repository reference."""
1113 assert refname.startswith(b'refs/heads/') \
1114 or refname.startswith(b'refs/tags/')
1115 p = subprocess.Popen([b'git', b'update-ref', refname,
1116 hexlify(newval), hexlify(oldval)],
1117 env=_gitenv(repo_dir),
1119 _git_wait(b'git update-ref', p)
1122 def delete_ref(refname, oldvalue=None):
1123 """Delete a repository reference (see git update-ref(1))."""
1124 assert refname.startswith(b'refs/')
1125 oldvalue = [] if not oldvalue else [oldvalue]
1126 p = subprocess.Popen([b'git', b'update-ref', b'-d', refname] + oldvalue,
1129 _git_wait('git update-ref', p)
1132 def guess_repo(path=None):
1133 """Set the path value in the global variable "repodir".
1134 This makes bup look for an existing bup repository, but not fail if a
1135 repository doesn't exist. Usually, if you are interacting with a bup
1136 repository, you would not be calling this function but using
1137 check_repo_or_die().
1143 repodir = environ.get(b'BUP_DIR')
1145 repodir = os.path.expanduser(b'~/.bup')
1148 def init_repo(path=None):
1149 """Create the Git bare repository for bup in a given path."""
1151 d = repo() # appends a / to the path
1152 parent = os.path.dirname(os.path.dirname(d))
1153 if parent and not os.path.exists(parent):
1154 raise GitError('parent directory "%s" does not exist\n'
1156 if os.path.exists(d) and not os.path.isdir(os.path.join(d, b'.')):
1157 raise GitError('"%s" exists but is not a directory\n' % path_msg(d))
1158 p = subprocess.Popen([b'git', b'--bare', b'init'], stdout=sys.stderr,
1161 _git_wait('git init', p)
1162 # Force the index version configuration in order to ensure bup works
1163 # regardless of the version of the installed Git binary.
1164 p = subprocess.Popen([b'git', b'config', b'pack.indexVersion', '2'],
1165 stdout=sys.stderr, env=_gitenv(), close_fds=True)
1166 _git_wait('git config', p)
1168 p = subprocess.Popen([b'git', b'config', b'core.logAllRefUpdates', b'true'],
1169 stdout=sys.stderr, env=_gitenv(), close_fds=True)
1170 _git_wait('git config', p)
1173 def check_repo_or_die(path=None):
1174 """Check to see if a bup repository probably exists, and abort if not."""
1177 pst = stat_if_exists(top + b'/objects/pack')
1178 if pst and stat.S_ISDIR(pst.st_mode):
1181 top_st = stat_if_exists(top)
1183 log('error: repository %r does not exist (see "bup help init")\n'
1186 log('error: %s is not a repository\n' % path_msg(top))
1190 def is_suitable_git(ver_str):
1191 if not ver_str.startswith(b'git version '):
1192 return 'unrecognized'
1193 ver_str = ver_str[len(b'git version '):]
1194 if ver_str.startswith(b'0.'):
1195 return 'insufficient'
1196 if ver_str.startswith(b'1.'):
1197 if re.match(br'1\.[012345]rc', ver_str):
1198 return 'insufficient'
1199 if re.match(br'1\.[01234]\.', ver_str):
1200 return 'insufficient'
1201 if re.match(br'1\.5\.[012345]($|\.)', ver_str):
1202 return 'insufficient'
1203 if re.match(br'1\.5\.6-rc', ver_str):
1204 return 'insufficient'
1206 if re.match(br'[0-9]+(\.|$)?', ver_str):
1212 def require_suitable_git(ver_str=None):
1213 """Raise GitError if the version of git isn't suitable.
1215 Rely on ver_str when provided, rather than invoking the git in the
1220 if _git_great is not None:
1222 if environ.get(b'BUP_GIT_VERSION_IS_FINE', b'').lower() \
1223 in (b'yes', b'true', b'1'):
1227 ver_str, _, _ = _git_exo([b'git', b'--version'])
1228 status = is_suitable_git(ver_str)
1229 if status == 'unrecognized':
1230 raise GitError('Unexpected git --version output: %r' % ver_str)
1231 if status == 'insufficient':
1232 log('error: git version must be at least 1.5.6\n')
1234 if status == 'suitable':
1240 class _AbortableIter:
1241 def __init__(self, it, onabort = None):
1243 self.onabort = onabort
1251 return next(self.it)
1252 except StopIteration as e:
1262 """Abort iteration and call the abortion callback, if needed."""
1273 """Link to 'git cat-file' that is used to retrieve blob data."""
1274 def __init__(self, repo_dir = None):
1275 require_suitable_git()
1276 self.repo_dir = repo_dir
1277 self.p = self.inprogress = None
1281 self.p.stdout.close()
1282 self.p.stdin.close()
1284 self.inprogress = None
1288 self.p = subprocess.Popen([b'git', b'cat-file', b'--batch'],
1289 stdin=subprocess.PIPE,
1290 stdout=subprocess.PIPE,
1293 env=_gitenv(self.repo_dir))
1296 """Yield (oidx, type, size), followed by the data referred to by ref.
1297 If ref does not exist, only yield (None, None, None).
1300 if not self.p or self.p.poll() != None:
1303 poll_result = self.p.poll()
1304 assert(poll_result == None)
1306 log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1307 assert(not self.inprogress)
1308 assert ref.find(b'\n') < 0
1309 assert ref.find(b'\r') < 0
1310 assert not ref.startswith(b'-')
1311 self.inprogress = ref
1312 self.p.stdin.write(ref + b'\n')
1313 self.p.stdin.flush()
1314 hdr = self.p.stdout.readline()
1315 if hdr.endswith(b' missing\n'):
1316 self.inprogress = None
1317 yield None, None, None
1319 info = hdr.split(b' ')
1320 if len(info) != 3 or len(info[0]) != 40:
1321 raise GitError('expected object (id, type, size), got %r' % info)
1322 oidx, typ, size = info
1324 it = _AbortableIter(chunkyreader(self.p.stdout, size),
1325 onabort=self._abort)
1327 yield oidx, typ, size
1330 readline_result = self.p.stdout.readline()
1331 assert readline_result == b'\n'
1332 self.inprogress = None
1333 except Exception as e:
1337 def _join(self, it):
1338 _, typ, _ = next(it)
1342 elif typ == b'tree':
1343 treefile = b''.join(it)
1344 for (mode, name, sha) in tree_decode(treefile):
1345 for blob in self.join(hexlify(sha)):
1347 elif typ == b'commit':
1348 treeline = b''.join(it).split(b'\n')[0]
1349 assert treeline.startswith(b'tree ')
1350 for blob in self.join(treeline[5:]):
1353 raise GitError('invalid object type %r: expected blob/tree/commit'
1357 """Generate a list of the content of all blobs that can be reached
1358 from an object. The hash given in 'id' must point to a blob, a tree
1359 or a commit. The content of all blobs that can be seen from trees or
1360 commits will be added to the list.
1362 for d in self._join(self.get(id)):
1368 def cp(repo_dir=None):
1369 """Create a CatPipe object or reuse the already existing one."""
1372 repo_dir = repodir or repo()
1373 repo_dir = os.path.abspath(repo_dir)
1374 cp = _cp.get(repo_dir)
1376 cp = CatPipe(repo_dir)
1381 def tags(repo_dir = None):
1382 """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1384 for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1385 assert n.startswith(b'refs/tags/')
1389 tags[c].append(name) # more than one tag can point at 'c'
1393 class MissingObject(KeyError):
1394 def __init__(self, oid):
1396 KeyError.__init__(self, 'object %r is missing' % hexlify(oid))
1399 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1400 'path', 'chunk_path', 'data'])
1401 # The path is the mangled path, and if an item represents a fragment
1402 # of a chunked file, the chunk_path will be the chunked subtree path
1403 # for the chunk, i.e. ['', '2d3115e', ...]. The top-level path for a
1404 # chunked file will have a chunk_path of ['']. So some chunk subtree
1405 # of the file '/foo/bar/baz' might look like this:
1407 # item.path = ['foo', 'bar', 'baz.bup']
1408 # item.chunk_path = ['', '2d3115e', '016b097']
1409 # item.type = 'tree'
1413 def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1414 """Yield everything reachable from oidx via get_ref (which must behave
1415 like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1416 returns true. Throw MissingObject if a hash encountered is
1417 missing from the repository, and don't read or return blob content
1418 in the data field unless include_data is set.
1421 # Maintain the pending stack on the heap to avoid stack overflow
1422 pending = [(oidx, [], [], None)]
1424 oidx, parent_path, chunk_path, mode = pending.pop()
1425 oid = unhexlify(oidx)
1426 if stop_at and stop_at(oidx):
1429 if (not include_data) and mode and stat.S_ISREG(mode):
1430 # If the object is a "regular file", then it's a leaf in
1431 # the graph, so we can skip reading the data if the caller
1432 # hasn't requested it.
1433 yield WalkItem(oid=oid, type=b'blob',
1434 chunk_path=chunk_path, path=parent_path,
1439 item_it = get_ref(oidx)
1440 get_oidx, typ, _ = next(item_it)
1442 raise MissingObject(unhexlify(oidx))
1443 if typ not in (b'blob', b'commit', b'tree'):
1444 raise Exception('unexpected repository object type %r' % typ)
1446 # FIXME: set the mode based on the type when the mode is None
1447 if typ == b'blob' and not include_data:
1448 # Dump data until we can ask cat_pipe not to fetch it
1449 for ignored in item_it:
1453 data = b''.join(item_it)
1455 yield WalkItem(oid=oid, type=typ,
1456 chunk_path=chunk_path, path=parent_path,
1458 data=(data if include_data else None))
1460 if typ == b'commit':
1461 commit_items = parse_commit(data)
1462 for pid in commit_items.parents:
1463 pending.append((pid, parent_path, chunk_path, mode))
1464 pending.append((commit_items.tree, parent_path, chunk_path,
1465 hashsplit.GIT_MODE_TREE))
1466 elif typ == b'tree':
1467 for mode, name, ent_id in tree_decode(data):
1468 demangled, bup_type = demangle_name(name, mode)
1470 sub_path = parent_path
1471 sub_chunk_path = chunk_path + [name]
1473 sub_path = parent_path + [name]
1474 if bup_type == BUP_CHUNKED:
1475 sub_chunk_path = [b'']
1477 sub_chunk_path = chunk_path
1478 pending.append((hexlify(ent_id), sub_path, sub_chunk_path,