1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
6 from __future__ import absolute_import, print_function
7 import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
8 from array import array
9 from binascii import hexlify, unhexlify
10 from collections import namedtuple
11 from itertools import islice
12 from numbers import Integral
14 from bup import _helpers, compat, hashsplit, path, midx, bloom, xstat
15 from bup.compat import (buffer,
16 byte_int, bytes_from_byte, bytes_from_uint,
21 from bup.io import path_msg
22 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
25 hostname, localtime, log,
28 mmap_read, mmap_readwrite,
30 progress, qprogress, stat_if_exists,
33 from bup.pwdgrp import username, userfullname
37 repodir = None # The default repository, once initialized
39 _typemap = {b'blob': 3, b'tree': 2, b'commit': 1, b'tag': 4}
40 _typermap = {v: k for k, v in items(_typemap)}
47 class GitError(Exception):
51 def _gitenv(repo_dir=None):
54 return merge_dict(environ, {b'GIT_DIR': os.path.abspath(repo_dir)})
56 def _git_wait(cmd, p):
59 raise GitError('%r returned %d' % (cmd, rv))
61 def _git_exo(cmd, **kwargs):
62 kwargs['check'] = False
63 result = exo(cmd, **kwargs)
65 if proc.returncode != 0:
66 raise GitError('%r returned %d' % (cmd, proc.returncode))
69 def git_config_get(option, repo_dir=None):
70 cmd = (b'git', b'config', b'--get', option)
71 p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
72 env=_gitenv(repo_dir=repo_dir),
79 raise GitError('%r returned %d' % (cmd, rc))
83 def parse_tz_offset(s):
84 """UTC offset in seconds."""
85 tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
86 if bytes_from_byte(s[0]) == b'-':
91 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
92 # Make sure that's authoritative.
93 _start_end_char = br'[^ .,:;<>"\'\0\n]'
94 _content_char = br'[^\0\n<>]'
95 _safe_str_rx = br'(?:%s{1,2}|(?:%s%s*%s))' \
97 _start_end_char, _content_char, _start_end_char)
98 _tz_rx = br'[-+]\d\d[0-5]\d'
99 _parent_rx = br'(?:parent [abcdefABCDEF0123456789]{40}\n)'
100 # Assumes every following line starting with a space is part of the
101 # mergetag. Is there a formal commit blob spec?
102 _mergetag_rx = br'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)'
103 _commit_rx = re.compile(br'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
104 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
105 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?)
107 (?P<message>(?:.|\n)*)''' % (_parent_rx,
108 _safe_str_rx, _safe_str_rx, _tz_rx,
109 _safe_str_rx, _safe_str_rx, _tz_rx,
111 _parent_hash_rx = re.compile(br'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
113 # Note that the author_sec and committer_sec values are (UTC) epoch
114 # seconds, and for now the mergetag is not included.
115 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
116 'author_name', 'author_mail',
117 'author_sec', 'author_offset',
118 'committer_name', 'committer_mail',
119 'committer_sec', 'committer_offset',
122 def parse_commit(content):
123 commit_match = re.match(_commit_rx, content)
125 raise Exception('cannot parse commit %r' % content)
126 matches = commit_match.groupdict()
127 return CommitInfo(tree=matches['tree'],
128 parents=re.findall(_parent_hash_rx, matches['parents']),
129 author_name=matches['author_name'],
130 author_mail=matches['author_mail'],
131 author_sec=int(matches['asec']),
132 author_offset=parse_tz_offset(matches['atz']),
133 committer_name=matches['committer_name'],
134 committer_mail=matches['committer_mail'],
135 committer_sec=int(matches['csec']),
136 committer_offset=parse_tz_offset(matches['ctz']),
137 message=matches['message'])
140 def get_cat_data(cat_iterator, expected_type):
141 _, kind, _ = next(cat_iterator)
142 if kind != expected_type:
143 raise Exception('expected %r, saw %r' % (expected_type, kind))
144 return b''.join(cat_iterator)
146 def get_commit_items(id, cp):
147 return parse_commit(get_cat_data(cp.get(id), b'commit'))
149 def _local_git_date_str(epoch_sec):
150 return b'%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
153 def _git_date_str(epoch_sec, tz_offset_sec):
154 offs = tz_offset_sec // 60
155 return b'%d %s%02d%02d' \
157 b'+' if offs >= 0 else b'-',
162 def repo(sub = b'', repo_dir=None):
163 """Get the path to the git repository or one of its subdirectories."""
164 repo_dir = repo_dir or repodir
166 raise GitError('You should call check_repo_or_die()')
168 # If there's a .git subdirectory, then the actual repo is in there.
169 gd = os.path.join(repo_dir, b'.git')
170 if os.path.exists(gd):
173 return os.path.join(repo_dir, sub)
177 re.compile(br'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)')
180 return _shorten_hash_rx.sub(br'\1\2*\3', s)
184 full = os.path.abspath(path)
185 fullrepo = os.path.abspath(repo(b''))
186 if not fullrepo.endswith(b'/'):
188 if full.startswith(fullrepo):
189 path = full[len(fullrepo):]
190 if path.startswith(b'index-cache/'):
191 path = path[len(b'index-cache/'):]
192 return shorten_hash(path)
195 def auto_midx(objdir):
196 args = [path.exe(), b'midx', b'--auto', b'--dir', objdir]
198 rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
200 # make sure 'args' gets printed to help with debugging
201 add_error('%r: exception: %s' % (args, e))
204 add_error('%r: returned %d' % (args, rv))
206 args = [path.exe(), b'bloom', b'--dir', objdir]
208 rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
210 # make sure 'args' gets printed to help with debugging
211 add_error('%r: exception: %s' % (args, e))
214 add_error('%r: returned %d' % (args, rv))
217 def mangle_name(name, mode, gitmode):
218 """Mangle a file name to present an abstract name for segmented files.
219 Mangled file names will have the ".bup" extension added to them. If a
220 file's name already ends with ".bup", a ".bupl" extension is added to
221 disambiguate normal files from segmented ones.
223 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
224 assert(stat.S_ISDIR(gitmode))
225 return name + b'.bup'
226 elif name.endswith(b'.bup') or name[:-1].endswith(b'.bup'):
227 return name + b'.bupl'
232 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
233 def demangle_name(name, mode):
234 """Remove name mangling from a file name, if necessary.
236 The return value is a tuple (demangled_filename,mode), where mode is one of
239 * BUP_NORMAL : files that should be read as-is from the repository
240 * BUP_CHUNKED : files that were chunked and need to be reassembled
242 For more information on the name mangling algorithm, see mangle_name()
244 if name.endswith(b'.bupl'):
245 return (name[:-5], BUP_NORMAL)
246 elif name.endswith(b'.bup'):
247 return (name[:-4], BUP_CHUNKED)
248 elif name.endswith(b'.bupm'):
250 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
252 return (name, BUP_NORMAL)
255 def calc_hash(type, content):
256 """Calculate some content's hash in the Git fashion."""
257 header = b'%s %d\0' % (type, len(content))
263 def shalist_item_sort_key(ent):
264 (mode, name, id) = ent
265 assert(mode+0 == mode)
266 if stat.S_ISDIR(mode):
272 def tree_encode(shalist):
273 """Generate a git tree object from (mode,name,hash) tuples."""
274 shalist = sorted(shalist, key = shalist_item_sort_key)
276 for (mode,name,bin) in shalist:
278 assert(mode+0 == mode)
280 assert(len(bin) == 20)
281 s = b'%o %s\0%s' % (mode,name,bin)
282 assert s[0] != b'0' # 0-padded octal is not acceptable in a git tree
287 def tree_decode(buf):
288 """Generate a list of (mode,name,hash) from the git tree object in buf."""
290 while ofs < len(buf):
291 z = buf.find(b'\0', ofs)
293 spl = buf[ofs:z].split(b' ', 1)
294 assert(len(spl) == 2)
296 sha = buf[z+1:z+1+20]
298 yield (int(mode, 8), name, sha)
301 def _encode_packobj(type, content, compression_level=1):
302 if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
303 raise ValueError('invalid compression level %s' % compression_level)
306 szbits = (sz & 0x0f) | (_typemap[type]<<4)
309 if sz: szbits |= 0x80
310 szout += bytes_from_uint(szbits)
315 z = zlib.compressobj(compression_level)
317 yield z.compress(content)
321 def _encode_looseobj(type, content, compression_level=1):
322 z = zlib.compressobj(compression_level)
323 yield z.compress(b'%s %d\0' % (type, len(content)))
324 yield z.compress(content)
328 def _decode_looseobj(buf):
330 s = zlib.decompress(buf)
333 l = s[:i].split(b' ')
337 assert(type in _typemap)
338 assert(sz == len(content))
339 return (type, content)
342 def _decode_packobj(buf):
345 type = _typermap[(c & 0x70) >> 4]
352 sz |= (c & 0x7f) << shift
356 return (type, zlib.decompress(buf[i+1:]))
363 def find_offset(self, hash):
364 """Get the offset of an object inside the index file."""
365 idx = self._idx_from_hash(hash)
367 return self._ofs_from_idx(idx)
370 def exists(self, hash, want_source=False):
371 """Return nonempty if the object exists in this index."""
372 if hash and (self._idx_from_hash(hash) != None):
373 return want_source and os.path.basename(self.name) or True
376 def _idx_from_hash(self, hash):
377 global _total_searches, _total_steps
379 assert(len(hash) == 20)
380 b1 = byte_int(hash[0])
381 start = self.fanout[b1-1] # range -1..254
382 end = self.fanout[b1] # range 0..255
384 _total_steps += 1 # lookup table is a step
387 mid = start + (end - start) // 2
388 v = self._idx_to_hash(mid)
398 class PackIdxV1(PackIdx):
399 """Object representation of a Git pack index (version 1) file."""
400 def __init__(self, filename, f):
402 self.idxnames = [self.name]
403 self.map = mmap_read(f)
404 # Min size for 'L' is 4, which is sufficient for struct's '!I'
405 self.fanout = array('L', struct.unpack('!256I', self.map))
406 self.fanout.append(0) # entry "-1"
407 self.nsha = self.fanout[255]
408 self.sha_ofs = 256 * 4
409 # Avoid slicing shatable for individual hashes (very high overhead)
410 self.shatable = buffer(self.map, self.sha_ofs, self.nsha * 24)
415 def __exit__(self, type, value, traceback):
419 return int(self.nsha) # int() from long for python 2
421 def _ofs_from_idx(self, idx):
422 if idx >= self.nsha or idx < 0:
423 raise IndexError('invalid pack index index %d' % idx)
424 ofs = self.sha_ofs + idx * 24
425 return struct.unpack_from('!I', self.map, offset=ofs)[0]
427 def _idx_to_hash(self, idx):
428 if idx >= self.nsha or idx < 0:
429 raise IndexError('invalid pack index index %d' % idx)
430 ofs = self.sha_ofs + idx * 24 + 4
431 return self.map[ofs : ofs + 20]
434 start = self.sha_ofs + 4
435 for ofs in range(start, start + 24 * self.nsha, 24):
436 yield self.map[ofs : ofs + 20]
439 if self.map is not None:
445 class PackIdxV2(PackIdx):
446 """Object representation of a Git pack index (version 2) file."""
447 def __init__(self, filename, f):
449 self.idxnames = [self.name]
450 self.map = mmap_read(f)
451 assert self.map[0:8] == b'\377tOc\0\0\0\2'
452 # Min size for 'L' is 4, which is sufficient for struct's '!I'
453 self.fanout = array('L', struct.unpack_from('!256I', self.map, offset=8))
454 self.fanout.append(0)
455 self.nsha = self.fanout[255]
456 self.sha_ofs = 8 + 256*4
457 self.ofstable_ofs = self.sha_ofs + self.nsha * 20 + self.nsha * 4
458 self.ofs64table_ofs = self.ofstable_ofs + self.nsha * 4
459 # Avoid slicing this for individual hashes (very high overhead)
460 self.shatable = buffer(self.map, self.sha_ofs, self.nsha*20)
465 def __exit__(self, type, value, traceback):
469 return int(self.nsha) # int() from long for python 2
471 def _ofs_from_idx(self, idx):
472 if idx >= self.nsha or idx < 0:
473 raise IndexError('invalid pack index index %d' % idx)
474 ofs_ofs = self.ofstable_ofs + idx * 4
475 ofs = struct.unpack_from('!I', self.map, offset=ofs_ofs)[0]
477 idx64 = ofs & 0x7fffffff
478 ofs64_ofs = self.ofs64table_ofs + idx64 * 8
479 ofs = struct.unpack_from('!Q', self.map, offset=ofs64_ofs)[0]
482 def _idx_to_hash(self, idx):
483 if idx >= self.nsha or idx < 0:
484 raise IndexError('invalid pack index index %d' % idx)
485 ofs = self.sha_ofs + idx * 20
486 return self.map[ofs : ofs + 20]
490 for ofs in range(start, start + 20 * self.nsha, 20):
491 yield self.map[ofs : ofs + 20]
494 if self.map is not None:
502 def __init__(self, dir, ignore_midx=False):
504 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
509 self.do_bloom = False
511 self.ignore_midx = ignore_midx
517 assert(_mpi_count == 0)
520 return iter(idxmerge(self.packs))
523 return sum(len(pack) for pack in self.packs)
525 def exists(self, hash, want_source=False):
526 """Return nonempty if the object exists in the index files."""
527 global _total_searches
529 if hash in self.also:
531 if self.do_bloom and self.bloom:
532 if self.bloom.exists(hash):
533 self.do_bloom = False
535 _total_searches -= 1 # was counted by bloom
537 for i in range(len(self.packs)):
539 _total_searches -= 1 # will be incremented by sub-pack
540 ix = p.exists(hash, want_source=want_source)
542 # reorder so most recently used packs are searched first
543 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
548 def refresh(self, skip_midx = False):
549 """Refresh the index list.
550 This method verifies if .midx files were superseded (e.g. all of its
551 contents are in another, bigger .midx file) and removes the superseded
554 If skip_midx is True, all work on .midx files will be skipped and .midx
555 files will be removed from the list.
557 The instance variable 'ignore_midx' can force this function to
558 always act as if skip_midx was True.
560 if self.bloom is not None:
562 self.bloom = None # Always reopen the bloom as it may have been relaced
563 self.do_bloom = False
564 skip_midx = skip_midx or self.ignore_midx
565 d = dict((p.name, p) for p in self.packs
566 if not skip_midx or not isinstance(p, midx.PackMidx))
567 if os.path.exists(self.dir):
570 midxes = set(glob.glob(os.path.join(self.dir, b'*.midx')))
571 # remove any *.midx files from our list that no longer exist
572 for ix in list(d.values()):
573 if not isinstance(ix, midx.PackMidx):
575 if ix.name in midxes:
580 self.packs.remove(ix)
581 for ix in self.packs:
582 if isinstance(ix, midx.PackMidx):
583 for name in ix.idxnames:
584 d[os.path.join(self.dir, name)] = ix
587 mx = midx.PackMidx(full)
588 (mxd, mxf) = os.path.split(mx.name)
590 for n in mx.idxnames:
591 if not os.path.exists(os.path.join(mxd, n)):
592 log(('warning: index %s missing\n'
594 % (path_msg(n), path_msg(mxf)))
602 midxl.sort(key=lambda ix:
603 (-len(ix), -xstat.stat(ix.name).st_mtime))
606 for sub in ix.idxnames:
607 found = d.get(os.path.join(self.dir, sub))
608 if not found or isinstance(found, PackIdx):
609 # doesn't exist, or exists but not in a midx
614 for name in ix.idxnames:
615 d[os.path.join(self.dir, name)] = ix
616 elif not ix.force_keep:
617 debug1('midx: removing redundant: %s\n'
618 % path_msg(os.path.basename(ix.name)))
621 for full in glob.glob(os.path.join(self.dir, b'*.idx')):
625 except GitError as e:
629 bfull = os.path.join(self.dir, b'bup.bloom')
630 if self.bloom is None and os.path.exists(bfull):
631 self.bloom = bloom.ShaBloom(bfull)
632 self.packs = list(set(d.values()))
633 self.packs.sort(reverse=True, key=lambda x: len(x))
634 if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
638 debug1('PackIdxList: using %d index%s.\n'
639 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
642 """Insert an additional object in the list."""
646 def open_idx(filename):
647 if filename.endswith(b'.idx'):
648 f = open(filename, 'rb')
650 if header[0:4] == b'\377tOc':
651 version = struct.unpack('!I', header[4:8])[0]
653 return PackIdxV2(filename, f)
655 raise GitError('%s: expected idx file version 2, got %d'
656 % (path_msg(filename), version))
657 elif len(header) == 8 and header[0:4] < b'\377tOc':
658 return PackIdxV1(filename, f)
660 raise GitError('%s: unrecognized idx file header'
661 % path_msg(filename))
662 elif filename.endswith(b'.midx'):
663 return midx.PackMidx(filename)
665 raise GitError('idx filenames must end with .idx or .midx')
668 def idxmerge(idxlist, final_progress=True):
669 """Generate a list of all the objects reachable in a PackIdxList."""
670 def pfunc(count, total):
671 qprogress('Reading indexes: %.2f%% (%d/%d)\r'
672 % (count*100.0/total, count, total))
673 def pfinal(count, total):
675 progress('Reading indexes: %.2f%% (%d/%d), done.\n'
676 % (100, total, total))
677 return merge_iter(idxlist, 10024, pfunc, pfinal)
680 def _make_objcache():
681 return PackIdxList(repo(b'objects/pack'))
683 # bup-gc assumes that it can disable all PackWriter activities
684 # (bloom/midx/cache) via the constructor and close() arguments.
687 """Writes Git objects inside a pack file."""
688 def __init__(self, objcache_maker=_make_objcache, compression_level=1,
689 run_midx=True, on_pack_finish=None,
690 max_pack_size=None, max_pack_objects=None, repo_dir=None):
691 self.repo_dir = repo_dir or repo()
698 self.objcache_maker = objcache_maker
700 self.compression_level = compression_level
701 self.run_midx=run_midx
702 self.on_pack_finish = on_pack_finish
703 if not max_pack_size:
704 max_pack_size = git_config_get(b'pack.packSizeLimit',
705 repo_dir=self.repo_dir)
706 if max_pack_size is not None:
707 max_pack_size = parse_num(max_pack_size)
708 if not max_pack_size:
709 # larger packs slow down pruning
710 max_pack_size = 1000 * 1000 * 1000
711 self.max_pack_size = max_pack_size
712 # cache memory usage is about 83 bytes per object
713 self.max_pack_objects = max_pack_objects if max_pack_objects \
714 else max(1, self.max_pack_size // 5000)
722 def __exit__(self, type, value, traceback):
727 objdir = dir = os.path.join(self.repo_dir, b'objects')
728 fd, name = tempfile.mkstemp(suffix=b'.pack', dir=objdir)
730 self.file = os.fdopen(fd, 'w+b')
735 self.parentfd = os.open(objdir, os.O_RDONLY)
741 assert name.endswith(b'.pack')
742 self.filename = name[:-5]
743 self.file.write(b'PACK\0\0\0\2\0\0\0\0')
744 self.idx = PackIdxV2Writer()
746 def _raw_write(self, datalist, sha):
749 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
750 # the file never has a *partial* blob. So let's make sure it's
751 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
752 # to our hashsplit algorithm.) f.write() does its own buffering,
753 # but that's okay because we'll flush it in _end().
754 oneblob = b''.join(datalist)
760 crc = zlib.crc32(oneblob) & 0xffffffff
761 self._update_idx(sha, crc, nw)
766 def _update_idx(self, sha, crc, size):
769 self.idx.add(sha, crc, self.file.tell() - size)
771 def _write(self, sha, type, content):
775 sha = calc_hash(type, content)
776 size, crc = self._raw_write(_encode_packobj(type, content,
777 self.compression_level),
779 if self.outbytes >= self.max_pack_size \
780 or self.count >= self.max_pack_objects:
784 def breakpoint(self):
785 """Clear byte and object counts and return the last processed id."""
786 id = self._end(self.run_midx)
787 self.outbytes = self.count = 0
790 def _require_objcache(self):
791 if self.objcache is None and self.objcache_maker:
792 self.objcache = self.objcache_maker()
793 if self.objcache is None:
795 "PackWriter not opened or can't check exists w/o objcache")
797 def exists(self, id, want_source=False):
798 """Return non-empty if an object is found in the object cache."""
799 self._require_objcache()
800 return self.objcache.exists(id, want_source=want_source)
802 def just_write(self, sha, type, content):
803 """Write an object to the pack file without checking for duplication."""
804 self._write(sha, type, content)
805 # If nothing else, gc doesn't have/want an objcache
806 if self.objcache is not None:
807 self.objcache.add(sha)
809 def maybe_write(self, type, content):
810 """Write an object to the pack file if not present and return its id."""
811 sha = calc_hash(type, content)
812 if not self.exists(sha):
813 self._require_objcache()
814 self.just_write(sha, type, content)
817 def new_blob(self, blob):
818 """Create a blob object in the pack with the supplied content."""
819 return self.maybe_write(b'blob', blob)
821 def new_tree(self, shalist):
822 """Create a tree object in the pack."""
823 content = tree_encode(shalist)
824 return self.maybe_write(b'tree', content)
826 def new_commit(self, tree, parent,
827 author, adate_sec, adate_tz,
828 committer, cdate_sec, cdate_tz,
830 """Create a commit object in the pack. The date_sec values must be
831 epoch-seconds, and if a tz is None, the local timezone is assumed."""
832 if adate_tz is not None:
833 adate_str = _git_date_str(adate_sec, adate_tz)
835 adate_str = _local_git_date_str(adate_sec)
836 if cdate_tz is not None:
837 cdate_str = _git_date_str(cdate_sec, cdate_tz)
839 cdate_str = _local_git_date_str(cdate_sec)
841 if tree: l.append(b'tree %s' % hexlify(tree))
842 if parent: l.append(b'parent %s' % hexlify(parent))
843 if author: l.append(b'author %s %s' % (author, adate_str))
844 if committer: l.append(b'committer %s %s' % (committer, cdate_str))
847 return self.maybe_write(b'commit', b'\n'.join(l))
850 """Remove the pack file from disk."""
859 os.unlink(self.filename + b'.pack')
866 def _end(self, run_midx=True):
868 if not f: return None
875 # update object count
877 cp = struct.pack('!i', self.count)
881 # calculate the pack sha1sum
884 for b in chunkyreader(f):
886 packbin = sum.digest()
888 fdatasync(f.fileno())
892 obj_list_sha = idx.write(self.filename + b'.idx', packbin)
893 nameprefix = os.path.join(self.repo_dir,
894 b'objects/pack/pack-' + obj_list_sha)
895 if os.path.exists(self.filename + b'.map'):
896 os.unlink(self.filename + b'.map')
897 os.rename(self.filename + b'.pack', nameprefix + b'.pack')
898 os.rename(self.filename + b'.idx', nameprefix + b'.idx')
900 os.fsync(self.parentfd)
902 os.close(self.parentfd)
905 auto_midx(os.path.join(self.repo_dir, b'objects/pack'))
907 if self.on_pack_finish:
908 self.on_pack_finish(nameprefix)
912 def close(self, run_midx=True):
913 """Close the pack file and move it to its definitive path."""
914 return self._end(run_midx=run_midx)
917 class PackIdxV2Writer:
919 self.idx = list(list() for i in range(256))
922 def add(self, sha, crc, offs):
925 self.idx[byte_int(sha[0])].append((sha, crc, offs))
927 def write(self, filename, packbin):
929 for section in self.idx:
930 for entry in section:
931 if entry[2] >= 2**31:
934 # Length: header + fan-out + shas-and-crcs + overflow-offsets
935 index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
937 idx_f = open(filename, 'w+b')
939 idx_f.truncate(index_len)
940 fdatasync(idx_f.fileno())
941 idx_map = mmap_readwrite(idx_f, close=False)
943 count = _helpers.write_idx(filename, idx_map, self.idx,
945 assert(count == self.count)
952 idx_f = open(filename, 'a+b')
957 b = idx_f.read(8 + 4*256)
960 obj_list_sum = Sha1()
961 for b in chunkyreader(idx_f, 20 * self.count):
963 obj_list_sum.update(b)
964 namebase = hexlify(obj_list_sum.digest())
966 for b in chunkyreader(idx_f):
968 idx_f.write(idx_sum.digest())
969 fdatasync(idx_f.fileno())
975 def list_refs(patterns=None, repo_dir=None,
976 limit_to_heads=False, limit_to_tags=False):
977 """Yield (refname, hash) tuples for all repository refs unless
978 patterns are specified. In that case, only include tuples for
979 refs matching those patterns (cf. git-show-ref(1)). The limits
980 restrict the result items to refs/heads or refs/tags. If both
981 limits are specified, items from both sources will be included.
984 argv = [b'git', b'show-ref']
986 argv.append(b'--heads')
988 argv.append(b'--tags')
991 argv.extend(patterns)
992 p = subprocess.Popen(argv, env=_gitenv(repo_dir), stdout=subprocess.PIPE,
994 out = p.stdout.read().strip()
995 rv = p.wait() # not fatal
999 for d in out.split(b'\n'):
1000 sha, name = d.split(b' ', 1)
1001 yield name, unhexlify(sha)
1004 def read_ref(refname, repo_dir = None):
1005 """Get the commit id of the most recent commit made on a given ref."""
1006 refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
1007 l = tuple(islice(refs, 2))
1015 def rev_list_invocation(ref_or_refs, format=None):
1016 if isinstance(ref_or_refs, bytes):
1017 refs = (ref_or_refs,)
1020 argv = [b'git', b'rev-list']
1023 argv.append(b'--pretty=format:' + format)
1025 assert not ref.startswith(b'-')
1031 def rev_list(ref_or_refs, parse=None, format=None, repo_dir=None):
1032 """Yield information about commits as per "git rev-list". If a format
1033 is not provided, yield one hex hash at a time. If a format is
1034 provided, pass it to rev-list and call parse(git_stdout) for each
1035 commit with the stream positioned just after the rev-list "commit
1036 HASH" header line. When a format is provided yield (oidx,
1037 parse(git_stdout)) for each commit.
1040 assert bool(parse) == bool(format)
1041 p = subprocess.Popen(rev_list_invocation(ref_or_refs,
1043 env=_gitenv(repo_dir),
1044 stdout = subprocess.PIPE,
1047 for line in p.stdout:
1050 line = p.stdout.readline()
1053 if not s.startswith(b'commit '):
1054 raise Exception('unexpected line ' + repr(s))
1057 yield s, parse(p.stdout)
1058 line = p.stdout.readline()
1060 rv = p.wait() # not fatal
1062 raise GitError('git rev-list returned error %d' % rv)
1065 def get_commit_dates(refs, repo_dir=None):
1066 """Get the dates for the specified commit refs. For now, every unique
1067 string in refs must resolve to a different commit or this
1068 function will fail."""
1071 commit = get_commit_items(ref, cp(repo_dir))
1072 result.append(commit.author_sec)
1076 def rev_parse(committish, repo_dir=None):
1077 """Resolve the full hash for 'committish', if it exists.
1079 Should be roughly equivalent to 'git rev-parse'.
1081 Returns the hex value of the hash if it is found, None if 'committish' does
1082 not correspond to anything.
1084 head = read_ref(committish, repo_dir=repo_dir)
1086 debug2("resolved from ref: commit = %s\n" % hexlify(head))
1089 pL = PackIdxList(repo(b'objects/pack', repo_dir=repo_dir))
1091 if len(committish) == 40:
1093 hash = unhexlify(committish)
1103 def update_ref(refname, newval, oldval, repo_dir=None):
1104 """Update a repository reference."""
1107 assert refname.startswith(b'refs/heads/') \
1108 or refname.startswith(b'refs/tags/')
1109 p = subprocess.Popen([b'git', b'update-ref', refname,
1110 hexlify(newval), hexlify(oldval)],
1111 env=_gitenv(repo_dir),
1113 _git_wait(b'git update-ref', p)
1116 def delete_ref(refname, oldvalue=None):
1117 """Delete a repository reference (see git update-ref(1))."""
1118 assert refname.startswith(b'refs/')
1119 oldvalue = [] if not oldvalue else [oldvalue]
1120 p = subprocess.Popen([b'git', b'update-ref', b'-d', refname] + oldvalue,
1123 _git_wait('git update-ref', p)
1126 def guess_repo(path=None):
1127 """Set the path value in the global variable "repodir".
1128 This makes bup look for an existing bup repository, but not fail if a
1129 repository doesn't exist. Usually, if you are interacting with a bup
1130 repository, you would not be calling this function but using
1131 check_repo_or_die().
1137 repodir = environ.get(b'BUP_DIR')
1139 repodir = os.path.expanduser(b'~/.bup')
1142 def init_repo(path=None):
1143 """Create the Git bare repository for bup in a given path."""
1145 d = repo() # appends a / to the path
1146 parent = os.path.dirname(os.path.dirname(d))
1147 if parent and not os.path.exists(parent):
1148 raise GitError('parent directory "%s" does not exist\n'
1150 if os.path.exists(d) and not os.path.isdir(os.path.join(d, b'.')):
1151 raise GitError('"%s" exists but is not a directory\n' % path_msg(d))
1152 p = subprocess.Popen([b'git', b'--bare', b'init'], stdout=sys.stderr,
1155 _git_wait('git init', p)
1156 # Force the index version configuration in order to ensure bup works
1157 # regardless of the version of the installed Git binary.
1158 p = subprocess.Popen([b'git', b'config', b'pack.indexVersion', '2'],
1159 stdout=sys.stderr, env=_gitenv(), close_fds=True)
1160 _git_wait('git config', p)
1162 p = subprocess.Popen([b'git', b'config', b'core.logAllRefUpdates', b'true'],
1163 stdout=sys.stderr, env=_gitenv(), close_fds=True)
1164 _git_wait('git config', p)
1167 def check_repo_or_die(path=None):
1168 """Check to see if a bup repository probably exists, and abort if not."""
1171 pst = stat_if_exists(top + b'/objects/pack')
1172 if pst and stat.S_ISDIR(pst.st_mode):
1175 top_st = stat_if_exists(top)
1177 log('error: repository %r does not exist (see "bup help init")\n'
1180 log('error: %s is not a repository\n' % path_msg(top))
1184 def is_suitable_git(ver_str):
1185 if not ver_str.startswith(b'git version '):
1186 return 'unrecognized'
1187 ver_str = ver_str[len(b'git version '):]
1188 if ver_str.startswith(b'0.'):
1189 return 'insufficient'
1190 if ver_str.startswith(b'1.'):
1191 if re.match(br'1\.[012345]rc', ver_str):
1192 return 'insufficient'
1193 if re.match(br'1\.[01234]\.', ver_str):
1194 return 'insufficient'
1195 if re.match(br'1\.5\.[012345]($|\.)', ver_str):
1196 return 'insufficient'
1197 if re.match(br'1\.5\.6-rc', ver_str):
1198 return 'insufficient'
1200 if re.match(br'[0-9]+(\.|$)?', ver_str):
1206 def require_suitable_git(ver_str=None):
1207 """Raise GitError if the version of git isn't suitable.
1209 Rely on ver_str when provided, rather than invoking the git in the
1214 if _git_great is not None:
1216 if environ.get(b'BUP_GIT_VERSION_IS_FINE', b'').lower() \
1217 in (b'yes', b'true', b'1'):
1221 ver_str, _, _ = _git_exo([b'git', b'--version'])
1222 status = is_suitable_git(ver_str)
1223 if status == 'unrecognized':
1224 raise GitError('Unexpected git --version output: %r' % ver_str)
1225 if status == 'insufficient':
1226 log('error: git version must be at least 1.5.6\n')
1228 if status == 'suitable':
1234 class _AbortableIter:
1235 def __init__(self, it, onabort = None):
1237 self.onabort = onabort
1245 return next(self.it)
1246 except StopIteration as e:
1256 """Abort iteration and call the abortion callback, if needed."""
1267 """Link to 'git cat-file' that is used to retrieve blob data."""
1268 def __init__(self, repo_dir = None):
1269 require_suitable_git()
1270 self.repo_dir = repo_dir
1271 self.p = self.inprogress = None
1273 def close(self, wait=False):
1279 self.inprogress = None
1286 self.p = subprocess.Popen([b'git', b'cat-file', b'--batch'],
1287 stdin=subprocess.PIPE,
1288 stdout=subprocess.PIPE,
1291 env=_gitenv(self.repo_dir))
1294 """Yield (oidx, type, size), followed by the data referred to by ref.
1295 If ref does not exist, only yield (None, None, None).
1298 if not self.p or self.p.poll() != None:
1301 poll_result = self.p.poll()
1302 assert(poll_result == None)
1304 log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1305 assert(not self.inprogress)
1306 assert ref.find(b'\n') < 0
1307 assert ref.find(b'\r') < 0
1308 assert not ref.startswith(b'-')
1309 self.inprogress = ref
1310 self.p.stdin.write(ref + b'\n')
1311 self.p.stdin.flush()
1312 hdr = self.p.stdout.readline()
1313 if hdr.endswith(b' missing\n'):
1314 self.inprogress = None
1315 yield None, None, None
1317 info = hdr.split(b' ')
1318 if len(info) != 3 or len(info[0]) != 40:
1319 raise GitError('expected object (id, type, size), got %r' % info)
1320 oidx, typ, size = info
1322 it = _AbortableIter(chunkyreader(self.p.stdout, size),
1325 yield oidx, typ, size
1328 readline_result = self.p.stdout.readline()
1329 assert readline_result == b'\n'
1330 self.inprogress = None
1331 except Exception as e:
1335 def _join(self, it):
1336 _, typ, _ = next(it)
1340 elif typ == b'tree':
1341 treefile = b''.join(it)
1342 for (mode, name, sha) in tree_decode(treefile):
1343 for blob in self.join(hexlify(sha)):
1345 elif typ == b'commit':
1346 treeline = b''.join(it).split(b'\n')[0]
1347 assert treeline.startswith(b'tree ')
1348 for blob in self.join(treeline[5:]):
1351 raise GitError('invalid object type %r: expected blob/tree/commit'
1355 """Generate a list of the content of all blobs that can be reached
1356 from an object. The hash given in 'id' must point to a blob, a tree
1357 or a commit. The content of all blobs that can be seen from trees or
1358 commits will be added to the list.
1360 for d in self._join(self.get(id)):
1366 def cp(repo_dir=None):
1367 """Create a CatPipe object or reuse the already existing one."""
1370 repo_dir = repodir or repo()
1371 repo_dir = os.path.abspath(repo_dir)
1372 cp = _cp.get(repo_dir)
1374 cp = CatPipe(repo_dir)
1379 def close_catpipes():
1380 # FIXME: chain exceptions
1382 _, cp = _cp.popitem()
1386 def tags(repo_dir = None):
1387 """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1389 for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1390 assert n.startswith(b'refs/tags/')
1394 tags[c].append(name) # more than one tag can point at 'c'
1398 class MissingObject(KeyError):
1399 def __init__(self, oid):
1401 KeyError.__init__(self, 'object %r is missing' % hexlify(oid))
1404 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1405 'path', 'chunk_path', 'data'])
1406 # The path is the mangled path, and if an item represents a fragment
1407 # of a chunked file, the chunk_path will be the chunked subtree path
1408 # for the chunk, i.e. ['', '2d3115e', ...]. The top-level path for a
1409 # chunked file will have a chunk_path of ['']. So some chunk subtree
1410 # of the file '/foo/bar/baz' might look like this:
1412 # item.path = ['foo', 'bar', 'baz.bup']
1413 # item.chunk_path = ['', '2d3115e', '016b097']
1414 # item.type = 'tree'
1418 def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1419 """Yield everything reachable from oidx via get_ref (which must behave
1420 like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1421 returns true. Throw MissingObject if a hash encountered is
1422 missing from the repository, and don't read or return blob content
1423 in the data field unless include_data is set.
1426 # Maintain the pending stack on the heap to avoid stack overflow
1427 pending = [(oidx, [], [], None)]
1429 oidx, parent_path, chunk_path, mode = pending.pop()
1430 oid = unhexlify(oidx)
1431 if stop_at and stop_at(oidx):
1434 if (not include_data) and mode and stat.S_ISREG(mode):
1435 # If the object is a "regular file", then it's a leaf in
1436 # the graph, so we can skip reading the data if the caller
1437 # hasn't requested it.
1438 yield WalkItem(oid=oid, type=b'blob',
1439 chunk_path=chunk_path, path=parent_path,
1444 item_it = get_ref(oidx)
1445 get_oidx, typ, _ = next(item_it)
1447 raise MissingObject(unhexlify(oidx))
1448 if typ not in (b'blob', b'commit', b'tree'):
1449 raise Exception('unexpected repository object type %r' % typ)
1451 # FIXME: set the mode based on the type when the mode is None
1452 if typ == b'blob' and not include_data:
1453 # Dump data until we can ask cat_pipe not to fetch it
1454 for ignored in item_it:
1458 data = b''.join(item_it)
1460 yield WalkItem(oid=oid, type=typ,
1461 chunk_path=chunk_path, path=parent_path,
1463 data=(data if include_data else None))
1465 if typ == b'commit':
1466 commit_items = parse_commit(data)
1467 for pid in commit_items.parents:
1468 pending.append((pid, parent_path, chunk_path, mode))
1469 pending.append((commit_items.tree, parent_path, chunk_path,
1470 hashsplit.GIT_MODE_TREE))
1471 elif typ == b'tree':
1472 for mode, name, ent_id in tree_decode(data):
1473 demangled, bup_type = demangle_name(name, mode)
1475 sub_path = parent_path
1476 sub_chunk_path = chunk_path + [name]
1478 sub_path = parent_path + [name]
1479 if bup_type == BUP_CHUNKED:
1480 sub_chunk_path = [b'']
1482 sub_chunk_path = chunk_path
1483 pending.append((hexlify(ent_id), sub_path, sub_chunk_path,