1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
6 from __future__ import absolute_import, print_function
7 import os, sys, zlib, subprocess, struct, stat, re, tempfile, glob
8 from array import array
9 from binascii import hexlify, unhexlify
10 from collections import namedtuple
11 from itertools import islice
13 from bup import _helpers, hashsplit, path, midx, bloom, xstat
14 from bup.compat import (buffer,
15 byte_int, bytes_from_byte, bytes_from_uint,
20 from bup.io import path_msg
21 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
27 mmap_read, mmap_readwrite,
28 progress, qprogress, stat_if_exists,
34 repodir = None # The default repository, once initialized
36 _typemap = {b'blob': 3, b'tree': 2, b'commit': 1, b'tag': 4}
37 _typermap = {v: k for k, v in items(_typemap)}
44 class GitError(Exception):
48 def _gitenv(repo_dir=None):
51 return merge_dict(environ, {b'GIT_DIR': os.path.abspath(repo_dir)})
53 def _git_wait(cmd, p):
56 raise GitError('%r returned %d' % (cmd, rv))
58 def _git_exo(cmd, **kwargs):
59 kwargs['check'] = False
60 result = exo(cmd, **kwargs)
62 if proc.returncode != 0:
63 raise GitError('%r returned %d' % (cmd, proc.returncode))
66 def git_config_get(option, repo_dir=None, opttype=None, cfg_file=None):
67 assert not (repo_dir and cfg_file), "repo_dir and cfg_file cannot both be used"
68 cmd = [b'git', b'config', b'--null']
70 cmd.extend([b'--file', cfg_file])
72 cmd.extend([b'--int'])
73 elif opttype == 'bool':
74 cmd.extend([b'--bool'])
76 assert opttype is None
77 cmd.extend([b'--get', option])
80 env = _gitenv(repo_dir=repo_dir)
81 p = subprocess.Popen(cmd, stdout=subprocess.PIPE, env=env,
83 # with --null, git writes out a trailing \0 after the value
84 r = p.stdout.read()[:-1]
89 elif opttype == 'bool':
90 # git converts to 'true' or 'false'
94 raise GitError('%r returned %d' % (cmd, rc))
98 def parse_tz_offset(s):
99 """UTC offset in seconds."""
100 tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
101 if bytes_from_byte(s[0]) == b'-':
106 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
107 # Make sure that's authoritative.
108 _start_end_char = br'[^ .,:;<>"\'\0\n]'
109 _content_char = br'[^\0\n<>]'
110 _safe_str_rx = br'(?:%s{1,2}|(?:%s%s*%s))' \
112 _start_end_char, _content_char, _start_end_char)
113 _tz_rx = br'[-+]\d\d[0-5]\d'
114 _parent_rx = br'(?:parent [abcdefABCDEF0123456789]{40}\n)'
115 # Assumes every following line starting with a space is part of the
116 # mergetag. Is there a formal commit blob spec?
117 _mergetag_rx = br'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)'
118 _commit_rx = re.compile(br'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
119 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
120 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?)
122 (?P<message>(?:.|\n)*)''' % (_parent_rx,
123 _safe_str_rx, _safe_str_rx, _tz_rx,
124 _safe_str_rx, _safe_str_rx, _tz_rx,
126 _parent_hash_rx = re.compile(br'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
128 # Note that the author_sec and committer_sec values are (UTC) epoch
129 # seconds, and for now the mergetag is not included.
130 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
131 'author_name', 'author_mail',
132 'author_sec', 'author_offset',
133 'committer_name', 'committer_mail',
134 'committer_sec', 'committer_offset',
137 def parse_commit(content):
138 commit_match = re.match(_commit_rx, content)
140 raise Exception('cannot parse commit %r' % content)
141 matches = commit_match.groupdict()
142 return CommitInfo(tree=matches['tree'],
143 parents=re.findall(_parent_hash_rx, matches['parents']),
144 author_name=matches['author_name'],
145 author_mail=matches['author_mail'],
146 author_sec=int(matches['asec']),
147 author_offset=parse_tz_offset(matches['atz']),
148 committer_name=matches['committer_name'],
149 committer_mail=matches['committer_mail'],
150 committer_sec=int(matches['csec']),
151 committer_offset=parse_tz_offset(matches['ctz']),
152 message=matches['message'])
155 def get_cat_data(cat_iterator, expected_type):
156 _, kind, _ = next(cat_iterator)
157 if kind != expected_type:
158 raise Exception('expected %r, saw %r' % (expected_type, kind))
159 return b''.join(cat_iterator)
161 def get_commit_items(id, cp):
162 return parse_commit(get_cat_data(cp.get(id), b'commit'))
164 def _local_git_date_str(epoch_sec):
165 return b'%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
168 def _git_date_str(epoch_sec, tz_offset_sec):
169 offs = tz_offset_sec // 60
170 return b'%d %s%02d%02d' \
172 b'+' if offs >= 0 else b'-',
177 def repo(sub = b'', repo_dir=None):
178 """Get the path to the git repository or one of its subdirectories."""
179 repo_dir = repo_dir or repodir
181 raise GitError('You should call check_repo_or_die()')
183 # If there's a .git subdirectory, then the actual repo is in there.
184 gd = os.path.join(repo_dir, b'.git')
185 if os.path.exists(gd):
188 return os.path.join(repo_dir, sub)
192 re.compile(br'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)')
195 return _shorten_hash_rx.sub(br'\1\2*\3', s)
199 full = os.path.abspath(path)
200 fullrepo = os.path.abspath(repo(b''))
201 if not fullrepo.endswith(b'/'):
203 if full.startswith(fullrepo):
204 path = full[len(fullrepo):]
205 if path.startswith(b'index-cache/'):
206 path = path[len(b'index-cache/'):]
207 return shorten_hash(path)
210 def auto_midx(objdir):
211 args = [path.exe(), b'midx', b'--auto', b'--dir', objdir]
213 rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
215 # make sure 'args' gets printed to help with debugging
216 add_error('%r: exception: %s' % (args, e))
219 add_error('%r: returned %d' % (args, rv))
221 args = [path.exe(), b'bloom', b'--dir', objdir]
223 rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
225 # make sure 'args' gets printed to help with debugging
226 add_error('%r: exception: %s' % (args, e))
229 add_error('%r: returned %d' % (args, rv))
232 def mangle_name(name, mode, gitmode):
233 """Mangle a file name to present an abstract name for segmented files.
234 Mangled file names will have the ".bup" extension added to them. If a
235 file's name already ends with ".bup", a ".bupl" extension is added to
236 disambiguate normal files from segmented ones.
238 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
239 assert(stat.S_ISDIR(gitmode))
240 return name + b'.bup'
241 elif name.endswith(b'.bup') or name[:-1].endswith(b'.bup'):
242 return name + b'.bupl'
247 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
248 def demangle_name(name, mode):
249 """Remove name mangling from a file name, if necessary.
251 The return value is a tuple (demangled_filename,mode), where mode is one of
254 * BUP_NORMAL : files that should be read as-is from the repository
255 * BUP_CHUNKED : files that were chunked and need to be reassembled
257 For more information on the name mangling algorithm, see mangle_name()
259 if name.endswith(b'.bupl'):
260 return (name[:-5], BUP_NORMAL)
261 elif name.endswith(b'.bup'):
262 return (name[:-4], BUP_CHUNKED)
263 elif name.endswith(b'.bupm'):
265 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
266 return (name, BUP_NORMAL)
269 def calc_hash(type, content):
270 """Calculate some content's hash in the Git fashion."""
271 header = b'%s %d\0' % (type, len(content))
277 def shalist_item_sort_key(ent):
278 (mode, name, id) = ent
279 assert(mode+0 == mode)
280 if stat.S_ISDIR(mode):
286 def tree_encode(shalist):
287 """Generate a git tree object from (mode,name,hash) tuples."""
288 shalist = sorted(shalist, key = shalist_item_sort_key)
290 for (mode,name,bin) in shalist:
292 assert(mode+0 == mode)
294 assert(len(bin) == 20)
295 s = b'%o %s\0%s' % (mode,name,bin)
296 assert s[0] != b'0' # 0-padded octal is not acceptable in a git tree
301 def tree_decode(buf):
302 """Generate a list of (mode,name,hash) from the git tree object in buf."""
304 while ofs < len(buf):
305 z = buf.find(b'\0', ofs)
307 spl = buf[ofs:z].split(b' ', 1)
308 assert(len(spl) == 2)
310 sha = buf[z+1:z+1+20]
312 yield (int(mode, 8), name, sha)
315 def _encode_packobj(type, content, compression_level=1):
316 if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
317 raise ValueError('invalid compression level %s' % compression_level)
320 szbits = (sz & 0x0f) | (_typemap[type]<<4)
323 if sz: szbits |= 0x80
324 szout += bytes_from_uint(szbits)
329 z = zlib.compressobj(compression_level)
331 yield z.compress(content)
335 def _decode_packobj(buf):
338 type = _typermap[(c & 0x70) >> 4]
345 sz |= (c & 0x7f) << shift
349 return (type, zlib.decompress(buf[i+1:]))
356 def find_offset(self, hash):
357 """Get the offset of an object inside the index file."""
358 idx = self._idx_from_hash(hash)
360 return self._ofs_from_idx(idx)
363 def exists(self, hash, want_source=False):
364 """Return nonempty if the object exists in this index."""
365 if hash and (self._idx_from_hash(hash) != None):
366 return want_source and os.path.basename(self.name) or True
369 def _idx_from_hash(self, hash):
370 global _total_searches, _total_steps
372 assert(len(hash) == 20)
373 b1 = byte_int(hash[0])
374 start = self.fanout[b1-1] # range -1..254
375 end = self.fanout[b1] # range 0..255
377 _total_steps += 1 # lookup table is a step
380 mid = start + (end - start) // 2
381 v = self._idx_to_hash(mid)
391 class PackIdxV1(PackIdx):
392 """Object representation of a Git pack index (version 1) file."""
393 def __init__(self, filename, f):
395 self.idxnames = [self.name]
396 self.map = mmap_read(f)
397 # Min size for 'L' is 4, which is sufficient for struct's '!I'
398 self.fanout = array('L', struct.unpack('!256I', self.map))
399 self.fanout.append(0) # entry "-1"
400 self.nsha = self.fanout[255]
401 self.sha_ofs = 256 * 4
402 # Avoid slicing shatable for individual hashes (very high overhead)
403 self.shatable = buffer(self.map, self.sha_ofs, self.nsha * 24)
408 def __exit__(self, type, value, traceback):
412 return int(self.nsha) # int() from long for python 2
414 def _ofs_from_idx(self, idx):
415 if idx >= self.nsha or idx < 0:
416 raise IndexError('invalid pack index index %d' % idx)
417 ofs = self.sha_ofs + idx * 24
418 return struct.unpack_from('!I', self.map, offset=ofs)[0]
420 def _idx_to_hash(self, idx):
421 if idx >= self.nsha or idx < 0:
422 raise IndexError('invalid pack index index %d' % idx)
423 ofs = self.sha_ofs + idx * 24 + 4
424 return self.map[ofs : ofs + 20]
427 start = self.sha_ofs + 4
428 for ofs in range(start, start + 24 * self.nsha, 24):
429 yield self.map[ofs : ofs + 20]
432 if self.map is not None:
438 class PackIdxV2(PackIdx):
439 """Object representation of a Git pack index (version 2) file."""
440 def __init__(self, filename, f):
442 self.idxnames = [self.name]
443 self.map = mmap_read(f)
444 assert self.map[0:8] == b'\377tOc\0\0\0\2'
445 # Min size for 'L' is 4, which is sufficient for struct's '!I'
446 self.fanout = array('L', struct.unpack_from('!256I', self.map, offset=8))
447 self.fanout.append(0)
448 self.nsha = self.fanout[255]
449 self.sha_ofs = 8 + 256*4
450 self.ofstable_ofs = self.sha_ofs + self.nsha * 20 + self.nsha * 4
451 self.ofs64table_ofs = self.ofstable_ofs + self.nsha * 4
452 # Avoid slicing this for individual hashes (very high overhead)
453 self.shatable = buffer(self.map, self.sha_ofs, self.nsha*20)
458 def __exit__(self, type, value, traceback):
462 return int(self.nsha) # int() from long for python 2
464 def _ofs_from_idx(self, idx):
465 if idx >= self.nsha or idx < 0:
466 raise IndexError('invalid pack index index %d' % idx)
467 ofs_ofs = self.ofstable_ofs + idx * 4
468 ofs = struct.unpack_from('!I', self.map, offset=ofs_ofs)[0]
470 idx64 = ofs & 0x7fffffff
471 ofs64_ofs = self.ofs64table_ofs + idx64 * 8
472 ofs = struct.unpack_from('!Q', self.map, offset=ofs64_ofs)[0]
475 def _idx_to_hash(self, idx):
476 if idx >= self.nsha or idx < 0:
477 raise IndexError('invalid pack index index %d' % idx)
478 ofs = self.sha_ofs + idx * 20
479 return self.map[ofs : ofs + 20]
483 for ofs in range(start, start + 20 * self.nsha, 20):
484 yield self.map[ofs : ofs + 20]
487 if self.map is not None:
495 def __init__(self, dir, ignore_midx=False):
497 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
502 self.do_bloom = False
504 self.ignore_midx = ignore_midx
510 assert(_mpi_count == 0)
513 return iter(idxmerge(self.packs))
516 return sum(len(pack) for pack in self.packs)
518 def exists(self, hash, want_source=False):
519 """Return nonempty if the object exists in the index files."""
520 global _total_searches
522 if hash in self.also:
524 if self.do_bloom and self.bloom:
525 if self.bloom.exists(hash):
526 self.do_bloom = False
528 _total_searches -= 1 # was counted by bloom
530 for i in range(len(self.packs)):
532 _total_searches -= 1 # will be incremented by sub-pack
533 ix = p.exists(hash, want_source=want_source)
535 # reorder so most recently used packs are searched first
536 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
541 def refresh(self, skip_midx = False):
542 """Refresh the index list.
543 This method verifies if .midx files were superseded (e.g. all of its
544 contents are in another, bigger .midx file) and removes the superseded
547 If skip_midx is True, all work on .midx files will be skipped and .midx
548 files will be removed from the list.
550 The instance variable 'ignore_midx' can force this function to
551 always act as if skip_midx was True.
553 if self.bloom is not None:
555 self.bloom = None # Always reopen the bloom as it may have been relaced
556 self.do_bloom = False
557 skip_midx = skip_midx or self.ignore_midx
558 d = dict((p.name, p) for p in self.packs
559 if not skip_midx or not isinstance(p, midx.PackMidx))
560 if os.path.exists(self.dir):
563 midxes = set(glob.glob(os.path.join(self.dir, b'*.midx')))
564 # remove any *.midx files from our list that no longer exist
565 for ix in list(d.values()):
566 if not isinstance(ix, midx.PackMidx):
568 if ix.name in midxes:
573 self.packs.remove(ix)
574 for ix in self.packs:
575 if isinstance(ix, midx.PackMidx):
576 for name in ix.idxnames:
577 d[os.path.join(self.dir, name)] = ix
580 mx = midx.PackMidx(full)
581 (mxd, mxf) = os.path.split(mx.name)
583 for n in mx.idxnames:
584 if not os.path.exists(os.path.join(mxd, n)):
585 log(('warning: index %s missing\n'
587 % (path_msg(n), path_msg(mxf)))
595 midxl.sort(key=lambda ix:
596 (-len(ix), -xstat.stat(ix.name).st_mtime))
599 for sub in ix.idxnames:
600 found = d.get(os.path.join(self.dir, sub))
601 if not found or isinstance(found, PackIdx):
602 # doesn't exist, or exists but not in a midx
607 for name in ix.idxnames:
608 d[os.path.join(self.dir, name)] = ix
609 elif not ix.force_keep:
610 debug1('midx: removing redundant: %s\n'
611 % path_msg(os.path.basename(ix.name)))
614 for full in glob.glob(os.path.join(self.dir, b'*.idx')):
618 except GitError as e:
622 bfull = os.path.join(self.dir, b'bup.bloom')
623 if self.bloom is None and os.path.exists(bfull):
624 self.bloom = bloom.ShaBloom(bfull)
625 self.packs = list(set(d.values()))
626 self.packs.sort(reverse=True, key=lambda x: len(x))
627 if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
631 debug1('PackIdxList: using %d index%s.\n'
632 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
635 """Insert an additional object in the list."""
639 def open_idx(filename):
640 if filename.endswith(b'.idx'):
641 f = open(filename, 'rb')
643 if header[0:4] == b'\377tOc':
644 version = struct.unpack('!I', header[4:8])[0]
646 return PackIdxV2(filename, f)
648 raise GitError('%s: expected idx file version 2, got %d'
649 % (path_msg(filename), version))
650 elif len(header) == 8 and header[0:4] < b'\377tOc':
651 return PackIdxV1(filename, f)
653 raise GitError('%s: unrecognized idx file header'
654 % path_msg(filename))
655 elif filename.endswith(b'.midx'):
656 return midx.PackMidx(filename)
658 raise GitError('idx filenames must end with .idx or .midx')
661 def idxmerge(idxlist, final_progress=True):
662 """Generate a list of all the objects reachable in a PackIdxList."""
663 def pfunc(count, total):
664 qprogress('Reading indexes: %.2f%% (%d/%d)\r'
665 % (count*100.0/total, count, total))
666 def pfinal(count, total):
668 progress('Reading indexes: %.2f%% (%d/%d), done.\n'
669 % (100, total, total))
670 return merge_iter(idxlist, 10024, pfunc, pfinal)
673 def create_commit_blob(tree, parent,
674 author, adate_sec, adate_tz,
675 committer, cdate_sec, cdate_tz,
677 if adate_tz is not None:
678 adate_str = _git_date_str(adate_sec, adate_tz)
680 adate_str = _local_git_date_str(adate_sec)
681 if cdate_tz is not None:
682 cdate_str = _git_date_str(cdate_sec, cdate_tz)
684 cdate_str = _local_git_date_str(cdate_sec)
686 if tree: l.append(b'tree %s' % hexlify(tree))
687 if parent: l.append(b'parent %s' % hexlify(parent))
688 if author: l.append(b'author %s %s' % (author, adate_str))
689 if committer: l.append(b'committer %s %s' % (committer, cdate_str))
695 def _make_objcache():
696 return PackIdxList(repo(b'objects/pack'))
698 # bup-gc assumes that it can disable all PackWriter activities
699 # (bloom/midx/cache) via the constructor and close() arguments.
702 """Writes Git objects inside a pack file."""
703 def __init__(self, objcache_maker=_make_objcache, compression_level=1,
704 run_midx=True, on_pack_finish=None,
705 max_pack_size=None, max_pack_objects=None, repo_dir=None):
706 self.repo_dir = repo_dir or repo()
713 self.objcache_maker = objcache_maker
715 self.compression_level = compression_level
716 self.run_midx=run_midx
717 self.on_pack_finish = on_pack_finish
718 if not max_pack_size:
719 max_pack_size = git_config_get(b'pack.packSizeLimit',
720 repo_dir=self.repo_dir,
722 if not max_pack_size:
723 # larger packs slow down pruning
724 max_pack_size = 1000 * 1000 * 1000
725 self.max_pack_size = max_pack_size
726 # cache memory usage is about 83 bytes per object
727 self.max_pack_objects = max_pack_objects if max_pack_objects \
728 else max(1, self.max_pack_size // 5000)
736 def __exit__(self, type, value, traceback):
741 objdir = dir = os.path.join(self.repo_dir, b'objects')
742 fd, name = tempfile.mkstemp(suffix=b'.pack', dir=objdir)
744 self.file = os.fdopen(fd, 'w+b')
749 self.parentfd = os.open(objdir, os.O_RDONLY)
755 assert name.endswith(b'.pack')
756 self.filename = name[:-5]
757 self.file.write(b'PACK\0\0\0\2\0\0\0\0')
758 self.idx = PackIdxV2Writer()
760 def _raw_write(self, datalist, sha):
763 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
764 # the file never has a *partial* blob. So let's make sure it's
765 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
766 # to our hashsplit algorithm.) f.write() does its own buffering,
767 # but that's okay because we'll flush it in _end().
768 oneblob = b''.join(datalist)
774 crc = zlib.crc32(oneblob) & 0xffffffff
775 self._update_idx(sha, crc, nw)
780 def _update_idx(self, sha, crc, size):
783 self.idx.add(sha, crc, self.file.tell() - size)
785 def _write(self, sha, type, content):
789 sha = calc_hash(type, content)
790 size, crc = self._raw_write(_encode_packobj(type, content,
791 self.compression_level),
793 if self.outbytes >= self.max_pack_size \
794 or self.count >= self.max_pack_objects:
798 def breakpoint(self):
799 """Clear byte and object counts and return the last processed id."""
800 id = self._end(self.run_midx)
801 self.outbytes = self.count = 0
804 def _require_objcache(self):
805 if self.objcache is None and self.objcache_maker:
806 self.objcache = self.objcache_maker()
807 if self.objcache is None:
809 "PackWriter not opened or can't check exists w/o objcache")
811 def exists(self, id, want_source=False):
812 """Return non-empty if an object is found in the object cache."""
813 self._require_objcache()
814 return self.objcache.exists(id, want_source=want_source)
816 def just_write(self, sha, type, content):
817 """Write an object to the pack file without checking for duplication."""
818 self._write(sha, type, content)
819 # If nothing else, gc doesn't have/want an objcache
820 if self.objcache is not None:
821 self.objcache.add(sha)
823 def maybe_write(self, type, content):
824 """Write an object to the pack file if not present and return its id."""
825 sha = calc_hash(type, content)
826 if not self.exists(sha):
827 self._require_objcache()
828 self.just_write(sha, type, content)
831 def new_blob(self, blob):
832 """Create a blob object in the pack with the supplied content."""
833 return self.maybe_write(b'blob', blob)
835 def new_tree(self, shalist):
836 """Create a tree object in the pack."""
837 content = tree_encode(shalist)
838 return self.maybe_write(b'tree', content)
840 def new_commit(self, tree, parent,
841 author, adate_sec, adate_tz,
842 committer, cdate_sec, cdate_tz,
844 """Create a commit object in the pack. The date_sec values must be
845 epoch-seconds, and if a tz is None, the local timezone is assumed."""
846 content = create_commit_blob(tree, parent,
847 author, adate_sec, adate_tz,
848 committer, cdate_sec, cdate_tz,
850 return self.maybe_write(b'commit', content)
853 """Remove the pack file from disk."""
862 os.unlink(self.filename + b'.pack')
869 def _end(self, run_midx=True):
871 if not f: return None
878 # update object count
880 cp = struct.pack('!i', self.count)
884 # calculate the pack sha1sum
887 for b in chunkyreader(f):
889 packbin = sum.digest()
891 fdatasync(f.fileno())
895 idx.write(self.filename + b'.idx', packbin)
896 nameprefix = os.path.join(self.repo_dir,
897 b'objects/pack/pack-' + hexlify(packbin))
898 if os.path.exists(self.filename + b'.map'):
899 os.unlink(self.filename + b'.map')
900 os.rename(self.filename + b'.pack', nameprefix + b'.pack')
901 os.rename(self.filename + b'.idx', nameprefix + b'.idx')
903 os.fsync(self.parentfd)
905 os.close(self.parentfd)
908 auto_midx(os.path.join(self.repo_dir, b'objects/pack'))
910 if self.on_pack_finish:
911 self.on_pack_finish(nameprefix)
915 def close(self, run_midx=True):
916 """Close the pack file and move it to its definitive path."""
917 return self._end(run_midx=run_midx)
920 class PackIdxV2Writer:
922 self.idx = list(list() for i in range(256))
925 def add(self, sha, crc, offs):
928 self.idx[byte_int(sha[0])].append((sha, crc, offs))
930 def write(self, filename, packbin):
932 for section in self.idx:
933 for entry in section:
934 if entry[2] >= 2**31:
937 # Length: header + fan-out + shas-and-crcs + overflow-offsets
938 index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
940 idx_f = open(filename, 'w+b')
942 idx_f.truncate(index_len)
943 fdatasync(idx_f.fileno())
944 idx_map = mmap_readwrite(idx_f, close=False)
946 count = _helpers.write_idx(filename, idx_map, self.idx,
948 assert(count == self.count)
955 idx_f = open(filename, 'a+b')
960 b = idx_f.read(8 + 4*256)
963 for b in chunkyreader(idx_f, 20 * self.count):
966 for b in chunkyreader(idx_f):
968 idx_f.write(idx_sum.digest())
969 fdatasync(idx_f.fileno())
974 def list_refs(patterns=None, repo_dir=None,
975 limit_to_heads=False, limit_to_tags=False):
976 """Yield (refname, hash) tuples for all repository refs unless
977 patterns are specified. In that case, only include tuples for
978 refs matching those patterns (cf. git-show-ref(1)). The limits
979 restrict the result items to refs/heads or refs/tags. If both
980 limits are specified, items from both sources will be included.
983 argv = [b'git', b'show-ref']
985 argv.append(b'--heads')
987 argv.append(b'--tags')
990 argv.extend(patterns)
991 p = subprocess.Popen(argv, env=_gitenv(repo_dir), stdout=subprocess.PIPE,
993 out = p.stdout.read().strip()
994 rv = p.wait() # not fatal
998 for d in out.split(b'\n'):
999 sha, name = d.split(b' ', 1)
1000 yield name, unhexlify(sha)
1003 def read_ref(refname, repo_dir = None):
1004 """Get the commit id of the most recent commit made on a given ref."""
1005 refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
1006 l = tuple(islice(refs, 2))
1014 def rev_list_invocation(ref_or_refs, format=None):
1015 if isinstance(ref_or_refs, bytes):
1016 refs = (ref_or_refs,)
1019 argv = [b'git', b'rev-list']
1022 argv.append(b'--pretty=format:' + format)
1024 assert not ref.startswith(b'-')
1030 def rev_list(ref_or_refs, parse=None, format=None, repo_dir=None):
1031 """Yield information about commits as per "git rev-list". If a format
1032 is not provided, yield one hex hash at a time. If a format is
1033 provided, pass it to rev-list and call parse(git_stdout) for each
1034 commit with the stream positioned just after the rev-list "commit
1035 HASH" header line. When a format is provided yield (oidx,
1036 parse(git_stdout)) for each commit.
1039 assert bool(parse) == bool(format)
1040 p = subprocess.Popen(rev_list_invocation(ref_or_refs,
1042 env=_gitenv(repo_dir),
1043 stdout = subprocess.PIPE,
1046 for line in p.stdout:
1049 line = p.stdout.readline()
1052 if not s.startswith(b'commit '):
1053 raise Exception('unexpected line ' + repr(s))
1056 yield s, parse(p.stdout)
1057 line = p.stdout.readline()
1059 rv = p.wait() # not fatal
1061 raise GitError('git rev-list returned error %d' % rv)
1064 def rev_parse(committish, repo_dir=None):
1065 """Resolve the full hash for 'committish', if it exists.
1067 Should be roughly equivalent to 'git rev-parse'.
1069 Returns the hex value of the hash if it is found, None if 'committish' does
1070 not correspond to anything.
1072 head = read_ref(committish, repo_dir=repo_dir)
1074 debug2("resolved from ref: commit = %s\n" % hexlify(head))
1077 pL = PackIdxList(repo(b'objects/pack', repo_dir=repo_dir))
1079 if len(committish) == 40:
1081 hash = unhexlify(committish)
1091 def update_ref(refname, newval, oldval, repo_dir=None):
1092 """Update a repository reference."""
1095 assert refname.startswith(b'refs/heads/') \
1096 or refname.startswith(b'refs/tags/')
1097 p = subprocess.Popen([b'git', b'update-ref', refname,
1098 hexlify(newval), hexlify(oldval)],
1099 env=_gitenv(repo_dir),
1101 _git_wait(b'git update-ref', p)
1104 def delete_ref(refname, oldvalue=None):
1105 """Delete a repository reference (see git update-ref(1))."""
1106 assert refname.startswith(b'refs/')
1107 oldvalue = [] if not oldvalue else [oldvalue]
1108 p = subprocess.Popen([b'git', b'update-ref', b'-d', refname] + oldvalue,
1111 _git_wait('git update-ref', p)
1114 def guess_repo(path=None):
1115 """Set the path value in the global variable "repodir".
1116 This makes bup look for an existing bup repository, but not fail if a
1117 repository doesn't exist. Usually, if you are interacting with a bup
1118 repository, you would not be calling this function but using
1119 check_repo_or_die().
1125 repodir = environ.get(b'BUP_DIR')
1127 repodir = os.path.expanduser(b'~/.bup')
1130 def init_repo(path=None):
1131 """Create the Git bare repository for bup in a given path."""
1133 d = repo() # appends a / to the path
1134 parent = os.path.dirname(os.path.dirname(d))
1135 if parent and not os.path.exists(parent):
1136 raise GitError('parent directory "%s" does not exist\n'
1138 if os.path.exists(d) and not os.path.isdir(os.path.join(d, b'.')):
1139 raise GitError('"%s" exists but is not a directory\n' % path_msg(d))
1140 p = subprocess.Popen([b'git', b'--bare', b'init'], stdout=sys.stderr,
1143 _git_wait('git init', p)
1144 # Force the index version configuration in order to ensure bup works
1145 # regardless of the version of the installed Git binary.
1146 p = subprocess.Popen([b'git', b'config', b'pack.indexVersion', '2'],
1147 stdout=sys.stderr, env=_gitenv(), close_fds=True)
1148 _git_wait('git config', p)
1150 p = subprocess.Popen([b'git', b'config', b'core.logAllRefUpdates', b'true'],
1151 stdout=sys.stderr, env=_gitenv(), close_fds=True)
1152 _git_wait('git config', p)
1155 def check_repo_or_die(path=None):
1156 """Check to see if a bup repository probably exists, and abort if not."""
1159 pst = stat_if_exists(top + b'/objects/pack')
1160 if pst and stat.S_ISDIR(pst.st_mode):
1163 top_st = stat_if_exists(top)
1165 log('error: repository %r does not exist (see "bup help init")\n'
1168 log('error: %s is not a repository\n' % path_msg(top))
1172 def is_suitable_git(ver_str):
1173 if not ver_str.startswith(b'git version '):
1174 return 'unrecognized'
1175 ver_str = ver_str[len(b'git version '):]
1176 if ver_str.startswith(b'0.'):
1177 return 'insufficient'
1178 if ver_str.startswith(b'1.'):
1179 if re.match(br'1\.[012345]rc', ver_str):
1180 return 'insufficient'
1181 if re.match(br'1\.[01234]\.', ver_str):
1182 return 'insufficient'
1183 if re.match(br'1\.5\.[012345]($|\.)', ver_str):
1184 return 'insufficient'
1185 if re.match(br'1\.5\.6-rc', ver_str):
1186 return 'insufficient'
1188 if re.match(br'[0-9]+(\.|$)?', ver_str):
1194 def require_suitable_git(ver_str=None):
1195 """Raise GitError if the version of git isn't suitable.
1197 Rely on ver_str when provided, rather than invoking the git in the
1202 if _git_great is not None:
1204 if environ.get(b'BUP_GIT_VERSION_IS_FINE', b'').lower() \
1205 in (b'yes', b'true', b'1'):
1209 ver_str, _, _ = _git_exo([b'git', b'--version'])
1210 status = is_suitable_git(ver_str)
1211 if status == 'unrecognized':
1212 raise GitError('Unexpected git --version output: %r' % ver_str)
1213 if status == 'insufficient':
1214 log('error: git version must be at least 1.5.6\n')
1216 if status == 'suitable':
1222 class _AbortableIter:
1223 def __init__(self, it, onabort = None):
1225 self.onabort = onabort
1233 return next(self.it)
1234 except StopIteration as e:
1244 """Abort iteration and call the abortion callback, if needed."""
1255 """Link to 'git cat-file' that is used to retrieve blob data."""
1256 def __init__(self, repo_dir = None):
1257 require_suitable_git()
1258 self.repo_dir = repo_dir
1259 self.p = self.inprogress = None
1261 def close(self, wait=False):
1267 self.inprogress = None
1275 self.p = subprocess.Popen([b'git', b'cat-file', b'--batch'],
1276 stdin=subprocess.PIPE,
1277 stdout=subprocess.PIPE,
1280 env=_gitenv(self.repo_dir))
1283 """Yield (oidx, type, size), followed by the data referred to by ref.
1284 If ref does not exist, only yield (None, None, None).
1287 if not self.p or self.p.poll() != None:
1290 poll_result = self.p.poll()
1291 assert(poll_result == None)
1293 log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1294 assert(not self.inprogress)
1295 assert ref.find(b'\n') < 0
1296 assert ref.find(b'\r') < 0
1297 assert not ref.startswith(b'-')
1298 self.inprogress = ref
1299 self.p.stdin.write(ref + b'\n')
1300 self.p.stdin.flush()
1301 hdr = self.p.stdout.readline()
1303 raise GitError('unexpected cat-file EOF (last request: %r, exit: %s)'
1304 % (ref, self.p.poll() or 'none'))
1305 if hdr.endswith(b' missing\n'):
1306 self.inprogress = None
1307 yield None, None, None
1309 info = hdr.split(b' ')
1310 if len(info) != 3 or len(info[0]) != 40:
1311 raise GitError('expected object (id, type, size), got %r' % info)
1312 oidx, typ, size = info
1314 it = _AbortableIter(chunkyreader(self.p.stdout, size),
1317 yield oidx, typ, size
1320 readline_result = self.p.stdout.readline()
1321 assert readline_result == b'\n'
1322 self.inprogress = None
1323 except Exception as e:
1327 def _join(self, it):
1328 _, typ, _ = next(it)
1332 elif typ == b'tree':
1333 treefile = b''.join(it)
1334 for (mode, name, sha) in tree_decode(treefile):
1335 for blob in self.join(hexlify(sha)):
1337 elif typ == b'commit':
1338 treeline = b''.join(it).split(b'\n')[0]
1339 assert treeline.startswith(b'tree ')
1340 for blob in self.join(treeline[5:]):
1343 raise GitError('invalid object type %r: expected blob/tree/commit'
1347 """Generate a list of the content of all blobs that can be reached
1348 from an object. The hash given in 'id' must point to a blob, a tree
1349 or a commit. The content of all blobs that can be seen from trees or
1350 commits will be added to the list.
1352 for d in self._join(self.get(id)):
1358 def cp(repo_dir=None):
1359 """Create a CatPipe object or reuse the already existing one."""
1362 repo_dir = repodir or repo()
1363 repo_dir = os.path.abspath(repo_dir)
1364 cp = _cp.get(repo_dir)
1366 cp = CatPipe(repo_dir)
1371 def close_catpipes():
1372 # FIXME: chain exceptions
1374 _, cp = _cp.popitem()
1378 def tags(repo_dir = None):
1379 """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1381 for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1382 assert n.startswith(b'refs/tags/')
1386 tags[c].append(name) # more than one tag can point at 'c'
1390 class MissingObject(KeyError):
1391 def __init__(self, oid):
1393 KeyError.__init__(self, 'object %r is missing' % hexlify(oid))
1396 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1397 'path', 'chunk_path', 'data'])
1398 # The path is the mangled path, and if an item represents a fragment
1399 # of a chunked file, the chunk_path will be the chunked subtree path
1400 # for the chunk, i.e. ['', '2d3115e', ...]. The top-level path for a
1401 # chunked file will have a chunk_path of ['']. So some chunk subtree
1402 # of the file '/foo/bar/baz' might look like this:
1404 # item.path = ['foo', 'bar', 'baz.bup']
1405 # item.chunk_path = ['', '2d3115e', '016b097']
1406 # item.type = 'tree'
1410 def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1411 """Yield everything reachable from oidx via get_ref (which must behave
1412 like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1413 returns true. Throw MissingObject if a hash encountered is
1414 missing from the repository, and don't read or return blob content
1415 in the data field unless include_data is set.
1418 # Maintain the pending stack on the heap to avoid stack overflow
1419 pending = [(oidx, [], [], None)]
1421 oidx, parent_path, chunk_path, mode = pending.pop()
1422 oid = unhexlify(oidx)
1423 if stop_at and stop_at(oidx):
1426 if (not include_data) and mode and stat.S_ISREG(mode):
1427 # If the object is a "regular file", then it's a leaf in
1428 # the graph, so we can skip reading the data if the caller
1429 # hasn't requested it.
1430 yield WalkItem(oid=oid, type=b'blob',
1431 chunk_path=chunk_path, path=parent_path,
1436 item_it = get_ref(oidx)
1437 get_oidx, typ, _ = next(item_it)
1439 raise MissingObject(unhexlify(oidx))
1440 if typ not in (b'blob', b'commit', b'tree'):
1441 raise Exception('unexpected repository object type %r' % typ)
1443 # FIXME: set the mode based on the type when the mode is None
1444 if typ == b'blob' and not include_data:
1445 # Dump data until we can ask cat_pipe not to fetch it
1446 for ignored in item_it:
1450 data = b''.join(item_it)
1452 yield WalkItem(oid=oid, type=typ,
1453 chunk_path=chunk_path, path=parent_path,
1455 data=(data if include_data else None))
1457 if typ == b'commit':
1458 commit_items = parse_commit(data)
1459 for pid in commit_items.parents:
1460 pending.append((pid, parent_path, chunk_path, mode))
1461 pending.append((commit_items.tree, parent_path, chunk_path,
1462 hashsplit.GIT_MODE_TREE))
1463 elif typ == b'tree':
1464 for mode, name, ent_id in tree_decode(data):
1465 demangled, bup_type = demangle_name(name, mode)
1467 sub_path = parent_path
1468 sub_chunk_path = chunk_path + [name]
1470 sub_path = parent_path + [name]
1471 if bup_type == BUP_CHUNKED:
1472 sub_chunk_path = [b'']
1474 sub_chunk_path = chunk_path
1475 pending.append((hexlify(ent_id), sub_path, sub_chunk_path,