1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
6 from __future__ import absolute_import, print_function
7 import os, sys, zlib, subprocess, struct, stat, re, tempfile, glob
8 from array import array
9 from binascii import hexlify, unhexlify
10 from collections import namedtuple
11 from itertools import islice
13 from bup import _helpers, hashsplit, path, midx, bloom, xstat
14 from bup.compat import (buffer,
15 byte_int, bytes_from_byte, bytes_from_uint,
20 from bup.io import path_msg
21 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
27 mmap_read, mmap_readwrite,
28 progress, qprogress, stat_if_exists,
34 repodir = None # The default repository, once initialized
36 _typemap = {b'blob': 3, b'tree': 2, b'commit': 1, b'tag': 4}
37 _typermap = {v: k for k, v in items(_typemap)}
44 class GitError(Exception):
48 def _gitenv(repo_dir=None):
51 return merge_dict(environ, {b'GIT_DIR': os.path.abspath(repo_dir)})
53 def _git_wait(cmd, p):
56 raise GitError('%r returned %d' % (cmd, rv))
58 def _git_exo(cmd, **kwargs):
59 kwargs['check'] = False
60 result = exo(cmd, **kwargs)
62 if proc.returncode != 0:
63 raise GitError('%r returned %d' % (cmd, proc.returncode))
66 def git_config_get(option, repo_dir=None, opttype=None, cfg_file=None):
67 assert not (repo_dir and cfg_file), "repo_dir and cfg_file cannot both be used"
68 cmd = [b'git', b'config', b'--null']
70 cmd.extend([b'--file', cfg_file])
72 cmd.extend([b'--int'])
73 elif opttype == 'bool':
74 cmd.extend([b'--bool'])
76 assert opttype is None
77 cmd.extend([b'--get', option])
80 env = _gitenv(repo_dir=repo_dir)
81 p = subprocess.Popen(cmd, stdout=subprocess.PIPE, env=env,
83 # with --null, git writes out a trailing \0 after the value
84 r = p.stdout.read()[:-1]
89 elif opttype == 'bool':
90 # git converts to 'true' or 'false'
94 raise GitError('%r returned %d' % (cmd, rc))
98 def parse_tz_offset(s):
99 """UTC offset in seconds."""
100 tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
101 if bytes_from_byte(s[0]) == b'-':
106 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
107 # Make sure that's authoritative.
108 _start_end_char = br'[^ .,:;<>"\'\0\n]'
109 _content_char = br'[^\0\n<>]'
110 _safe_str_rx = br'(?:%s{1,2}|(?:%s%s*%s))' \
112 _start_end_char, _content_char, _start_end_char)
113 _tz_rx = br'[-+]\d\d[0-5]\d'
114 _parent_rx = br'(?:parent [abcdefABCDEF0123456789]{40}\n)'
115 # Assumes every following line starting with a space is part of the
116 # mergetag. Is there a formal commit blob spec?
117 _mergetag_rx = br'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)'
118 _commit_rx = re.compile(br'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
119 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
120 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?)
122 (?P<message>(?:.|\n)*)''' % (_parent_rx,
123 _safe_str_rx, _safe_str_rx, _tz_rx,
124 _safe_str_rx, _safe_str_rx, _tz_rx,
126 _parent_hash_rx = re.compile(br'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
128 # Note that the author_sec and committer_sec values are (UTC) epoch
129 # seconds, and for now the mergetag is not included.
130 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
131 'author_name', 'author_mail',
132 'author_sec', 'author_offset',
133 'committer_name', 'committer_mail',
134 'committer_sec', 'committer_offset',
137 def parse_commit(content):
138 commit_match = re.match(_commit_rx, content)
140 raise Exception('cannot parse commit %r' % content)
141 matches = commit_match.groupdict()
142 return CommitInfo(tree=matches['tree'],
143 parents=re.findall(_parent_hash_rx, matches['parents']),
144 author_name=matches['author_name'],
145 author_mail=matches['author_mail'],
146 author_sec=int(matches['asec']),
147 author_offset=parse_tz_offset(matches['atz']),
148 committer_name=matches['committer_name'],
149 committer_mail=matches['committer_mail'],
150 committer_sec=int(matches['csec']),
151 committer_offset=parse_tz_offset(matches['ctz']),
152 message=matches['message'])
155 def get_cat_data(cat_iterator, expected_type):
156 _, kind, _ = next(cat_iterator)
157 if kind != expected_type:
158 raise Exception('expected %r, saw %r' % (expected_type, kind))
159 return b''.join(cat_iterator)
161 def get_commit_items(id, cp):
162 return parse_commit(get_cat_data(cp.get(id), b'commit'))
164 def _local_git_date_str(epoch_sec):
165 return b'%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
168 def _git_date_str(epoch_sec, tz_offset_sec):
169 offs = tz_offset_sec // 60
170 return b'%d %s%02d%02d' \
172 b'+' if offs >= 0 else b'-',
177 def repo(sub = b'', repo_dir=None):
178 """Get the path to the git repository or one of its subdirectories."""
179 repo_dir = repo_dir or repodir
181 raise GitError('You should call check_repo_or_die()')
183 # If there's a .git subdirectory, then the actual repo is in there.
184 gd = os.path.join(repo_dir, b'.git')
185 if os.path.exists(gd):
188 return os.path.join(repo_dir, sub)
192 re.compile(br'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)')
195 return _shorten_hash_rx.sub(br'\1\2*\3', s)
199 full = os.path.abspath(path)
200 fullrepo = os.path.abspath(repo(b''))
201 if not fullrepo.endswith(b'/'):
203 if full.startswith(fullrepo):
204 path = full[len(fullrepo):]
205 if path.startswith(b'index-cache/'):
206 path = path[len(b'index-cache/'):]
207 return shorten_hash(path)
210 def auto_midx(objdir):
211 args = [path.exe(), b'midx', b'--auto', b'--dir', objdir]
213 rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
215 # make sure 'args' gets printed to help with debugging
216 add_error('%r: exception: %s' % (args, e))
219 add_error('%r: returned %d' % (args, rv))
221 args = [path.exe(), b'bloom', b'--dir', objdir]
223 rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
225 # make sure 'args' gets printed to help with debugging
226 add_error('%r: exception: %s' % (args, e))
229 add_error('%r: returned %d' % (args, rv))
232 def mangle_name(name, mode, gitmode):
233 """Mangle a file name to present an abstract name for segmented files.
234 Mangled file names will have the ".bup" extension added to them. If a
235 file's name already ends with ".bup", a ".bupl" extension is added to
236 disambiguate normal files from segmented ones.
238 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
239 assert(stat.S_ISDIR(gitmode))
240 return name + b'.bup'
241 elif name.endswith(b'.bup') or name[:-1].endswith(b'.bup'):
242 return name + b'.bupl'
247 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
248 def demangle_name(name, mode):
249 """Remove name mangling from a file name, if necessary.
251 The return value is a tuple (demangled_filename,mode), where mode is one of
254 * BUP_NORMAL : files that should be read as-is from the repository
255 * BUP_CHUNKED : files that were chunked and need to be reassembled
257 For more information on the name mangling algorithm, see mangle_name()
259 if name.endswith(b'.bupl'):
260 return (name[:-5], BUP_NORMAL)
261 elif name.endswith(b'.bup'):
262 return (name[:-4], BUP_CHUNKED)
263 elif name.endswith(b'.bupm'):
265 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
267 return (name, BUP_NORMAL)
270 def calc_hash(type, content):
271 """Calculate some content's hash in the Git fashion."""
272 header = b'%s %d\0' % (type, len(content))
278 def shalist_item_sort_key(ent):
279 (mode, name, id) = ent
280 assert(mode+0 == mode)
281 if stat.S_ISDIR(mode):
287 def tree_encode(shalist):
288 """Generate a git tree object from (mode,name,hash) tuples."""
289 shalist = sorted(shalist, key = shalist_item_sort_key)
291 for (mode,name,bin) in shalist:
293 assert(mode+0 == mode)
295 assert(len(bin) == 20)
296 s = b'%o %s\0%s' % (mode,name,bin)
297 assert s[0] != b'0' # 0-padded octal is not acceptable in a git tree
302 def tree_decode(buf):
303 """Generate a list of (mode,name,hash) from the git tree object in buf."""
305 while ofs < len(buf):
306 z = buf.find(b'\0', ofs)
308 spl = buf[ofs:z].split(b' ', 1)
309 assert(len(spl) == 2)
311 sha = buf[z+1:z+1+20]
313 yield (int(mode, 8), name, sha)
316 def _encode_packobj(type, content, compression_level=1):
317 if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
318 raise ValueError('invalid compression level %s' % compression_level)
321 szbits = (sz & 0x0f) | (_typemap[type]<<4)
324 if sz: szbits |= 0x80
325 szout += bytes_from_uint(szbits)
330 z = zlib.compressobj(compression_level)
332 yield z.compress(content)
336 def _decode_packobj(buf):
339 type = _typermap[(c & 0x70) >> 4]
346 sz |= (c & 0x7f) << shift
350 return (type, zlib.decompress(buf[i+1:]))
357 def find_offset(self, hash):
358 """Get the offset of an object inside the index file."""
359 idx = self._idx_from_hash(hash)
361 return self._ofs_from_idx(idx)
364 def exists(self, hash, want_source=False):
365 """Return nonempty if the object exists in this index."""
366 if hash and (self._idx_from_hash(hash) != None):
367 return want_source and os.path.basename(self.name) or True
370 def _idx_from_hash(self, hash):
371 global _total_searches, _total_steps
373 assert(len(hash) == 20)
374 b1 = byte_int(hash[0])
375 start = self.fanout[b1-1] # range -1..254
376 end = self.fanout[b1] # range 0..255
378 _total_steps += 1 # lookup table is a step
381 mid = start + (end - start) // 2
382 v = self._idx_to_hash(mid)
392 class PackIdxV1(PackIdx):
393 """Object representation of a Git pack index (version 1) file."""
394 def __init__(self, filename, f):
396 self.idxnames = [self.name]
397 self.map = mmap_read(f)
398 # Min size for 'L' is 4, which is sufficient for struct's '!I'
399 self.fanout = array('L', struct.unpack('!256I', self.map))
400 self.fanout.append(0) # entry "-1"
401 self.nsha = self.fanout[255]
402 self.sha_ofs = 256 * 4
403 # Avoid slicing shatable for individual hashes (very high overhead)
404 self.shatable = buffer(self.map, self.sha_ofs, self.nsha * 24)
409 def __exit__(self, type, value, traceback):
413 return int(self.nsha) # int() from long for python 2
415 def _ofs_from_idx(self, idx):
416 if idx >= self.nsha or idx < 0:
417 raise IndexError('invalid pack index index %d' % idx)
418 ofs = self.sha_ofs + idx * 24
419 return struct.unpack_from('!I', self.map, offset=ofs)[0]
421 def _idx_to_hash(self, idx):
422 if idx >= self.nsha or idx < 0:
423 raise IndexError('invalid pack index index %d' % idx)
424 ofs = self.sha_ofs + idx * 24 + 4
425 return self.map[ofs : ofs + 20]
428 start = self.sha_ofs + 4
429 for ofs in range(start, start + 24 * self.nsha, 24):
430 yield self.map[ofs : ofs + 20]
433 if self.map is not None:
439 class PackIdxV2(PackIdx):
440 """Object representation of a Git pack index (version 2) file."""
441 def __init__(self, filename, f):
443 self.idxnames = [self.name]
444 self.map = mmap_read(f)
445 assert self.map[0:8] == b'\377tOc\0\0\0\2'
446 # Min size for 'L' is 4, which is sufficient for struct's '!I'
447 self.fanout = array('L', struct.unpack_from('!256I', self.map, offset=8))
448 self.fanout.append(0)
449 self.nsha = self.fanout[255]
450 self.sha_ofs = 8 + 256*4
451 self.ofstable_ofs = self.sha_ofs + self.nsha * 20 + self.nsha * 4
452 self.ofs64table_ofs = self.ofstable_ofs + self.nsha * 4
453 # Avoid slicing this for individual hashes (very high overhead)
454 self.shatable = buffer(self.map, self.sha_ofs, self.nsha*20)
459 def __exit__(self, type, value, traceback):
463 return int(self.nsha) # int() from long for python 2
465 def _ofs_from_idx(self, idx):
466 if idx >= self.nsha or idx < 0:
467 raise IndexError('invalid pack index index %d' % idx)
468 ofs_ofs = self.ofstable_ofs + idx * 4
469 ofs = struct.unpack_from('!I', self.map, offset=ofs_ofs)[0]
471 idx64 = ofs & 0x7fffffff
472 ofs64_ofs = self.ofs64table_ofs + idx64 * 8
473 ofs = struct.unpack_from('!Q', self.map, offset=ofs64_ofs)[0]
476 def _idx_to_hash(self, idx):
477 if idx >= self.nsha or idx < 0:
478 raise IndexError('invalid pack index index %d' % idx)
479 ofs = self.sha_ofs + idx * 20
480 return self.map[ofs : ofs + 20]
484 for ofs in range(start, start + 20 * self.nsha, 20):
485 yield self.map[ofs : ofs + 20]
488 if self.map is not None:
496 def __init__(self, dir, ignore_midx=False):
498 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
503 self.do_bloom = False
505 self.ignore_midx = ignore_midx
511 assert(_mpi_count == 0)
514 return iter(idxmerge(self.packs))
517 return sum(len(pack) for pack in self.packs)
519 def exists(self, hash, want_source=False):
520 """Return nonempty if the object exists in the index files."""
521 global _total_searches
523 if hash in self.also:
525 if self.do_bloom and self.bloom:
526 if self.bloom.exists(hash):
527 self.do_bloom = False
529 _total_searches -= 1 # was counted by bloom
531 for i in range(len(self.packs)):
533 _total_searches -= 1 # will be incremented by sub-pack
534 ix = p.exists(hash, want_source=want_source)
536 # reorder so most recently used packs are searched first
537 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
542 def refresh(self, skip_midx = False):
543 """Refresh the index list.
544 This method verifies if .midx files were superseded (e.g. all of its
545 contents are in another, bigger .midx file) and removes the superseded
548 If skip_midx is True, all work on .midx files will be skipped and .midx
549 files will be removed from the list.
551 The instance variable 'ignore_midx' can force this function to
552 always act as if skip_midx was True.
554 if self.bloom is not None:
556 self.bloom = None # Always reopen the bloom as it may have been relaced
557 self.do_bloom = False
558 skip_midx = skip_midx or self.ignore_midx
559 d = dict((p.name, p) for p in self.packs
560 if not skip_midx or not isinstance(p, midx.PackMidx))
561 if os.path.exists(self.dir):
564 midxes = set(glob.glob(os.path.join(self.dir, b'*.midx')))
565 # remove any *.midx files from our list that no longer exist
566 for ix in list(d.values()):
567 if not isinstance(ix, midx.PackMidx):
569 if ix.name in midxes:
574 self.packs.remove(ix)
575 for ix in self.packs:
576 if isinstance(ix, midx.PackMidx):
577 for name in ix.idxnames:
578 d[os.path.join(self.dir, name)] = ix
581 mx = midx.PackMidx(full)
582 (mxd, mxf) = os.path.split(mx.name)
584 for n in mx.idxnames:
585 if not os.path.exists(os.path.join(mxd, n)):
586 log(('warning: index %s missing\n'
588 % (path_msg(n), path_msg(mxf)))
596 midxl.sort(key=lambda ix:
597 (-len(ix), -xstat.stat(ix.name).st_mtime))
600 for sub in ix.idxnames:
601 found = d.get(os.path.join(self.dir, sub))
602 if not found or isinstance(found, PackIdx):
603 # doesn't exist, or exists but not in a midx
608 for name in ix.idxnames:
609 d[os.path.join(self.dir, name)] = ix
610 elif not ix.force_keep:
611 debug1('midx: removing redundant: %s\n'
612 % path_msg(os.path.basename(ix.name)))
615 for full in glob.glob(os.path.join(self.dir, b'*.idx')):
619 except GitError as e:
623 bfull = os.path.join(self.dir, b'bup.bloom')
624 if self.bloom is None and os.path.exists(bfull):
625 self.bloom = bloom.ShaBloom(bfull)
626 self.packs = list(set(d.values()))
627 self.packs.sort(reverse=True, key=lambda x: len(x))
628 if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
632 debug1('PackIdxList: using %d index%s.\n'
633 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
636 """Insert an additional object in the list."""
640 def open_idx(filename):
641 if filename.endswith(b'.idx'):
642 f = open(filename, 'rb')
644 if header[0:4] == b'\377tOc':
645 version = struct.unpack('!I', header[4:8])[0]
647 return PackIdxV2(filename, f)
649 raise GitError('%s: expected idx file version 2, got %d'
650 % (path_msg(filename), version))
651 elif len(header) == 8 and header[0:4] < b'\377tOc':
652 return PackIdxV1(filename, f)
654 raise GitError('%s: unrecognized idx file header'
655 % path_msg(filename))
656 elif filename.endswith(b'.midx'):
657 return midx.PackMidx(filename)
659 raise GitError('idx filenames must end with .idx or .midx')
662 def idxmerge(idxlist, final_progress=True):
663 """Generate a list of all the objects reachable in a PackIdxList."""
664 def pfunc(count, total):
665 qprogress('Reading indexes: %.2f%% (%d/%d)\r'
666 % (count*100.0/total, count, total))
667 def pfinal(count, total):
669 progress('Reading indexes: %.2f%% (%d/%d), done.\n'
670 % (100, total, total))
671 return merge_iter(idxlist, 10024, pfunc, pfinal)
674 def create_commit_blob(tree, parent,
675 author, adate_sec, adate_tz,
676 committer, cdate_sec, cdate_tz,
678 if adate_tz is not None:
679 adate_str = _git_date_str(adate_sec, adate_tz)
681 adate_str = _local_git_date_str(adate_sec)
682 if cdate_tz is not None:
683 cdate_str = _git_date_str(cdate_sec, cdate_tz)
685 cdate_str = _local_git_date_str(cdate_sec)
687 if tree: l.append(b'tree %s' % hexlify(tree))
688 if parent: l.append(b'parent %s' % hexlify(parent))
689 if author: l.append(b'author %s %s' % (author, adate_str))
690 if committer: l.append(b'committer %s %s' % (committer, cdate_str))
696 def _make_objcache():
697 return PackIdxList(repo(b'objects/pack'))
699 # bup-gc assumes that it can disable all PackWriter activities
700 # (bloom/midx/cache) via the constructor and close() arguments.
703 """Writes Git objects inside a pack file."""
704 def __init__(self, objcache_maker=_make_objcache, compression_level=1,
705 run_midx=True, on_pack_finish=None,
706 max_pack_size=None, max_pack_objects=None, repo_dir=None):
707 self.repo_dir = repo_dir or repo()
714 self.objcache_maker = objcache_maker
716 self.compression_level = compression_level
717 self.run_midx=run_midx
718 self.on_pack_finish = on_pack_finish
719 if not max_pack_size:
720 max_pack_size = git_config_get(b'pack.packSizeLimit',
721 repo_dir=self.repo_dir,
723 if not max_pack_size:
724 # larger packs slow down pruning
725 max_pack_size = 1000 * 1000 * 1000
726 self.max_pack_size = max_pack_size
727 # cache memory usage is about 83 bytes per object
728 self.max_pack_objects = max_pack_objects if max_pack_objects \
729 else max(1, self.max_pack_size // 5000)
737 def __exit__(self, type, value, traceback):
742 objdir = dir = os.path.join(self.repo_dir, b'objects')
743 fd, name = tempfile.mkstemp(suffix=b'.pack', dir=objdir)
745 self.file = os.fdopen(fd, 'w+b')
750 self.parentfd = os.open(objdir, os.O_RDONLY)
756 assert name.endswith(b'.pack')
757 self.filename = name[:-5]
758 self.file.write(b'PACK\0\0\0\2\0\0\0\0')
759 self.idx = PackIdxV2Writer()
761 def _raw_write(self, datalist, sha):
764 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
765 # the file never has a *partial* blob. So let's make sure it's
766 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
767 # to our hashsplit algorithm.) f.write() does its own buffering,
768 # but that's okay because we'll flush it in _end().
769 oneblob = b''.join(datalist)
775 crc = zlib.crc32(oneblob) & 0xffffffff
776 self._update_idx(sha, crc, nw)
781 def _update_idx(self, sha, crc, size):
784 self.idx.add(sha, crc, self.file.tell() - size)
786 def _write(self, sha, type, content):
790 sha = calc_hash(type, content)
791 size, crc = self._raw_write(_encode_packobj(type, content,
792 self.compression_level),
794 if self.outbytes >= self.max_pack_size \
795 or self.count >= self.max_pack_objects:
799 def breakpoint(self):
800 """Clear byte and object counts and return the last processed id."""
801 id = self._end(self.run_midx)
802 self.outbytes = self.count = 0
805 def _require_objcache(self):
806 if self.objcache is None and self.objcache_maker:
807 self.objcache = self.objcache_maker()
808 if self.objcache is None:
810 "PackWriter not opened or can't check exists w/o objcache")
812 def exists(self, id, want_source=False):
813 """Return non-empty if an object is found in the object cache."""
814 self._require_objcache()
815 return self.objcache.exists(id, want_source=want_source)
817 def just_write(self, sha, type, content):
818 """Write an object to the pack file without checking for duplication."""
819 self._write(sha, type, content)
820 # If nothing else, gc doesn't have/want an objcache
821 if self.objcache is not None:
822 self.objcache.add(sha)
824 def maybe_write(self, type, content):
825 """Write an object to the pack file if not present and return its id."""
826 sha = calc_hash(type, content)
827 if not self.exists(sha):
828 self._require_objcache()
829 self.just_write(sha, type, content)
832 def new_blob(self, blob):
833 """Create a blob object in the pack with the supplied content."""
834 return self.maybe_write(b'blob', blob)
836 def new_tree(self, shalist):
837 """Create a tree object in the pack."""
838 content = tree_encode(shalist)
839 return self.maybe_write(b'tree', content)
841 def new_commit(self, tree, parent,
842 author, adate_sec, adate_tz,
843 committer, cdate_sec, cdate_tz,
845 """Create a commit object in the pack. The date_sec values must be
846 epoch-seconds, and if a tz is None, the local timezone is assumed."""
847 content = create_commit_blob(tree, parent,
848 author, adate_sec, adate_tz,
849 committer, cdate_sec, cdate_tz,
851 return self.maybe_write(b'commit', content)
854 """Remove the pack file from disk."""
863 os.unlink(self.filename + b'.pack')
870 def _end(self, run_midx=True):
872 if not f: return None
879 # update object count
881 cp = struct.pack('!i', self.count)
885 # calculate the pack sha1sum
888 for b in chunkyreader(f):
890 packbin = sum.digest()
892 fdatasync(f.fileno())
896 idx.write(self.filename + b'.idx', packbin)
897 nameprefix = os.path.join(self.repo_dir,
898 b'objects/pack/pack-' + hexlify(packbin))
899 if os.path.exists(self.filename + b'.map'):
900 os.unlink(self.filename + b'.map')
901 os.rename(self.filename + b'.pack', nameprefix + b'.pack')
902 os.rename(self.filename + b'.idx', nameprefix + b'.idx')
904 os.fsync(self.parentfd)
906 os.close(self.parentfd)
909 auto_midx(os.path.join(self.repo_dir, b'objects/pack'))
911 if self.on_pack_finish:
912 self.on_pack_finish(nameprefix)
916 def close(self, run_midx=True):
917 """Close the pack file and move it to its definitive path."""
918 return self._end(run_midx=run_midx)
921 class PackIdxV2Writer:
923 self.idx = list(list() for i in range(256))
926 def add(self, sha, crc, offs):
929 self.idx[byte_int(sha[0])].append((sha, crc, offs))
931 def write(self, filename, packbin):
933 for section in self.idx:
934 for entry in section:
935 if entry[2] >= 2**31:
938 # Length: header + fan-out + shas-and-crcs + overflow-offsets
939 index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
941 idx_f = open(filename, 'w+b')
943 idx_f.truncate(index_len)
944 fdatasync(idx_f.fileno())
945 idx_map = mmap_readwrite(idx_f, close=False)
947 count = _helpers.write_idx(filename, idx_map, self.idx,
949 assert(count == self.count)
956 idx_f = open(filename, 'a+b')
961 b = idx_f.read(8 + 4*256)
964 for b in chunkyreader(idx_f, 20 * self.count):
967 for b in chunkyreader(idx_f):
969 idx_f.write(idx_sum.digest())
970 fdatasync(idx_f.fileno())
975 def list_refs(patterns=None, repo_dir=None,
976 limit_to_heads=False, limit_to_tags=False):
977 """Yield (refname, hash) tuples for all repository refs unless
978 patterns are specified. In that case, only include tuples for
979 refs matching those patterns (cf. git-show-ref(1)). The limits
980 restrict the result items to refs/heads or refs/tags. If both
981 limits are specified, items from both sources will be included.
984 argv = [b'git', b'show-ref']
986 argv.append(b'--heads')
988 argv.append(b'--tags')
991 argv.extend(patterns)
992 p = subprocess.Popen(argv, env=_gitenv(repo_dir), stdout=subprocess.PIPE,
994 out = p.stdout.read().strip()
995 rv = p.wait() # not fatal
999 for d in out.split(b'\n'):
1000 sha, name = d.split(b' ', 1)
1001 yield name, unhexlify(sha)
1004 def read_ref(refname, repo_dir = None):
1005 """Get the commit id of the most recent commit made on a given ref."""
1006 refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
1007 l = tuple(islice(refs, 2))
1015 def rev_list_invocation(ref_or_refs, format=None):
1016 if isinstance(ref_or_refs, bytes):
1017 refs = (ref_or_refs,)
1020 argv = [b'git', b'rev-list']
1023 argv.append(b'--pretty=format:' + format)
1025 assert not ref.startswith(b'-')
1031 def rev_list(ref_or_refs, parse=None, format=None, repo_dir=None):
1032 """Yield information about commits as per "git rev-list". If a format
1033 is not provided, yield one hex hash at a time. If a format is
1034 provided, pass it to rev-list and call parse(git_stdout) for each
1035 commit with the stream positioned just after the rev-list "commit
1036 HASH" header line. When a format is provided yield (oidx,
1037 parse(git_stdout)) for each commit.
1040 assert bool(parse) == bool(format)
1041 p = subprocess.Popen(rev_list_invocation(ref_or_refs,
1043 env=_gitenv(repo_dir),
1044 stdout = subprocess.PIPE,
1047 for line in p.stdout:
1050 line = p.stdout.readline()
1053 if not s.startswith(b'commit '):
1054 raise Exception('unexpected line ' + repr(s))
1057 yield s, parse(p.stdout)
1058 line = p.stdout.readline()
1060 rv = p.wait() # not fatal
1062 raise GitError('git rev-list returned error %d' % rv)
1065 def rev_parse(committish, repo_dir=None):
1066 """Resolve the full hash for 'committish', if it exists.
1068 Should be roughly equivalent to 'git rev-parse'.
1070 Returns the hex value of the hash if it is found, None if 'committish' does
1071 not correspond to anything.
1073 head = read_ref(committish, repo_dir=repo_dir)
1075 debug2("resolved from ref: commit = %s\n" % hexlify(head))
1078 pL = PackIdxList(repo(b'objects/pack', repo_dir=repo_dir))
1080 if len(committish) == 40:
1082 hash = unhexlify(committish)
1092 def update_ref(refname, newval, oldval, repo_dir=None):
1093 """Update a repository reference."""
1096 assert refname.startswith(b'refs/heads/') \
1097 or refname.startswith(b'refs/tags/')
1098 p = subprocess.Popen([b'git', b'update-ref', refname,
1099 hexlify(newval), hexlify(oldval)],
1100 env=_gitenv(repo_dir),
1102 _git_wait(b'git update-ref', p)
1105 def delete_ref(refname, oldvalue=None):
1106 """Delete a repository reference (see git update-ref(1))."""
1107 assert refname.startswith(b'refs/')
1108 oldvalue = [] if not oldvalue else [oldvalue]
1109 p = subprocess.Popen([b'git', b'update-ref', b'-d', refname] + oldvalue,
1112 _git_wait('git update-ref', p)
1115 def guess_repo(path=None):
1116 """Set the path value in the global variable "repodir".
1117 This makes bup look for an existing bup repository, but not fail if a
1118 repository doesn't exist. Usually, if you are interacting with a bup
1119 repository, you would not be calling this function but using
1120 check_repo_or_die().
1126 repodir = environ.get(b'BUP_DIR')
1128 repodir = os.path.expanduser(b'~/.bup')
1131 def init_repo(path=None):
1132 """Create the Git bare repository for bup in a given path."""
1134 d = repo() # appends a / to the path
1135 parent = os.path.dirname(os.path.dirname(d))
1136 if parent and not os.path.exists(parent):
1137 raise GitError('parent directory "%s" does not exist\n'
1139 if os.path.exists(d) and not os.path.isdir(os.path.join(d, b'.')):
1140 raise GitError('"%s" exists but is not a directory\n' % path_msg(d))
1141 p = subprocess.Popen([b'git', b'--bare', b'init'], stdout=sys.stderr,
1144 _git_wait('git init', p)
1145 # Force the index version configuration in order to ensure bup works
1146 # regardless of the version of the installed Git binary.
1147 p = subprocess.Popen([b'git', b'config', b'pack.indexVersion', '2'],
1148 stdout=sys.stderr, env=_gitenv(), close_fds=True)
1149 _git_wait('git config', p)
1151 p = subprocess.Popen([b'git', b'config', b'core.logAllRefUpdates', b'true'],
1152 stdout=sys.stderr, env=_gitenv(), close_fds=True)
1153 _git_wait('git config', p)
1156 def check_repo_or_die(path=None):
1157 """Check to see if a bup repository probably exists, and abort if not."""
1160 pst = stat_if_exists(top + b'/objects/pack')
1161 if pst and stat.S_ISDIR(pst.st_mode):
1164 top_st = stat_if_exists(top)
1166 log('error: repository %r does not exist (see "bup help init")\n'
1169 log('error: %s is not a repository\n' % path_msg(top))
1173 def is_suitable_git(ver_str):
1174 if not ver_str.startswith(b'git version '):
1175 return 'unrecognized'
1176 ver_str = ver_str[len(b'git version '):]
1177 if ver_str.startswith(b'0.'):
1178 return 'insufficient'
1179 if ver_str.startswith(b'1.'):
1180 if re.match(br'1\.[012345]rc', ver_str):
1181 return 'insufficient'
1182 if re.match(br'1\.[01234]\.', ver_str):
1183 return 'insufficient'
1184 if re.match(br'1\.5\.[012345]($|\.)', ver_str):
1185 return 'insufficient'
1186 if re.match(br'1\.5\.6-rc', ver_str):
1187 return 'insufficient'
1189 if re.match(br'[0-9]+(\.|$)?', ver_str):
1195 def require_suitable_git(ver_str=None):
1196 """Raise GitError if the version of git isn't suitable.
1198 Rely on ver_str when provided, rather than invoking the git in the
1203 if _git_great is not None:
1205 if environ.get(b'BUP_GIT_VERSION_IS_FINE', b'').lower() \
1206 in (b'yes', b'true', b'1'):
1210 ver_str, _, _ = _git_exo([b'git', b'--version'])
1211 status = is_suitable_git(ver_str)
1212 if status == 'unrecognized':
1213 raise GitError('Unexpected git --version output: %r' % ver_str)
1214 if status == 'insufficient':
1215 log('error: git version must be at least 1.5.6\n')
1217 if status == 'suitable':
1223 class _AbortableIter:
1224 def __init__(self, it, onabort = None):
1226 self.onabort = onabort
1234 return next(self.it)
1235 except StopIteration as e:
1245 """Abort iteration and call the abortion callback, if needed."""
1256 """Link to 'git cat-file' that is used to retrieve blob data."""
1257 def __init__(self, repo_dir = None):
1258 require_suitable_git()
1259 self.repo_dir = repo_dir
1260 self.p = self.inprogress = None
1262 def close(self, wait=False):
1268 self.inprogress = None
1275 self.p = subprocess.Popen([b'git', b'cat-file', b'--batch'],
1276 stdin=subprocess.PIPE,
1277 stdout=subprocess.PIPE,
1280 env=_gitenv(self.repo_dir))
1283 """Yield (oidx, type, size), followed by the data referred to by ref.
1284 If ref does not exist, only yield (None, None, None).
1287 if not self.p or self.p.poll() != None:
1290 poll_result = self.p.poll()
1291 assert(poll_result == None)
1293 log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1294 assert(not self.inprogress)
1295 assert ref.find(b'\n') < 0
1296 assert ref.find(b'\r') < 0
1297 assert not ref.startswith(b'-')
1298 self.inprogress = ref
1299 self.p.stdin.write(ref + b'\n')
1300 self.p.stdin.flush()
1301 hdr = self.p.stdout.readline()
1302 if hdr.endswith(b' missing\n'):
1303 self.inprogress = None
1304 yield None, None, None
1306 info = hdr.split(b' ')
1307 if len(info) != 3 or len(info[0]) != 40:
1308 raise GitError('expected object (id, type, size), got %r' % info)
1309 oidx, typ, size = info
1311 it = _AbortableIter(chunkyreader(self.p.stdout, size),
1314 yield oidx, typ, size
1317 readline_result = self.p.stdout.readline()
1318 assert readline_result == b'\n'
1319 self.inprogress = None
1320 except Exception as e:
1324 def _join(self, it):
1325 _, typ, _ = next(it)
1329 elif typ == b'tree':
1330 treefile = b''.join(it)
1331 for (mode, name, sha) in tree_decode(treefile):
1332 for blob in self.join(hexlify(sha)):
1334 elif typ == b'commit':
1335 treeline = b''.join(it).split(b'\n')[0]
1336 assert treeline.startswith(b'tree ')
1337 for blob in self.join(treeline[5:]):
1340 raise GitError('invalid object type %r: expected blob/tree/commit'
1344 """Generate a list of the content of all blobs that can be reached
1345 from an object. The hash given in 'id' must point to a blob, a tree
1346 or a commit. The content of all blobs that can be seen from trees or
1347 commits will be added to the list.
1349 for d in self._join(self.get(id)):
1355 def cp(repo_dir=None):
1356 """Create a CatPipe object or reuse the already existing one."""
1359 repo_dir = repodir or repo()
1360 repo_dir = os.path.abspath(repo_dir)
1361 cp = _cp.get(repo_dir)
1363 cp = CatPipe(repo_dir)
1368 def close_catpipes():
1369 # FIXME: chain exceptions
1371 _, cp = _cp.popitem()
1375 def tags(repo_dir = None):
1376 """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1378 for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1379 assert n.startswith(b'refs/tags/')
1383 tags[c].append(name) # more than one tag can point at 'c'
1387 class MissingObject(KeyError):
1388 def __init__(self, oid):
1390 KeyError.__init__(self, 'object %r is missing' % hexlify(oid))
1393 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1394 'path', 'chunk_path', 'data'])
1395 # The path is the mangled path, and if an item represents a fragment
1396 # of a chunked file, the chunk_path will be the chunked subtree path
1397 # for the chunk, i.e. ['', '2d3115e', ...]. The top-level path for a
1398 # chunked file will have a chunk_path of ['']. So some chunk subtree
1399 # of the file '/foo/bar/baz' might look like this:
1401 # item.path = ['foo', 'bar', 'baz.bup']
1402 # item.chunk_path = ['', '2d3115e', '016b097']
1403 # item.type = 'tree'
1407 def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1408 """Yield everything reachable from oidx via get_ref (which must behave
1409 like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1410 returns true. Throw MissingObject if a hash encountered is
1411 missing from the repository, and don't read or return blob content
1412 in the data field unless include_data is set.
1415 # Maintain the pending stack on the heap to avoid stack overflow
1416 pending = [(oidx, [], [], None)]
1418 oidx, parent_path, chunk_path, mode = pending.pop()
1419 oid = unhexlify(oidx)
1420 if stop_at and stop_at(oidx):
1423 if (not include_data) and mode and stat.S_ISREG(mode):
1424 # If the object is a "regular file", then it's a leaf in
1425 # the graph, so we can skip reading the data if the caller
1426 # hasn't requested it.
1427 yield WalkItem(oid=oid, type=b'blob',
1428 chunk_path=chunk_path, path=parent_path,
1433 item_it = get_ref(oidx)
1434 get_oidx, typ, _ = next(item_it)
1436 raise MissingObject(unhexlify(oidx))
1437 if typ not in (b'blob', b'commit', b'tree'):
1438 raise Exception('unexpected repository object type %r' % typ)
1440 # FIXME: set the mode based on the type when the mode is None
1441 if typ == b'blob' and not include_data:
1442 # Dump data until we can ask cat_pipe not to fetch it
1443 for ignored in item_it:
1447 data = b''.join(item_it)
1449 yield WalkItem(oid=oid, type=typ,
1450 chunk_path=chunk_path, path=parent_path,
1452 data=(data if include_data else None))
1454 if typ == b'commit':
1455 commit_items = parse_commit(data)
1456 for pid in commit_items.parents:
1457 pending.append((pid, parent_path, chunk_path, mode))
1458 pending.append((commit_items.tree, parent_path, chunk_path,
1459 hashsplit.GIT_MODE_TREE))
1460 elif typ == b'tree':
1461 for mode, name, ent_id in tree_decode(data):
1462 demangled, bup_type = demangle_name(name, mode)
1464 sub_path = parent_path
1465 sub_chunk_path = chunk_path + [name]
1467 sub_path = parent_path + [name]
1468 if bup_type == BUP_CHUNKED:
1469 sub_chunk_path = [b'']
1471 sub_chunk_path = chunk_path
1472 pending.append((hexlify(ent_id), sub_path, sub_chunk_path,