1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
6 from __future__ import absolute_import, print_function
7 import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
8 from array import array
9 from binascii import hexlify, unhexlify
10 from collections import namedtuple
11 from itertools import islice
12 from numbers import Integral
14 from bup import _helpers, compat, hashsplit, path, midx, bloom, xstat
15 from bup.compat import (buffer,
16 byte_int, bytes_from_byte, bytes_from_uint,
21 from bup.io import path_msg
22 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
25 hostname, localtime, log,
28 mmap_read, mmap_readwrite,
30 progress, qprogress, stat_if_exists,
33 from bup.pwdgrp import username, userfullname
37 repodir = None # The default repository, once initialized
39 _typemap = {b'blob': 3, b'tree': 2, b'commit': 1, b'tag': 4}
40 _typermap = {v: k for k, v in items(_typemap)}
47 class GitError(Exception):
51 def _gitenv(repo_dir=None):
54 return merge_dict(environ, {b'GIT_DIR': os.path.abspath(repo_dir)})
56 def _git_wait(cmd, p):
59 raise GitError('%r returned %d' % (cmd, rv))
61 def _git_exo(cmd, **kwargs):
62 kwargs['check'] = False
63 result = exo(cmd, **kwargs)
65 if proc.returncode != 0:
66 raise GitError('%r returned %d' % (cmd, proc.returncode))
69 def git_config_get(option, repo_dir=None):
70 cmd = (b'git', b'config', b'--get', option)
71 p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
72 env=_gitenv(repo_dir=repo_dir))
78 raise GitError('%r returned %d' % (cmd, rc))
82 def parse_tz_offset(s):
83 """UTC offset in seconds."""
84 tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
85 if bytes_from_byte(s[0]) == b'-':
90 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
91 # Make sure that's authoritative.
92 _start_end_char = br'[^ .,:;<>"\'\0\n]'
93 _content_char = br'[^\0\n<>]'
94 _safe_str_rx = br'(?:%s{1,2}|(?:%s%s*%s))' \
96 _start_end_char, _content_char, _start_end_char)
97 _tz_rx = br'[-+]\d\d[0-5]\d'
98 _parent_rx = br'(?:parent [abcdefABCDEF0123456789]{40}\n)'
99 # Assumes every following line starting with a space is part of the
100 # mergetag. Is there a formal commit blob spec?
101 _mergetag_rx = br'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)'
102 _commit_rx = re.compile(br'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
103 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
104 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?)
106 (?P<message>(?:.|\n)*)''' % (_parent_rx,
107 _safe_str_rx, _safe_str_rx, _tz_rx,
108 _safe_str_rx, _safe_str_rx, _tz_rx,
110 _parent_hash_rx = re.compile(br'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
112 # Note that the author_sec and committer_sec values are (UTC) epoch
113 # seconds, and for now the mergetag is not included.
114 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
115 'author_name', 'author_mail',
116 'author_sec', 'author_offset',
117 'committer_name', 'committer_mail',
118 'committer_sec', 'committer_offset',
121 def parse_commit(content):
122 commit_match = re.match(_commit_rx, content)
124 raise Exception('cannot parse commit %r' % content)
125 matches = commit_match.groupdict()
126 return CommitInfo(tree=matches['tree'],
127 parents=re.findall(_parent_hash_rx, matches['parents']),
128 author_name=matches['author_name'],
129 author_mail=matches['author_mail'],
130 author_sec=int(matches['asec']),
131 author_offset=parse_tz_offset(matches['atz']),
132 committer_name=matches['committer_name'],
133 committer_mail=matches['committer_mail'],
134 committer_sec=int(matches['csec']),
135 committer_offset=parse_tz_offset(matches['ctz']),
136 message=matches['message'])
139 def get_cat_data(cat_iterator, expected_type):
140 _, kind, _ = next(cat_iterator)
141 if kind != expected_type:
142 raise Exception('expected %r, saw %r' % (expected_type, kind))
143 return b''.join(cat_iterator)
145 def get_commit_items(id, cp):
146 return parse_commit(get_cat_data(cp.get(id), b'commit'))
148 def _local_git_date_str(epoch_sec):
149 return b'%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
152 def _git_date_str(epoch_sec, tz_offset_sec):
153 offs = tz_offset_sec // 60
154 return b'%d %s%02d%02d' \
156 b'+' if offs >= 0 else b'-',
161 def repo(sub = b'', repo_dir=None):
162 """Get the path to the git repository or one of its subdirectories."""
163 repo_dir = repo_dir or repodir
165 raise GitError('You should call check_repo_or_die()')
167 # If there's a .git subdirectory, then the actual repo is in there.
168 gd = os.path.join(repo_dir, b'.git')
169 if os.path.exists(gd):
172 return os.path.join(repo_dir, sub)
176 re.compile(br'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)')
179 return _shorten_hash_rx.sub(br'\1\2*\3', s)
183 full = os.path.abspath(path)
184 fullrepo = os.path.abspath(repo(b''))
185 if not fullrepo.endswith(b'/'):
187 if full.startswith(fullrepo):
188 path = full[len(fullrepo):]
189 if path.startswith(b'index-cache/'):
190 path = path[len(b'index-cache/'):]
191 return shorten_hash(path)
195 paths = [repo(b'objects/pack')]
196 paths += glob.glob(repo(b'index-cache/*/.'))
200 def auto_midx(objdir):
201 args = [path.exe(), b'midx', b'--auto', b'--dir', objdir]
203 rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
205 # make sure 'args' gets printed to help with debugging
206 add_error('%r: exception: %s' % (args, e))
209 add_error('%r: returned %d' % (args, rv))
211 args = [path.exe(), b'bloom', b'--dir', objdir]
213 rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
215 # make sure 'args' gets printed to help with debugging
216 add_error('%r: exception: %s' % (args, e))
219 add_error('%r: returned %d' % (args, rv))
222 def mangle_name(name, mode, gitmode):
223 """Mangle a file name to present an abstract name for segmented files.
224 Mangled file names will have the ".bup" extension added to them. If a
225 file's name already ends with ".bup", a ".bupl" extension is added to
226 disambiguate normal files from segmented ones.
228 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
229 assert(stat.S_ISDIR(gitmode))
230 return name + b'.bup'
231 elif name.endswith(b'.bup') or name[:-1].endswith(b'.bup'):
232 return name + b'.bupl'
237 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
238 def demangle_name(name, mode):
239 """Remove name mangling from a file name, if necessary.
241 The return value is a tuple (demangled_filename,mode), where mode is one of
244 * BUP_NORMAL : files that should be read as-is from the repository
245 * BUP_CHUNKED : files that were chunked and need to be reassembled
247 For more information on the name mangling algorithm, see mangle_name()
249 if name.endswith(b'.bupl'):
250 return (name[:-5], BUP_NORMAL)
251 elif name.endswith(b'.bup'):
252 return (name[:-4], BUP_CHUNKED)
253 elif name.endswith(b'.bupm'):
255 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
257 return (name, BUP_NORMAL)
260 def calc_hash(type, content):
261 """Calculate some content's hash in the Git fashion."""
262 header = b'%s %d\0' % (type, len(content))
268 def shalist_item_sort_key(ent):
269 (mode, name, id) = ent
270 assert(mode+0 == mode)
271 if stat.S_ISDIR(mode):
277 def tree_encode(shalist):
278 """Generate a git tree object from (mode,name,hash) tuples."""
279 shalist = sorted(shalist, key = shalist_item_sort_key)
281 for (mode,name,bin) in shalist:
283 assert(mode+0 == mode)
285 assert(len(bin) == 20)
286 s = b'%o %s\0%s' % (mode,name,bin)
287 assert s[0] != b'0' # 0-padded octal is not acceptable in a git tree
292 def tree_decode(buf):
293 """Generate a list of (mode,name,hash) from the git tree object in buf."""
295 while ofs < len(buf):
296 z = buf.find(b'\0', ofs)
298 spl = buf[ofs:z].split(b' ', 1)
299 assert(len(spl) == 2)
301 sha = buf[z+1:z+1+20]
303 yield (int(mode, 8), name, sha)
306 def _encode_packobj(type, content, compression_level=1):
307 if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
308 raise ValueError('invalid compression level %s' % compression_level)
311 szbits = (sz & 0x0f) | (_typemap[type]<<4)
314 if sz: szbits |= 0x80
315 szout += bytes_from_uint(szbits)
320 z = zlib.compressobj(compression_level)
322 yield z.compress(content)
326 def _encode_looseobj(type, content, compression_level=1):
327 z = zlib.compressobj(compression_level)
328 yield z.compress(b'%s %d\0' % (type, len(content)))
329 yield z.compress(content)
333 def _decode_looseobj(buf):
335 s = zlib.decompress(buf)
338 l = s[:i].split(b' ')
342 assert(type in _typemap)
343 assert(sz == len(content))
344 return (type, content)
347 def _decode_packobj(buf):
350 type = _typermap[(c & 0x70) >> 4]
357 sz |= (c & 0x7f) << shift
361 return (type, zlib.decompress(buf[i+1:]))
368 def find_offset(self, hash):
369 """Get the offset of an object inside the index file."""
370 idx = self._idx_from_hash(hash)
372 return self._ofs_from_idx(idx)
375 def exists(self, hash, want_source=False):
376 """Return nonempty if the object exists in this index."""
377 if hash and (self._idx_from_hash(hash) != None):
378 return want_source and os.path.basename(self.name) or True
381 def _idx_from_hash(self, hash):
382 global _total_searches, _total_steps
384 assert(len(hash) == 20)
385 b1 = byte_int(hash[0])
386 start = self.fanout[b1-1] # range -1..254
387 end = self.fanout[b1] # range 0..255
389 _total_steps += 1 # lookup table is a step
392 mid = start + (end - start) // 2
393 v = self._idx_to_hash(mid)
403 class PackIdxV1(PackIdx):
404 """Object representation of a Git pack index (version 1) file."""
405 def __init__(self, filename, f):
407 self.idxnames = [self.name]
408 self.map = mmap_read(f)
409 # Min size for 'L' is 4, which is sufficient for struct's '!I'
410 self.fanout = array('L', struct.unpack('!256I', self.map))
411 self.fanout.append(0) # entry "-1"
412 self.nsha = self.fanout[255]
413 self.sha_ofs = 256 * 4
414 # Avoid slicing shatable for individual hashes (very high overhead)
415 self.shatable = buffer(self.map, self.sha_ofs, self.nsha * 24)
420 def __exit__(self, type, value, traceback):
424 return int(self.nsha) # int() from long for python 2
426 def _ofs_from_idx(self, idx):
427 if idx >= self.nsha or idx < 0:
428 raise IndexError('invalid pack index index %d' % idx)
429 ofs = self.sha_ofs + idx * 24
430 return struct.unpack_from('!I', self.map, offset=ofs)[0]
432 def _idx_to_hash(self, idx):
433 if idx >= self.nsha or idx < 0:
434 raise IndexError('invalid pack index index %d' % idx)
435 ofs = self.sha_ofs + idx * 24 + 4
436 return self.map[ofs : ofs + 20]
439 start = self.sha_ofs + 4
440 for ofs in range(start, start + 24 * self.nsha, 24):
441 yield self.map[ofs : ofs + 20]
444 if self.map is not None:
450 class PackIdxV2(PackIdx):
451 """Object representation of a Git pack index (version 2) file."""
452 def __init__(self, filename, f):
454 self.idxnames = [self.name]
455 self.map = mmap_read(f)
456 assert self.map[0:8] == b'\377tOc\0\0\0\2'
457 # Min size for 'L' is 4, which is sufficient for struct's '!I'
458 self.fanout = array('L', struct.unpack_from('!256I', self.map, offset=8))
459 self.fanout.append(0)
460 self.nsha = self.fanout[255]
461 self.sha_ofs = 8 + 256*4
462 self.ofstable_ofs = self.sha_ofs + self.nsha * 20 + self.nsha * 4
463 self.ofs64table_ofs = self.ofstable_ofs + self.nsha * 4
464 # Avoid slicing this for individual hashes (very high overhead)
465 self.shatable = buffer(self.map, self.sha_ofs, self.nsha*20)
470 def __exit__(self, type, value, traceback):
474 return int(self.nsha) # int() from long for python 2
476 def _ofs_from_idx(self, idx):
477 if idx >= self.nsha or idx < 0:
478 raise IndexError('invalid pack index index %d' % idx)
479 ofs_ofs = self.ofstable_ofs + idx * 4
480 ofs = struct.unpack_from('!I', self.map, offset=ofs_ofs)[0]
482 idx64 = ofs & 0x7fffffff
483 ofs64_ofs = self.ofs64table_ofs + idx64 * 8
484 ofs = struct.unpack_from('!Q', self.map, offset=ofs64_ofs)[0]
487 def _idx_to_hash(self, idx):
488 if idx >= self.nsha or idx < 0:
489 raise IndexError('invalid pack index index %d' % idx)
490 ofs = self.sha_ofs + idx * 20
491 return self.map[ofs : ofs + 20]
495 for ofs in range(start, start + 20 * self.nsha, 20):
496 yield self.map[ofs : ofs + 20]
499 if self.map is not None:
507 def __init__(self, dir, ignore_midx=False):
509 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
514 self.do_bloom = False
516 self.ignore_midx = ignore_midx
522 assert(_mpi_count == 0)
525 return iter(idxmerge(self.packs))
528 return sum(len(pack) for pack in self.packs)
530 def exists(self, hash, want_source=False):
531 """Return nonempty if the object exists in the index files."""
532 global _total_searches
534 if hash in self.also:
536 if self.do_bloom and self.bloom:
537 if self.bloom.exists(hash):
538 self.do_bloom = False
540 _total_searches -= 1 # was counted by bloom
542 for i in range(len(self.packs)):
544 _total_searches -= 1 # will be incremented by sub-pack
545 ix = p.exists(hash, want_source=want_source)
547 # reorder so most recently used packs are searched first
548 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
553 def refresh(self, skip_midx = False):
554 """Refresh the index list.
555 This method verifies if .midx files were superseded (e.g. all of its
556 contents are in another, bigger .midx file) and removes the superseded
559 If skip_midx is True, all work on .midx files will be skipped and .midx
560 files will be removed from the list.
562 The instance variable 'ignore_midx' can force this function to
563 always act as if skip_midx was True.
565 if self.bloom is not None:
567 self.bloom = None # Always reopen the bloom as it may have been relaced
568 self.do_bloom = False
569 skip_midx = skip_midx or self.ignore_midx
570 d = dict((p.name, p) for p in self.packs
571 if not skip_midx or not isinstance(p, midx.PackMidx))
572 if os.path.exists(self.dir):
575 midxes = set(glob.glob(os.path.join(self.dir, b'*.midx')))
576 # remove any *.midx files from our list that no longer exist
577 for ix in list(d.values()):
578 if not isinstance(ix, midx.PackMidx):
580 if ix.name in midxes:
585 self.packs.remove(ix)
586 for ix in self.packs:
587 if isinstance(ix, midx.PackMidx):
588 for name in ix.idxnames:
589 d[os.path.join(self.dir, name)] = ix
592 mx = midx.PackMidx(full)
593 (mxd, mxf) = os.path.split(mx.name)
595 for n in mx.idxnames:
596 if not os.path.exists(os.path.join(mxd, n)):
597 log(('warning: index %s missing\n'
599 % (path_msg(n), path_msg(mxf)))
607 midxl.sort(key=lambda ix:
608 (-len(ix), -xstat.stat(ix.name).st_mtime))
611 for sub in ix.idxnames:
612 found = d.get(os.path.join(self.dir, sub))
613 if not found or isinstance(found, PackIdx):
614 # doesn't exist, or exists but not in a midx
619 for name in ix.idxnames:
620 d[os.path.join(self.dir, name)] = ix
621 elif not ix.force_keep:
622 debug1('midx: removing redundant: %s\n'
623 % path_msg(os.path.basename(ix.name)))
626 for full in glob.glob(os.path.join(self.dir, b'*.idx')):
630 except GitError as e:
634 bfull = os.path.join(self.dir, b'bup.bloom')
635 if self.bloom is None and os.path.exists(bfull):
636 self.bloom = bloom.ShaBloom(bfull)
637 self.packs = list(set(d.values()))
638 self.packs.sort(reverse=True, key=lambda x: len(x))
639 if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
643 debug1('PackIdxList: using %d index%s.\n'
644 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
647 """Insert an additional object in the list."""
651 def open_idx(filename):
652 if filename.endswith(b'.idx'):
653 f = open(filename, 'rb')
655 if header[0:4] == b'\377tOc':
656 version = struct.unpack('!I', header[4:8])[0]
658 return PackIdxV2(filename, f)
660 raise GitError('%s: expected idx file version 2, got %d'
661 % (path_msg(filename), version))
662 elif len(header) == 8 and header[0:4] < b'\377tOc':
663 return PackIdxV1(filename, f)
665 raise GitError('%s: unrecognized idx file header'
666 % path_msg(filename))
667 elif filename.endswith(b'.midx'):
668 return midx.PackMidx(filename)
670 raise GitError('idx filenames must end with .idx or .midx')
673 def idxmerge(idxlist, final_progress=True):
674 """Generate a list of all the objects reachable in a PackIdxList."""
675 def pfunc(count, total):
676 qprogress('Reading indexes: %.2f%% (%d/%d)\r'
677 % (count*100.0/total, count, total))
678 def pfinal(count, total):
680 progress('Reading indexes: %.2f%% (%d/%d), done.\n'
681 % (100, total, total))
682 return merge_iter(idxlist, 10024, pfunc, pfinal)
685 def _make_objcache():
686 return PackIdxList(repo(b'objects/pack'))
688 # bup-gc assumes that it can disable all PackWriter activities
689 # (bloom/midx/cache) via the constructor and close() arguments.
692 """Writes Git objects inside a pack file."""
693 def __init__(self, objcache_maker=_make_objcache, compression_level=1,
694 run_midx=True, on_pack_finish=None,
695 max_pack_size=None, max_pack_objects=None, repo_dir=None):
696 self.repo_dir = repo_dir or repo()
703 self.objcache_maker = objcache_maker
705 self.compression_level = compression_level
706 self.run_midx=run_midx
707 self.on_pack_finish = on_pack_finish
708 if not max_pack_size:
709 max_pack_size = git_config_get(b'pack.packSizeLimit',
710 repo_dir=self.repo_dir)
711 if max_pack_size is not None:
712 max_pack_size = parse_num(max_pack_size)
713 if not max_pack_size:
714 # larger packs slow down pruning
715 max_pack_size = 1000 * 1000 * 1000
716 self.max_pack_size = max_pack_size
717 # cache memory usage is about 83 bytes per object
718 self.max_pack_objects = max_pack_objects if max_pack_objects \
719 else max(1, self.max_pack_size // 5000)
727 def __exit__(self, type, value, traceback):
732 objdir = dir = os.path.join(self.repo_dir, b'objects')
733 fd, name = tempfile.mkstemp(suffix=b'.pack', dir=objdir)
735 self.file = os.fdopen(fd, 'w+b')
740 self.parentfd = os.open(objdir, os.O_RDONLY)
746 assert name.endswith(b'.pack')
747 self.filename = name[:-5]
748 self.file.write(b'PACK\0\0\0\2\0\0\0\0')
749 self.idx = PackIdxV2Writer()
751 def _raw_write(self, datalist, sha):
754 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
755 # the file never has a *partial* blob. So let's make sure it's
756 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
757 # to our hashsplit algorithm.) f.write() does its own buffering,
758 # but that's okay because we'll flush it in _end().
759 oneblob = b''.join(datalist)
765 crc = zlib.crc32(oneblob) & 0xffffffff
766 self._update_idx(sha, crc, nw)
771 def _update_idx(self, sha, crc, size):
774 self.idx.add(sha, crc, self.file.tell() - size)
776 def _write(self, sha, type, content):
780 sha = calc_hash(type, content)
781 size, crc = self._raw_write(_encode_packobj(type, content,
782 self.compression_level),
784 if self.outbytes >= self.max_pack_size \
785 or self.count >= self.max_pack_objects:
789 def breakpoint(self):
790 """Clear byte and object counts and return the last processed id."""
791 id = self._end(self.run_midx)
792 self.outbytes = self.count = 0
795 def _require_objcache(self):
796 if self.objcache is None and self.objcache_maker:
797 self.objcache = self.objcache_maker()
798 if self.objcache is None:
800 "PackWriter not opened or can't check exists w/o objcache")
802 def exists(self, id, want_source=False):
803 """Return non-empty if an object is found in the object cache."""
804 self._require_objcache()
805 return self.objcache.exists(id, want_source=want_source)
807 def just_write(self, sha, type, content):
808 """Write an object to the pack file without checking for duplication."""
809 self._write(sha, type, content)
810 # If nothing else, gc doesn't have/want an objcache
811 if self.objcache is not None:
812 self.objcache.add(sha)
814 def maybe_write(self, type, content):
815 """Write an object to the pack file if not present and return its id."""
816 sha = calc_hash(type, content)
817 if not self.exists(sha):
818 self._require_objcache()
819 self.just_write(sha, type, content)
822 def new_blob(self, blob):
823 """Create a blob object in the pack with the supplied content."""
824 return self.maybe_write(b'blob', blob)
826 def new_tree(self, shalist):
827 """Create a tree object in the pack."""
828 content = tree_encode(shalist)
829 return self.maybe_write(b'tree', content)
831 def new_commit(self, tree, parent,
832 author, adate_sec, adate_tz,
833 committer, cdate_sec, cdate_tz,
835 """Create a commit object in the pack. The date_sec values must be
836 epoch-seconds, and if a tz is None, the local timezone is assumed."""
837 if adate_tz is not None:
838 adate_str = _git_date_str(adate_sec, adate_tz)
840 adate_str = _local_git_date_str(adate_sec)
841 if cdate_tz is not None:
842 cdate_str = _git_date_str(cdate_sec, cdate_tz)
844 cdate_str = _local_git_date_str(cdate_sec)
846 if tree: l.append(b'tree %s' % hexlify(tree))
847 if parent: l.append(b'parent %s' % hexlify(parent))
848 if author: l.append(b'author %s %s' % (author, adate_str))
849 if committer: l.append(b'committer %s %s' % (committer, cdate_str))
852 return self.maybe_write(b'commit', b'\n'.join(l))
855 """Remove the pack file from disk."""
864 os.unlink(self.filename + b'.pack')
871 def _end(self, run_midx=True):
873 if not f: return None
880 # update object count
882 cp = struct.pack('!i', self.count)
886 # calculate the pack sha1sum
889 for b in chunkyreader(f):
891 packbin = sum.digest()
893 fdatasync(f.fileno())
897 obj_list_sha = idx.write(self.filename + b'.idx', packbin)
898 nameprefix = os.path.join(self.repo_dir,
899 b'objects/pack/pack-' + obj_list_sha)
900 if os.path.exists(self.filename + b'.map'):
901 os.unlink(self.filename + b'.map')
902 os.rename(self.filename + b'.pack', nameprefix + b'.pack')
903 os.rename(self.filename + b'.idx', nameprefix + b'.idx')
905 os.fsync(self.parentfd)
907 os.close(self.parentfd)
910 auto_midx(os.path.join(self.repo_dir, b'objects/pack'))
912 if self.on_pack_finish:
913 self.on_pack_finish(nameprefix)
917 def close(self, run_midx=True):
918 """Close the pack file and move it to its definitive path."""
919 return self._end(run_midx=run_midx)
922 class PackIdxV2Writer:
924 self.idx = list(list() for i in range(256))
927 def add(self, sha, crc, offs):
930 self.idx[byte_int(sha[0])].append((sha, crc, offs))
932 def write(self, filename, packbin):
934 for section in self.idx:
935 for entry in section:
936 if entry[2] >= 2**31:
939 # Length: header + fan-out + shas-and-crcs + overflow-offsets
940 index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
942 idx_f = open(filename, 'w+b')
944 idx_f.truncate(index_len)
945 fdatasync(idx_f.fileno())
946 idx_map = mmap_readwrite(idx_f, close=False)
948 count = _helpers.write_idx(filename, idx_map, self.idx,
950 assert(count == self.count)
957 idx_f = open(filename, 'a+b')
962 b = idx_f.read(8 + 4*256)
965 obj_list_sum = Sha1()
966 for b in chunkyreader(idx_f, 20 * self.count):
968 obj_list_sum.update(b)
969 namebase = hexlify(obj_list_sum.digest())
971 for b in chunkyreader(idx_f):
973 idx_f.write(idx_sum.digest())
974 fdatasync(idx_f.fileno())
980 def list_refs(patterns=None, repo_dir=None,
981 limit_to_heads=False, limit_to_tags=False):
982 """Yield (refname, hash) tuples for all repository refs unless
983 patterns are specified. In that case, only include tuples for
984 refs matching those patterns (cf. git-show-ref(1)). The limits
985 restrict the result items to refs/heads or refs/tags. If both
986 limits are specified, items from both sources will be included.
989 argv = [b'git', b'show-ref']
991 argv.append(b'--heads')
993 argv.append(b'--tags')
996 argv.extend(patterns)
997 p = subprocess.Popen(argv, env=_gitenv(repo_dir), stdout=subprocess.PIPE)
998 out = p.stdout.read().strip()
999 rv = p.wait() # not fatal
1003 for d in out.split(b'\n'):
1004 sha, name = d.split(b' ', 1)
1005 yield name, unhexlify(sha)
1008 def read_ref(refname, repo_dir = None):
1009 """Get the commit id of the most recent commit made on a given ref."""
1010 refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
1011 l = tuple(islice(refs, 2))
1019 def rev_list_invocation(ref_or_refs, format=None):
1020 if isinstance(ref_or_refs, bytes):
1021 refs = (ref_or_refs,)
1024 argv = [b'git', b'rev-list']
1027 argv.append(b'--pretty=format:' + format)
1029 assert not ref.startswith(b'-')
1035 def rev_list(ref_or_refs, parse=None, format=None, repo_dir=None):
1036 """Yield information about commits as per "git rev-list". If a format
1037 is not provided, yield one hex hash at a time. If a format is
1038 provided, pass it to rev-list and call parse(git_stdout) for each
1039 commit with the stream positioned just after the rev-list "commit
1040 HASH" header line. When a format is provided yield (oidx,
1041 parse(git_stdout)) for each commit.
1044 assert bool(parse) == bool(format)
1045 p = subprocess.Popen(rev_list_invocation(ref_or_refs,
1047 env=_gitenv(repo_dir),
1048 stdout = subprocess.PIPE)
1050 for line in p.stdout:
1053 line = p.stdout.readline()
1056 if not s.startswith(b'commit '):
1057 raise Exception('unexpected line ' + repr(s))
1060 yield s, parse(p.stdout)
1061 line = p.stdout.readline()
1063 rv = p.wait() # not fatal
1065 raise GitError('git rev-list returned error %d' % rv)
1068 def get_commit_dates(refs, repo_dir=None):
1069 """Get the dates for the specified commit refs. For now, every unique
1070 string in refs must resolve to a different commit or this
1071 function will fail."""
1074 commit = get_commit_items(ref, cp(repo_dir))
1075 result.append(commit.author_sec)
1079 def rev_parse(committish, repo_dir=None):
1080 """Resolve the full hash for 'committish', if it exists.
1082 Should be roughly equivalent to 'git rev-parse'.
1084 Returns the hex value of the hash if it is found, None if 'committish' does
1085 not correspond to anything.
1087 head = read_ref(committish, repo_dir=repo_dir)
1089 debug2("resolved from ref: commit = %s\n" % hexlify(head))
1092 pL = PackIdxList(repo(b'objects/pack', repo_dir=repo_dir))
1094 if len(committish) == 40:
1096 hash = unhexlify(committish)
1106 def update_ref(refname, newval, oldval, repo_dir=None):
1107 """Update a repository reference."""
1110 assert refname.startswith(b'refs/heads/') \
1111 or refname.startswith(b'refs/tags/')
1112 p = subprocess.Popen([b'git', b'update-ref', refname,
1113 hexlify(newval), hexlify(oldval)],
1114 env=_gitenv(repo_dir))
1115 _git_wait(b'git update-ref', p)
1118 def delete_ref(refname, oldvalue=None):
1119 """Delete a repository reference (see git update-ref(1))."""
1120 assert refname.startswith(b'refs/')
1121 oldvalue = [] if not oldvalue else [oldvalue]
1122 p = subprocess.Popen([b'git', b'update-ref', b'-d', refname] + oldvalue,
1124 _git_wait('git update-ref', p)
1127 def guess_repo(path=None):
1128 """Set the path value in the global variable "repodir".
1129 This makes bup look for an existing bup repository, but not fail if a
1130 repository doesn't exist. Usually, if you are interacting with a bup
1131 repository, you would not be calling this function but using
1132 check_repo_or_die().
1138 repodir = environ.get(b'BUP_DIR')
1140 repodir = os.path.expanduser(b'~/.bup')
1143 def init_repo(path=None):
1144 """Create the Git bare repository for bup in a given path."""
1146 d = repo() # appends a / to the path
1147 parent = os.path.dirname(os.path.dirname(d))
1148 if parent and not os.path.exists(parent):
1149 raise GitError('parent directory "%s" does not exist\n'
1151 if os.path.exists(d) and not os.path.isdir(os.path.join(d, b'.')):
1152 raise GitError('"%s" exists but is not a directory\n' % path_msg(d))
1153 p = subprocess.Popen([b'git', b'--bare', b'init'], stdout=sys.stderr,
1155 _git_wait('git init', p)
1156 # Force the index version configuration in order to ensure bup works
1157 # regardless of the version of the installed Git binary.
1158 p = subprocess.Popen([b'git', b'config', b'pack.indexVersion', '2'],
1159 stdout=sys.stderr, env=_gitenv())
1160 _git_wait('git config', p)
1162 p = subprocess.Popen([b'git', b'config', b'core.logAllRefUpdates', b'true'],
1163 stdout=sys.stderr, env=_gitenv())
1164 _git_wait('git config', p)
1167 def check_repo_or_die(path=None):
1168 """Check to see if a bup repository probably exists, and abort if not."""
1171 pst = stat_if_exists(top + b'/objects/pack')
1172 if pst and stat.S_ISDIR(pst.st_mode):
1175 top_st = stat_if_exists(top)
1177 log('error: repository %r does not exist (see "bup help init")\n'
1180 log('error: %s is not a repository\n' % path_msg(top))
1184 def is_suitable_git(ver_str):
1185 if not ver_str.startswith(b'git version '):
1186 return 'unrecognized'
1187 ver_str = ver_str[len(b'git version '):]
1188 if ver_str.startswith(b'0.'):
1189 return 'insufficient'
1190 if ver_str.startswith(b'1.'):
1191 if re.match(br'1\.[012345]rc', ver_str):
1192 return 'insufficient'
1193 if re.match(br'1\.[01234]\.', ver_str):
1194 return 'insufficient'
1195 if re.match(br'1\.5\.[012345]($|\.)', ver_str):
1196 return 'insufficient'
1197 if re.match(br'1\.5\.6-rc', ver_str):
1198 return 'insufficient'
1200 if re.match(br'[0-9]+(\.|$)?', ver_str):
1206 def require_suitable_git(ver_str=None):
1207 """Raise GitError if the version of git isn't suitable.
1209 Rely on ver_str when provided, rather than invoking the git in the
1214 if _git_great is not None:
1216 if environ.get(b'BUP_GIT_VERSION_IS_FINE', b'').lower() \
1217 in (b'yes', b'true', b'1'):
1221 ver_str, _, _ = _git_exo([b'git', b'--version'])
1222 status = is_suitable_git(ver_str)
1223 if status == 'unrecognized':
1224 raise GitError('Unexpected git --version output: %r' % ver_str)
1225 if status == 'insufficient':
1226 log('error: git version must be at least 1.5.6\n')
1228 if status == 'suitable':
1234 class _AbortableIter:
1235 def __init__(self, it, onabort = None):
1237 self.onabort = onabort
1245 return next(self.it)
1246 except StopIteration as e:
1256 """Abort iteration and call the abortion callback, if needed."""
1267 """Link to 'git cat-file' that is used to retrieve blob data."""
1268 def __init__(self, repo_dir = None):
1269 require_suitable_git()
1270 self.repo_dir = repo_dir
1271 self.p = self.inprogress = None
1275 self.p.stdout.close()
1276 self.p.stdin.close()
1278 self.inprogress = None
1282 self.p = subprocess.Popen([b'git', b'cat-file', b'--batch'],
1283 stdin=subprocess.PIPE,
1284 stdout=subprocess.PIPE,
1287 env=_gitenv(self.repo_dir))
1290 """Yield (oidx, type, size), followed by the data referred to by ref.
1291 If ref does not exist, only yield (None, None, None).
1294 if not self.p or self.p.poll() != None:
1297 poll_result = self.p.poll()
1298 assert(poll_result == None)
1300 log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1301 assert(not self.inprogress)
1302 assert ref.find(b'\n') < 0
1303 assert ref.find(b'\r') < 0
1304 assert not ref.startswith(b'-')
1305 self.inprogress = ref
1306 self.p.stdin.write(ref + b'\n')
1307 self.p.stdin.flush()
1308 hdr = self.p.stdout.readline()
1309 if hdr.endswith(b' missing\n'):
1310 self.inprogress = None
1311 yield None, None, None
1313 info = hdr.split(b' ')
1314 if len(info) != 3 or len(info[0]) != 40:
1315 raise GitError('expected object (id, type, size), got %r' % info)
1316 oidx, typ, size = info
1318 it = _AbortableIter(chunkyreader(self.p.stdout, size),
1319 onabort=self._abort)
1321 yield oidx, typ, size
1324 readline_result = self.p.stdout.readline()
1325 assert readline_result == b'\n'
1326 self.inprogress = None
1327 except Exception as e:
1331 def _join(self, it):
1332 _, typ, _ = next(it)
1336 elif typ == b'tree':
1337 treefile = b''.join(it)
1338 for (mode, name, sha) in tree_decode(treefile):
1339 for blob in self.join(hexlify(sha)):
1341 elif typ == b'commit':
1342 treeline = b''.join(it).split(b'\n')[0]
1343 assert treeline.startswith(b'tree ')
1344 for blob in self.join(treeline[5:]):
1347 raise GitError('invalid object type %r: expected blob/tree/commit'
1351 """Generate a list of the content of all blobs that can be reached
1352 from an object. The hash given in 'id' must point to a blob, a tree
1353 or a commit. The content of all blobs that can be seen from trees or
1354 commits will be added to the list.
1356 for d in self._join(self.get(id)):
1362 def cp(repo_dir=None):
1363 """Create a CatPipe object or reuse the already existing one."""
1366 repo_dir = repodir or repo()
1367 repo_dir = os.path.abspath(repo_dir)
1368 cp = _cp.get(repo_dir)
1370 cp = CatPipe(repo_dir)
1375 def tags(repo_dir = None):
1376 """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1378 for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1379 assert n.startswith(b'refs/tags/')
1383 tags[c].append(name) # more than one tag can point at 'c'
1387 class MissingObject(KeyError):
1388 def __init__(self, oid):
1390 KeyError.__init__(self, 'object %r is missing' % hexlify(oid))
1393 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1394 'path', 'chunk_path', 'data'])
1395 # The path is the mangled path, and if an item represents a fragment
1396 # of a chunked file, the chunk_path will be the chunked subtree path
1397 # for the chunk, i.e. ['', '2d3115e', ...]. The top-level path for a
1398 # chunked file will have a chunk_path of ['']. So some chunk subtree
1399 # of the file '/foo/bar/baz' might look like this:
1401 # item.path = ['foo', 'bar', 'baz.bup']
1402 # item.chunk_path = ['', '2d3115e', '016b097']
1403 # item.type = 'tree'
1407 def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1408 """Yield everything reachable from oidx via get_ref (which must behave
1409 like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1410 returns true. Throw MissingObject if a hash encountered is
1411 missing from the repository, and don't read or return blob content
1412 in the data field unless include_data is set.
1415 # Maintain the pending stack on the heap to avoid stack overflow
1416 pending = [(oidx, [], [], None)]
1418 oidx, parent_path, chunk_path, mode = pending.pop()
1419 oid = unhexlify(oidx)
1420 if stop_at and stop_at(oidx):
1423 if (not include_data) and mode and stat.S_ISREG(mode):
1424 # If the object is a "regular file", then it's a leaf in
1425 # the graph, so we can skip reading the data if the caller
1426 # hasn't requested it.
1427 yield WalkItem(oid=oid, type=b'blob',
1428 chunk_path=chunk_path, path=parent_path,
1433 item_it = get_ref(oidx)
1434 get_oidx, typ, _ = next(item_it)
1436 raise MissingObject(unhexlify(oidx))
1437 if typ not in (b'blob', b'commit', b'tree'):
1438 raise Exception('unexpected repository object type %r' % typ)
1440 # FIXME: set the mode based on the type when the mode is None
1441 if typ == b'blob' and not include_data:
1442 # Dump data until we can ask cat_pipe not to fetch it
1443 for ignored in item_it:
1447 data = b''.join(item_it)
1449 yield WalkItem(oid=oid, type=typ,
1450 chunk_path=chunk_path, path=parent_path,
1452 data=(data if include_data else None))
1454 if typ == b'commit':
1455 commit_items = parse_commit(data)
1456 for pid in commit_items.parents:
1457 pending.append((pid, parent_path, chunk_path, mode))
1458 pending.append((commit_items.tree, parent_path, chunk_path,
1459 hashsplit.GIT_MODE_TREE))
1460 elif typ == b'tree':
1461 for mode, name, ent_id in tree_decode(data):
1462 demangled, bup_type = demangle_name(name, mode)
1464 sub_path = parent_path
1465 sub_chunk_path = chunk_path + [name]
1467 sub_path = parent_path + [name]
1468 if bup_type == BUP_CHUNKED:
1469 sub_chunk_path = [b'']
1471 sub_chunk_path = chunk_path
1472 pending.append((hexlify(ent_id), sub_path, sub_chunk_path,