1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
6 from __future__ import absolute_import, print_function
7 import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
8 from array import array
9 from binascii import hexlify, unhexlify
10 from collections import namedtuple
11 from itertools import islice
12 from numbers import Integral
14 from bup import _helpers, compat, hashsplit, path, midx, bloom, xstat
15 from bup.compat import (buffer,
16 byte_int, bytes_from_byte, bytes_from_uint,
21 from bup.io import path_msg
22 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
24 hostname, localtime, log,
27 mmap_read, mmap_readwrite,
29 progress, qprogress, stat_if_exists,
32 from bup.pwdgrp import username, userfullname
36 repodir = None # The default repository, once initialized
38 _typemap = {b'blob': 3, b'tree': 2, b'commit': 1, b'tag': 4}
39 _typermap = {v: k for k, v in items(_typemap)}
46 class GitError(Exception):
50 def _gitenv(repo_dir=None):
53 return merge_dict(environ, {b'GIT_DIR': os.path.abspath(repo_dir)})
55 def _git_wait(cmd, p):
58 raise GitError('%r returned %d' % (cmd, rv))
60 def _git_capture(argv):
61 p = subprocess.Popen(argv, stdout=subprocess.PIPE, env=_gitenv())
66 def git_config_get(option, repo_dir=None):
67 cmd = (b'git', b'config', b'--get', option)
68 p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
69 env=_gitenv(repo_dir=repo_dir))
75 raise GitError('%r returned %d' % (cmd, rc))
79 def parse_tz_offset(s):
80 """UTC offset in seconds."""
81 tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
82 if bytes_from_byte(s[0]) == b'-':
87 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
88 # Make sure that's authoritative.
89 _start_end_char = br'[^ .,:;<>"\'\0\n]'
90 _content_char = br'[^\0\n<>]'
91 _safe_str_rx = br'(?:%s{1,2}|(?:%s%s*%s))' \
93 _start_end_char, _content_char, _start_end_char)
94 _tz_rx = br'[-+]\d\d[0-5]\d'
95 _parent_rx = br'(?:parent [abcdefABCDEF0123456789]{40}\n)'
96 # Assumes every following line starting with a space is part of the
97 # mergetag. Is there a formal commit blob spec?
98 _mergetag_rx = br'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)'
99 _commit_rx = re.compile(br'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
100 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
101 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?)
103 (?P<message>(?:.|\n)*)''' % (_parent_rx,
104 _safe_str_rx, _safe_str_rx, _tz_rx,
105 _safe_str_rx, _safe_str_rx, _tz_rx,
107 _parent_hash_rx = re.compile(br'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
109 # Note that the author_sec and committer_sec values are (UTC) epoch
110 # seconds, and for now the mergetag is not included.
111 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
112 'author_name', 'author_mail',
113 'author_sec', 'author_offset',
114 'committer_name', 'committer_mail',
115 'committer_sec', 'committer_offset',
118 def parse_commit(content):
119 commit_match = re.match(_commit_rx, content)
121 raise Exception('cannot parse commit %r' % content)
122 matches = commit_match.groupdict()
123 return CommitInfo(tree=matches['tree'],
124 parents=re.findall(_parent_hash_rx, matches['parents']),
125 author_name=matches['author_name'],
126 author_mail=matches['author_mail'],
127 author_sec=int(matches['asec']),
128 author_offset=parse_tz_offset(matches['atz']),
129 committer_name=matches['committer_name'],
130 committer_mail=matches['committer_mail'],
131 committer_sec=int(matches['csec']),
132 committer_offset=parse_tz_offset(matches['ctz']),
133 message=matches['message'])
136 def get_cat_data(cat_iterator, expected_type):
137 _, kind, _ = next(cat_iterator)
138 if kind != expected_type:
139 raise Exception('expected %r, saw %r' % (expected_type, kind))
140 return b''.join(cat_iterator)
142 def get_commit_items(id, cp):
143 return parse_commit(get_cat_data(cp.get(id), b'commit'))
145 def _local_git_date_str(epoch_sec):
146 return b'%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
149 def _git_date_str(epoch_sec, tz_offset_sec):
150 offs = tz_offset_sec // 60
151 return b'%d %s%02d%02d' \
153 b'+' if offs >= 0 else b'-',
158 def repo(sub = b'', repo_dir=None):
159 """Get the path to the git repository or one of its subdirectories."""
160 repo_dir = repo_dir or repodir
162 raise GitError('You should call check_repo_or_die()')
164 # If there's a .git subdirectory, then the actual repo is in there.
165 gd = os.path.join(repo_dir, b'.git')
166 if os.path.exists(gd):
169 return os.path.join(repo_dir, sub)
173 re.compile(br'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)')
176 return _shorten_hash_rx.sub(br'\1\2*\3', s)
180 full = os.path.abspath(path)
181 fullrepo = os.path.abspath(repo(b''))
182 if not fullrepo.endswith(b'/'):
184 if full.startswith(fullrepo):
185 path = full[len(fullrepo):]
186 if path.startswith(b'index-cache/'):
187 path = path[len(b'index-cache/'):]
188 return shorten_hash(path)
192 paths = [repo(b'objects/pack')]
193 paths += glob.glob(repo(b'index-cache/*/.'))
197 def auto_midx(objdir):
198 args = [path.exe(), b'midx', b'--auto', b'--dir', objdir]
200 rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
202 # make sure 'args' gets printed to help with debugging
203 add_error('%r: exception: %s' % (args, e))
206 add_error('%r: returned %d' % (args, rv))
208 args = [path.exe(), b'bloom', b'--dir', objdir]
210 rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
212 # make sure 'args' gets printed to help with debugging
213 add_error('%r: exception: %s' % (args, e))
216 add_error('%r: returned %d' % (args, rv))
219 def mangle_name(name, mode, gitmode):
220 """Mangle a file name to present an abstract name for segmented files.
221 Mangled file names will have the ".bup" extension added to them. If a
222 file's name already ends with ".bup", a ".bupl" extension is added to
223 disambiguate normal files from segmented ones.
225 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
226 assert(stat.S_ISDIR(gitmode))
227 return name + b'.bup'
228 elif name.endswith(b'.bup') or name[:-1].endswith(b'.bup'):
229 return name + b'.bupl'
234 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
235 def demangle_name(name, mode):
236 """Remove name mangling from a file name, if necessary.
238 The return value is a tuple (demangled_filename,mode), where mode is one of
241 * BUP_NORMAL : files that should be read as-is from the repository
242 * BUP_CHUNKED : files that were chunked and need to be reassembled
244 For more information on the name mangling algorithm, see mangle_name()
246 if name.endswith(b'.bupl'):
247 return (name[:-5], BUP_NORMAL)
248 elif name.endswith(b'.bup'):
249 return (name[:-4], BUP_CHUNKED)
250 elif name.endswith(b'.bupm'):
252 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
254 return (name, BUP_NORMAL)
257 def calc_hash(type, content):
258 """Calculate some content's hash in the Git fashion."""
259 header = b'%s %d\0' % (type, len(content))
265 def shalist_item_sort_key(ent):
266 (mode, name, id) = ent
267 assert(mode+0 == mode)
268 if stat.S_ISDIR(mode):
274 def tree_encode(shalist):
275 """Generate a git tree object from (mode,name,hash) tuples."""
276 shalist = sorted(shalist, key = shalist_item_sort_key)
278 for (mode,name,bin) in shalist:
280 assert(mode+0 == mode)
282 assert(len(bin) == 20)
283 s = b'%o %s\0%s' % (mode,name,bin)
284 assert s[0] != b'0' # 0-padded octal is not acceptable in a git tree
289 def tree_decode(buf):
290 """Generate a list of (mode,name,hash) from the git tree object in buf."""
292 while ofs < len(buf):
293 z = buf.find(b'\0', ofs)
295 spl = buf[ofs:z].split(b' ', 1)
296 assert(len(spl) == 2)
298 sha = buf[z+1:z+1+20]
300 yield (int(mode, 8), name, sha)
303 def _encode_packobj(type, content, compression_level=1):
304 if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
305 raise ValueError('invalid compression level %s' % compression_level)
308 szbits = (sz & 0x0f) | (_typemap[type]<<4)
311 if sz: szbits |= 0x80
312 szout += bytes_from_uint(szbits)
317 z = zlib.compressobj(compression_level)
319 yield z.compress(content)
323 def _encode_looseobj(type, content, compression_level=1):
324 z = zlib.compressobj(compression_level)
325 yield z.compress(b'%s %d\0' % (type, len(content)))
326 yield z.compress(content)
330 def _decode_looseobj(buf):
332 s = zlib.decompress(buf)
335 l = s[:i].split(b' ')
339 assert(type in _typemap)
340 assert(sz == len(content))
341 return (type, content)
344 def _decode_packobj(buf):
347 type = _typermap[(c & 0x70) >> 4]
354 sz |= (c & 0x7f) << shift
358 return (type, zlib.decompress(buf[i+1:]))
365 def find_offset(self, hash):
366 """Get the offset of an object inside the index file."""
367 idx = self._idx_from_hash(hash)
369 return self._ofs_from_idx(idx)
372 def exists(self, hash, want_source=False):
373 """Return nonempty if the object exists in this index."""
374 if hash and (self._idx_from_hash(hash) != None):
375 return want_source and os.path.basename(self.name) or True
378 def _idx_from_hash(self, hash):
379 global _total_searches, _total_steps
381 assert(len(hash) == 20)
382 b1 = byte_int(hash[0])
383 start = self.fanout[b1-1] # range -1..254
384 end = self.fanout[b1] # range 0..255
386 _total_steps += 1 # lookup table is a step
389 mid = start + (end - start) // 2
390 v = self._idx_to_hash(mid)
400 class PackIdxV1(PackIdx):
401 """Object representation of a Git pack index (version 1) file."""
402 def __init__(self, filename, f):
404 self.idxnames = [self.name]
405 self.map = mmap_read(f)
406 # Min size for 'L' is 4, which is sufficient for struct's '!I'
407 self.fanout = array('L', struct.unpack('!256I', self.map))
408 self.fanout.append(0) # entry "-1"
409 self.nsha = self.fanout[255]
410 self.sha_ofs = 256 * 4
411 # Avoid slicing shatable for individual hashes (very high overhead)
412 self.shatable = buffer(self.map, self.sha_ofs, self.nsha * 24)
415 return int(self.nsha) # int() from long for python 2
417 def _ofs_from_idx(self, idx):
418 if idx >= self.nsha or idx < 0:
419 raise IndexError('invalid pack index index %d' % idx)
420 ofs = self.sha_ofs + idx * 24
421 return struct.unpack_from('!I', self.map, offset=ofs)[0]
423 def _idx_to_hash(self, idx):
424 if idx >= self.nsha or idx < 0:
425 raise IndexError('invalid pack index index %d' % idx)
426 ofs = self.sha_ofs + idx * 24 + 4
427 return self.map[ofs : ofs + 20]
430 start = self.sha_ofs + 4
431 for ofs in range(start, start + 24 * self.nsha, 24):
432 yield self.map[ofs : ofs + 20]
435 class PackIdxV2(PackIdx):
436 """Object representation of a Git pack index (version 2) file."""
437 def __init__(self, filename, f):
439 self.idxnames = [self.name]
440 self.map = mmap_read(f)
441 assert self.map[0:8] == b'\377tOc\0\0\0\2'
442 # Min size for 'L' is 4, which is sufficient for struct's '!I'
443 self.fanout = array('L', struct.unpack_from('!256I', self.map, offset=8))
444 self.fanout.append(0)
445 self.nsha = self.fanout[255]
446 self.sha_ofs = 8 + 256*4
447 self.ofstable_ofs = self.sha_ofs + self.nsha * 20 + self.nsha * 4
448 self.ofs64table_ofs = self.ofstable_ofs + self.nsha * 4
449 # Avoid slicing this for individual hashes (very high overhead)
450 self.shatable = buffer(self.map, self.sha_ofs, self.nsha*20)
453 return int(self.nsha) # int() from long for python 2
455 def _ofs_from_idx(self, idx):
456 if idx >= self.nsha or idx < 0:
457 raise IndexError('invalid pack index index %d' % idx)
458 ofs_ofs = self.ofstable_ofs + idx * 4
459 ofs = struct.unpack_from('!I', self.map, offset=ofs_ofs)[0]
461 idx64 = ofs & 0x7fffffff
462 ofs64_ofs = self.ofs64table_ofs + idx64 * 8
463 ofs = struct.unpack_from('!Q', self.map, offset=ofs64_ofs)[0]
466 def _idx_to_hash(self, idx):
467 if idx >= self.nsha or idx < 0:
468 raise IndexError('invalid pack index index %d' % idx)
469 ofs = self.sha_ofs + idx * 20
470 return self.map[ofs : ofs + 20]
474 for ofs in range(start, start + 20 * self.nsha, 20):
475 yield self.map[ofs : ofs + 20]
480 def __init__(self, dir, ignore_midx=False):
482 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
487 self.do_bloom = False
489 self.ignore_midx = ignore_midx
495 assert(_mpi_count == 0)
498 return iter(idxmerge(self.packs))
501 return sum(len(pack) for pack in self.packs)
503 def exists(self, hash, want_source=False):
504 """Return nonempty if the object exists in the index files."""
505 global _total_searches
507 if hash in self.also:
509 if self.do_bloom and self.bloom:
510 if self.bloom.exists(hash):
511 self.do_bloom = False
513 _total_searches -= 1 # was counted by bloom
515 for i in range(len(self.packs)):
517 _total_searches -= 1 # will be incremented by sub-pack
518 ix = p.exists(hash, want_source=want_source)
520 # reorder so most recently used packs are searched first
521 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
526 def refresh(self, skip_midx = False):
527 """Refresh the index list.
528 This method verifies if .midx files were superseded (e.g. all of its
529 contents are in another, bigger .midx file) and removes the superseded
532 If skip_midx is True, all work on .midx files will be skipped and .midx
533 files will be removed from the list.
535 The instance variable 'ignore_midx' can force this function to
536 always act as if skip_midx was True.
538 self.bloom = None # Always reopen the bloom as it may have been relaced
539 self.do_bloom = False
540 skip_midx = skip_midx or self.ignore_midx
541 d = dict((p.name, p) for p in self.packs
542 if not skip_midx or not isinstance(p, midx.PackMidx))
543 if os.path.exists(self.dir):
546 for ix in self.packs:
547 if isinstance(ix, midx.PackMidx):
548 for name in ix.idxnames:
549 d[os.path.join(self.dir, name)] = ix
550 for full in glob.glob(os.path.join(self.dir,b'*.midx')):
552 mx = midx.PackMidx(full)
553 (mxd, mxf) = os.path.split(mx.name)
555 for n in mx.idxnames:
556 if not os.path.exists(os.path.join(mxd, n)):
557 log(('warning: index %s missing\n'
559 % (path_msg(n), path_msg(mxf)))
567 midxl.sort(key=lambda ix:
568 (-len(ix), -xstat.stat(ix.name).st_mtime))
571 for sub in ix.idxnames:
572 found = d.get(os.path.join(self.dir, sub))
573 if not found or isinstance(found, PackIdx):
574 # doesn't exist, or exists but not in a midx
579 for name in ix.idxnames:
580 d[os.path.join(self.dir, name)] = ix
581 elif not ix.force_keep:
582 debug1('midx: removing redundant: %s\n'
583 % path_msg(os.path.basename(ix.name)))
586 for full in glob.glob(os.path.join(self.dir, b'*.idx')):
590 except GitError as e:
594 bfull = os.path.join(self.dir, b'bup.bloom')
595 if self.bloom is None and os.path.exists(bfull):
596 self.bloom = bloom.ShaBloom(bfull)
597 self.packs = list(set(d.values()))
598 self.packs.sort(reverse=True, key=lambda x: len(x))
599 if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
603 debug1('PackIdxList: using %d index%s.\n'
604 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
607 """Insert an additional object in the list."""
611 def open_idx(filename):
612 if filename.endswith(b'.idx'):
613 f = open(filename, 'rb')
615 if header[0:4] == b'\377tOc':
616 version = struct.unpack('!I', header[4:8])[0]
618 return PackIdxV2(filename, f)
620 raise GitError('%s: expected idx file version 2, got %d'
621 % (path_msg(filename), version))
622 elif len(header) == 8 and header[0:4] < b'\377tOc':
623 return PackIdxV1(filename, f)
625 raise GitError('%s: unrecognized idx file header'
626 % path_msg(filename))
627 elif filename.endswith(b'.midx'):
628 return midx.PackMidx(filename)
630 raise GitError('idx filenames must end with .idx or .midx')
633 def idxmerge(idxlist, final_progress=True):
634 """Generate a list of all the objects reachable in a PackIdxList."""
635 def pfunc(count, total):
636 qprogress('Reading indexes: %.2f%% (%d/%d)\r'
637 % (count*100.0/total, count, total))
638 def pfinal(count, total):
640 progress('Reading indexes: %.2f%% (%d/%d), done.\n'
641 % (100, total, total))
642 return merge_iter(idxlist, 10024, pfunc, pfinal)
645 def _make_objcache():
646 return PackIdxList(repo(b'objects/pack'))
648 # bup-gc assumes that it can disable all PackWriter activities
649 # (bloom/midx/cache) via the constructor and close() arguments.
652 """Writes Git objects inside a pack file."""
653 def __init__(self, objcache_maker=_make_objcache, compression_level=1,
654 run_midx=True, on_pack_finish=None,
655 max_pack_size=None, max_pack_objects=None, repo_dir=None):
656 self.repo_dir = repo_dir or repo()
663 self.objcache_maker = objcache_maker
665 self.compression_level = compression_level
666 self.run_midx=run_midx
667 self.on_pack_finish = on_pack_finish
668 if not max_pack_size:
669 max_pack_size = git_config_get(b'pack.packSizeLimit',
670 repo_dir=self.repo_dir)
671 if max_pack_size is not None:
672 max_pack_size = parse_num(max_pack_size)
673 if not max_pack_size:
674 # larger packs slow down pruning
675 max_pack_size = 1000 * 1000 * 1000
676 self.max_pack_size = max_pack_size
677 # cache memory usage is about 83 bytes per object
678 self.max_pack_objects = max_pack_objects if max_pack_objects \
679 else max(1, self.max_pack_size // 5000)
687 def __exit__(self, type, value, traceback):
692 objdir = dir = os.path.join(self.repo_dir, b'objects')
693 fd, name = tempfile.mkstemp(suffix=b'.pack', dir=objdir)
695 self.file = os.fdopen(fd, 'w+b')
700 self.parentfd = os.open(objdir, os.O_RDONLY)
706 assert name.endswith(b'.pack')
707 self.filename = name[:-5]
708 self.file.write(b'PACK\0\0\0\2\0\0\0\0')
709 self.idx = list(list() for i in range(256))
711 def _raw_write(self, datalist, sha):
714 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
715 # the file never has a *partial* blob. So let's make sure it's
716 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
717 # to our hashsplit algorithm.) f.write() does its own buffering,
718 # but that's okay because we'll flush it in _end().
719 oneblob = b''.join(datalist)
725 crc = zlib.crc32(oneblob) & 0xffffffff
726 self._update_idx(sha, crc, nw)
731 def _update_idx(self, sha, crc, size):
734 self.idx[byte_int(sha[0])].append((sha, crc,
735 self.file.tell() - size))
737 def _write(self, sha, type, content):
741 sha = calc_hash(type, content)
742 size, crc = self._raw_write(_encode_packobj(type, content,
743 self.compression_level),
745 if self.outbytes >= self.max_pack_size \
746 or self.count >= self.max_pack_objects:
750 def breakpoint(self):
751 """Clear byte and object counts and return the last processed id."""
752 id = self._end(self.run_midx)
753 self.outbytes = self.count = 0
756 def _require_objcache(self):
757 if self.objcache is None and self.objcache_maker:
758 self.objcache = self.objcache_maker()
759 if self.objcache is None:
761 "PackWriter not opened or can't check exists w/o objcache")
763 def exists(self, id, want_source=False):
764 """Return non-empty if an object is found in the object cache."""
765 self._require_objcache()
766 return self.objcache.exists(id, want_source=want_source)
768 def just_write(self, sha, type, content):
769 """Write an object to the pack file without checking for duplication."""
770 self._write(sha, type, content)
771 # If nothing else, gc doesn't have/want an objcache
772 if self.objcache is not None:
773 self.objcache.add(sha)
775 def maybe_write(self, type, content):
776 """Write an object to the pack file if not present and return its id."""
777 sha = calc_hash(type, content)
778 if not self.exists(sha):
779 self._require_objcache()
780 self.just_write(sha, type, content)
783 def new_blob(self, blob):
784 """Create a blob object in the pack with the supplied content."""
785 return self.maybe_write(b'blob', blob)
787 def new_tree(self, shalist):
788 """Create a tree object in the pack."""
789 content = tree_encode(shalist)
790 return self.maybe_write(b'tree', content)
792 def new_commit(self, tree, parent,
793 author, adate_sec, adate_tz,
794 committer, cdate_sec, cdate_tz,
796 """Create a commit object in the pack. The date_sec values must be
797 epoch-seconds, and if a tz is None, the local timezone is assumed."""
799 adate_str = _git_date_str(adate_sec, adate_tz)
801 adate_str = _local_git_date_str(adate_sec)
803 cdate_str = _git_date_str(cdate_sec, cdate_tz)
805 cdate_str = _local_git_date_str(cdate_sec)
807 if tree: l.append(b'tree %s' % hexlify(tree))
808 if parent: l.append(b'parent %s' % hexlify(parent))
809 if author: l.append(b'author %s %s' % (author, adate_str))
810 if committer: l.append(b'committer %s %s' % (committer, cdate_str))
813 return self.maybe_write(b'commit', b'\n'.join(l))
816 """Remove the pack file from disk."""
825 os.unlink(self.filename + b'.pack')
832 def _end(self, run_midx=True):
834 if not f: return None
841 # update object count
843 cp = struct.pack('!i', self.count)
847 # calculate the pack sha1sum
850 for b in chunkyreader(f):
852 packbin = sum.digest()
854 fdatasync(f.fileno())
858 obj_list_sha = self._write_pack_idx_v2(self.filename + b'.idx', idx,
860 nameprefix = os.path.join(self.repo_dir,
861 b'objects/pack/pack-' + obj_list_sha)
862 if os.path.exists(self.filename + b'.map'):
863 os.unlink(self.filename + b'.map')
864 os.rename(self.filename + b'.pack', nameprefix + b'.pack')
865 os.rename(self.filename + b'.idx', nameprefix + b'.idx')
867 os.fsync(self.parentfd)
869 os.close(self.parentfd)
872 auto_midx(os.path.join(self.repo_dir, b'objects/pack'))
874 if self.on_pack_finish:
875 self.on_pack_finish(nameprefix)
879 def close(self, run_midx=True):
880 """Close the pack file and move it to its definitive path."""
881 return self._end(run_midx=run_midx)
883 def _write_pack_idx_v2(self, filename, idx, packbin):
886 for entry in section:
887 if entry[2] >= 2**31:
890 # Length: header + fan-out + shas-and-crcs + overflow-offsets
891 index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
893 idx_f = open(filename, 'w+b')
895 idx_f.truncate(index_len)
896 fdatasync(idx_f.fileno())
897 idx_map = mmap_readwrite(idx_f, close=False)
899 count = _helpers.write_idx(filename, idx_map, idx, self.count)
900 assert(count == self.count)
907 idx_f = open(filename, 'a+b')
912 b = idx_f.read(8 + 4*256)
915 obj_list_sum = Sha1()
916 for b in chunkyreader(idx_f, 20*self.count):
918 obj_list_sum.update(b)
919 namebase = hexlify(obj_list_sum.digest())
921 for b in chunkyreader(idx_f):
923 idx_f.write(idx_sum.digest())
924 fdatasync(idx_f.fileno())
930 def list_refs(patterns=None, repo_dir=None,
931 limit_to_heads=False, limit_to_tags=False):
932 """Yield (refname, hash) tuples for all repository refs unless
933 patterns are specified. In that case, only include tuples for
934 refs matching those patterns (cf. git-show-ref(1)). The limits
935 restrict the result items to refs/heads or refs/tags. If both
936 limits are specified, items from both sources will be included.
939 argv = [b'git', b'show-ref']
941 argv.append(b'--heads')
943 argv.append(b'--tags')
946 argv.extend(patterns)
947 p = subprocess.Popen(argv, env=_gitenv(repo_dir), stdout=subprocess.PIPE)
948 out = p.stdout.read().strip()
949 rv = p.wait() # not fatal
953 for d in out.split(b'\n'):
954 sha, name = d.split(b' ', 1)
955 yield name, unhexlify(sha)
958 def read_ref(refname, repo_dir = None):
959 """Get the commit id of the most recent commit made on a given ref."""
960 refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
961 l = tuple(islice(refs, 2))
969 def rev_list_invocation(ref_or_refs, count=None, format=None):
970 if isinstance(ref_or_refs, bytes):
971 refs = (ref_or_refs,)
974 argv = [b'git', b'rev-list']
975 if isinstance(count, Integral):
976 argv.extend([b'-n', b'%d' % count])
978 raise ValueError('unexpected count argument %r' % count)
981 argv.append(b'--pretty=format:' + format)
983 assert not ref.startswith(b'-')
989 def rev_list(ref_or_refs, count=None, parse=None, format=None, repo_dir=None):
990 """Yield information about commits as per "git rev-list". If a format
991 is not provided, yield one hex hash at a time. If a format is
992 provided, pass it to rev-list and call parse(git_stdout) for each
993 commit with the stream positioned just after the rev-list "commit
994 HASH" header line. When a format is provided yield (oidx,
995 parse(git_stdout)) for each commit.
998 assert bool(parse) == bool(format)
999 p = subprocess.Popen(rev_list_invocation(ref_or_refs, count=count,
1001 env=_gitenv(repo_dir),
1002 stdout = subprocess.PIPE)
1004 for line in p.stdout:
1007 line = p.stdout.readline()
1010 if not s.startswith(b'commit '):
1011 raise Exception('unexpected line ' + repr(s))
1014 yield s, parse(p.stdout)
1015 line = p.stdout.readline()
1017 rv = p.wait() # not fatal
1019 raise GitError('git rev-list returned error %d' % rv)
1022 def get_commit_dates(refs, repo_dir=None):
1023 """Get the dates for the specified commit refs. For now, every unique
1024 string in refs must resolve to a different commit or this
1025 function will fail."""
1028 commit = get_commit_items(ref, cp(repo_dir))
1029 result.append(commit.author_sec)
1033 def rev_parse(committish, repo_dir=None):
1034 """Resolve the full hash for 'committish', if it exists.
1036 Should be roughly equivalent to 'git rev-parse'.
1038 Returns the hex value of the hash if it is found, None if 'committish' does
1039 not correspond to anything.
1041 head = read_ref(committish, repo_dir=repo_dir)
1043 debug2("resolved from ref: commit = %s\n" % hexlify(head))
1046 pL = PackIdxList(repo(b'objects/pack', repo_dir=repo_dir))
1048 if len(committish) == 40:
1050 hash = unhexlify(committish)
1060 def update_ref(refname, newval, oldval, repo_dir=None):
1061 """Update a repository reference."""
1064 assert refname.startswith(b'refs/heads/') \
1065 or refname.startswith(b'refs/tags/')
1066 p = subprocess.Popen([b'git', b'update-ref', refname,
1067 hexlify(newval), hexlify(oldval)],
1068 env=_gitenv(repo_dir))
1069 _git_wait(b'git update-ref', p)
1072 def delete_ref(refname, oldvalue=None):
1073 """Delete a repository reference (see git update-ref(1))."""
1074 assert refname.startswith(b'refs/')
1075 oldvalue = [] if not oldvalue else [oldvalue]
1076 p = subprocess.Popen([b'git', b'update-ref', b'-d', refname] + oldvalue,
1078 _git_wait('git update-ref', p)
1081 def guess_repo(path=None):
1082 """Set the path value in the global variable "repodir".
1083 This makes bup look for an existing bup repository, but not fail if a
1084 repository doesn't exist. Usually, if you are interacting with a bup
1085 repository, you would not be calling this function but using
1086 check_repo_or_die().
1092 repodir = environ.get(b'BUP_DIR')
1094 repodir = os.path.expanduser(b'~/.bup')
1097 def init_repo(path=None):
1098 """Create the Git bare repository for bup in a given path."""
1100 d = repo() # appends a / to the path
1101 parent = os.path.dirname(os.path.dirname(d))
1102 if parent and not os.path.exists(parent):
1103 raise GitError('parent directory "%s" does not exist\n'
1105 if os.path.exists(d) and not os.path.isdir(os.path.join(d, b'.')):
1106 raise GitError('"%s" exists but is not a directory\n' % path_msg(d))
1107 p = subprocess.Popen([b'git', b'--bare', b'init'], stdout=sys.stderr,
1109 _git_wait('git init', p)
1110 # Force the index version configuration in order to ensure bup works
1111 # regardless of the version of the installed Git binary.
1112 p = subprocess.Popen([b'git', b'config', b'pack.indexVersion', '2'],
1113 stdout=sys.stderr, env=_gitenv())
1114 _git_wait('git config', p)
1116 p = subprocess.Popen([b'git', b'config', b'core.logAllRefUpdates', b'true'],
1117 stdout=sys.stderr, env=_gitenv())
1118 _git_wait('git config', p)
1121 def check_repo_or_die(path=None):
1122 """Check to see if a bup repository probably exists, and abort if not."""
1125 pst = stat_if_exists(top + b'/objects/pack')
1126 if pst and stat.S_ISDIR(pst.st_mode):
1129 top_st = stat_if_exists(top)
1131 log('error: repository %r does not exist (see "bup help init")\n'
1134 log('error: %s is not a repository\n' % path_msg(top))
1140 """Get Git's version and ensure a usable version is installed.
1142 The returned version is formatted as an ordered tuple with each position
1143 representing a digit in the version tag. For example, the following tuple
1144 would represent version 1.6.6.9:
1150 p = subprocess.Popen([b'git', b'--version'], stdout=subprocess.PIPE)
1151 gvs = p.stdout.read()
1152 _git_wait('git --version', p)
1153 m = re.match(br'git version (\S+.\S+)', gvs)
1155 raise GitError('git --version weird output: %r' % gvs)
1156 _ver = tuple(int(x) for x in m.group(1).split(b'.'))
1157 needed = (1, 5, 3, 1)
1159 raise GitError('git version %s or higher is required; you have %s'
1160 % ('.'.join(str(x) for x in needed),
1161 '.'.join(str(x) for x in _ver)))
1165 class _AbortableIter:
1166 def __init__(self, it, onabort = None):
1168 self.onabort = onabort
1176 return next(self.it)
1177 except StopIteration as e:
1187 """Abort iteration and call the abortion callback, if needed."""
1198 """Link to 'git cat-file' that is used to retrieve blob data."""
1199 def __init__(self, repo_dir = None):
1200 self.repo_dir = repo_dir
1203 log('error: git version must be at least 1.5.6\n')
1205 self.p = self.inprogress = None
1209 self.p.stdout.close()
1210 self.p.stdin.close()
1212 self.inprogress = None
1216 self.p = subprocess.Popen([b'git', b'cat-file', b'--batch'],
1217 stdin=subprocess.PIPE,
1218 stdout=subprocess.PIPE,
1221 env=_gitenv(self.repo_dir))
1224 """Yield (oidx, type, size), followed by the data referred to by ref.
1225 If ref does not exist, only yield (None, None, None).
1228 if not self.p or self.p.poll() != None:
1231 poll_result = self.p.poll()
1232 assert(poll_result == None)
1234 log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1235 assert(not self.inprogress)
1236 assert ref.find(b'\n') < 0
1237 assert ref.find(b'\r') < 0
1238 assert not ref.startswith(b'-')
1239 self.inprogress = ref
1240 self.p.stdin.write(ref + b'\n')
1241 self.p.stdin.flush()
1242 hdr = self.p.stdout.readline()
1243 if hdr.endswith(b' missing\n'):
1244 self.inprogress = None
1245 yield None, None, None
1247 info = hdr.split(b' ')
1248 if len(info) != 3 or len(info[0]) != 40:
1249 raise GitError('expected object (id, type, size), got %r' % info)
1250 oidx, typ, size = info
1252 it = _AbortableIter(chunkyreader(self.p.stdout, size),
1253 onabort=self._abort)
1255 yield oidx, typ, size
1258 readline_result = self.p.stdout.readline()
1259 assert readline_result == b'\n'
1260 self.inprogress = None
1261 except Exception as e:
1265 def _join(self, it):
1266 _, typ, _ = next(it)
1270 elif typ == b'tree':
1271 treefile = b''.join(it)
1272 for (mode, name, sha) in tree_decode(treefile):
1273 for blob in self.join(hexlify(sha)):
1275 elif typ == b'commit':
1276 treeline = b''.join(it).split(b'\n')[0]
1277 assert treeline.startswith(b'tree ')
1278 for blob in self.join(treeline[5:]):
1281 raise GitError('invalid object type %r: expected blob/tree/commit'
1285 """Generate a list of the content of all blobs that can be reached
1286 from an object. The hash given in 'id' must point to a blob, a tree
1287 or a commit. The content of all blobs that can be seen from trees or
1288 commits will be added to the list.
1291 for d in self._join(self.get(id)):
1293 except StopIteration:
1299 def cp(repo_dir=None):
1300 """Create a CatPipe object or reuse the already existing one."""
1303 repo_dir = repodir or repo()
1304 repo_dir = os.path.abspath(repo_dir)
1305 cp = _cp.get(repo_dir)
1307 cp = CatPipe(repo_dir)
1312 def tags(repo_dir = None):
1313 """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1315 for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1316 assert n.startswith(b'refs/tags/')
1320 tags[c].append(name) # more than one tag can point at 'c'
1324 class MissingObject(KeyError):
1325 def __init__(self, oid):
1327 KeyError.__init__(self, 'object %r is missing' % oid.encode('hex'))
1330 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1331 'path', 'chunk_path', 'data'])
1332 # The path is the mangled path, and if an item represents a fragment
1333 # of a chunked file, the chunk_path will be the chunked subtree path
1334 # for the chunk, i.e. ['', '2d3115e', ...]. The top-level path for a
1335 # chunked file will have a chunk_path of ['']. So some chunk subtree
1336 # of the file '/foo/bar/baz' might look like this:
1338 # item.path = ['foo', 'bar', 'baz.bup']
1339 # item.chunk_path = ['', '2d3115e', '016b097']
1340 # item.type = 'tree'
1344 def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1345 """Yield everything reachable from oidx via get_ref (which must behave
1346 like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1347 returns true. Throw MissingObject if a hash encountered is
1348 missing from the repository, and don't read or return blob content
1349 in the data field unless include_data is set.
1352 # Maintain the pending stack on the heap to avoid stack overflow
1353 pending = [(oidx, [], [], None)]
1355 oidx, parent_path, chunk_path, mode = pending.pop()
1356 oid = unhexlify(oidx)
1357 if stop_at and stop_at(oidx):
1360 if (not include_data) and mode and stat.S_ISREG(mode):
1361 # If the object is a "regular file", then it's a leaf in
1362 # the graph, so we can skip reading the data if the caller
1363 # hasn't requested it.
1364 yield WalkItem(oid=oid, type=b'blob',
1365 chunk_path=chunk_path, path=parent_path,
1370 item_it = get_ref(oidx)
1371 get_oidx, typ, _ = next(item_it)
1373 raise MissingObject(unhexlify(oidx))
1374 if typ not in (b'blob', b'commit', b'tree'):
1375 raise Exception('unexpected repository object type %r' % typ)
1377 # FIXME: set the mode based on the type when the mode is None
1378 if typ == b'blob' and not include_data:
1379 # Dump data until we can ask cat_pipe not to fetch it
1380 for ignored in item_it:
1384 data = b''.join(item_it)
1386 yield WalkItem(oid=oid, type=typ,
1387 chunk_path=chunk_path, path=parent_path,
1389 data=(data if include_data else None))
1391 if typ == b'commit':
1392 commit_items = parse_commit(data)
1393 for pid in commit_items.parents:
1394 pending.append((pid, parent_path, chunk_path, mode))
1395 pending.append((commit_items.tree, parent_path, chunk_path,
1396 hashsplit.GIT_MODE_TREE))
1397 elif typ == b'tree':
1398 for mode, name, ent_id in tree_decode(data):
1399 demangled, bup_type = demangle_name(name, mode)
1401 sub_path = parent_path
1402 sub_chunk_path = chunk_path + [name]
1404 sub_path = parent_path + [name]
1405 if bup_type == BUP_CHUNKED:
1406 sub_chunk_path = [b'']
1408 sub_chunk_path = chunk_path
1409 pending.append((hexlify(ent_id), sub_path, sub_chunk_path,