1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
6 from __future__ import absolute_import, print_function
7 import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
8 from array import array
9 from binascii import hexlify, unhexlify
10 from collections import namedtuple
11 from itertools import islice
12 from numbers import Integral
14 from bup import _helpers, compat, hashsplit, path, midx, bloom, xstat
15 from bup.compat import (buffer,
16 byte_int, bytes_from_byte, bytes_from_uint,
21 from bup.io import path_msg
22 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
24 hostname, localtime, log,
27 mmap_read, mmap_readwrite,
29 progress, qprogress, stat_if_exists,
32 from bup.pwdgrp import username, userfullname
36 repodir = None # The default repository, once initialized
38 _typemap = {b'blob': 3, b'tree': 2, b'commit': 1, b'tag': 4}
39 _typermap = {v: k for k, v in items(_typemap)}
46 class GitError(Exception):
50 def _gitenv(repo_dir=None):
53 return merge_dict(environ, {b'GIT_DIR': os.path.abspath(repo_dir)})
55 def _git_wait(cmd, p):
58 raise GitError('%r returned %d' % (cmd, rv))
60 def git_config_get(option, repo_dir=None):
61 cmd = (b'git', b'config', b'--get', option)
62 p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
63 env=_gitenv(repo_dir=repo_dir))
69 raise GitError('%r returned %d' % (cmd, rc))
73 def parse_tz_offset(s):
74 """UTC offset in seconds."""
75 tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
76 if bytes_from_byte(s[0]) == b'-':
81 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
82 # Make sure that's authoritative.
83 _start_end_char = br'[^ .,:;<>"\'\0\n]'
84 _content_char = br'[^\0\n<>]'
85 _safe_str_rx = br'(?:%s{1,2}|(?:%s%s*%s))' \
87 _start_end_char, _content_char, _start_end_char)
88 _tz_rx = br'[-+]\d\d[0-5]\d'
89 _parent_rx = br'(?:parent [abcdefABCDEF0123456789]{40}\n)'
90 # Assumes every following line starting with a space is part of the
91 # mergetag. Is there a formal commit blob spec?
92 _mergetag_rx = br'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)'
93 _commit_rx = re.compile(br'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
94 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
95 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?)
97 (?P<message>(?:.|\n)*)''' % (_parent_rx,
98 _safe_str_rx, _safe_str_rx, _tz_rx,
99 _safe_str_rx, _safe_str_rx, _tz_rx,
101 _parent_hash_rx = re.compile(br'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
103 # Note that the author_sec and committer_sec values are (UTC) epoch
104 # seconds, and for now the mergetag is not included.
105 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
106 'author_name', 'author_mail',
107 'author_sec', 'author_offset',
108 'committer_name', 'committer_mail',
109 'committer_sec', 'committer_offset',
112 def parse_commit(content):
113 commit_match = re.match(_commit_rx, content)
115 raise Exception('cannot parse commit %r' % content)
116 matches = commit_match.groupdict()
117 return CommitInfo(tree=matches['tree'],
118 parents=re.findall(_parent_hash_rx, matches['parents']),
119 author_name=matches['author_name'],
120 author_mail=matches['author_mail'],
121 author_sec=int(matches['asec']),
122 author_offset=parse_tz_offset(matches['atz']),
123 committer_name=matches['committer_name'],
124 committer_mail=matches['committer_mail'],
125 committer_sec=int(matches['csec']),
126 committer_offset=parse_tz_offset(matches['ctz']),
127 message=matches['message'])
130 def get_cat_data(cat_iterator, expected_type):
131 _, kind, _ = next(cat_iterator)
132 if kind != expected_type:
133 raise Exception('expected %r, saw %r' % (expected_type, kind))
134 return b''.join(cat_iterator)
136 def get_commit_items(id, cp):
137 return parse_commit(get_cat_data(cp.get(id), b'commit'))
139 def _local_git_date_str(epoch_sec):
140 return b'%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
143 def _git_date_str(epoch_sec, tz_offset_sec):
144 offs = tz_offset_sec // 60
145 return b'%d %s%02d%02d' \
147 b'+' if offs >= 0 else b'-',
152 def repo(sub = b'', repo_dir=None):
153 """Get the path to the git repository or one of its subdirectories."""
154 repo_dir = repo_dir or repodir
156 raise GitError('You should call check_repo_or_die()')
158 # If there's a .git subdirectory, then the actual repo is in there.
159 gd = os.path.join(repo_dir, b'.git')
160 if os.path.exists(gd):
163 return os.path.join(repo_dir, sub)
167 re.compile(br'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)')
170 return _shorten_hash_rx.sub(br'\1\2*\3', s)
174 full = os.path.abspath(path)
175 fullrepo = os.path.abspath(repo(b''))
176 if not fullrepo.endswith(b'/'):
178 if full.startswith(fullrepo):
179 path = full[len(fullrepo):]
180 if path.startswith(b'index-cache/'):
181 path = path[len(b'index-cache/'):]
182 return shorten_hash(path)
186 paths = [repo(b'objects/pack')]
187 paths += glob.glob(repo(b'index-cache/*/.'))
191 def auto_midx(objdir):
192 args = [path.exe(), b'midx', b'--auto', b'--dir', objdir]
194 rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
196 # make sure 'args' gets printed to help with debugging
197 add_error('%r: exception: %s' % (args, e))
200 add_error('%r: returned %d' % (args, rv))
202 args = [path.exe(), b'bloom', b'--dir', objdir]
204 rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
206 # make sure 'args' gets printed to help with debugging
207 add_error('%r: exception: %s' % (args, e))
210 add_error('%r: returned %d' % (args, rv))
213 def mangle_name(name, mode, gitmode):
214 """Mangle a file name to present an abstract name for segmented files.
215 Mangled file names will have the ".bup" extension added to them. If a
216 file's name already ends with ".bup", a ".bupl" extension is added to
217 disambiguate normal files from segmented ones.
219 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
220 assert(stat.S_ISDIR(gitmode))
221 return name + b'.bup'
222 elif name.endswith(b'.bup') or name[:-1].endswith(b'.bup'):
223 return name + b'.bupl'
228 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
229 def demangle_name(name, mode):
230 """Remove name mangling from a file name, if necessary.
232 The return value is a tuple (demangled_filename,mode), where mode is one of
235 * BUP_NORMAL : files that should be read as-is from the repository
236 * BUP_CHUNKED : files that were chunked and need to be reassembled
238 For more information on the name mangling algorithm, see mangle_name()
240 if name.endswith(b'.bupl'):
241 return (name[:-5], BUP_NORMAL)
242 elif name.endswith(b'.bup'):
243 return (name[:-4], BUP_CHUNKED)
244 elif name.endswith(b'.bupm'):
246 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
248 return (name, BUP_NORMAL)
251 def calc_hash(type, content):
252 """Calculate some content's hash in the Git fashion."""
253 header = b'%s %d\0' % (type, len(content))
259 def shalist_item_sort_key(ent):
260 (mode, name, id) = ent
261 assert(mode+0 == mode)
262 if stat.S_ISDIR(mode):
268 def tree_encode(shalist):
269 """Generate a git tree object from (mode,name,hash) tuples."""
270 shalist = sorted(shalist, key = shalist_item_sort_key)
272 for (mode,name,bin) in shalist:
274 assert(mode+0 == mode)
276 assert(len(bin) == 20)
277 s = b'%o %s\0%s' % (mode,name,bin)
278 assert s[0] != b'0' # 0-padded octal is not acceptable in a git tree
283 def tree_decode(buf):
284 """Generate a list of (mode,name,hash) from the git tree object in buf."""
286 while ofs < len(buf):
287 z = buf.find(b'\0', ofs)
289 spl = buf[ofs:z].split(b' ', 1)
290 assert(len(spl) == 2)
292 sha = buf[z+1:z+1+20]
294 yield (int(mode, 8), name, sha)
297 def _encode_packobj(type, content, compression_level=1):
298 if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
299 raise ValueError('invalid compression level %s' % compression_level)
302 szbits = (sz & 0x0f) | (_typemap[type]<<4)
305 if sz: szbits |= 0x80
306 szout += bytes_from_uint(szbits)
311 z = zlib.compressobj(compression_level)
313 yield z.compress(content)
317 def _encode_looseobj(type, content, compression_level=1):
318 z = zlib.compressobj(compression_level)
319 yield z.compress(b'%s %d\0' % (type, len(content)))
320 yield z.compress(content)
324 def _decode_looseobj(buf):
326 s = zlib.decompress(buf)
329 l = s[:i].split(b' ')
333 assert(type in _typemap)
334 assert(sz == len(content))
335 return (type, content)
338 def _decode_packobj(buf):
341 type = _typermap[(c & 0x70) >> 4]
348 sz |= (c & 0x7f) << shift
352 return (type, zlib.decompress(buf[i+1:]))
359 def find_offset(self, hash):
360 """Get the offset of an object inside the index file."""
361 idx = self._idx_from_hash(hash)
363 return self._ofs_from_idx(idx)
366 def exists(self, hash, want_source=False):
367 """Return nonempty if the object exists in this index."""
368 if hash and (self._idx_from_hash(hash) != None):
369 return want_source and os.path.basename(self.name) or True
372 def _idx_from_hash(self, hash):
373 global _total_searches, _total_steps
375 assert(len(hash) == 20)
376 b1 = byte_int(hash[0])
377 start = self.fanout[b1-1] # range -1..254
378 end = self.fanout[b1] # range 0..255
380 _total_steps += 1 # lookup table is a step
383 mid = start + (end - start) // 2
384 v = self._idx_to_hash(mid)
394 class PackIdxV1(PackIdx):
395 """Object representation of a Git pack index (version 1) file."""
396 def __init__(self, filename, f):
398 self.idxnames = [self.name]
399 self.map = mmap_read(f)
400 # Min size for 'L' is 4, which is sufficient for struct's '!I'
401 self.fanout = array('L', struct.unpack('!256I', self.map))
402 self.fanout.append(0) # entry "-1"
403 self.nsha = self.fanout[255]
404 self.sha_ofs = 256 * 4
405 # Avoid slicing shatable for individual hashes (very high overhead)
406 self.shatable = buffer(self.map, self.sha_ofs, self.nsha * 24)
409 return int(self.nsha) # int() from long for python 2
411 def _ofs_from_idx(self, idx):
412 if idx >= self.nsha or idx < 0:
413 raise IndexError('invalid pack index index %d' % idx)
414 ofs = self.sha_ofs + idx * 24
415 return struct.unpack_from('!I', self.map, offset=ofs)[0]
417 def _idx_to_hash(self, idx):
418 if idx >= self.nsha or idx < 0:
419 raise IndexError('invalid pack index index %d' % idx)
420 ofs = self.sha_ofs + idx * 24 + 4
421 return self.map[ofs : ofs + 20]
424 start = self.sha_ofs + 4
425 for ofs in range(start, start + 24 * self.nsha, 24):
426 yield self.map[ofs : ofs + 20]
429 class PackIdxV2(PackIdx):
430 """Object representation of a Git pack index (version 2) file."""
431 def __init__(self, filename, f):
433 self.idxnames = [self.name]
434 self.map = mmap_read(f)
435 assert self.map[0:8] == b'\377tOc\0\0\0\2'
436 # Min size for 'L' is 4, which is sufficient for struct's '!I'
437 self.fanout = array('L', struct.unpack_from('!256I', self.map, offset=8))
438 self.fanout.append(0)
439 self.nsha = self.fanout[255]
440 self.sha_ofs = 8 + 256*4
441 self.ofstable_ofs = self.sha_ofs + self.nsha * 20 + self.nsha * 4
442 self.ofs64table_ofs = self.ofstable_ofs + self.nsha * 4
443 # Avoid slicing this for individual hashes (very high overhead)
444 self.shatable = buffer(self.map, self.sha_ofs, self.nsha*20)
447 return int(self.nsha) # int() from long for python 2
449 def _ofs_from_idx(self, idx):
450 if idx >= self.nsha or idx < 0:
451 raise IndexError('invalid pack index index %d' % idx)
452 ofs_ofs = self.ofstable_ofs + idx * 4
453 ofs = struct.unpack_from('!I', self.map, offset=ofs_ofs)[0]
455 idx64 = ofs & 0x7fffffff
456 ofs64_ofs = self.ofs64table_ofs + idx64 * 8
457 ofs = struct.unpack_from('!Q', self.map, offset=ofs64_ofs)[0]
460 def _idx_to_hash(self, idx):
461 if idx >= self.nsha or idx < 0:
462 raise IndexError('invalid pack index index %d' % idx)
463 ofs = self.sha_ofs + idx * 20
464 return self.map[ofs : ofs + 20]
468 for ofs in range(start, start + 20 * self.nsha, 20):
469 yield self.map[ofs : ofs + 20]
474 def __init__(self, dir, ignore_midx=False):
476 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
481 self.do_bloom = False
483 self.ignore_midx = ignore_midx
489 assert(_mpi_count == 0)
492 return iter(idxmerge(self.packs))
495 return sum(len(pack) for pack in self.packs)
497 def exists(self, hash, want_source=False):
498 """Return nonempty if the object exists in the index files."""
499 global _total_searches
501 if hash in self.also:
503 if self.do_bloom and self.bloom:
504 if self.bloom.exists(hash):
505 self.do_bloom = False
507 _total_searches -= 1 # was counted by bloom
509 for i in range(len(self.packs)):
511 _total_searches -= 1 # will be incremented by sub-pack
512 ix = p.exists(hash, want_source=want_source)
514 # reorder so most recently used packs are searched first
515 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
520 def refresh(self, skip_midx = False):
521 """Refresh the index list.
522 This method verifies if .midx files were superseded (e.g. all of its
523 contents are in another, bigger .midx file) and removes the superseded
526 If skip_midx is True, all work on .midx files will be skipped and .midx
527 files will be removed from the list.
529 The instance variable 'ignore_midx' can force this function to
530 always act as if skip_midx was True.
532 if self.bloom is not None:
534 self.bloom = None # Always reopen the bloom as it may have been relaced
535 self.do_bloom = False
536 skip_midx = skip_midx or self.ignore_midx
537 d = dict((p.name, p) for p in self.packs
538 if not skip_midx or not isinstance(p, midx.PackMidx))
539 if os.path.exists(self.dir):
542 midxes = set(glob.glob(os.path.join(self.dir, b'*.midx')))
543 # remove any *.midx files from our list that no longer exist
544 for ix in list(d.values()):
545 if not isinstance(ix, midx.PackMidx):
547 if ix.name in midxes:
552 self.packs.remove(ix)
553 for ix in self.packs:
554 if isinstance(ix, midx.PackMidx):
555 for name in ix.idxnames:
556 d[os.path.join(self.dir, name)] = ix
559 mx = midx.PackMidx(full)
560 (mxd, mxf) = os.path.split(mx.name)
562 for n in mx.idxnames:
563 if not os.path.exists(os.path.join(mxd, n)):
564 log(('warning: index %s missing\n'
566 % (path_msg(n), path_msg(mxf)))
574 midxl.sort(key=lambda ix:
575 (-len(ix), -xstat.stat(ix.name).st_mtime))
578 for sub in ix.idxnames:
579 found = d.get(os.path.join(self.dir, sub))
580 if not found or isinstance(found, PackIdx):
581 # doesn't exist, or exists but not in a midx
586 for name in ix.idxnames:
587 d[os.path.join(self.dir, name)] = ix
588 elif not ix.force_keep:
589 debug1('midx: removing redundant: %s\n'
590 % path_msg(os.path.basename(ix.name)))
593 for full in glob.glob(os.path.join(self.dir, b'*.idx')):
597 except GitError as e:
601 bfull = os.path.join(self.dir, b'bup.bloom')
602 if self.bloom is None and os.path.exists(bfull):
603 self.bloom = bloom.ShaBloom(bfull)
604 self.packs = list(set(d.values()))
605 self.packs.sort(reverse=True, key=lambda x: len(x))
606 if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
610 debug1('PackIdxList: using %d index%s.\n'
611 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
614 """Insert an additional object in the list."""
618 def open_idx(filename):
619 if filename.endswith(b'.idx'):
620 f = open(filename, 'rb')
622 if header[0:4] == b'\377tOc':
623 version = struct.unpack('!I', header[4:8])[0]
625 return PackIdxV2(filename, f)
627 raise GitError('%s: expected idx file version 2, got %d'
628 % (path_msg(filename), version))
629 elif len(header) == 8 and header[0:4] < b'\377tOc':
630 return PackIdxV1(filename, f)
632 raise GitError('%s: unrecognized idx file header'
633 % path_msg(filename))
634 elif filename.endswith(b'.midx'):
635 return midx.PackMidx(filename)
637 raise GitError('idx filenames must end with .idx or .midx')
640 def idxmerge(idxlist, final_progress=True):
641 """Generate a list of all the objects reachable in a PackIdxList."""
642 def pfunc(count, total):
643 qprogress('Reading indexes: %.2f%% (%d/%d)\r'
644 % (count*100.0/total, count, total))
645 def pfinal(count, total):
647 progress('Reading indexes: %.2f%% (%d/%d), done.\n'
648 % (100, total, total))
649 return merge_iter(idxlist, 10024, pfunc, pfinal)
652 def _make_objcache():
653 return PackIdxList(repo(b'objects/pack'))
655 # bup-gc assumes that it can disable all PackWriter activities
656 # (bloom/midx/cache) via the constructor and close() arguments.
659 """Writes Git objects inside a pack file."""
660 def __init__(self, objcache_maker=_make_objcache, compression_level=1,
661 run_midx=True, on_pack_finish=None,
662 max_pack_size=None, max_pack_objects=None, repo_dir=None):
663 self.repo_dir = repo_dir or repo()
670 self.objcache_maker = objcache_maker
672 self.compression_level = compression_level
673 self.run_midx=run_midx
674 self.on_pack_finish = on_pack_finish
675 if not max_pack_size:
676 max_pack_size = git_config_get(b'pack.packSizeLimit',
677 repo_dir=self.repo_dir)
678 if max_pack_size is not None:
679 max_pack_size = parse_num(max_pack_size)
680 if not max_pack_size:
681 # larger packs slow down pruning
682 max_pack_size = 1000 * 1000 * 1000
683 self.max_pack_size = max_pack_size
684 # cache memory usage is about 83 bytes per object
685 self.max_pack_objects = max_pack_objects if max_pack_objects \
686 else max(1, self.max_pack_size // 5000)
694 def __exit__(self, type, value, traceback):
699 objdir = dir = os.path.join(self.repo_dir, b'objects')
700 fd, name = tempfile.mkstemp(suffix=b'.pack', dir=objdir)
702 self.file = os.fdopen(fd, 'w+b')
707 self.parentfd = os.open(objdir, os.O_RDONLY)
713 assert name.endswith(b'.pack')
714 self.filename = name[:-5]
715 self.file.write(b'PACK\0\0\0\2\0\0\0\0')
716 self.idx = list(list() for i in range(256))
718 def _raw_write(self, datalist, sha):
721 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
722 # the file never has a *partial* blob. So let's make sure it's
723 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
724 # to our hashsplit algorithm.) f.write() does its own buffering,
725 # but that's okay because we'll flush it in _end().
726 oneblob = b''.join(datalist)
732 crc = zlib.crc32(oneblob) & 0xffffffff
733 self._update_idx(sha, crc, nw)
738 def _update_idx(self, sha, crc, size):
741 self.idx[byte_int(sha[0])].append((sha, crc,
742 self.file.tell() - size))
744 def _write(self, sha, type, content):
748 sha = calc_hash(type, content)
749 size, crc = self._raw_write(_encode_packobj(type, content,
750 self.compression_level),
752 if self.outbytes >= self.max_pack_size \
753 or self.count >= self.max_pack_objects:
757 def breakpoint(self):
758 """Clear byte and object counts and return the last processed id."""
759 id = self._end(self.run_midx)
760 self.outbytes = self.count = 0
763 def _require_objcache(self):
764 if self.objcache is None and self.objcache_maker:
765 self.objcache = self.objcache_maker()
766 if self.objcache is None:
768 "PackWriter not opened or can't check exists w/o objcache")
770 def exists(self, id, want_source=False):
771 """Return non-empty if an object is found in the object cache."""
772 self._require_objcache()
773 return self.objcache.exists(id, want_source=want_source)
775 def just_write(self, sha, type, content):
776 """Write an object to the pack file without checking for duplication."""
777 self._write(sha, type, content)
778 # If nothing else, gc doesn't have/want an objcache
779 if self.objcache is not None:
780 self.objcache.add(sha)
782 def maybe_write(self, type, content):
783 """Write an object to the pack file if not present and return its id."""
784 sha = calc_hash(type, content)
785 if not self.exists(sha):
786 self._require_objcache()
787 self.just_write(sha, type, content)
790 def new_blob(self, blob):
791 """Create a blob object in the pack with the supplied content."""
792 return self.maybe_write(b'blob', blob)
794 def new_tree(self, shalist):
795 """Create a tree object in the pack."""
796 content = tree_encode(shalist)
797 return self.maybe_write(b'tree', content)
799 def new_commit(self, tree, parent,
800 author, adate_sec, adate_tz,
801 committer, cdate_sec, cdate_tz,
803 """Create a commit object in the pack. The date_sec values must be
804 epoch-seconds, and if a tz is None, the local timezone is assumed."""
806 adate_str = _git_date_str(adate_sec, adate_tz)
808 adate_str = _local_git_date_str(adate_sec)
810 cdate_str = _git_date_str(cdate_sec, cdate_tz)
812 cdate_str = _local_git_date_str(cdate_sec)
814 if tree: l.append(b'tree %s' % hexlify(tree))
815 if parent: l.append(b'parent %s' % hexlify(parent))
816 if author: l.append(b'author %s %s' % (author, adate_str))
817 if committer: l.append(b'committer %s %s' % (committer, cdate_str))
820 return self.maybe_write(b'commit', b'\n'.join(l))
823 """Remove the pack file from disk."""
832 os.unlink(self.filename + b'.pack')
839 def _end(self, run_midx=True):
841 if not f: return None
848 # update object count
850 cp = struct.pack('!i', self.count)
854 # calculate the pack sha1sum
857 for b in chunkyreader(f):
859 packbin = sum.digest()
861 fdatasync(f.fileno())
865 obj_list_sha = self._write_pack_idx_v2(self.filename + b'.idx', idx,
867 nameprefix = os.path.join(self.repo_dir,
868 b'objects/pack/pack-' + obj_list_sha)
869 if os.path.exists(self.filename + b'.map'):
870 os.unlink(self.filename + b'.map')
871 os.rename(self.filename + b'.pack', nameprefix + b'.pack')
872 os.rename(self.filename + b'.idx', nameprefix + b'.idx')
874 os.fsync(self.parentfd)
876 os.close(self.parentfd)
879 auto_midx(os.path.join(self.repo_dir, b'objects/pack'))
881 if self.on_pack_finish:
882 self.on_pack_finish(nameprefix)
886 def close(self, run_midx=True):
887 """Close the pack file and move it to its definitive path."""
888 return self._end(run_midx=run_midx)
890 def _write_pack_idx_v2(self, filename, idx, packbin):
893 for entry in section:
894 if entry[2] >= 2**31:
897 # Length: header + fan-out + shas-and-crcs + overflow-offsets
898 index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
900 idx_f = open(filename, 'w+b')
902 idx_f.truncate(index_len)
903 fdatasync(idx_f.fileno())
904 idx_map = mmap_readwrite(idx_f, close=False)
906 count = _helpers.write_idx(filename, idx_map, idx, self.count)
907 assert(count == self.count)
914 idx_f = open(filename, 'a+b')
919 b = idx_f.read(8 + 4*256)
922 obj_list_sum = Sha1()
923 for b in chunkyreader(idx_f, 20*self.count):
925 obj_list_sum.update(b)
926 namebase = hexlify(obj_list_sum.digest())
928 for b in chunkyreader(idx_f):
930 idx_f.write(idx_sum.digest())
931 fdatasync(idx_f.fileno())
937 def list_refs(patterns=None, repo_dir=None,
938 limit_to_heads=False, limit_to_tags=False):
939 """Yield (refname, hash) tuples for all repository refs unless
940 patterns are specified. In that case, only include tuples for
941 refs matching those patterns (cf. git-show-ref(1)). The limits
942 restrict the result items to refs/heads or refs/tags. If both
943 limits are specified, items from both sources will be included.
946 argv = [b'git', b'show-ref']
948 argv.append(b'--heads')
950 argv.append(b'--tags')
953 argv.extend(patterns)
954 p = subprocess.Popen(argv, env=_gitenv(repo_dir), stdout=subprocess.PIPE)
955 out = p.stdout.read().strip()
956 rv = p.wait() # not fatal
960 for d in out.split(b'\n'):
961 sha, name = d.split(b' ', 1)
962 yield name, unhexlify(sha)
965 def read_ref(refname, repo_dir = None):
966 """Get the commit id of the most recent commit made on a given ref."""
967 refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
968 l = tuple(islice(refs, 2))
976 def rev_list_invocation(ref_or_refs, count=None, format=None):
977 if isinstance(ref_or_refs, bytes):
978 refs = (ref_or_refs,)
981 argv = [b'git', b'rev-list']
982 if isinstance(count, Integral):
983 argv.extend([b'-n', b'%d' % count])
985 raise ValueError('unexpected count argument %r' % count)
988 argv.append(b'--pretty=format:' + format)
990 assert not ref.startswith(b'-')
996 def rev_list(ref_or_refs, count=None, parse=None, format=None, repo_dir=None):
997 """Yield information about commits as per "git rev-list". If a format
998 is not provided, yield one hex hash at a time. If a format is
999 provided, pass it to rev-list and call parse(git_stdout) for each
1000 commit with the stream positioned just after the rev-list "commit
1001 HASH" header line. When a format is provided yield (oidx,
1002 parse(git_stdout)) for each commit.
1005 assert bool(parse) == bool(format)
1006 p = subprocess.Popen(rev_list_invocation(ref_or_refs, count=count,
1008 env=_gitenv(repo_dir),
1009 stdout = subprocess.PIPE)
1011 for line in p.stdout:
1014 line = p.stdout.readline()
1017 if not s.startswith(b'commit '):
1018 raise Exception('unexpected line ' + repr(s))
1021 yield s, parse(p.stdout)
1022 line = p.stdout.readline()
1024 rv = p.wait() # not fatal
1026 raise GitError('git rev-list returned error %d' % rv)
1029 def get_commit_dates(refs, repo_dir=None):
1030 """Get the dates for the specified commit refs. For now, every unique
1031 string in refs must resolve to a different commit or this
1032 function will fail."""
1035 commit = get_commit_items(ref, cp(repo_dir))
1036 result.append(commit.author_sec)
1040 def rev_parse(committish, repo_dir=None):
1041 """Resolve the full hash for 'committish', if it exists.
1043 Should be roughly equivalent to 'git rev-parse'.
1045 Returns the hex value of the hash if it is found, None if 'committish' does
1046 not correspond to anything.
1048 head = read_ref(committish, repo_dir=repo_dir)
1050 debug2("resolved from ref: commit = %s\n" % hexlify(head))
1053 pL = PackIdxList(repo(b'objects/pack', repo_dir=repo_dir))
1055 if len(committish) == 40:
1057 hash = unhexlify(committish)
1067 def update_ref(refname, newval, oldval, repo_dir=None):
1068 """Update a repository reference."""
1071 assert refname.startswith(b'refs/heads/') \
1072 or refname.startswith(b'refs/tags/')
1073 p = subprocess.Popen([b'git', b'update-ref', refname,
1074 hexlify(newval), hexlify(oldval)],
1075 env=_gitenv(repo_dir))
1076 _git_wait(b'git update-ref', p)
1079 def delete_ref(refname, oldvalue=None):
1080 """Delete a repository reference (see git update-ref(1))."""
1081 assert refname.startswith(b'refs/')
1082 oldvalue = [] if not oldvalue else [oldvalue]
1083 p = subprocess.Popen([b'git', b'update-ref', b'-d', refname] + oldvalue,
1085 _git_wait('git update-ref', p)
1088 def guess_repo(path=None):
1089 """Set the path value in the global variable "repodir".
1090 This makes bup look for an existing bup repository, but not fail if a
1091 repository doesn't exist. Usually, if you are interacting with a bup
1092 repository, you would not be calling this function but using
1093 check_repo_or_die().
1099 repodir = environ.get(b'BUP_DIR')
1101 repodir = os.path.expanduser(b'~/.bup')
1104 def init_repo(path=None):
1105 """Create the Git bare repository for bup in a given path."""
1107 d = repo() # appends a / to the path
1108 parent = os.path.dirname(os.path.dirname(d))
1109 if parent and not os.path.exists(parent):
1110 raise GitError('parent directory "%s" does not exist\n'
1112 if os.path.exists(d) and not os.path.isdir(os.path.join(d, b'.')):
1113 raise GitError('"%s" exists but is not a directory\n' % path_msg(d))
1114 p = subprocess.Popen([b'git', b'--bare', b'init'], stdout=sys.stderr,
1116 _git_wait('git init', p)
1117 # Force the index version configuration in order to ensure bup works
1118 # regardless of the version of the installed Git binary.
1119 p = subprocess.Popen([b'git', b'config', b'pack.indexVersion', '2'],
1120 stdout=sys.stderr, env=_gitenv())
1121 _git_wait('git config', p)
1123 p = subprocess.Popen([b'git', b'config', b'core.logAllRefUpdates', b'true'],
1124 stdout=sys.stderr, env=_gitenv())
1125 _git_wait('git config', p)
1128 def check_repo_or_die(path=None):
1129 """Check to see if a bup repository probably exists, and abort if not."""
1132 pst = stat_if_exists(top + b'/objects/pack')
1133 if pst and stat.S_ISDIR(pst.st_mode):
1136 top_st = stat_if_exists(top)
1138 log('error: repository %r does not exist (see "bup help init")\n'
1141 log('error: %s is not a repository\n' % path_msg(top))
1147 """Get Git's version and ensure a usable version is installed.
1149 The returned version is formatted as an ordered tuple with each position
1150 representing a digit in the version tag. For example, the following tuple
1151 would represent version 1.6.6.9:
1157 p = subprocess.Popen([b'git', b'--version'], stdout=subprocess.PIPE)
1158 gvs = p.stdout.read()
1159 _git_wait('git --version', p)
1160 m = re.match(br'git version (\S+.\S+)', gvs)
1162 raise GitError('git --version weird output: %r' % gvs)
1163 _ver = tuple(int(x) for x in m.group(1).split(b'.'))
1164 needed = (1, 5, 3, 1)
1166 raise GitError('git version %s or higher is required; you have %s'
1167 % ('.'.join(str(x) for x in needed),
1168 '.'.join(str(x) for x in _ver)))
1172 class _AbortableIter:
1173 def __init__(self, it, onabort = None):
1175 self.onabort = onabort
1183 return next(self.it)
1184 except StopIteration as e:
1194 """Abort iteration and call the abortion callback, if needed."""
1205 """Link to 'git cat-file' that is used to retrieve blob data."""
1206 def __init__(self, repo_dir = None):
1207 self.repo_dir = repo_dir
1210 log('error: git version must be at least 1.5.6\n')
1212 self.p = self.inprogress = None
1216 self.p.stdout.close()
1217 self.p.stdin.close()
1219 self.inprogress = None
1223 self.p = subprocess.Popen([b'git', b'cat-file', b'--batch'],
1224 stdin=subprocess.PIPE,
1225 stdout=subprocess.PIPE,
1228 env=_gitenv(self.repo_dir))
1231 """Yield (oidx, type, size), followed by the data referred to by ref.
1232 If ref does not exist, only yield (None, None, None).
1235 if not self.p or self.p.poll() != None:
1238 poll_result = self.p.poll()
1239 assert(poll_result == None)
1241 log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1242 assert(not self.inprogress)
1243 assert ref.find(b'\n') < 0
1244 assert ref.find(b'\r') < 0
1245 assert not ref.startswith(b'-')
1246 self.inprogress = ref
1247 self.p.stdin.write(ref + b'\n')
1248 self.p.stdin.flush()
1249 hdr = self.p.stdout.readline()
1250 if hdr.endswith(b' missing\n'):
1251 self.inprogress = None
1252 yield None, None, None
1254 info = hdr.split(b' ')
1255 if len(info) != 3 or len(info[0]) != 40:
1256 raise GitError('expected object (id, type, size), got %r' % info)
1257 oidx, typ, size = info
1259 it = _AbortableIter(chunkyreader(self.p.stdout, size),
1260 onabort=self._abort)
1262 yield oidx, typ, size
1265 readline_result = self.p.stdout.readline()
1266 assert readline_result == b'\n'
1267 self.inprogress = None
1268 except Exception as e:
1272 def _join(self, it):
1273 _, typ, _ = next(it)
1277 elif typ == b'tree':
1278 treefile = b''.join(it)
1279 for (mode, name, sha) in tree_decode(treefile):
1280 for blob in self.join(hexlify(sha)):
1282 elif typ == b'commit':
1283 treeline = b''.join(it).split(b'\n')[0]
1284 assert treeline.startswith(b'tree ')
1285 for blob in self.join(treeline[5:]):
1288 raise GitError('invalid object type %r: expected blob/tree/commit'
1292 """Generate a list of the content of all blobs that can be reached
1293 from an object. The hash given in 'id' must point to a blob, a tree
1294 or a commit. The content of all blobs that can be seen from trees or
1295 commits will be added to the list.
1298 for d in self._join(self.get(id)):
1300 except StopIteration:
1306 def cp(repo_dir=None):
1307 """Create a CatPipe object or reuse the already existing one."""
1310 repo_dir = repodir or repo()
1311 repo_dir = os.path.abspath(repo_dir)
1312 cp = _cp.get(repo_dir)
1314 cp = CatPipe(repo_dir)
1319 def tags(repo_dir = None):
1320 """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1322 for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1323 assert n.startswith(b'refs/tags/')
1327 tags[c].append(name) # more than one tag can point at 'c'
1331 class MissingObject(KeyError):
1332 def __init__(self, oid):
1334 KeyError.__init__(self, 'object %r is missing' % oid.encode('hex'))
1337 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1338 'path', 'chunk_path', 'data'])
1339 # The path is the mangled path, and if an item represents a fragment
1340 # of a chunked file, the chunk_path will be the chunked subtree path
1341 # for the chunk, i.e. ['', '2d3115e', ...]. The top-level path for a
1342 # chunked file will have a chunk_path of ['']. So some chunk subtree
1343 # of the file '/foo/bar/baz' might look like this:
1345 # item.path = ['foo', 'bar', 'baz.bup']
1346 # item.chunk_path = ['', '2d3115e', '016b097']
1347 # item.type = 'tree'
1351 def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1352 """Yield everything reachable from oidx via get_ref (which must behave
1353 like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1354 returns true. Throw MissingObject if a hash encountered is
1355 missing from the repository, and don't read or return blob content
1356 in the data field unless include_data is set.
1359 # Maintain the pending stack on the heap to avoid stack overflow
1360 pending = [(oidx, [], [], None)]
1362 oidx, parent_path, chunk_path, mode = pending.pop()
1363 oid = unhexlify(oidx)
1364 if stop_at and stop_at(oidx):
1367 if (not include_data) and mode and stat.S_ISREG(mode):
1368 # If the object is a "regular file", then it's a leaf in
1369 # the graph, so we can skip reading the data if the caller
1370 # hasn't requested it.
1371 yield WalkItem(oid=oid, type=b'blob',
1372 chunk_path=chunk_path, path=parent_path,
1377 item_it = get_ref(oidx)
1378 get_oidx, typ, _ = next(item_it)
1380 raise MissingObject(unhexlify(oidx))
1381 if typ not in (b'blob', b'commit', b'tree'):
1382 raise Exception('unexpected repository object type %r' % typ)
1384 # FIXME: set the mode based on the type when the mode is None
1385 if typ == b'blob' and not include_data:
1386 # Dump data until we can ask cat_pipe not to fetch it
1387 for ignored in item_it:
1391 data = b''.join(item_it)
1393 yield WalkItem(oid=oid, type=typ,
1394 chunk_path=chunk_path, path=parent_path,
1396 data=(data if include_data else None))
1398 if typ == b'commit':
1399 commit_items = parse_commit(data)
1400 for pid in commit_items.parents:
1401 pending.append((pid, parent_path, chunk_path, mode))
1402 pending.append((commit_items.tree, parent_path, chunk_path,
1403 hashsplit.GIT_MODE_TREE))
1404 elif typ == b'tree':
1405 for mode, name, ent_id in tree_decode(data):
1406 demangled, bup_type = demangle_name(name, mode)
1408 sub_path = parent_path
1409 sub_chunk_path = chunk_path + [name]
1411 sub_path = parent_path + [name]
1412 if bup_type == BUP_CHUNKED:
1413 sub_chunk_path = [b'']
1415 sub_chunk_path = chunk_path
1416 pending.append((hexlify(ent_id), sub_path, sub_chunk_path,