1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
6 from __future__ import absolute_import, print_function
7 import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
8 from array import array
9 from binascii import hexlify, unhexlify
10 from collections import namedtuple
11 from itertools import islice
12 from numbers import Integral
14 from bup import _helpers, compat, hashsplit, path, midx, bloom, xstat
15 from bup.compat import (buffer,
16 byte_int, bytes_from_byte, bytes_from_uint,
21 from bup.io import path_msg
22 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
24 hostname, localtime, log,
27 mmap_read, mmap_readwrite,
29 progress, qprogress, stat_if_exists,
32 from bup.pwdgrp import username, userfullname
36 repodir = None # The default repository, once initialized
38 _typemap = {b'blob': 3, b'tree': 2, b'commit': 1, b'tag': 4}
39 _typermap = {v: k for k, v in items(_typemap)}
46 class GitError(Exception):
50 def _gitenv(repo_dir=None):
53 return merge_dict(environ, {b'GIT_DIR': os.path.abspath(repo_dir)})
55 def _git_wait(cmd, p):
58 raise GitError('%r returned %d' % (cmd, rv))
60 def git_config_get(option, repo_dir=None):
61 cmd = (b'git', b'config', b'--get', option)
62 p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
63 env=_gitenv(repo_dir=repo_dir))
69 raise GitError('%r returned %d' % (cmd, rc))
73 def parse_tz_offset(s):
74 """UTC offset in seconds."""
75 tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
76 if bytes_from_byte(s[0]) == b'-':
81 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
82 # Make sure that's authoritative.
83 _start_end_char = br'[^ .,:;<>"\'\0\n]'
84 _content_char = br'[^\0\n<>]'
85 _safe_str_rx = br'(?:%s{1,2}|(?:%s%s*%s))' \
87 _start_end_char, _content_char, _start_end_char)
88 _tz_rx = br'[-+]\d\d[0-5]\d'
89 _parent_rx = br'(?:parent [abcdefABCDEF0123456789]{40}\n)'
90 # Assumes every following line starting with a space is part of the
91 # mergetag. Is there a formal commit blob spec?
92 _mergetag_rx = br'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)'
93 _commit_rx = re.compile(br'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
94 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
95 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?)
97 (?P<message>(?:.|\n)*)''' % (_parent_rx,
98 _safe_str_rx, _safe_str_rx, _tz_rx,
99 _safe_str_rx, _safe_str_rx, _tz_rx,
101 _parent_hash_rx = re.compile(br'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
103 # Note that the author_sec and committer_sec values are (UTC) epoch
104 # seconds, and for now the mergetag is not included.
105 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
106 'author_name', 'author_mail',
107 'author_sec', 'author_offset',
108 'committer_name', 'committer_mail',
109 'committer_sec', 'committer_offset',
112 def parse_commit(content):
113 commit_match = re.match(_commit_rx, content)
115 raise Exception('cannot parse commit %r' % content)
116 matches = commit_match.groupdict()
117 return CommitInfo(tree=matches['tree'],
118 parents=re.findall(_parent_hash_rx, matches['parents']),
119 author_name=matches['author_name'],
120 author_mail=matches['author_mail'],
121 author_sec=int(matches['asec']),
122 author_offset=parse_tz_offset(matches['atz']),
123 committer_name=matches['committer_name'],
124 committer_mail=matches['committer_mail'],
125 committer_sec=int(matches['csec']),
126 committer_offset=parse_tz_offset(matches['ctz']),
127 message=matches['message'])
130 def get_cat_data(cat_iterator, expected_type):
131 _, kind, _ = next(cat_iterator)
132 if kind != expected_type:
133 raise Exception('expected %r, saw %r' % (expected_type, kind))
134 return b''.join(cat_iterator)
136 def get_commit_items(id, cp):
137 return parse_commit(get_cat_data(cp.get(id), b'commit'))
139 def _local_git_date_str(epoch_sec):
140 return b'%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
143 def _git_date_str(epoch_sec, tz_offset_sec):
144 offs = tz_offset_sec // 60
145 return b'%d %s%02d%02d' \
147 b'+' if offs >= 0 else b'-',
152 def repo(sub = b'', repo_dir=None):
153 """Get the path to the git repository or one of its subdirectories."""
154 repo_dir = repo_dir or repodir
156 raise GitError('You should call check_repo_or_die()')
158 # If there's a .git subdirectory, then the actual repo is in there.
159 gd = os.path.join(repo_dir, b'.git')
160 if os.path.exists(gd):
163 return os.path.join(repo_dir, sub)
167 re.compile(br'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)')
170 return _shorten_hash_rx.sub(br'\1\2*\3', s)
174 full = os.path.abspath(path)
175 fullrepo = os.path.abspath(repo(b''))
176 if not fullrepo.endswith(b'/'):
178 if full.startswith(fullrepo):
179 path = full[len(fullrepo):]
180 if path.startswith(b'index-cache/'):
181 path = path[len(b'index-cache/'):]
182 return shorten_hash(path)
186 paths = [repo(b'objects/pack')]
187 paths += glob.glob(repo(b'index-cache/*/.'))
191 def auto_midx(objdir):
192 args = [path.exe(), b'midx', b'--auto', b'--dir', objdir]
194 rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
196 # make sure 'args' gets printed to help with debugging
197 add_error('%r: exception: %s' % (args, e))
200 add_error('%r: returned %d' % (args, rv))
202 args = [path.exe(), b'bloom', b'--dir', objdir]
204 rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
206 # make sure 'args' gets printed to help with debugging
207 add_error('%r: exception: %s' % (args, e))
210 add_error('%r: returned %d' % (args, rv))
213 def mangle_name(name, mode, gitmode):
214 """Mangle a file name to present an abstract name for segmented files.
215 Mangled file names will have the ".bup" extension added to them. If a
216 file's name already ends with ".bup", a ".bupl" extension is added to
217 disambiguate normal files from segmented ones.
219 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
220 assert(stat.S_ISDIR(gitmode))
221 return name + b'.bup'
222 elif name.endswith(b'.bup') or name[:-1].endswith(b'.bup'):
223 return name + b'.bupl'
228 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
229 def demangle_name(name, mode):
230 """Remove name mangling from a file name, if necessary.
232 The return value is a tuple (demangled_filename,mode), where mode is one of
235 * BUP_NORMAL : files that should be read as-is from the repository
236 * BUP_CHUNKED : files that were chunked and need to be reassembled
238 For more information on the name mangling algorithm, see mangle_name()
240 if name.endswith(b'.bupl'):
241 return (name[:-5], BUP_NORMAL)
242 elif name.endswith(b'.bup'):
243 return (name[:-4], BUP_CHUNKED)
244 elif name.endswith(b'.bupm'):
246 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
248 return (name, BUP_NORMAL)
251 def calc_hash(type, content):
252 """Calculate some content's hash in the Git fashion."""
253 header = b'%s %d\0' % (type, len(content))
259 def shalist_item_sort_key(ent):
260 (mode, name, id) = ent
261 assert(mode+0 == mode)
262 if stat.S_ISDIR(mode):
268 def tree_encode(shalist):
269 """Generate a git tree object from (mode,name,hash) tuples."""
270 shalist = sorted(shalist, key = shalist_item_sort_key)
272 for (mode,name,bin) in shalist:
274 assert(mode+0 == mode)
276 assert(len(bin) == 20)
277 s = b'%o %s\0%s' % (mode,name,bin)
278 assert s[0] != b'0' # 0-padded octal is not acceptable in a git tree
283 def tree_decode(buf):
284 """Generate a list of (mode,name,hash) from the git tree object in buf."""
286 while ofs < len(buf):
287 z = buf.find(b'\0', ofs)
289 spl = buf[ofs:z].split(b' ', 1)
290 assert(len(spl) == 2)
292 sha = buf[z+1:z+1+20]
294 yield (int(mode, 8), name, sha)
297 def _encode_packobj(type, content, compression_level=1):
298 if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
299 raise ValueError('invalid compression level %s' % compression_level)
302 szbits = (sz & 0x0f) | (_typemap[type]<<4)
305 if sz: szbits |= 0x80
306 szout += bytes_from_uint(szbits)
311 z = zlib.compressobj(compression_level)
313 yield z.compress(content)
317 def _encode_looseobj(type, content, compression_level=1):
318 z = zlib.compressobj(compression_level)
319 yield z.compress(b'%s %d\0' % (type, len(content)))
320 yield z.compress(content)
324 def _decode_looseobj(buf):
326 s = zlib.decompress(buf)
329 l = s[:i].split(b' ')
333 assert(type in _typemap)
334 assert(sz == len(content))
335 return (type, content)
338 def _decode_packobj(buf):
341 type = _typermap[(c & 0x70) >> 4]
348 sz |= (c & 0x7f) << shift
352 return (type, zlib.decompress(buf[i+1:]))
359 def find_offset(self, hash):
360 """Get the offset of an object inside the index file."""
361 idx = self._idx_from_hash(hash)
363 return self._ofs_from_idx(idx)
366 def exists(self, hash, want_source=False):
367 """Return nonempty if the object exists in this index."""
368 if hash and (self._idx_from_hash(hash) != None):
369 return want_source and os.path.basename(self.name) or True
372 def _idx_from_hash(self, hash):
373 global _total_searches, _total_steps
375 assert(len(hash) == 20)
376 b1 = byte_int(hash[0])
377 start = self.fanout[b1-1] # range -1..254
378 end = self.fanout[b1] # range 0..255
380 _total_steps += 1 # lookup table is a step
383 mid = start + (end - start) // 2
384 v = self._idx_to_hash(mid)
394 class PackIdxV1(PackIdx):
395 """Object representation of a Git pack index (version 1) file."""
396 def __init__(self, filename, f):
398 self.idxnames = [self.name]
399 self.map = mmap_read(f)
400 # Min size for 'L' is 4, which is sufficient for struct's '!I'
401 self.fanout = array('L', struct.unpack('!256I', self.map))
402 self.fanout.append(0) # entry "-1"
403 self.nsha = self.fanout[255]
404 self.sha_ofs = 256 * 4
405 # Avoid slicing shatable for individual hashes (very high overhead)
406 self.shatable = buffer(self.map, self.sha_ofs, self.nsha * 24)
409 return int(self.nsha) # int() from long for python 2
411 def _ofs_from_idx(self, idx):
412 if idx >= self.nsha or idx < 0:
413 raise IndexError('invalid pack index index %d' % idx)
414 ofs = self.sha_ofs + idx * 24
415 return struct.unpack_from('!I', self.map, offset=ofs)[0]
417 def _idx_to_hash(self, idx):
418 if idx >= self.nsha or idx < 0:
419 raise IndexError('invalid pack index index %d' % idx)
420 ofs = self.sha_ofs + idx * 24 + 4
421 return self.map[ofs : ofs + 20]
424 start = self.sha_ofs + 4
425 for ofs in range(start, start + 24 * self.nsha, 24):
426 yield self.map[ofs : ofs + 20]
429 class PackIdxV2(PackIdx):
430 """Object representation of a Git pack index (version 2) file."""
431 def __init__(self, filename, f):
433 self.idxnames = [self.name]
434 self.map = mmap_read(f)
435 assert self.map[0:8] == b'\377tOc\0\0\0\2'
436 # Min size for 'L' is 4, which is sufficient for struct's '!I'
437 self.fanout = array('L', struct.unpack_from('!256I', self.map, offset=8))
438 self.fanout.append(0)
439 self.nsha = self.fanout[255]
440 self.sha_ofs = 8 + 256*4
441 self.ofstable_ofs = self.sha_ofs + self.nsha * 20 + self.nsha * 4
442 self.ofs64table_ofs = self.ofstable_ofs + self.nsha * 4
443 # Avoid slicing this for individual hashes (very high overhead)
444 self.shatable = buffer(self.map, self.sha_ofs, self.nsha*20)
447 return int(self.nsha) # int() from long for python 2
449 def _ofs_from_idx(self, idx):
450 if idx >= self.nsha or idx < 0:
451 raise IndexError('invalid pack index index %d' % idx)
452 ofs_ofs = self.ofstable_ofs + idx * 4
453 ofs = struct.unpack_from('!I', self.map, offset=ofs_ofs)[0]
455 idx64 = ofs & 0x7fffffff
456 ofs64_ofs = self.ofs64table_ofs + idx64 * 8
457 ofs = struct.unpack_from('!Q', self.map, offset=ofs64_ofs)[0]
460 def _idx_to_hash(self, idx):
461 if idx >= self.nsha or idx < 0:
462 raise IndexError('invalid pack index index %d' % idx)
463 ofs = self.sha_ofs + idx * 20
464 return self.map[ofs : ofs + 20]
468 for ofs in range(start, start + 20 * self.nsha, 20):
469 yield self.map[ofs : ofs + 20]
474 def __init__(self, dir, ignore_midx=False):
476 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
481 self.do_bloom = False
483 self.ignore_midx = ignore_midx
489 assert(_mpi_count == 0)
492 return iter(idxmerge(self.packs))
495 return sum(len(pack) for pack in self.packs)
497 def exists(self, hash, want_source=False):
498 """Return nonempty if the object exists in the index files."""
499 global _total_searches
501 if hash in self.also:
503 if self.do_bloom and self.bloom:
504 if self.bloom.exists(hash):
505 self.do_bloom = False
507 _total_searches -= 1 # was counted by bloom
509 for i in range(len(self.packs)):
511 _total_searches -= 1 # will be incremented by sub-pack
512 ix = p.exists(hash, want_source=want_source)
514 # reorder so most recently used packs are searched first
515 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
520 def refresh(self, skip_midx = False):
521 """Refresh the index list.
522 This method verifies if .midx files were superseded (e.g. all of its
523 contents are in another, bigger .midx file) and removes the superseded
526 If skip_midx is True, all work on .midx files will be skipped and .midx
527 files will be removed from the list.
529 The instance variable 'ignore_midx' can force this function to
530 always act as if skip_midx was True.
532 if self.bloom is not None:
534 self.bloom = None # Always reopen the bloom as it may have been relaced
535 self.do_bloom = False
536 skip_midx = skip_midx or self.ignore_midx
537 d = dict((p.name, p) for p in self.packs
538 if not skip_midx or not isinstance(p, midx.PackMidx))
539 if os.path.exists(self.dir):
542 midxes = set(glob.glob(os.path.join(self.dir, b'*.midx')))
543 # remove any *.midx files from our list that no longer exist
544 for ix in list(d.values()):
545 if not isinstance(ix, midx.PackMidx):
547 if ix.name in midxes:
552 self.packs.remove(ix)
553 for ix in self.packs:
554 if isinstance(ix, midx.PackMidx):
555 for name in ix.idxnames:
556 d[os.path.join(self.dir, name)] = ix
559 mx = midx.PackMidx(full)
560 (mxd, mxf) = os.path.split(mx.name)
562 for n in mx.idxnames:
563 if not os.path.exists(os.path.join(mxd, n)):
564 log(('warning: index %s missing\n'
566 % (path_msg(n), path_msg(mxf)))
574 midxl.sort(key=lambda ix:
575 (-len(ix), -xstat.stat(ix.name).st_mtime))
578 for sub in ix.idxnames:
579 found = d.get(os.path.join(self.dir, sub))
580 if not found or isinstance(found, PackIdx):
581 # doesn't exist, or exists but not in a midx
586 for name in ix.idxnames:
587 d[os.path.join(self.dir, name)] = ix
588 elif not ix.force_keep:
589 debug1('midx: removing redundant: %s\n'
590 % path_msg(os.path.basename(ix.name)))
593 for full in glob.glob(os.path.join(self.dir, b'*.idx')):
597 except GitError as e:
601 bfull = os.path.join(self.dir, b'bup.bloom')
602 if self.bloom is None and os.path.exists(bfull):
603 self.bloom = bloom.ShaBloom(bfull)
604 self.packs = list(set(d.values()))
605 self.packs.sort(reverse=True, key=lambda x: len(x))
606 if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
610 debug1('PackIdxList: using %d index%s.\n'
611 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
614 """Insert an additional object in the list."""
618 def open_idx(filename):
619 if filename.endswith(b'.idx'):
620 f = open(filename, 'rb')
622 if header[0:4] == b'\377tOc':
623 version = struct.unpack('!I', header[4:8])[0]
625 return PackIdxV2(filename, f)
627 raise GitError('%s: expected idx file version 2, got %d'
628 % (path_msg(filename), version))
629 elif len(header) == 8 and header[0:4] < b'\377tOc':
630 return PackIdxV1(filename, f)
632 raise GitError('%s: unrecognized idx file header'
633 % path_msg(filename))
634 elif filename.endswith(b'.midx'):
635 return midx.PackMidx(filename)
637 raise GitError('idx filenames must end with .idx or .midx')
640 def idxmerge(idxlist, final_progress=True):
641 """Generate a list of all the objects reachable in a PackIdxList."""
642 def pfunc(count, total):
643 qprogress('Reading indexes: %.2f%% (%d/%d)\r'
644 % (count*100.0/total, count, total))
645 def pfinal(count, total):
647 progress('Reading indexes: %.2f%% (%d/%d), done.\n'
648 % (100, total, total))
649 return merge_iter(idxlist, 10024, pfunc, pfinal)
652 def _make_objcache():
653 return PackIdxList(repo(b'objects/pack'))
655 # bup-gc assumes that it can disable all PackWriter activities
656 # (bloom/midx/cache) via the constructor and close() arguments.
659 """Writes Git objects inside a pack file."""
660 def __init__(self, objcache_maker=_make_objcache, compression_level=1,
661 run_midx=True, on_pack_finish=None,
662 max_pack_size=None, max_pack_objects=None, repo_dir=None):
663 self.repo_dir = repo_dir or repo()
670 self.objcache_maker = objcache_maker
672 self.compression_level = compression_level
673 self.run_midx=run_midx
674 self.on_pack_finish = on_pack_finish
675 if not max_pack_size:
676 max_pack_size = git_config_get(b'pack.packSizeLimit',
677 repo_dir=self.repo_dir)
678 if max_pack_size is not None:
679 max_pack_size = parse_num(max_pack_size)
680 if not max_pack_size:
681 # larger packs slow down pruning
682 max_pack_size = 1000 * 1000 * 1000
683 self.max_pack_size = max_pack_size
684 # cache memory usage is about 83 bytes per object
685 self.max_pack_objects = max_pack_objects if max_pack_objects \
686 else max(1, self.max_pack_size // 5000)
694 def __exit__(self, type, value, traceback):
699 objdir = dir = os.path.join(self.repo_dir, b'objects')
700 fd, name = tempfile.mkstemp(suffix=b'.pack', dir=objdir)
702 self.file = os.fdopen(fd, 'w+b')
707 self.parentfd = os.open(objdir, os.O_RDONLY)
713 assert name.endswith(b'.pack')
714 self.filename = name[:-5]
715 self.file.write(b'PACK\0\0\0\2\0\0\0\0')
716 self.idx = list(list() for i in range(256))
718 def _raw_write(self, datalist, sha):
721 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
722 # the file never has a *partial* blob. So let's make sure it's
723 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
724 # to our hashsplit algorithm.) f.write() does its own buffering,
725 # but that's okay because we'll flush it in _end().
726 oneblob = b''.join(datalist)
732 crc = zlib.crc32(oneblob) & 0xffffffff
733 self._update_idx(sha, crc, nw)
738 def _update_idx(self, sha, crc, size):
741 self.idx[byte_int(sha[0])].append((sha, crc,
742 self.file.tell() - size))
744 def _write(self, sha, type, content):
748 sha = calc_hash(type, content)
749 size, crc = self._raw_write(_encode_packobj(type, content,
750 self.compression_level),
752 if self.outbytes >= self.max_pack_size \
753 or self.count >= self.max_pack_objects:
757 def breakpoint(self):
758 """Clear byte and object counts and return the last processed id."""
759 id = self._end(self.run_midx)
760 self.outbytes = self.count = 0
763 def _require_objcache(self):
764 if self.objcache is None and self.objcache_maker:
765 self.objcache = self.objcache_maker()
766 if self.objcache is None:
768 "PackWriter not opened or can't check exists w/o objcache")
770 def exists(self, id, want_source=False):
771 """Return non-empty if an object is found in the object cache."""
772 self._require_objcache()
773 return self.objcache.exists(id, want_source=want_source)
775 def just_write(self, sha, type, content):
776 """Write an object to the pack file without checking for duplication."""
777 self._write(sha, type, content)
778 # If nothing else, gc doesn't have/want an objcache
779 if self.objcache is not None:
780 self.objcache.add(sha)
782 def maybe_write(self, type, content):
783 """Write an object to the pack file if not present and return its id."""
784 sha = calc_hash(type, content)
785 if not self.exists(sha):
786 self._require_objcache()
787 self.just_write(sha, type, content)
790 def new_blob(self, blob):
791 """Create a blob object in the pack with the supplied content."""
792 return self.maybe_write(b'blob', blob)
794 def new_tree(self, shalist):
795 """Create a tree object in the pack."""
796 content = tree_encode(shalist)
797 return self.maybe_write(b'tree', content)
799 def new_commit(self, tree, parent,
800 author, adate_sec, adate_tz,
801 committer, cdate_sec, cdate_tz,
803 """Create a commit object in the pack. The date_sec values must be
804 epoch-seconds, and if a tz is None, the local timezone is assumed."""
806 adate_str = _git_date_str(adate_sec, adate_tz)
808 adate_str = _local_git_date_str(adate_sec)
810 cdate_str = _git_date_str(cdate_sec, cdate_tz)
812 cdate_str = _local_git_date_str(cdate_sec)
814 if tree: l.append(b'tree %s' % hexlify(tree))
815 if parent: l.append(b'parent %s' % hexlify(parent))
816 if author: l.append(b'author %s %s' % (author, adate_str))
817 if committer: l.append(b'committer %s %s' % (committer, cdate_str))
820 return self.maybe_write(b'commit', b'\n'.join(l))
823 """Remove the pack file from disk."""
832 os.unlink(self.filename + b'.pack')
839 def _end(self, run_midx=True):
841 if not f: return None
848 # update object count
850 cp = struct.pack('!i', self.count)
854 # calculate the pack sha1sum
857 for b in chunkyreader(f):
859 packbin = sum.digest()
861 fdatasync(f.fileno())
865 obj_list_sha = self._write_pack_idx_v2(self.filename + b'.idx', idx,
867 nameprefix = os.path.join(self.repo_dir,
868 b'objects/pack/pack-' + obj_list_sha)
869 if os.path.exists(self.filename + b'.map'):
870 os.unlink(self.filename + b'.map')
871 os.rename(self.filename + b'.pack', nameprefix + b'.pack')
872 os.rename(self.filename + b'.idx', nameprefix + b'.idx')
874 os.fsync(self.parentfd)
876 os.close(self.parentfd)
879 auto_midx(os.path.join(self.repo_dir, b'objects/pack'))
881 if self.on_pack_finish:
882 self.on_pack_finish(nameprefix)
886 def close(self, run_midx=True):
887 """Close the pack file and move it to its definitive path."""
888 return self._end(run_midx=run_midx)
890 def _write_pack_idx_v2(self, filename, idx, packbin):
893 for entry in section:
894 if entry[2] >= 2**31:
897 # Length: header + fan-out + shas-and-crcs + overflow-offsets
898 index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
900 idx_f = open(filename, 'w+b')
902 idx_f.truncate(index_len)
903 fdatasync(idx_f.fileno())
904 idx_map = mmap_readwrite(idx_f, close=False)
906 count = _helpers.write_idx(filename, idx_map, idx, self.count)
907 assert(count == self.count)
914 idx_f = open(filename, 'a+b')
919 b = idx_f.read(8 + 4*256)
922 obj_list_sum = Sha1()
923 for b in chunkyreader(idx_f, 20*self.count):
925 obj_list_sum.update(b)
926 namebase = hexlify(obj_list_sum.digest())
928 for b in chunkyreader(idx_f):
930 idx_f.write(idx_sum.digest())
931 fdatasync(idx_f.fileno())
937 def list_refs(patterns=None, repo_dir=None,
938 limit_to_heads=False, limit_to_tags=False):
939 """Yield (refname, hash) tuples for all repository refs unless
940 patterns are specified. In that case, only include tuples for
941 refs matching those patterns (cf. git-show-ref(1)). The limits
942 restrict the result items to refs/heads or refs/tags. If both
943 limits are specified, items from both sources will be included.
946 argv = [b'git', b'show-ref']
948 argv.append(b'--heads')
950 argv.append(b'--tags')
953 argv.extend(patterns)
954 p = subprocess.Popen(argv, env=_gitenv(repo_dir), stdout=subprocess.PIPE)
955 out = p.stdout.read().strip()
956 rv = p.wait() # not fatal
960 for d in out.split(b'\n'):
961 sha, name = d.split(b' ', 1)
962 yield name, unhexlify(sha)
965 def read_ref(refname, repo_dir = None):
966 """Get the commit id of the most recent commit made on a given ref."""
967 refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
968 l = tuple(islice(refs, 2))
976 def rev_list_invocation(ref_or_refs, format=None):
977 if isinstance(ref_or_refs, bytes):
978 refs = (ref_or_refs,)
981 argv = [b'git', b'rev-list']
984 argv.append(b'--pretty=format:' + format)
986 assert not ref.startswith(b'-')
992 def rev_list(ref_or_refs, parse=None, format=None, repo_dir=None):
993 """Yield information about commits as per "git rev-list". If a format
994 is not provided, yield one hex hash at a time. If a format is
995 provided, pass it to rev-list and call parse(git_stdout) for each
996 commit with the stream positioned just after the rev-list "commit
997 HASH" header line. When a format is provided yield (oidx,
998 parse(git_stdout)) for each commit.
1001 assert bool(parse) == bool(format)
1002 p = subprocess.Popen(rev_list_invocation(ref_or_refs,
1004 env=_gitenv(repo_dir),
1005 stdout = subprocess.PIPE)
1007 for line in p.stdout:
1010 line = p.stdout.readline()
1013 if not s.startswith(b'commit '):
1014 raise Exception('unexpected line ' + repr(s))
1017 yield s, parse(p.stdout)
1018 line = p.stdout.readline()
1020 rv = p.wait() # not fatal
1022 raise GitError('git rev-list returned error %d' % rv)
1025 def get_commit_dates(refs, repo_dir=None):
1026 """Get the dates for the specified commit refs. For now, every unique
1027 string in refs must resolve to a different commit or this
1028 function will fail."""
1031 commit = get_commit_items(ref, cp(repo_dir))
1032 result.append(commit.author_sec)
1036 def rev_parse(committish, repo_dir=None):
1037 """Resolve the full hash for 'committish', if it exists.
1039 Should be roughly equivalent to 'git rev-parse'.
1041 Returns the hex value of the hash if it is found, None if 'committish' does
1042 not correspond to anything.
1044 head = read_ref(committish, repo_dir=repo_dir)
1046 debug2("resolved from ref: commit = %s\n" % hexlify(head))
1049 pL = PackIdxList(repo(b'objects/pack', repo_dir=repo_dir))
1051 if len(committish) == 40:
1053 hash = unhexlify(committish)
1063 def update_ref(refname, newval, oldval, repo_dir=None):
1064 """Update a repository reference."""
1067 assert refname.startswith(b'refs/heads/') \
1068 or refname.startswith(b'refs/tags/')
1069 p = subprocess.Popen([b'git', b'update-ref', refname,
1070 hexlify(newval), hexlify(oldval)],
1071 env=_gitenv(repo_dir))
1072 _git_wait(b'git update-ref', p)
1075 def delete_ref(refname, oldvalue=None):
1076 """Delete a repository reference (see git update-ref(1))."""
1077 assert refname.startswith(b'refs/')
1078 oldvalue = [] if not oldvalue else [oldvalue]
1079 p = subprocess.Popen([b'git', b'update-ref', b'-d', refname] + oldvalue,
1081 _git_wait('git update-ref', p)
1084 def guess_repo(path=None):
1085 """Set the path value in the global variable "repodir".
1086 This makes bup look for an existing bup repository, but not fail if a
1087 repository doesn't exist. Usually, if you are interacting with a bup
1088 repository, you would not be calling this function but using
1089 check_repo_or_die().
1095 repodir = environ.get(b'BUP_DIR')
1097 repodir = os.path.expanduser(b'~/.bup')
1100 def init_repo(path=None):
1101 """Create the Git bare repository for bup in a given path."""
1103 d = repo() # appends a / to the path
1104 parent = os.path.dirname(os.path.dirname(d))
1105 if parent and not os.path.exists(parent):
1106 raise GitError('parent directory "%s" does not exist\n'
1108 if os.path.exists(d) and not os.path.isdir(os.path.join(d, b'.')):
1109 raise GitError('"%s" exists but is not a directory\n' % path_msg(d))
1110 p = subprocess.Popen([b'git', b'--bare', b'init'], stdout=sys.stderr,
1112 _git_wait('git init', p)
1113 # Force the index version configuration in order to ensure bup works
1114 # regardless of the version of the installed Git binary.
1115 p = subprocess.Popen([b'git', b'config', b'pack.indexVersion', '2'],
1116 stdout=sys.stderr, env=_gitenv())
1117 _git_wait('git config', p)
1119 p = subprocess.Popen([b'git', b'config', b'core.logAllRefUpdates', b'true'],
1120 stdout=sys.stderr, env=_gitenv())
1121 _git_wait('git config', p)
1124 def check_repo_or_die(path=None):
1125 """Check to see if a bup repository probably exists, and abort if not."""
1128 pst = stat_if_exists(top + b'/objects/pack')
1129 if pst and stat.S_ISDIR(pst.st_mode):
1132 top_st = stat_if_exists(top)
1134 log('error: repository %r does not exist (see "bup help init")\n'
1137 log('error: %s is not a repository\n' % path_msg(top))
1143 """Get Git's version and ensure a usable version is installed.
1145 The returned version is formatted as an ordered tuple with each position
1146 representing a digit in the version tag. For example, the following tuple
1147 would represent version 1.6.6.9:
1153 p = subprocess.Popen([b'git', b'--version'], stdout=subprocess.PIPE)
1154 gvs = p.stdout.read()
1155 _git_wait('git --version', p)
1156 m = re.match(br'git version (\S+.\S+)', gvs)
1158 raise GitError('git --version weird output: %r' % gvs)
1159 _ver = tuple(int(x) for x in m.group(1).split(b'.'))
1160 needed = (1, 5, 3, 1)
1162 raise GitError('git version %s or higher is required; you have %s'
1163 % ('.'.join(str(x) for x in needed),
1164 '.'.join(str(x) for x in _ver)))
1168 class _AbortableIter:
1169 def __init__(self, it, onabort = None):
1171 self.onabort = onabort
1179 return next(self.it)
1180 except StopIteration as e:
1190 """Abort iteration and call the abortion callback, if needed."""
1201 """Link to 'git cat-file' that is used to retrieve blob data."""
1202 def __init__(self, repo_dir = None):
1203 self.repo_dir = repo_dir
1206 log('error: git version must be at least 1.5.6\n')
1208 self.p = self.inprogress = None
1212 self.p.stdout.close()
1213 self.p.stdin.close()
1215 self.inprogress = None
1219 self.p = subprocess.Popen([b'git', b'cat-file', b'--batch'],
1220 stdin=subprocess.PIPE,
1221 stdout=subprocess.PIPE,
1224 env=_gitenv(self.repo_dir))
1227 """Yield (oidx, type, size), followed by the data referred to by ref.
1228 If ref does not exist, only yield (None, None, None).
1231 if not self.p or self.p.poll() != None:
1234 poll_result = self.p.poll()
1235 assert(poll_result == None)
1237 log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1238 assert(not self.inprogress)
1239 assert ref.find(b'\n') < 0
1240 assert ref.find(b'\r') < 0
1241 assert not ref.startswith(b'-')
1242 self.inprogress = ref
1243 self.p.stdin.write(ref + b'\n')
1244 self.p.stdin.flush()
1245 hdr = self.p.stdout.readline()
1246 if hdr.endswith(b' missing\n'):
1247 self.inprogress = None
1248 yield None, None, None
1250 info = hdr.split(b' ')
1251 if len(info) != 3 or len(info[0]) != 40:
1252 raise GitError('expected object (id, type, size), got %r' % info)
1253 oidx, typ, size = info
1255 it = _AbortableIter(chunkyreader(self.p.stdout, size),
1256 onabort=self._abort)
1258 yield oidx, typ, size
1261 readline_result = self.p.stdout.readline()
1262 assert readline_result == b'\n'
1263 self.inprogress = None
1264 except Exception as e:
1268 def _join(self, it):
1269 _, typ, _ = next(it)
1273 elif typ == b'tree':
1274 treefile = b''.join(it)
1275 for (mode, name, sha) in tree_decode(treefile):
1276 for blob in self.join(hexlify(sha)):
1278 elif typ == b'commit':
1279 treeline = b''.join(it).split(b'\n')[0]
1280 assert treeline.startswith(b'tree ')
1281 for blob in self.join(treeline[5:]):
1284 raise GitError('invalid object type %r: expected blob/tree/commit'
1288 """Generate a list of the content of all blobs that can be reached
1289 from an object. The hash given in 'id' must point to a blob, a tree
1290 or a commit. The content of all blobs that can be seen from trees or
1291 commits will be added to the list.
1293 for d in self._join(self.get(id)):
1299 def cp(repo_dir=None):
1300 """Create a CatPipe object or reuse the already existing one."""
1303 repo_dir = repodir or repo()
1304 repo_dir = os.path.abspath(repo_dir)
1305 cp = _cp.get(repo_dir)
1307 cp = CatPipe(repo_dir)
1312 def tags(repo_dir = None):
1313 """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1315 for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1316 assert n.startswith(b'refs/tags/')
1320 tags[c].append(name) # more than one tag can point at 'c'
1324 class MissingObject(KeyError):
1325 def __init__(self, oid):
1327 KeyError.__init__(self, 'object %r is missing' % oid.encode('hex'))
1330 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1331 'path', 'chunk_path', 'data'])
1332 # The path is the mangled path, and if an item represents a fragment
1333 # of a chunked file, the chunk_path will be the chunked subtree path
1334 # for the chunk, i.e. ['', '2d3115e', ...]. The top-level path for a
1335 # chunked file will have a chunk_path of ['']. So some chunk subtree
1336 # of the file '/foo/bar/baz' might look like this:
1338 # item.path = ['foo', 'bar', 'baz.bup']
1339 # item.chunk_path = ['', '2d3115e', '016b097']
1340 # item.type = 'tree'
1344 def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1345 """Yield everything reachable from oidx via get_ref (which must behave
1346 like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1347 returns true. Throw MissingObject if a hash encountered is
1348 missing from the repository, and don't read or return blob content
1349 in the data field unless include_data is set.
1352 # Maintain the pending stack on the heap to avoid stack overflow
1353 pending = [(oidx, [], [], None)]
1355 oidx, parent_path, chunk_path, mode = pending.pop()
1356 oid = unhexlify(oidx)
1357 if stop_at and stop_at(oidx):
1360 if (not include_data) and mode and stat.S_ISREG(mode):
1361 # If the object is a "regular file", then it's a leaf in
1362 # the graph, so we can skip reading the data if the caller
1363 # hasn't requested it.
1364 yield WalkItem(oid=oid, type=b'blob',
1365 chunk_path=chunk_path, path=parent_path,
1370 item_it = get_ref(oidx)
1371 get_oidx, typ, _ = next(item_it)
1373 raise MissingObject(unhexlify(oidx))
1374 if typ not in (b'blob', b'commit', b'tree'):
1375 raise Exception('unexpected repository object type %r' % typ)
1377 # FIXME: set the mode based on the type when the mode is None
1378 if typ == b'blob' and not include_data:
1379 # Dump data until we can ask cat_pipe not to fetch it
1380 for ignored in item_it:
1384 data = b''.join(item_it)
1386 yield WalkItem(oid=oid, type=typ,
1387 chunk_path=chunk_path, path=parent_path,
1389 data=(data if include_data else None))
1391 if typ == b'commit':
1392 commit_items = parse_commit(data)
1393 for pid in commit_items.parents:
1394 pending.append((pid, parent_path, chunk_path, mode))
1395 pending.append((commit_items.tree, parent_path, chunk_path,
1396 hashsplit.GIT_MODE_TREE))
1397 elif typ == b'tree':
1398 for mode, name, ent_id in tree_decode(data):
1399 demangled, bup_type = demangle_name(name, mode)
1401 sub_path = parent_path
1402 sub_chunk_path = chunk_path + [name]
1404 sub_path = parent_path + [name]
1405 if bup_type == BUP_CHUNKED:
1406 sub_chunk_path = [b'']
1408 sub_chunk_path = chunk_path
1409 pending.append((hexlify(ent_id), sub_path, sub_chunk_path,