1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
6 from __future__ import absolute_import, print_function
7 import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
8 from array import array
9 from binascii import hexlify, unhexlify
10 from collections import namedtuple
11 from itertools import islice
12 from numbers import Integral
14 from bup import _helpers, compat, hashsplit, path, midx, bloom, xstat
15 from bup.compat import (buffer,
16 byte_int, bytes_from_byte, bytes_from_uint,
21 from bup.io import path_msg
22 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
24 hostname, localtime, log,
27 mmap_read, mmap_readwrite,
29 progress, qprogress, stat_if_exists,
32 from bup.pwdgrp import username, userfullname
36 repodir = None # The default repository, once initialized
38 _typemap = {b'blob': 3, b'tree': 2, b'commit': 1, b'tag': 4}
39 _typermap = {v: k for k, v in items(_typemap)}
46 class GitError(Exception):
50 def _gitenv(repo_dir=None):
53 return merge_dict(environ, {b'GIT_DIR': os.path.abspath(repo_dir)})
55 def _git_wait(cmd, p):
58 raise GitError('%r returned %d' % (cmd, rv))
60 def git_config_get(option, repo_dir=None):
61 cmd = (b'git', b'config', b'--get', option)
62 p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
63 env=_gitenv(repo_dir=repo_dir))
69 raise GitError('%r returned %d' % (cmd, rc))
73 def parse_tz_offset(s):
74 """UTC offset in seconds."""
75 tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
76 if bytes_from_byte(s[0]) == b'-':
81 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
82 # Make sure that's authoritative.
83 _start_end_char = br'[^ .,:;<>"\'\0\n]'
84 _content_char = br'[^\0\n<>]'
85 _safe_str_rx = br'(?:%s{1,2}|(?:%s%s*%s))' \
87 _start_end_char, _content_char, _start_end_char)
88 _tz_rx = br'[-+]\d\d[0-5]\d'
89 _parent_rx = br'(?:parent [abcdefABCDEF0123456789]{40}\n)'
90 # Assumes every following line starting with a space is part of the
91 # mergetag. Is there a formal commit blob spec?
92 _mergetag_rx = br'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)'
93 _commit_rx = re.compile(br'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
94 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
95 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?)
97 (?P<message>(?:.|\n)*)''' % (_parent_rx,
98 _safe_str_rx, _safe_str_rx, _tz_rx,
99 _safe_str_rx, _safe_str_rx, _tz_rx,
101 _parent_hash_rx = re.compile(br'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
103 # Note that the author_sec and committer_sec values are (UTC) epoch
104 # seconds, and for now the mergetag is not included.
105 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
106 'author_name', 'author_mail',
107 'author_sec', 'author_offset',
108 'committer_name', 'committer_mail',
109 'committer_sec', 'committer_offset',
112 def parse_commit(content):
113 commit_match = re.match(_commit_rx, content)
115 raise Exception('cannot parse commit %r' % content)
116 matches = commit_match.groupdict()
117 return CommitInfo(tree=matches['tree'],
118 parents=re.findall(_parent_hash_rx, matches['parents']),
119 author_name=matches['author_name'],
120 author_mail=matches['author_mail'],
121 author_sec=int(matches['asec']),
122 author_offset=parse_tz_offset(matches['atz']),
123 committer_name=matches['committer_name'],
124 committer_mail=matches['committer_mail'],
125 committer_sec=int(matches['csec']),
126 committer_offset=parse_tz_offset(matches['ctz']),
127 message=matches['message'])
130 def get_cat_data(cat_iterator, expected_type):
131 _, kind, _ = next(cat_iterator)
132 if kind != expected_type:
133 raise Exception('expected %r, saw %r' % (expected_type, kind))
134 return b''.join(cat_iterator)
136 def get_commit_items(id, cp):
137 return parse_commit(get_cat_data(cp.get(id), b'commit'))
139 def _local_git_date_str(epoch_sec):
140 return b'%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
143 def _git_date_str(epoch_sec, tz_offset_sec):
144 offs = tz_offset_sec // 60
145 return b'%d %s%02d%02d' \
147 b'+' if offs >= 0 else b'-',
152 def repo(sub = b'', repo_dir=None):
153 """Get the path to the git repository or one of its subdirectories."""
154 repo_dir = repo_dir or repodir
156 raise GitError('You should call check_repo_or_die()')
158 # If there's a .git subdirectory, then the actual repo is in there.
159 gd = os.path.join(repo_dir, b'.git')
160 if os.path.exists(gd):
163 return os.path.join(repo_dir, sub)
167 re.compile(br'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)')
170 return _shorten_hash_rx.sub(br'\1\2*\3', s)
174 full = os.path.abspath(path)
175 fullrepo = os.path.abspath(repo(b''))
176 if not fullrepo.endswith(b'/'):
178 if full.startswith(fullrepo):
179 path = full[len(fullrepo):]
180 if path.startswith(b'index-cache/'):
181 path = path[len(b'index-cache/'):]
182 return shorten_hash(path)
186 paths = [repo(b'objects/pack')]
187 paths += glob.glob(repo(b'index-cache/*/.'))
191 def auto_midx(objdir):
192 args = [path.exe(), b'midx', b'--auto', b'--dir', objdir]
194 rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
196 # make sure 'args' gets printed to help with debugging
197 add_error('%r: exception: %s' % (args, e))
200 add_error('%r: returned %d' % (args, rv))
202 args = [path.exe(), b'bloom', b'--dir', objdir]
204 rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
206 # make sure 'args' gets printed to help with debugging
207 add_error('%r: exception: %s' % (args, e))
210 add_error('%r: returned %d' % (args, rv))
213 def mangle_name(name, mode, gitmode):
214 """Mangle a file name to present an abstract name for segmented files.
215 Mangled file names will have the ".bup" extension added to them. If a
216 file's name already ends with ".bup", a ".bupl" extension is added to
217 disambiguate normal files from segmented ones.
219 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
220 assert(stat.S_ISDIR(gitmode))
221 return name + b'.bup'
222 elif name.endswith(b'.bup') or name[:-1].endswith(b'.bup'):
223 return name + b'.bupl'
228 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
229 def demangle_name(name, mode):
230 """Remove name mangling from a file name, if necessary.
232 The return value is a tuple (demangled_filename,mode), where mode is one of
235 * BUP_NORMAL : files that should be read as-is from the repository
236 * BUP_CHUNKED : files that were chunked and need to be reassembled
238 For more information on the name mangling algorithm, see mangle_name()
240 if name.endswith(b'.bupl'):
241 return (name[:-5], BUP_NORMAL)
242 elif name.endswith(b'.bup'):
243 return (name[:-4], BUP_CHUNKED)
244 elif name.endswith(b'.bupm'):
246 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
248 return (name, BUP_NORMAL)
251 def calc_hash(type, content):
252 """Calculate some content's hash in the Git fashion."""
253 header = b'%s %d\0' % (type, len(content))
259 def shalist_item_sort_key(ent):
260 (mode, name, id) = ent
261 assert(mode+0 == mode)
262 if stat.S_ISDIR(mode):
268 def tree_encode(shalist):
269 """Generate a git tree object from (mode,name,hash) tuples."""
270 shalist = sorted(shalist, key = shalist_item_sort_key)
272 for (mode,name,bin) in shalist:
274 assert(mode+0 == mode)
276 assert(len(bin) == 20)
277 s = b'%o %s\0%s' % (mode,name,bin)
278 assert s[0] != b'0' # 0-padded octal is not acceptable in a git tree
283 def tree_decode(buf):
284 """Generate a list of (mode,name,hash) from the git tree object in buf."""
286 while ofs < len(buf):
287 z = buf.find(b'\0', ofs)
289 spl = buf[ofs:z].split(b' ', 1)
290 assert(len(spl) == 2)
292 sha = buf[z+1:z+1+20]
294 yield (int(mode, 8), name, sha)
297 def _encode_packobj(type, content, compression_level=1):
298 if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
299 raise ValueError('invalid compression level %s' % compression_level)
302 szbits = (sz & 0x0f) | (_typemap[type]<<4)
305 if sz: szbits |= 0x80
306 szout += bytes_from_uint(szbits)
311 z = zlib.compressobj(compression_level)
313 yield z.compress(content)
317 def _encode_looseobj(type, content, compression_level=1):
318 z = zlib.compressobj(compression_level)
319 yield z.compress(b'%s %d\0' % (type, len(content)))
320 yield z.compress(content)
324 def _decode_looseobj(buf):
326 s = zlib.decompress(buf)
329 l = s[:i].split(b' ')
333 assert(type in _typemap)
334 assert(sz == len(content))
335 return (type, content)
338 def _decode_packobj(buf):
341 type = _typermap[(c & 0x70) >> 4]
348 sz |= (c & 0x7f) << shift
352 return (type, zlib.decompress(buf[i+1:]))
359 def find_offset(self, hash):
360 """Get the offset of an object inside the index file."""
361 idx = self._idx_from_hash(hash)
363 return self._ofs_from_idx(idx)
366 def exists(self, hash, want_source=False):
367 """Return nonempty if the object exists in this index."""
368 if hash and (self._idx_from_hash(hash) != None):
369 return want_source and os.path.basename(self.name) or True
372 def _idx_from_hash(self, hash):
373 global _total_searches, _total_steps
375 assert(len(hash) == 20)
376 b1 = byte_int(hash[0])
377 start = self.fanout[b1-1] # range -1..254
378 end = self.fanout[b1] # range 0..255
380 _total_steps += 1 # lookup table is a step
383 mid = start + (end - start) // 2
384 v = self._idx_to_hash(mid)
394 class PackIdxV1(PackIdx):
395 """Object representation of a Git pack index (version 1) file."""
396 def __init__(self, filename, f):
398 self.idxnames = [self.name]
399 self.map = mmap_read(f)
400 # Min size for 'L' is 4, which is sufficient for struct's '!I'
401 self.fanout = array('L', struct.unpack('!256I', self.map))
402 self.fanout.append(0) # entry "-1"
403 self.nsha = self.fanout[255]
404 self.sha_ofs = 256 * 4
405 # Avoid slicing shatable for individual hashes (very high overhead)
406 self.shatable = buffer(self.map, self.sha_ofs, self.nsha * 24)
409 return int(self.nsha) # int() from long for python 2
411 def _ofs_from_idx(self, idx):
412 if idx >= self.nsha or idx < 0:
413 raise IndexError('invalid pack index index %d' % idx)
414 ofs = self.sha_ofs + idx * 24
415 return struct.unpack_from('!I', self.map, offset=ofs)[0]
417 def _idx_to_hash(self, idx):
418 if idx >= self.nsha or idx < 0:
419 raise IndexError('invalid pack index index %d' % idx)
420 ofs = self.sha_ofs + idx * 24 + 4
421 return self.map[ofs : ofs + 20]
424 start = self.sha_ofs + 4
425 for ofs in range(start, start + 24 * self.nsha, 24):
426 yield self.map[ofs : ofs + 20]
429 class PackIdxV2(PackIdx):
430 """Object representation of a Git pack index (version 2) file."""
431 def __init__(self, filename, f):
433 self.idxnames = [self.name]
434 self.map = mmap_read(f)
435 assert self.map[0:8] == b'\377tOc\0\0\0\2'
436 # Min size for 'L' is 4, which is sufficient for struct's '!I'
437 self.fanout = array('L', struct.unpack_from('!256I', self.map, offset=8))
438 self.fanout.append(0)
439 self.nsha = self.fanout[255]
440 self.sha_ofs = 8 + 256*4
441 self.ofstable_ofs = self.sha_ofs + self.nsha * 20 + self.nsha * 4
442 self.ofs64table_ofs = self.ofstable_ofs + self.nsha * 4
443 # Avoid slicing this for individual hashes (very high overhead)
444 self.shatable = buffer(self.map, self.sha_ofs, self.nsha*20)
447 return int(self.nsha) # int() from long for python 2
449 def _ofs_from_idx(self, idx):
450 if idx >= self.nsha or idx < 0:
451 raise IndexError('invalid pack index index %d' % idx)
452 ofs_ofs = self.ofstable_ofs + idx * 4
453 ofs = struct.unpack_from('!I', self.map, offset=ofs_ofs)[0]
455 idx64 = ofs & 0x7fffffff
456 ofs64_ofs = self.ofs64table_ofs + idx64 * 8
457 ofs = struct.unpack_from('!Q', self.map, offset=ofs64_ofs)[0]
460 def _idx_to_hash(self, idx):
461 if idx >= self.nsha or idx < 0:
462 raise IndexError('invalid pack index index %d' % idx)
463 ofs = self.sha_ofs + idx * 20
464 return self.map[ofs : ofs + 20]
468 for ofs in range(start, start + 20 * self.nsha, 20):
469 yield self.map[ofs : ofs + 20]
474 def __init__(self, dir, ignore_midx=False):
476 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
481 self.do_bloom = False
483 self.ignore_midx = ignore_midx
489 assert(_mpi_count == 0)
492 return iter(idxmerge(self.packs))
495 return sum(len(pack) for pack in self.packs)
497 def exists(self, hash, want_source=False):
498 """Return nonempty if the object exists in the index files."""
499 global _total_searches
501 if hash in self.also:
503 if self.do_bloom and self.bloom:
504 if self.bloom.exists(hash):
505 self.do_bloom = False
507 _total_searches -= 1 # was counted by bloom
509 for i in range(len(self.packs)):
511 _total_searches -= 1 # will be incremented by sub-pack
512 ix = p.exists(hash, want_source=want_source)
514 # reorder so most recently used packs are searched first
515 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
520 def refresh(self, skip_midx = False):
521 """Refresh the index list.
522 This method verifies if .midx files were superseded (e.g. all of its
523 contents are in another, bigger .midx file) and removes the superseded
526 If skip_midx is True, all work on .midx files will be skipped and .midx
527 files will be removed from the list.
529 The instance variable 'ignore_midx' can force this function to
530 always act as if skip_midx was True.
532 self.bloom = None # Always reopen the bloom as it may have been relaced
533 self.do_bloom = False
534 skip_midx = skip_midx or self.ignore_midx
535 d = dict((p.name, p) for p in self.packs
536 if not skip_midx or not isinstance(p, midx.PackMidx))
537 if os.path.exists(self.dir):
540 for ix in self.packs:
541 if isinstance(ix, midx.PackMidx):
542 for name in ix.idxnames:
543 d[os.path.join(self.dir, name)] = ix
544 for full in glob.glob(os.path.join(self.dir,b'*.midx')):
546 mx = midx.PackMidx(full)
547 (mxd, mxf) = os.path.split(mx.name)
549 for n in mx.idxnames:
550 if not os.path.exists(os.path.join(mxd, n)):
551 log(('warning: index %s missing\n'
553 % (path_msg(n), path_msg(mxf)))
561 midxl.sort(key=lambda ix:
562 (-len(ix), -xstat.stat(ix.name).st_mtime))
565 for sub in ix.idxnames:
566 found = d.get(os.path.join(self.dir, sub))
567 if not found or isinstance(found, PackIdx):
568 # doesn't exist, or exists but not in a midx
573 for name in ix.idxnames:
574 d[os.path.join(self.dir, name)] = ix
575 elif not ix.force_keep:
576 debug1('midx: removing redundant: %s\n'
577 % path_msg(os.path.basename(ix.name)))
580 for full in glob.glob(os.path.join(self.dir, b'*.idx')):
584 except GitError as e:
588 bfull = os.path.join(self.dir, b'bup.bloom')
589 if self.bloom is None and os.path.exists(bfull):
590 self.bloom = bloom.ShaBloom(bfull)
591 self.packs = list(set(d.values()))
592 self.packs.sort(reverse=True, key=lambda x: len(x))
593 if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
597 debug1('PackIdxList: using %d index%s.\n'
598 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
601 """Insert an additional object in the list."""
605 def open_idx(filename):
606 if filename.endswith(b'.idx'):
607 f = open(filename, 'rb')
609 if header[0:4] == b'\377tOc':
610 version = struct.unpack('!I', header[4:8])[0]
612 return PackIdxV2(filename, f)
614 raise GitError('%s: expected idx file version 2, got %d'
615 % (path_msg(filename), version))
616 elif len(header) == 8 and header[0:4] < b'\377tOc':
617 return PackIdxV1(filename, f)
619 raise GitError('%s: unrecognized idx file header'
620 % path_msg(filename))
621 elif filename.endswith(b'.midx'):
622 return midx.PackMidx(filename)
624 raise GitError('idx filenames must end with .idx or .midx')
627 def idxmerge(idxlist, final_progress=True):
628 """Generate a list of all the objects reachable in a PackIdxList."""
629 def pfunc(count, total):
630 qprogress('Reading indexes: %.2f%% (%d/%d)\r'
631 % (count*100.0/total, count, total))
632 def pfinal(count, total):
634 progress('Reading indexes: %.2f%% (%d/%d), done.\n'
635 % (100, total, total))
636 return merge_iter(idxlist, 10024, pfunc, pfinal)
639 def _make_objcache():
640 return PackIdxList(repo(b'objects/pack'))
642 # bup-gc assumes that it can disable all PackWriter activities
643 # (bloom/midx/cache) via the constructor and close() arguments.
646 """Writes Git objects inside a pack file."""
647 def __init__(self, objcache_maker=_make_objcache, compression_level=1,
648 run_midx=True, on_pack_finish=None,
649 max_pack_size=None, max_pack_objects=None, repo_dir=None):
650 self.repo_dir = repo_dir or repo()
657 self.objcache_maker = objcache_maker
659 self.compression_level = compression_level
660 self.run_midx=run_midx
661 self.on_pack_finish = on_pack_finish
662 if not max_pack_size:
663 max_pack_size = git_config_get(b'pack.packSizeLimit',
664 repo_dir=self.repo_dir)
665 if max_pack_size is not None:
666 max_pack_size = parse_num(max_pack_size)
667 if not max_pack_size:
668 # larger packs slow down pruning
669 max_pack_size = 1000 * 1000 * 1000
670 self.max_pack_size = max_pack_size
671 # cache memory usage is about 83 bytes per object
672 self.max_pack_objects = max_pack_objects if max_pack_objects \
673 else max(1, self.max_pack_size // 5000)
681 def __exit__(self, type, value, traceback):
686 objdir = dir = os.path.join(self.repo_dir, b'objects')
687 fd, name = tempfile.mkstemp(suffix=b'.pack', dir=objdir)
689 self.file = os.fdopen(fd, 'w+b')
694 self.parentfd = os.open(objdir, os.O_RDONLY)
700 assert name.endswith(b'.pack')
701 self.filename = name[:-5]
702 self.file.write(b'PACK\0\0\0\2\0\0\0\0')
703 self.idx = list(list() for i in range(256))
705 def _raw_write(self, datalist, sha):
708 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
709 # the file never has a *partial* blob. So let's make sure it's
710 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
711 # to our hashsplit algorithm.) f.write() does its own buffering,
712 # but that's okay because we'll flush it in _end().
713 oneblob = b''.join(datalist)
719 crc = zlib.crc32(oneblob) & 0xffffffff
720 self._update_idx(sha, crc, nw)
725 def _update_idx(self, sha, crc, size):
728 self.idx[byte_int(sha[0])].append((sha, crc,
729 self.file.tell() - size))
731 def _write(self, sha, type, content):
735 sha = calc_hash(type, content)
736 size, crc = self._raw_write(_encode_packobj(type, content,
737 self.compression_level),
739 if self.outbytes >= self.max_pack_size \
740 or self.count >= self.max_pack_objects:
744 def breakpoint(self):
745 """Clear byte and object counts and return the last processed id."""
746 id = self._end(self.run_midx)
747 self.outbytes = self.count = 0
750 def _require_objcache(self):
751 if self.objcache is None and self.objcache_maker:
752 self.objcache = self.objcache_maker()
753 if self.objcache is None:
755 "PackWriter not opened or can't check exists w/o objcache")
757 def exists(self, id, want_source=False):
758 """Return non-empty if an object is found in the object cache."""
759 self._require_objcache()
760 return self.objcache.exists(id, want_source=want_source)
762 def just_write(self, sha, type, content):
763 """Write an object to the pack file without checking for duplication."""
764 self._write(sha, type, content)
765 # If nothing else, gc doesn't have/want an objcache
766 if self.objcache is not None:
767 self.objcache.add(sha)
769 def maybe_write(self, type, content):
770 """Write an object to the pack file if not present and return its id."""
771 sha = calc_hash(type, content)
772 if not self.exists(sha):
773 self._require_objcache()
774 self.just_write(sha, type, content)
777 def new_blob(self, blob):
778 """Create a blob object in the pack with the supplied content."""
779 return self.maybe_write(b'blob', blob)
781 def new_tree(self, shalist):
782 """Create a tree object in the pack."""
783 content = tree_encode(shalist)
784 return self.maybe_write(b'tree', content)
786 def new_commit(self, tree, parent,
787 author, adate_sec, adate_tz,
788 committer, cdate_sec, cdate_tz,
790 """Create a commit object in the pack. The date_sec values must be
791 epoch-seconds, and if a tz is None, the local timezone is assumed."""
793 adate_str = _git_date_str(adate_sec, adate_tz)
795 adate_str = _local_git_date_str(adate_sec)
797 cdate_str = _git_date_str(cdate_sec, cdate_tz)
799 cdate_str = _local_git_date_str(cdate_sec)
801 if tree: l.append(b'tree %s' % hexlify(tree))
802 if parent: l.append(b'parent %s' % hexlify(parent))
803 if author: l.append(b'author %s %s' % (author, adate_str))
804 if committer: l.append(b'committer %s %s' % (committer, cdate_str))
807 return self.maybe_write(b'commit', b'\n'.join(l))
810 """Remove the pack file from disk."""
819 os.unlink(self.filename + b'.pack')
826 def _end(self, run_midx=True):
828 if not f: return None
835 # update object count
837 cp = struct.pack('!i', self.count)
841 # calculate the pack sha1sum
844 for b in chunkyreader(f):
846 packbin = sum.digest()
848 fdatasync(f.fileno())
852 obj_list_sha = self._write_pack_idx_v2(self.filename + b'.idx', idx,
854 nameprefix = os.path.join(self.repo_dir,
855 b'objects/pack/pack-' + obj_list_sha)
856 if os.path.exists(self.filename + b'.map'):
857 os.unlink(self.filename + b'.map')
858 os.rename(self.filename + b'.pack', nameprefix + b'.pack')
859 os.rename(self.filename + b'.idx', nameprefix + b'.idx')
861 os.fsync(self.parentfd)
863 os.close(self.parentfd)
866 auto_midx(os.path.join(self.repo_dir, b'objects/pack'))
868 if self.on_pack_finish:
869 self.on_pack_finish(nameprefix)
873 def close(self, run_midx=True):
874 """Close the pack file and move it to its definitive path."""
875 return self._end(run_midx=run_midx)
877 def _write_pack_idx_v2(self, filename, idx, packbin):
880 for entry in section:
881 if entry[2] >= 2**31:
884 # Length: header + fan-out + shas-and-crcs + overflow-offsets
885 index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
887 idx_f = open(filename, 'w+b')
889 idx_f.truncate(index_len)
890 fdatasync(idx_f.fileno())
891 idx_map = mmap_readwrite(idx_f, close=False)
893 count = _helpers.write_idx(filename, idx_map, idx, self.count)
894 assert(count == self.count)
901 idx_f = open(filename, 'a+b')
906 b = idx_f.read(8 + 4*256)
909 obj_list_sum = Sha1()
910 for b in chunkyreader(idx_f, 20*self.count):
912 obj_list_sum.update(b)
913 namebase = hexlify(obj_list_sum.digest())
915 for b in chunkyreader(idx_f):
917 idx_f.write(idx_sum.digest())
918 fdatasync(idx_f.fileno())
924 def list_refs(patterns=None, repo_dir=None,
925 limit_to_heads=False, limit_to_tags=False):
926 """Yield (refname, hash) tuples for all repository refs unless
927 patterns are specified. In that case, only include tuples for
928 refs matching those patterns (cf. git-show-ref(1)). The limits
929 restrict the result items to refs/heads or refs/tags. If both
930 limits are specified, items from both sources will be included.
933 argv = [b'git', b'show-ref']
935 argv.append(b'--heads')
937 argv.append(b'--tags')
940 argv.extend(patterns)
941 p = subprocess.Popen(argv, env=_gitenv(repo_dir), stdout=subprocess.PIPE)
942 out = p.stdout.read().strip()
943 rv = p.wait() # not fatal
947 for d in out.split(b'\n'):
948 sha, name = d.split(b' ', 1)
949 yield name, unhexlify(sha)
952 def read_ref(refname, repo_dir = None):
953 """Get the commit id of the most recent commit made on a given ref."""
954 refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
955 l = tuple(islice(refs, 2))
963 def rev_list_invocation(ref_or_refs, count=None, format=None):
964 if isinstance(ref_or_refs, bytes):
965 refs = (ref_or_refs,)
968 argv = [b'git', b'rev-list']
969 if isinstance(count, Integral):
970 argv.extend([b'-n', b'%d' % count])
972 raise ValueError('unexpected count argument %r' % count)
975 argv.append(b'--pretty=format:' + format)
977 assert not ref.startswith(b'-')
983 def rev_list(ref_or_refs, count=None, parse=None, format=None, repo_dir=None):
984 """Yield information about commits as per "git rev-list". If a format
985 is not provided, yield one hex hash at a time. If a format is
986 provided, pass it to rev-list and call parse(git_stdout) for each
987 commit with the stream positioned just after the rev-list "commit
988 HASH" header line. When a format is provided yield (oidx,
989 parse(git_stdout)) for each commit.
992 assert bool(parse) == bool(format)
993 p = subprocess.Popen(rev_list_invocation(ref_or_refs, count=count,
995 env=_gitenv(repo_dir),
996 stdout = subprocess.PIPE)
998 for line in p.stdout:
1001 line = p.stdout.readline()
1004 if not s.startswith(b'commit '):
1005 raise Exception('unexpected line ' + repr(s))
1008 yield s, parse(p.stdout)
1009 line = p.stdout.readline()
1011 rv = p.wait() # not fatal
1013 raise GitError('git rev-list returned error %d' % rv)
1016 def get_commit_dates(refs, repo_dir=None):
1017 """Get the dates for the specified commit refs. For now, every unique
1018 string in refs must resolve to a different commit or this
1019 function will fail."""
1022 commit = get_commit_items(ref, cp(repo_dir))
1023 result.append(commit.author_sec)
1027 def rev_parse(committish, repo_dir=None):
1028 """Resolve the full hash for 'committish', if it exists.
1030 Should be roughly equivalent to 'git rev-parse'.
1032 Returns the hex value of the hash if it is found, None if 'committish' does
1033 not correspond to anything.
1035 head = read_ref(committish, repo_dir=repo_dir)
1037 debug2("resolved from ref: commit = %s\n" % hexlify(head))
1040 pL = PackIdxList(repo(b'objects/pack', repo_dir=repo_dir))
1042 if len(committish) == 40:
1044 hash = unhexlify(committish)
1054 def update_ref(refname, newval, oldval, repo_dir=None):
1055 """Update a repository reference."""
1058 assert refname.startswith(b'refs/heads/') \
1059 or refname.startswith(b'refs/tags/')
1060 p = subprocess.Popen([b'git', b'update-ref', refname,
1061 hexlify(newval), hexlify(oldval)],
1062 env=_gitenv(repo_dir))
1063 _git_wait(b'git update-ref', p)
1066 def delete_ref(refname, oldvalue=None):
1067 """Delete a repository reference (see git update-ref(1))."""
1068 assert refname.startswith(b'refs/')
1069 oldvalue = [] if not oldvalue else [oldvalue]
1070 p = subprocess.Popen([b'git', b'update-ref', b'-d', refname] + oldvalue,
1072 _git_wait('git update-ref', p)
1075 def guess_repo(path=None):
1076 """Set the path value in the global variable "repodir".
1077 This makes bup look for an existing bup repository, but not fail if a
1078 repository doesn't exist. Usually, if you are interacting with a bup
1079 repository, you would not be calling this function but using
1080 check_repo_or_die().
1086 repodir = environ.get(b'BUP_DIR')
1088 repodir = os.path.expanduser(b'~/.bup')
1091 def init_repo(path=None):
1092 """Create the Git bare repository for bup in a given path."""
1094 d = repo() # appends a / to the path
1095 parent = os.path.dirname(os.path.dirname(d))
1096 if parent and not os.path.exists(parent):
1097 raise GitError('parent directory "%s" does not exist\n'
1099 if os.path.exists(d) and not os.path.isdir(os.path.join(d, b'.')):
1100 raise GitError('"%s" exists but is not a directory\n' % path_msg(d))
1101 p = subprocess.Popen([b'git', b'--bare', b'init'], stdout=sys.stderr,
1103 _git_wait('git init', p)
1104 # Force the index version configuration in order to ensure bup works
1105 # regardless of the version of the installed Git binary.
1106 p = subprocess.Popen([b'git', b'config', b'pack.indexVersion', '2'],
1107 stdout=sys.stderr, env=_gitenv())
1108 _git_wait('git config', p)
1110 p = subprocess.Popen([b'git', b'config', b'core.logAllRefUpdates', b'true'],
1111 stdout=sys.stderr, env=_gitenv())
1112 _git_wait('git config', p)
1115 def check_repo_or_die(path=None):
1116 """Check to see if a bup repository probably exists, and abort if not."""
1119 pst = stat_if_exists(top + b'/objects/pack')
1120 if pst and stat.S_ISDIR(pst.st_mode):
1123 top_st = stat_if_exists(top)
1125 log('error: repository %r does not exist (see "bup help init")\n'
1128 log('error: %s is not a repository\n' % path_msg(top))
1134 """Get Git's version and ensure a usable version is installed.
1136 The returned version is formatted as an ordered tuple with each position
1137 representing a digit in the version tag. For example, the following tuple
1138 would represent version 1.6.6.9:
1144 p = subprocess.Popen([b'git', b'--version'], stdout=subprocess.PIPE)
1145 gvs = p.stdout.read()
1146 _git_wait('git --version', p)
1147 m = re.match(br'git version (\S+.\S+)', gvs)
1149 raise GitError('git --version weird output: %r' % gvs)
1150 _ver = tuple(int(x) for x in m.group(1).split(b'.'))
1151 needed = (1, 5, 3, 1)
1153 raise GitError('git version %s or higher is required; you have %s'
1154 % ('.'.join(str(x) for x in needed),
1155 '.'.join(str(x) for x in _ver)))
1159 class _AbortableIter:
1160 def __init__(self, it, onabort = None):
1162 self.onabort = onabort
1170 return next(self.it)
1171 except StopIteration as e:
1181 """Abort iteration and call the abortion callback, if needed."""
1192 """Link to 'git cat-file' that is used to retrieve blob data."""
1193 def __init__(self, repo_dir = None):
1194 self.repo_dir = repo_dir
1197 log('error: git version must be at least 1.5.6\n')
1199 self.p = self.inprogress = None
1203 self.p.stdout.close()
1204 self.p.stdin.close()
1206 self.inprogress = None
1210 self.p = subprocess.Popen([b'git', b'cat-file', b'--batch'],
1211 stdin=subprocess.PIPE,
1212 stdout=subprocess.PIPE,
1215 env=_gitenv(self.repo_dir))
1218 """Yield (oidx, type, size), followed by the data referred to by ref.
1219 If ref does not exist, only yield (None, None, None).
1222 if not self.p or self.p.poll() != None:
1225 poll_result = self.p.poll()
1226 assert(poll_result == None)
1228 log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1229 assert(not self.inprogress)
1230 assert ref.find(b'\n') < 0
1231 assert ref.find(b'\r') < 0
1232 assert not ref.startswith(b'-')
1233 self.inprogress = ref
1234 self.p.stdin.write(ref + b'\n')
1235 self.p.stdin.flush()
1236 hdr = self.p.stdout.readline()
1237 if hdr.endswith(b' missing\n'):
1238 self.inprogress = None
1239 yield None, None, None
1241 info = hdr.split(b' ')
1242 if len(info) != 3 or len(info[0]) != 40:
1243 raise GitError('expected object (id, type, size), got %r' % info)
1244 oidx, typ, size = info
1246 it = _AbortableIter(chunkyreader(self.p.stdout, size),
1247 onabort=self._abort)
1249 yield oidx, typ, size
1252 readline_result = self.p.stdout.readline()
1253 assert readline_result == b'\n'
1254 self.inprogress = None
1255 except Exception as e:
1259 def _join(self, it):
1260 _, typ, _ = next(it)
1264 elif typ == b'tree':
1265 treefile = b''.join(it)
1266 for (mode, name, sha) in tree_decode(treefile):
1267 for blob in self.join(hexlify(sha)):
1269 elif typ == b'commit':
1270 treeline = b''.join(it).split(b'\n')[0]
1271 assert treeline.startswith(b'tree ')
1272 for blob in self.join(treeline[5:]):
1275 raise GitError('invalid object type %r: expected blob/tree/commit'
1279 """Generate a list of the content of all blobs that can be reached
1280 from an object. The hash given in 'id' must point to a blob, a tree
1281 or a commit. The content of all blobs that can be seen from trees or
1282 commits will be added to the list.
1285 for d in self._join(self.get(id)):
1287 except StopIteration:
1293 def cp(repo_dir=None):
1294 """Create a CatPipe object or reuse the already existing one."""
1297 repo_dir = repodir or repo()
1298 repo_dir = os.path.abspath(repo_dir)
1299 cp = _cp.get(repo_dir)
1301 cp = CatPipe(repo_dir)
1306 def tags(repo_dir = None):
1307 """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1309 for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1310 assert n.startswith(b'refs/tags/')
1314 tags[c].append(name) # more than one tag can point at 'c'
1318 class MissingObject(KeyError):
1319 def __init__(self, oid):
1321 KeyError.__init__(self, 'object %r is missing' % oid.encode('hex'))
1324 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1325 'path', 'chunk_path', 'data'])
1326 # The path is the mangled path, and if an item represents a fragment
1327 # of a chunked file, the chunk_path will be the chunked subtree path
1328 # for the chunk, i.e. ['', '2d3115e', ...]. The top-level path for a
1329 # chunked file will have a chunk_path of ['']. So some chunk subtree
1330 # of the file '/foo/bar/baz' might look like this:
1332 # item.path = ['foo', 'bar', 'baz.bup']
1333 # item.chunk_path = ['', '2d3115e', '016b097']
1334 # item.type = 'tree'
1338 def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1339 """Yield everything reachable from oidx via get_ref (which must behave
1340 like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1341 returns true. Throw MissingObject if a hash encountered is
1342 missing from the repository, and don't read or return blob content
1343 in the data field unless include_data is set.
1346 # Maintain the pending stack on the heap to avoid stack overflow
1347 pending = [(oidx, [], [], None)]
1349 oidx, parent_path, chunk_path, mode = pending.pop()
1350 oid = unhexlify(oidx)
1351 if stop_at and stop_at(oidx):
1354 if (not include_data) and mode and stat.S_ISREG(mode):
1355 # If the object is a "regular file", then it's a leaf in
1356 # the graph, so we can skip reading the data if the caller
1357 # hasn't requested it.
1358 yield WalkItem(oid=oid, type=b'blob',
1359 chunk_path=chunk_path, path=parent_path,
1364 item_it = get_ref(oidx)
1365 get_oidx, typ, _ = next(item_it)
1367 raise MissingObject(unhexlify(oidx))
1368 if typ not in (b'blob', b'commit', b'tree'):
1369 raise Exception('unexpected repository object type %r' % typ)
1371 # FIXME: set the mode based on the type when the mode is None
1372 if typ == b'blob' and not include_data:
1373 # Dump data until we can ask cat_pipe not to fetch it
1374 for ignored in item_it:
1378 data = b''.join(item_it)
1380 yield WalkItem(oid=oid, type=typ,
1381 chunk_path=chunk_path, path=parent_path,
1383 data=(data if include_data else None))
1385 if typ == b'commit':
1386 commit_items = parse_commit(data)
1387 for pid in commit_items.parents:
1388 pending.append((pid, parent_path, chunk_path, mode))
1389 pending.append((commit_items.tree, parent_path, chunk_path,
1390 hashsplit.GIT_MODE_TREE))
1391 elif typ == b'tree':
1392 for mode, name, ent_id in tree_decode(data):
1393 demangled, bup_type = demangle_name(name, mode)
1395 sub_path = parent_path
1396 sub_chunk_path = chunk_path + [name]
1398 sub_path = parent_path + [name]
1399 if bup_type == BUP_CHUNKED:
1400 sub_chunk_path = [b'']
1402 sub_chunk_path = chunk_path
1403 pending.append((hexlify(ent_id), sub_path, sub_chunk_path,