1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
6 from __future__ import absolute_import
7 import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
8 from collections import namedtuple
9 from itertools import islice
10 from numbers import Integral
11 from os import environ
13 from bup import _helpers, compat, hashsplit, path, midx, bloom, xstat
14 from bup.compat import range
15 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
18 hostname, localtime, log, merge_iter,
19 mmap_read, mmap_readwrite,
21 progress, qprogress, shstr, stat_if_exists,
22 unlink, username, userfullname,
27 repodir = None # The default repository, once initialized
29 _typemap = { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
30 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
36 class GitError(Exception):
40 def _git_wait(cmd, p):
43 raise GitError('%s returned %d' % (shstr(cmd), rv))
45 def _git_capture(argv):
46 p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv())
48 _git_wait(repr(argv), p)
51 def _git_exo(cmd, **kwargs):
52 kwargs['check'] = False
53 result = exo(cmd, **kwargs)
55 if proc.returncode != 0:
56 raise GitError('%r returned %d' % (cmd, proc.returncode))
59 def git_config_get(option, repo_dir=None):
60 cmd = ('git', 'config', '--get', option)
61 p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
62 preexec_fn=_gitenv(repo_dir=repo_dir))
68 raise GitError('%s returned %d' % (cmd, rc))
72 def parse_tz_offset(s):
73 """UTC offset in seconds."""
74 tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
80 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
81 # Make sure that's authoritative.
82 _start_end_char = r'[^ .,:;<>"\'\0\n]'
83 _content_char = r'[^\0\n<>]'
84 _safe_str_rx = '(?:%s{1,2}|(?:%s%s*%s))' \
86 _start_end_char, _content_char, _start_end_char)
87 _tz_rx = r'[-+]\d\d[0-5]\d'
88 _parent_rx = r'(?:parent [abcdefABCDEF0123456789]{40}\n)'
89 # Assumes every following line starting with a space is part of the
90 # mergetag. Is there a formal commit blob spec?
91 _mergetag_rx = r'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)'
92 _commit_rx = re.compile(r'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
93 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
94 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?)
96 (?P<message>(?:.|\n)*)''' % (_parent_rx,
97 _safe_str_rx, _safe_str_rx, _tz_rx,
98 _safe_str_rx, _safe_str_rx, _tz_rx,
100 _parent_hash_rx = re.compile(r'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
102 # Note that the author_sec and committer_sec values are (UTC) epoch
103 # seconds, and for now the mergetag is not included.
104 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
105 'author_name', 'author_mail',
106 'author_sec', 'author_offset',
107 'committer_name', 'committer_mail',
108 'committer_sec', 'committer_offset',
111 def parse_commit(content):
112 commit_match = re.match(_commit_rx, content)
114 raise Exception('cannot parse commit %r' % content)
115 matches = commit_match.groupdict()
116 return CommitInfo(tree=matches['tree'],
117 parents=re.findall(_parent_hash_rx, matches['parents']),
118 author_name=matches['author_name'],
119 author_mail=matches['author_mail'],
120 author_sec=int(matches['asec']),
121 author_offset=parse_tz_offset(matches['atz']),
122 committer_name=matches['committer_name'],
123 committer_mail=matches['committer_mail'],
124 committer_sec=int(matches['csec']),
125 committer_offset=parse_tz_offset(matches['ctz']),
126 message=matches['message'])
129 def get_cat_data(cat_iterator, expected_type):
130 _, kind, _ = next(cat_iterator)
131 if kind != expected_type:
132 raise Exception('expected %r, saw %r' % (expected_type, kind))
133 return ''.join(cat_iterator)
135 def get_commit_items(id, cp):
136 return parse_commit(get_cat_data(cp.get(id), 'commit'))
138 def _local_git_date_str(epoch_sec):
139 return '%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
142 def _git_date_str(epoch_sec, tz_offset_sec):
143 offs = tz_offset_sec // 60
144 return '%d %s%02d%02d' \
146 '+' if offs >= 0 else '-',
151 def repo(sub = '', repo_dir=None):
152 """Get the path to the git repository or one of its subdirectories."""
153 repo_dir = repo_dir or repodir
155 raise GitError('You should call check_repo_or_die()')
157 # If there's a .git subdirectory, then the actual repo is in there.
158 gd = os.path.join(repo_dir, '.git')
159 if os.path.exists(gd):
162 return os.path.join(repo_dir, sub)
166 return re.sub(r'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)',
171 full = os.path.abspath(path)
172 fullrepo = os.path.abspath(repo(''))
173 if not fullrepo.endswith('/'):
175 if full.startswith(fullrepo):
176 path = full[len(fullrepo):]
177 if path.startswith('index-cache/'):
178 path = path[len('index-cache/'):]
179 return shorten_hash(path)
183 paths = [repo('objects/pack')]
184 paths += glob.glob(repo('index-cache/*/.'))
188 def auto_midx(objdir):
189 args = [path.exe(), 'midx', '--auto', '--dir', objdir]
191 rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
193 # make sure 'args' gets printed to help with debugging
194 add_error('%r: exception: %s' % (args, e))
197 add_error('%r: returned %d' % (args, rv))
199 args = [path.exe(), 'bloom', '--dir', objdir]
201 rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
203 # make sure 'args' gets printed to help with debugging
204 add_error('%r: exception: %s' % (args, e))
207 add_error('%r: returned %d' % (args, rv))
210 def mangle_name(name, mode, gitmode):
211 """Mangle a file name to present an abstract name for segmented files.
212 Mangled file names will have the ".bup" extension added to them. If a
213 file's name already ends with ".bup", a ".bupl" extension is added to
214 disambiguate normal files from segmented ones.
216 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
217 assert(stat.S_ISDIR(gitmode))
219 elif name.endswith('.bup') or name[:-1].endswith('.bup'):
220 return name + '.bupl'
225 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
226 def demangle_name(name, mode):
227 """Remove name mangling from a file name, if necessary.
229 The return value is a tuple (demangled_filename,mode), where mode is one of
232 * BUP_NORMAL : files that should be read as-is from the repository
233 * BUP_CHUNKED : files that were chunked and need to be reassembled
235 For more information on the name mangling algorithm, see mangle_name()
237 if name.endswith('.bupl'):
238 return (name[:-5], BUP_NORMAL)
239 elif name.endswith('.bup'):
240 return (name[:-4], BUP_CHUNKED)
241 elif name.endswith('.bupm'):
243 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
245 return (name, BUP_NORMAL)
248 def calc_hash(type, content):
249 """Calculate some content's hash in the Git fashion."""
250 header = '%s %d\0' % (type, len(content))
256 def shalist_item_sort_key(ent):
257 (mode, name, id) = ent
258 assert(mode+0 == mode)
259 if stat.S_ISDIR(mode):
265 def tree_encode(shalist):
266 """Generate a git tree object from (mode,name,hash) tuples."""
267 shalist = sorted(shalist, key = shalist_item_sort_key)
269 for (mode,name,bin) in shalist:
271 assert(mode+0 == mode)
273 assert(len(bin) == 20)
274 s = '%o %s\0%s' % (mode,name,bin)
275 assert(s[0] != '0') # 0-padded octal is not acceptable in a git tree
280 def tree_decode(buf):
281 """Generate a list of (mode,name,hash) from the git tree object in buf."""
283 while ofs < len(buf):
284 z = buf.find('\0', ofs)
286 spl = buf[ofs:z].split(' ', 1)
287 assert(len(spl) == 2)
289 sha = buf[z+1:z+1+20]
291 yield (int(mode, 8), name, sha)
294 def _encode_packobj(type, content, compression_level=1):
295 if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
296 raise ValueError('invalid compression level %s' % compression_level)
299 szbits = (sz & 0x0f) | (_typemap[type]<<4)
302 if sz: szbits |= 0x80
308 z = zlib.compressobj(compression_level)
310 yield z.compress(content)
314 def _encode_looseobj(type, content, compression_level=1):
315 z = zlib.compressobj(compression_level)
316 yield z.compress('%s %d\0' % (type, len(content)))
317 yield z.compress(content)
321 def _decode_looseobj(buf):
323 s = zlib.decompress(buf)
330 assert(type in _typemap)
331 assert(sz == len(content))
332 return (type, content)
335 def _decode_packobj(buf):
338 type = _typermap[(c & 0x70) >> 4]
345 sz |= (c & 0x7f) << shift
349 return (type, zlib.decompress(buf[i+1:]))
356 def find_offset(self, hash):
357 """Get the offset of an object inside the index file."""
358 idx = self._idx_from_hash(hash)
360 return self._ofs_from_idx(idx)
363 def exists(self, hash, want_source=False):
364 """Return nonempty if the object exists in this index."""
365 if hash and (self._idx_from_hash(hash) != None):
366 return want_source and os.path.basename(self.name) or True
370 return int(self.fanout[255])
372 def _idx_from_hash(self, hash):
373 global _total_searches, _total_steps
375 assert(len(hash) == 20)
377 start = self.fanout[b1-1] # range -1..254
378 end = self.fanout[b1] # range 0..255
380 _total_steps += 1 # lookup table is a step
383 mid = start + (end-start)/2
384 v = self._idx_to_hash(mid)
394 class PackIdxV1(PackIdx):
395 """Object representation of a Git pack index (version 1) file."""
396 def __init__(self, filename, f):
398 self.idxnames = [self.name]
399 self.map = mmap_read(f)
400 self.fanout = list(struct.unpack('!256I',
401 str(buffer(self.map, 0, 256*4))))
402 self.fanout.append(0) # entry "-1"
403 nsha = self.fanout[255]
405 self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
407 def _ofs_from_idx(self, idx):
408 return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
410 def _idx_to_hash(self, idx):
411 return str(self.shatable[idx*24+4 : idx*24+24])
414 for i in range(self.fanout[255]):
415 yield buffer(self.map, 256*4 + 24*i + 4, 20)
418 class PackIdxV2(PackIdx):
419 """Object representation of a Git pack index (version 2) file."""
420 def __init__(self, filename, f):
422 self.idxnames = [self.name]
423 self.map = mmap_read(f)
424 assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
425 self.fanout = list(struct.unpack('!256I',
426 str(buffer(self.map, 8, 256*4))))
427 self.fanout.append(0) # entry "-1"
428 nsha = self.fanout[255]
429 self.sha_ofs = 8 + 256*4
430 self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
431 self.ofstable = buffer(self.map,
432 self.sha_ofs + nsha*20 + nsha*4,
434 self.ofs64table = buffer(self.map,
435 8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
437 def _ofs_from_idx(self, idx):
438 ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
440 idx64 = ofs & 0x7fffffff
441 ofs = struct.unpack('!Q',
442 str(buffer(self.ofs64table, idx64*8, 8)))[0]
445 def _idx_to_hash(self, idx):
446 return str(self.shatable[idx*20:(idx+1)*20])
449 for i in range(self.fanout[255]):
450 yield buffer(self.map, 8 + 256*4 + 20*i, 20)
455 def __init__(self, dir):
457 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
462 self.do_bloom = False
469 assert(_mpi_count == 0)
472 return iter(idxmerge(self.packs))
475 return sum(len(pack) for pack in self.packs)
477 def exists(self, hash, want_source=False):
478 """Return nonempty if the object exists in the index files."""
479 global _total_searches
481 if hash in self.also:
483 if self.do_bloom and self.bloom:
484 if self.bloom.exists(hash):
485 self.do_bloom = False
487 _total_searches -= 1 # was counted by bloom
489 for i in xrange(len(self.packs)):
491 _total_searches -= 1 # will be incremented by sub-pack
492 ix = p.exists(hash, want_source=want_source)
494 # reorder so most recently used packs are searched first
495 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
500 def refresh(self, skip_midx = False):
501 """Refresh the index list.
502 This method verifies if .midx files were superseded (e.g. all of its
503 contents are in another, bigger .midx file) and removes the superseded
506 If skip_midx is True, all work on .midx files will be skipped and .midx
507 files will be removed from the list.
509 The module-global variable 'ignore_midx' can force this function to
510 always act as if skip_midx was True.
512 self.bloom = None # Always reopen the bloom as it may have been relaced
513 self.do_bloom = False
514 skip_midx = skip_midx or ignore_midx
515 d = dict((p.name, p) for p in self.packs
516 if not skip_midx or not isinstance(p, midx.PackMidx))
517 if os.path.exists(self.dir):
520 for ix in self.packs:
521 if isinstance(ix, midx.PackMidx):
522 for name in ix.idxnames:
523 d[os.path.join(self.dir, name)] = ix
524 for full in glob.glob(os.path.join(self.dir,'*.midx')):
526 mx = midx.PackMidx(full)
527 (mxd, mxf) = os.path.split(mx.name)
529 for n in mx.idxnames:
530 if not os.path.exists(os.path.join(mxd, n)):
531 log(('warning: index %s missing\n' +
532 ' used by %s\n') % (n, mxf))
540 midxl.sort(key=lambda ix:
541 (-len(ix), -xstat.stat(ix.name).st_mtime))
544 for sub in ix.idxnames:
545 found = d.get(os.path.join(self.dir, sub))
546 if not found or isinstance(found, PackIdx):
547 # doesn't exist, or exists but not in a midx
552 for name in ix.idxnames:
553 d[os.path.join(self.dir, name)] = ix
554 elif not ix.force_keep:
555 debug1('midx: removing redundant: %s\n'
556 % os.path.basename(ix.name))
559 for full in glob.glob(os.path.join(self.dir,'*.idx')):
563 except GitError as e:
567 bfull = os.path.join(self.dir, 'bup.bloom')
568 if self.bloom is None and os.path.exists(bfull):
569 self.bloom = bloom.ShaBloom(bfull)
570 self.packs = list(set(d.values()))
571 self.packs.sort(reverse=True, key=lambda x: len(x))
572 if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
576 debug1('PackIdxList: using %d index%s.\n'
577 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
580 """Insert an additional object in the list."""
584 def open_idx(filename):
585 if filename.endswith('.idx'):
586 f = open(filename, 'rb')
588 if header[0:4] == '\377tOc':
589 version = struct.unpack('!I', header[4:8])[0]
591 return PackIdxV2(filename, f)
593 raise GitError('%s: expected idx file version 2, got %d'
594 % (filename, version))
595 elif len(header) == 8 and header[0:4] < '\377tOc':
596 return PackIdxV1(filename, f)
598 raise GitError('%s: unrecognized idx file header' % filename)
599 elif filename.endswith('.midx'):
600 return midx.PackMidx(filename)
602 raise GitError('idx filenames must end with .idx or .midx')
605 def idxmerge(idxlist, final_progress=True):
606 """Generate a list of all the objects reachable in a PackIdxList."""
607 def pfunc(count, total):
608 qprogress('Reading indexes: %.2f%% (%d/%d)\r'
609 % (count*100.0/total, count, total))
610 def pfinal(count, total):
612 progress('Reading indexes: %.2f%% (%d/%d), done.\n'
613 % (100, total, total))
614 return merge_iter(idxlist, 10024, pfunc, pfinal)
617 def _make_objcache():
618 return PackIdxList(repo('objects/pack'))
620 # bup-gc assumes that it can disable all PackWriter activities
621 # (bloom/midx/cache) via the constructor and close() arguments.
624 """Writes Git objects inside a pack file."""
625 def __init__(self, objcache_maker=_make_objcache, compression_level=1,
626 run_midx=True, on_pack_finish=None,
627 max_pack_size=None, max_pack_objects=None, repo_dir=None):
628 self.repo_dir = repo_dir or repo()
635 self.objcache_maker = objcache_maker
637 self.compression_level = compression_level
638 self.run_midx=run_midx
639 self.on_pack_finish = on_pack_finish
640 if not max_pack_size:
641 max_pack_size = git_config_get('pack.packSizeLimit',
642 repo_dir=self.repo_dir)
643 if max_pack_size is not None:
644 max_pack_size = parse_num(max_pack_size)
645 if not max_pack_size:
646 # larger packs slow down pruning
647 max_pack_size = 1000 * 1000 * 1000
648 self.max_pack_size = max_pack_size
649 # cache memory usage is about 83 bytes per object
650 self.max_pack_objects = max_pack_objects if max_pack_objects \
651 else max(1, self.max_pack_size // 5000)
659 def __exit__(self, type, value, traceback):
664 objdir = dir = os.path.join(self.repo_dir, 'objects')
665 fd, name = tempfile.mkstemp(suffix='.pack', dir=objdir)
667 self.file = os.fdopen(fd, 'w+b')
672 self.parentfd = os.open(objdir, os.O_RDONLY)
678 assert(name.endswith('.pack'))
679 self.filename = name[:-5]
680 self.file.write('PACK\0\0\0\2\0\0\0\0')
681 self.idx = list(list() for i in xrange(256))
683 def _raw_write(self, datalist, sha):
686 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
687 # the file never has a *partial* blob. So let's make sure it's
688 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
689 # to our hashsplit algorithm.) f.write() does its own buffering,
690 # but that's okay because we'll flush it in _end().
691 oneblob = ''.join(datalist)
695 raise GitError, e, sys.exc_info()[2]
697 crc = zlib.crc32(oneblob) & 0xffffffff
698 self._update_idx(sha, crc, nw)
703 def _update_idx(self, sha, crc, size):
706 self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
708 def _write(self, sha, type, content):
712 sha = calc_hash(type, content)
713 size, crc = self._raw_write(_encode_packobj(type, content,
714 self.compression_level),
716 if self.outbytes >= self.max_pack_size \
717 or self.count >= self.max_pack_objects:
721 def breakpoint(self):
722 """Clear byte and object counts and return the last processed id."""
723 id = self._end(self.run_midx)
724 self.outbytes = self.count = 0
727 def _require_objcache(self):
728 if self.objcache is None and self.objcache_maker:
729 self.objcache = self.objcache_maker()
730 if self.objcache is None:
732 "PackWriter not opened or can't check exists w/o objcache")
734 def exists(self, id, want_source=False):
735 """Return non-empty if an object is found in the object cache."""
736 self._require_objcache()
737 return self.objcache.exists(id, want_source=want_source)
739 def just_write(self, sha, type, content):
740 """Write an object to the pack file without checking for duplication."""
741 self._write(sha, type, content)
742 # If nothing else, gc doesn't have/want an objcache
743 if self.objcache is not None:
744 self.objcache.add(sha)
746 def maybe_write(self, type, content):
747 """Write an object to the pack file if not present and return its id."""
748 sha = calc_hash(type, content)
749 if not self.exists(sha):
750 self._require_objcache()
751 self.just_write(sha, type, content)
754 def new_blob(self, blob):
755 """Create a blob object in the pack with the supplied content."""
756 return self.maybe_write('blob', blob)
758 def new_tree(self, shalist):
759 """Create a tree object in the pack."""
760 content = tree_encode(shalist)
761 return self.maybe_write('tree', content)
763 def new_commit(self, tree, parent,
764 author, adate_sec, adate_tz,
765 committer, cdate_sec, cdate_tz,
767 """Create a commit object in the pack. The date_sec values must be
768 epoch-seconds, and if a tz is None, the local timezone is assumed."""
770 adate_str = _git_date_str(adate_sec, adate_tz)
772 adate_str = _local_git_date_str(adate_sec)
774 cdate_str = _git_date_str(cdate_sec, cdate_tz)
776 cdate_str = _local_git_date_str(cdate_sec)
778 if tree: l.append('tree %s' % tree.encode('hex'))
779 if parent: l.append('parent %s' % parent.encode('hex'))
780 if author: l.append('author %s %s' % (author, adate_str))
781 if committer: l.append('committer %s %s' % (committer, cdate_str))
784 return self.maybe_write('commit', '\n'.join(l))
787 """Remove the pack file from disk."""
796 os.unlink(self.filename + '.pack')
803 def _end(self, run_midx=True):
805 if not f: return None
812 # update object count
814 cp = struct.pack('!i', self.count)
818 # calculate the pack sha1sum
821 for b in chunkyreader(f):
823 packbin = sum.digest()
825 fdatasync(f.fileno())
829 obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
830 nameprefix = os.path.join(self.repo_dir,
831 'objects/pack/pack-' + obj_list_sha)
832 if os.path.exists(self.filename + '.map'):
833 os.unlink(self.filename + '.map')
834 os.rename(self.filename + '.pack', nameprefix + '.pack')
835 os.rename(self.filename + '.idx', nameprefix + '.idx')
837 os.fsync(self.parentfd)
839 os.close(self.parentfd)
842 auto_midx(os.path.join(self.repo_dir, 'objects/pack'))
844 if self.on_pack_finish:
845 self.on_pack_finish(nameprefix)
849 def close(self, run_midx=True):
850 """Close the pack file and move it to its definitive path."""
851 return self._end(run_midx=run_midx)
853 def _write_pack_idx_v2(self, filename, idx, packbin):
856 for entry in section:
857 if entry[2] >= 2**31:
860 # Length: header + fan-out + shas-and-crcs + overflow-offsets
861 index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
863 idx_f = open(filename, 'w+b')
865 idx_f.truncate(index_len)
866 fdatasync(idx_f.fileno())
867 idx_map = mmap_readwrite(idx_f, close=False)
869 count = _helpers.write_idx(filename, idx_map, idx, self.count)
870 assert(count == self.count)
877 idx_f = open(filename, 'a+b')
882 b = idx_f.read(8 + 4*256)
885 obj_list_sum = Sha1()
886 for b in chunkyreader(idx_f, 20*self.count):
888 obj_list_sum.update(b)
889 namebase = obj_list_sum.hexdigest()
891 for b in chunkyreader(idx_f):
893 idx_f.write(idx_sum.digest())
894 fdatasync(idx_f.fileno())
900 def _gitenv(repo_dir = None):
904 os.environ['GIT_DIR'] = os.path.abspath(repo_dir)
908 def list_refs(patterns=None, repo_dir=None,
909 limit_to_heads=False, limit_to_tags=False):
910 """Yield (refname, hash) tuples for all repository refs unless
911 patterns are specified. In that case, only include tuples for
912 refs matching those patterns (cf. git-show-ref(1)). The limits
913 restrict the result items to refs/heads or refs/tags. If both
914 limits are specified, items from both sources will be included.
917 argv = ['git', 'show-ref']
919 argv.append('--heads')
921 argv.append('--tags')
924 argv.extend(patterns)
925 p = subprocess.Popen(argv,
926 preexec_fn = _gitenv(repo_dir),
927 stdout = subprocess.PIPE)
928 out = p.stdout.read().strip()
929 rv = p.wait() # not fatal
933 for d in out.split('\n'):
934 (sha, name) = d.split(' ', 1)
935 yield (name, sha.decode('hex'))
938 def read_ref(refname, repo_dir = None):
939 """Get the commit id of the most recent commit made on a given ref."""
940 refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
941 l = tuple(islice(refs, 2))
949 def rev_list_invocation(ref_or_refs, count=None, format=None):
950 if isinstance(ref_or_refs, compat.str_type):
951 refs = (ref_or_refs,)
954 argv = ['git', 'rev-list']
955 if isinstance(count, Integral):
956 argv.extend(['-n', str(count)])
958 raise ValueError('unexpected count argument %r' % count)
961 argv.append('--pretty=format:' + format)
963 assert not ref.startswith('-')
969 def rev_list(ref_or_refs, count=None, parse=None, format=None, repo_dir=None):
970 """Yield information about commits as per "git rev-list". If a format
971 is not provided, yield one hex hash at a time. If a format is
972 provided, pass it to rev-list and call parse(git_stdout) for each
973 commit with the stream positioned just after the rev-list "commit
974 HASH" header line. When a format is provided yield (oidx,
975 parse(git_stdout)) for each commit.
978 assert bool(parse) == bool(format)
979 p = subprocess.Popen(rev_list_invocation(ref_or_refs, count=count,
981 preexec_fn = _gitenv(repo_dir),
982 stdout = subprocess.PIPE)
984 for line in p.stdout:
987 line = p.stdout.readline()
990 if not s.startswith('commit '):
991 raise Exception('unexpected line ' + s)
994 yield s, parse(p.stdout)
995 line = p.stdout.readline()
997 rv = p.wait() # not fatal
999 raise GitError, 'git rev-list returned error %d' % rv
1002 def get_commit_dates(refs, repo_dir=None):
1003 """Get the dates for the specified commit refs. For now, every unique
1004 string in refs must resolve to a different commit or this
1005 function will fail."""
1008 commit = get_commit_items(ref, cp(repo_dir))
1009 result.append(commit.author_sec)
1013 def rev_parse(committish, repo_dir=None):
1014 """Resolve the full hash for 'committish', if it exists.
1016 Should be roughly equivalent to 'git rev-parse'.
1018 Returns the hex value of the hash if it is found, None if 'committish' does
1019 not correspond to anything.
1021 head = read_ref(committish, repo_dir=repo_dir)
1023 debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
1026 pL = PackIdxList(repo('objects/pack', repo_dir=repo_dir))
1028 if len(committish) == 40:
1030 hash = committish.decode('hex')
1040 def update_ref(refname, newval, oldval, repo_dir=None):
1041 """Update a repository reference."""
1044 assert(refname.startswith('refs/heads/') \
1045 or refname.startswith('refs/tags/'))
1046 p = subprocess.Popen(['git', 'update-ref', refname,
1047 newval.encode('hex'), oldval.encode('hex')],
1048 preexec_fn = _gitenv(repo_dir))
1049 _git_wait('git update-ref', p)
1052 def delete_ref(refname, oldvalue=None):
1053 """Delete a repository reference (see git update-ref(1))."""
1054 assert(refname.startswith('refs/'))
1055 oldvalue = [] if not oldvalue else [oldvalue]
1056 p = subprocess.Popen(['git', 'update-ref', '-d', refname] + oldvalue,
1057 preexec_fn = _gitenv())
1058 _git_wait('git update-ref', p)
1061 def guess_repo(path=None):
1062 """Set the path value in the global variable "repodir".
1063 This makes bup look for an existing bup repository, but not fail if a
1064 repository doesn't exist. Usually, if you are interacting with a bup
1065 repository, you would not be calling this function but using
1066 check_repo_or_die().
1072 repodir = os.environ.get('BUP_DIR')
1074 repodir = os.path.expanduser('~/.bup')
1077 def init_repo(path=None):
1078 """Create the Git bare repository for bup in a given path."""
1080 d = repo() # appends a / to the path
1081 parent = os.path.dirname(os.path.dirname(d))
1082 if parent and not os.path.exists(parent):
1083 raise GitError('parent directory "%s" does not exist\n' % parent)
1084 if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
1085 raise GitError('"%s" exists but is not a directory\n' % d)
1086 p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
1087 preexec_fn = _gitenv())
1088 _git_wait('git init', p)
1089 # Force the index version configuration in order to ensure bup works
1090 # regardless of the version of the installed Git binary.
1091 p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
1092 stdout=sys.stderr, preexec_fn = _gitenv())
1093 _git_wait('git config', p)
1095 p = subprocess.Popen(['git', 'config', 'core.logAllRefUpdates', 'true'],
1096 stdout=sys.stderr, preexec_fn = _gitenv())
1097 _git_wait('git config', p)
1100 def check_repo_or_die(path=None):
1101 """Check to see if a bup repository probably exists, and abort if not."""
1104 pst = stat_if_exists(top + '/objects/pack')
1105 if pst and stat.S_ISDIR(pst.st_mode):
1108 top_st = stat_if_exists(top)
1110 log('error: repository %r does not exist (see "bup help init")\n'
1113 log('error: %r is not a repository\n' % top)
1117 def is_suitable_git(ver_str):
1118 if not ver_str.startswith(b'git version '):
1119 return 'unrecognized'
1120 ver_str = ver_str[len(b'git version '):]
1121 if ver_str.startswith(b'0.'):
1122 return 'insufficient'
1123 if ver_str.startswith(b'1.'):
1124 if re.match(br'1\.[012345]rc', ver_str):
1125 return 'insufficient'
1126 if re.match(br'1\.[01234]\.', ver_str):
1127 return 'insufficient'
1128 if re.match(br'1\.5\.[012345]($|\.)', ver_str):
1129 return 'insufficient'
1130 if re.match(br'1\.5\.6-rc', ver_str):
1131 return 'insufficient'
1133 if re.match(br'[0-9]+(\.|$)?', ver_str):
1139 def require_suitable_git(ver_str=None):
1140 """Raise GitError if the version of git isn't suitable.
1142 Rely on ver_str when provided, rather than invoking the git in the
1147 if _git_great is not None:
1149 if environ.get(b'BUP_GIT_VERSION_IS_FINE', b'').lower() \
1150 in (b'yes', b'true', b'1'):
1154 ver_str, _, _ = _git_exo([b'git', b'--version'])
1155 status = is_suitable_git(ver_str)
1156 if status == 'unrecognized':
1157 raise GitError('Unexpected git --version output: %r' % ver_str)
1158 if status == 'insufficient':
1159 log('error: git version must be at least 1.5.6\n')
1161 if status == 'suitable':
1167 class _AbortableIter:
1168 def __init__(self, it, onabort = None):
1170 self.onabort = onabort
1178 return next(self.it)
1179 except StopIteration as e:
1187 """Abort iteration and call the abortion callback, if needed."""
1198 """Link to 'git cat-file' that is used to retrieve blob data."""
1199 def __init__(self, repo_dir = None):
1200 require_suitable_git()
1201 self.repo_dir = repo_dir
1202 self.p = self.inprogress = None
1206 self.p.stdout.close()
1207 self.p.stdin.close()
1209 self.inprogress = None
1213 self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1214 stdin=subprocess.PIPE,
1215 stdout=subprocess.PIPE,
1218 preexec_fn = _gitenv(self.repo_dir))
1221 """Yield (oidx, type, size), followed by the data referred to by ref.
1222 If ref does not exist, only yield (None, None, None).
1225 if not self.p or self.p.poll() != None:
1228 poll_result = self.p.poll()
1229 assert(poll_result == None)
1231 log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1232 assert(not self.inprogress)
1233 assert(ref.find('\n') < 0)
1234 assert(ref.find('\r') < 0)
1235 assert(not ref.startswith('-'))
1236 self.inprogress = ref
1237 self.p.stdin.write('%s\n' % ref)
1238 self.p.stdin.flush()
1239 hdr = self.p.stdout.readline()
1240 if hdr.endswith(' missing\n'):
1241 self.inprogress = None
1242 yield None, None, None
1244 info = hdr.split(' ')
1245 if len(info) != 3 or len(info[0]) != 40:
1246 raise GitError('expected object (id, type, size), got %r' % info)
1247 oidx, typ, size = info
1249 it = _AbortableIter(chunkyreader(self.p.stdout, size),
1250 onabort=self._abort)
1252 yield oidx, typ, size
1255 readline_result = self.p.stdout.readline()
1256 assert(readline_result == '\n')
1257 self.inprogress = None
1258 except Exception as e:
1262 def _join(self, it):
1263 _, typ, _ = next(it)
1268 treefile = ''.join(it)
1269 for (mode, name, sha) in tree_decode(treefile):
1270 for blob in self.join(sha.encode('hex')):
1272 elif typ == 'commit':
1273 treeline = ''.join(it).split('\n')[0]
1274 assert(treeline.startswith('tree '))
1275 for blob in self.join(treeline[5:]):
1278 raise GitError('invalid object type %r: expected blob/tree/commit'
1282 """Generate a list of the content of all blobs that can be reached
1283 from an object. The hash given in 'id' must point to a blob, a tree
1284 or a commit. The content of all blobs that can be seen from trees or
1285 commits will be added to the list.
1288 for d in self._join(self.get(id)):
1290 except StopIteration:
1296 def cp(repo_dir=None):
1297 """Create a CatPipe object or reuse the already existing one."""
1300 repo_dir = repodir or repo()
1301 repo_dir = os.path.abspath(repo_dir)
1302 cp = _cp.get(repo_dir)
1304 cp = CatPipe(repo_dir)
1309 def tags(repo_dir = None):
1310 """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1312 for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1313 assert(n.startswith('refs/tags/'))
1317 tags[c].append(name) # more than one tag can point at 'c'
1321 class MissingObject(KeyError):
1322 def __init__(self, oid):
1324 KeyError.__init__(self, 'object %r is missing' % oid.encode('hex'))
1327 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1328 'path', 'chunk_path', 'data'])
1329 # The path is the mangled path, and if an item represents a fragment
1330 # of a chunked file, the chunk_path will be the chunked subtree path
1331 # for the chunk, i.e. ['', '2d3115e', ...]. The top-level path for a
1332 # chunked file will have a chunk_path of ['']. So some chunk subtree
1333 # of the file '/foo/bar/baz' might look like this:
1335 # item.path = ['foo', 'bar', 'baz.bup']
1336 # item.chunk_path = ['', '2d3115e', '016b097']
1337 # item.type = 'tree'
1341 def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1342 """Yield everything reachable from oidx via get_ref (which must behave
1343 like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1344 returns true. Throw MissingObject if a hash encountered is
1345 missing from the repository, and don't read or return blob content
1346 in the data field unless include_data is set.
1349 # Maintain the pending stack on the heap to avoid stack overflow
1350 pending = [(oidx, [], [], None)]
1352 oidx, parent_path, chunk_path, mode = pending.pop()
1353 oid = oidx.decode('hex')
1354 if stop_at and stop_at(oidx):
1357 if (not include_data) and mode and stat.S_ISREG(mode):
1358 # If the object is a "regular file", then it's a leaf in
1359 # the graph, so we can skip reading the data if the caller
1360 # hasn't requested it.
1361 yield WalkItem(oid=oid, type='blob',
1362 chunk_path=chunk_path, path=parent_path,
1367 item_it = get_ref(oidx)
1368 get_oidx, typ, _ = next(item_it)
1370 raise MissingObject(oidx.decode('hex'))
1371 if typ not in ('blob', 'commit', 'tree'):
1372 raise Exception('unexpected repository object type %r' % typ)
1374 # FIXME: set the mode based on the type when the mode is None
1375 if typ == 'blob' and not include_data:
1376 # Dump data until we can ask cat_pipe not to fetch it
1377 for ignored in item_it:
1381 data = ''.join(item_it)
1383 yield WalkItem(oid=oid, type=typ,
1384 chunk_path=chunk_path, path=parent_path,
1386 data=(data if include_data else None))
1389 commit_items = parse_commit(data)
1390 for pid in commit_items.parents:
1391 pending.append((pid, parent_path, chunk_path, mode))
1392 pending.append((commit_items.tree, parent_path, chunk_path,
1393 hashsplit.GIT_MODE_TREE))
1395 for mode, name, ent_id in tree_decode(data):
1396 demangled, bup_type = demangle_name(name, mode)
1398 sub_path = parent_path
1399 sub_chunk_path = chunk_path + [name]
1401 sub_path = parent_path + [name]
1402 if bup_type == BUP_CHUNKED:
1403 sub_chunk_path = ['']
1405 sub_chunk_path = chunk_path
1406 pending.append((ent_id.encode('hex'), sub_path, sub_chunk_path,