1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
6 from __future__ import absolute_import
7 import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
8 from collections import namedtuple
9 from itertools import islice
10 from numbers import Integral
12 from bup import _helpers, compat, hashsplit, path, midx, bloom, xstat
13 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
15 hostname, localtime, log, merge_iter,
16 mmap_read, mmap_readwrite,
18 progress, qprogress, shstr, stat_if_exists,
19 unlink, username, userfullname,
24 repodir = None # The default repository, once initialized
26 _typemap = { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
27 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
33 class GitError(Exception):
37 def _git_wait(cmd, p):
40 raise GitError('%s returned %d' % (shstr(cmd), rv))
42 def _git_capture(argv):
43 p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv())
45 _git_wait(repr(argv), p)
48 def git_config_get(option, repo_dir=None):
49 cmd = ('git', 'config', '--get', option)
50 p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
51 preexec_fn=_gitenv(repo_dir=repo_dir))
57 raise GitError('%s returned %d' % (cmd, rc))
61 def parse_tz_offset(s):
62 """UTC offset in seconds."""
63 tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
69 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
70 # Make sure that's authoritative.
71 _start_end_char = r'[^ .,:;<>"\'\0\n]'
72 _content_char = r'[^\0\n<>]'
73 _safe_str_rx = '(?:%s{1,2}|(?:%s%s*%s))' \
75 _start_end_char, _content_char, _start_end_char)
76 _tz_rx = r'[-+]\d\d[0-5]\d'
77 _parent_rx = r'(?:parent [abcdefABCDEF0123456789]{40}\n)'
78 _commit_rx = re.compile(r'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
79 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
80 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)
82 (?P<message>(?:.|\n)*)''' % (_parent_rx,
83 _safe_str_rx, _safe_str_rx, _tz_rx,
84 _safe_str_rx, _safe_str_rx, _tz_rx))
85 _parent_hash_rx = re.compile(r'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
88 # Note that the author_sec and committer_sec values are (UTC) epoch seconds.
89 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
90 'author_name', 'author_mail',
91 'author_sec', 'author_offset',
92 'committer_name', 'committer_mail',
93 'committer_sec', 'committer_offset',
96 def parse_commit(content):
97 commit_match = re.match(_commit_rx, content)
99 raise Exception('cannot parse commit %r' % content)
100 matches = commit_match.groupdict()
101 return CommitInfo(tree=matches['tree'],
102 parents=re.findall(_parent_hash_rx, matches['parents']),
103 author_name=matches['author_name'],
104 author_mail=matches['author_mail'],
105 author_sec=int(matches['asec']),
106 author_offset=parse_tz_offset(matches['atz']),
107 committer_name=matches['committer_name'],
108 committer_mail=matches['committer_mail'],
109 committer_sec=int(matches['csec']),
110 committer_offset=parse_tz_offset(matches['ctz']),
111 message=matches['message'])
114 def get_commit_items(id, cp):
115 commit_it = cp.get(id)
116 _, typ, _ = next(commit_it)
117 assert(typ == 'commit')
118 commit_content = ''.join(commit_it)
119 return parse_commit(commit_content)
122 def _local_git_date_str(epoch_sec):
123 return '%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
126 def _git_date_str(epoch_sec, tz_offset_sec):
127 offs = tz_offset_sec // 60
128 return '%d %s%02d%02d' \
130 '+' if offs >= 0 else '-',
135 def repo(sub = '', repo_dir=None):
136 """Get the path to the git repository or one of its subdirectories."""
138 repo_dir = repo_dir or repodir
140 raise GitError('You should call check_repo_or_die()')
142 # If there's a .git subdirectory, then the actual repo is in there.
143 gd = os.path.join(repo_dir, '.git')
144 if os.path.exists(gd):
147 return os.path.join(repo_dir, sub)
151 return re.sub(r'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)',
156 full = os.path.abspath(path)
157 fullrepo = os.path.abspath(repo(''))
158 if not fullrepo.endswith('/'):
160 if full.startswith(fullrepo):
161 path = full[len(fullrepo):]
162 if path.startswith('index-cache/'):
163 path = path[len('index-cache/'):]
164 return shorten_hash(path)
168 paths = [repo('objects/pack')]
169 paths += glob.glob(repo('index-cache/*/.'))
173 def auto_midx(objdir):
174 args = [path.exe(), 'midx', '--auto', '--dir', objdir]
176 rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
178 # make sure 'args' gets printed to help with debugging
179 add_error('%r: exception: %s' % (args, e))
182 add_error('%r: returned %d' % (args, rv))
184 args = [path.exe(), 'bloom', '--dir', objdir]
186 rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
188 # make sure 'args' gets printed to help with debugging
189 add_error('%r: exception: %s' % (args, e))
192 add_error('%r: returned %d' % (args, rv))
195 def mangle_name(name, mode, gitmode):
196 """Mangle a file name to present an abstract name for segmented files.
197 Mangled file names will have the ".bup" extension added to them. If a
198 file's name already ends with ".bup", a ".bupl" extension is added to
199 disambiguate normal files from segmented ones.
201 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
202 assert(stat.S_ISDIR(gitmode))
204 elif name.endswith('.bup') or name[:-1].endswith('.bup'):
205 return name + '.bupl'
210 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
211 def demangle_name(name, mode):
212 """Remove name mangling from a file name, if necessary.
214 The return value is a tuple (demangled_filename,mode), where mode is one of
217 * BUP_NORMAL : files that should be read as-is from the repository
218 * BUP_CHUNKED : files that were chunked and need to be reassembled
220 For more information on the name mangling algorithm, see mangle_name()
222 if name.endswith('.bupl'):
223 return (name[:-5], BUP_NORMAL)
224 elif name.endswith('.bup'):
225 return (name[:-4], BUP_CHUNKED)
226 elif name.endswith('.bupm'):
228 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
230 return (name, BUP_NORMAL)
233 def calc_hash(type, content):
234 """Calculate some content's hash in the Git fashion."""
235 header = '%s %d\0' % (type, len(content))
241 def shalist_item_sort_key(ent):
242 (mode, name, id) = ent
243 assert(mode+0 == mode)
244 if stat.S_ISDIR(mode):
250 def tree_encode(shalist):
251 """Generate a git tree object from (mode,name,hash) tuples."""
252 shalist = sorted(shalist, key = shalist_item_sort_key)
254 for (mode,name,bin) in shalist:
256 assert(mode+0 == mode)
258 assert(len(bin) == 20)
259 s = '%o %s\0%s' % (mode,name,bin)
260 assert(s[0] != '0') # 0-padded octal is not acceptable in a git tree
265 def tree_decode(buf):
266 """Generate a list of (mode,name,hash) from the git tree object in buf."""
268 while ofs < len(buf):
269 z = buf.find('\0', ofs)
271 spl = buf[ofs:z].split(' ', 1)
272 assert(len(spl) == 2)
274 sha = buf[z+1:z+1+20]
276 yield (int(mode, 8), name, sha)
279 def _encode_packobj(type, content, compression_level=1):
280 if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
281 raise ValueError('invalid compression level %s' % compression_level)
284 szbits = (sz & 0x0f) | (_typemap[type]<<4)
287 if sz: szbits |= 0x80
293 z = zlib.compressobj(compression_level)
295 yield z.compress(content)
299 def _encode_looseobj(type, content, compression_level=1):
300 z = zlib.compressobj(compression_level)
301 yield z.compress('%s %d\0' % (type, len(content)))
302 yield z.compress(content)
306 def _decode_looseobj(buf):
308 s = zlib.decompress(buf)
315 assert(type in _typemap)
316 assert(sz == len(content))
317 return (type, content)
320 def _decode_packobj(buf):
323 type = _typermap[(c & 0x70) >> 4]
330 sz |= (c & 0x7f) << shift
334 return (type, zlib.decompress(buf[i+1:]))
341 def find_offset(self, hash):
342 """Get the offset of an object inside the index file."""
343 idx = self._idx_from_hash(hash)
345 return self._ofs_from_idx(idx)
348 def exists(self, hash, want_source=False):
349 """Return nonempty if the object exists in this index."""
350 if hash and (self._idx_from_hash(hash) != None):
351 return want_source and os.path.basename(self.name) or True
355 return int(self.fanout[255])
357 def _idx_from_hash(self, hash):
358 global _total_searches, _total_steps
360 assert(len(hash) == 20)
362 start = self.fanout[b1-1] # range -1..254
363 end = self.fanout[b1] # range 0..255
365 _total_steps += 1 # lookup table is a step
368 mid = start + (end-start)/2
369 v = self._idx_to_hash(mid)
379 class PackIdxV1(PackIdx):
380 """Object representation of a Git pack index (version 1) file."""
381 def __init__(self, filename, f):
383 self.idxnames = [self.name]
384 self.map = mmap_read(f)
385 self.fanout = list(struct.unpack('!256I',
386 str(buffer(self.map, 0, 256*4))))
387 self.fanout.append(0) # entry "-1"
388 nsha = self.fanout[255]
390 self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
392 def _ofs_from_idx(self, idx):
393 return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
395 def _idx_to_hash(self, idx):
396 return str(self.shatable[idx*24+4 : idx*24+24])
399 for i in xrange(self.fanout[255]):
400 yield buffer(self.map, 256*4 + 24*i + 4, 20)
403 class PackIdxV2(PackIdx):
404 """Object representation of a Git pack index (version 2) file."""
405 def __init__(self, filename, f):
407 self.idxnames = [self.name]
408 self.map = mmap_read(f)
409 assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
410 self.fanout = list(struct.unpack('!256I',
411 str(buffer(self.map, 8, 256*4))))
412 self.fanout.append(0) # entry "-1"
413 nsha = self.fanout[255]
414 self.sha_ofs = 8 + 256*4
415 self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
416 self.ofstable = buffer(self.map,
417 self.sha_ofs + nsha*20 + nsha*4,
419 self.ofs64table = buffer(self.map,
420 8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
422 def _ofs_from_idx(self, idx):
423 ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
425 idx64 = ofs & 0x7fffffff
426 ofs = struct.unpack('!Q',
427 str(buffer(self.ofs64table, idx64*8, 8)))[0]
430 def _idx_to_hash(self, idx):
431 return str(self.shatable[idx*20:(idx+1)*20])
434 for i in xrange(self.fanout[255]):
435 yield buffer(self.map, 8 + 256*4 + 20*i, 20)
440 def __init__(self, dir):
442 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
447 self.do_bloom = False
454 assert(_mpi_count == 0)
457 return iter(idxmerge(self.packs))
460 return sum(len(pack) for pack in self.packs)
462 def exists(self, hash, want_source=False):
463 """Return nonempty if the object exists in the index files."""
464 global _total_searches
466 if hash in self.also:
468 if self.do_bloom and self.bloom:
469 if self.bloom.exists(hash):
470 self.do_bloom = False
472 _total_searches -= 1 # was counted by bloom
474 for i in xrange(len(self.packs)):
476 _total_searches -= 1 # will be incremented by sub-pack
477 ix = p.exists(hash, want_source=want_source)
479 # reorder so most recently used packs are searched first
480 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
485 def refresh(self, skip_midx = False):
486 """Refresh the index list.
487 This method verifies if .midx files were superseded (e.g. all of its
488 contents are in another, bigger .midx file) and removes the superseded
491 If skip_midx is True, all work on .midx files will be skipped and .midx
492 files will be removed from the list.
494 The module-global variable 'ignore_midx' can force this function to
495 always act as if skip_midx was True.
497 self.bloom = None # Always reopen the bloom as it may have been relaced
498 self.do_bloom = False
499 skip_midx = skip_midx or ignore_midx
500 d = dict((p.name, p) for p in self.packs
501 if not skip_midx or not isinstance(p, midx.PackMidx))
502 if os.path.exists(self.dir):
505 for ix in self.packs:
506 if isinstance(ix, midx.PackMidx):
507 for name in ix.idxnames:
508 d[os.path.join(self.dir, name)] = ix
509 for full in glob.glob(os.path.join(self.dir,'*.midx')):
511 mx = midx.PackMidx(full)
512 (mxd, mxf) = os.path.split(mx.name)
514 for n in mx.idxnames:
515 if not os.path.exists(os.path.join(mxd, n)):
516 log(('warning: index %s missing\n' +
517 ' used by %s\n') % (n, mxf))
525 midxl.sort(key=lambda ix:
526 (-len(ix), -xstat.stat(ix.name).st_mtime))
529 for sub in ix.idxnames:
530 found = d.get(os.path.join(self.dir, sub))
531 if not found or isinstance(found, PackIdx):
532 # doesn't exist, or exists but not in a midx
537 for name in ix.idxnames:
538 d[os.path.join(self.dir, name)] = ix
539 elif not ix.force_keep:
540 debug1('midx: removing redundant: %s\n'
541 % os.path.basename(ix.name))
544 for full in glob.glob(os.path.join(self.dir,'*.idx')):
548 except GitError as e:
552 bfull = os.path.join(self.dir, 'bup.bloom')
553 if self.bloom is None and os.path.exists(bfull):
554 self.bloom = bloom.ShaBloom(bfull)
555 self.packs = list(set(d.values()))
556 self.packs.sort(reverse=True, key=lambda x: len(x))
557 if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
561 debug1('PackIdxList: using %d index%s.\n'
562 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
565 """Insert an additional object in the list."""
569 def open_idx(filename):
570 if filename.endswith('.idx'):
571 f = open(filename, 'rb')
573 if header[0:4] == '\377tOc':
574 version = struct.unpack('!I', header[4:8])[0]
576 return PackIdxV2(filename, f)
578 raise GitError('%s: expected idx file version 2, got %d'
579 % (filename, version))
580 elif len(header) == 8 and header[0:4] < '\377tOc':
581 return PackIdxV1(filename, f)
583 raise GitError('%s: unrecognized idx file header' % filename)
584 elif filename.endswith('.midx'):
585 return midx.PackMidx(filename)
587 raise GitError('idx filenames must end with .idx or .midx')
590 def idxmerge(idxlist, final_progress=True):
591 """Generate a list of all the objects reachable in a PackIdxList."""
592 def pfunc(count, total):
593 qprogress('Reading indexes: %.2f%% (%d/%d)\r'
594 % (count*100.0/total, count, total))
595 def pfinal(count, total):
597 progress('Reading indexes: %.2f%% (%d/%d), done.\n'
598 % (100, total, total))
599 return merge_iter(idxlist, 10024, pfunc, pfinal)
602 def _make_objcache():
603 return PackIdxList(repo('objects/pack'))
605 # bup-gc assumes that it can disable all PackWriter activities
606 # (bloom/midx/cache) via the constructor and close() arguments.
609 """Writes Git objects inside a pack file."""
610 def __init__(self, objcache_maker=_make_objcache, compression_level=1,
611 run_midx=True, on_pack_finish=None,
612 max_pack_size=None, max_pack_objects=None):
613 self.repo_dir = repo()
620 self.objcache_maker = objcache_maker
622 self.compression_level = compression_level
623 self.run_midx=run_midx
624 self.on_pack_finish = on_pack_finish
625 if not max_pack_size:
626 max_pack_size = git_config_get('pack.packSizeLimit',
627 repo_dir=self.repo_dir)
628 if max_pack_size is not None:
629 max_pack_size = parse_num(max_pack_size)
630 if not max_pack_size:
631 # larger packs slow down pruning
632 max_pack_size = 1000 * 1000 * 1000
633 self.max_pack_size = max_pack_size
634 # cache memory usage is about 83 bytes per object
635 self.max_pack_objects = max_pack_objects if max_pack_objects \
636 else max(1, self.max_pack_size // 5000)
643 objdir = dir = os.path.join(self.repo_dir, 'objects')
644 fd, name = tempfile.mkstemp(suffix='.pack', dir=objdir)
646 self.file = os.fdopen(fd, 'w+b')
651 self.parentfd = os.open(objdir, os.O_RDONLY)
657 assert(name.endswith('.pack'))
658 self.filename = name[:-5]
659 self.file.write('PACK\0\0\0\2\0\0\0\0')
660 self.idx = list(list() for i in xrange(256))
662 def _raw_write(self, datalist, sha):
665 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
666 # the file never has a *partial* blob. So let's make sure it's
667 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
668 # to our hashsplit algorithm.) f.write() does its own buffering,
669 # but that's okay because we'll flush it in _end().
670 oneblob = ''.join(datalist)
674 raise GitError, e, sys.exc_info()[2]
676 crc = zlib.crc32(oneblob) & 0xffffffff
677 self._update_idx(sha, crc, nw)
682 def _update_idx(self, sha, crc, size):
685 self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
687 def _write(self, sha, type, content):
691 sha = calc_hash(type, content)
692 size, crc = self._raw_write(_encode_packobj(type, content,
693 self.compression_level),
695 if self.outbytes >= self.max_pack_size \
696 or self.count >= self.max_pack_objects:
700 def breakpoint(self):
701 """Clear byte and object counts and return the last processed id."""
702 id = self._end(self.run_midx)
703 self.outbytes = self.count = 0
706 def _require_objcache(self):
707 if self.objcache is None and self.objcache_maker:
708 self.objcache = self.objcache_maker()
709 if self.objcache is None:
711 "PackWriter not opened or can't check exists w/o objcache")
713 def exists(self, id, want_source=False):
714 """Return non-empty if an object is found in the object cache."""
715 self._require_objcache()
716 return self.objcache.exists(id, want_source=want_source)
718 def just_write(self, sha, type, content):
719 """Write an object to the pack file, bypassing the objcache. Fails if
721 self._write(sha, type, content)
723 def maybe_write(self, type, content):
724 """Write an object to the pack file if not present and return its id."""
725 sha = calc_hash(type, content)
726 if not self.exists(sha):
727 self.just_write(sha, type, content)
728 self._require_objcache()
729 self.objcache.add(sha)
732 def new_blob(self, blob):
733 """Create a blob object in the pack with the supplied content."""
734 return self.maybe_write('blob', blob)
736 def new_tree(self, shalist):
737 """Create a tree object in the pack."""
738 content = tree_encode(shalist)
739 return self.maybe_write('tree', content)
741 def new_commit(self, tree, parent,
742 author, adate_sec, adate_tz,
743 committer, cdate_sec, cdate_tz,
745 """Create a commit object in the pack. The date_sec values must be
746 epoch-seconds, and if a tz is None, the local timezone is assumed."""
748 adate_str = _git_date_str(adate_sec, adate_tz)
750 adate_str = _local_git_date_str(adate_sec)
752 cdate_str = _git_date_str(cdate_sec, cdate_tz)
754 cdate_str = _local_git_date_str(cdate_sec)
756 if tree: l.append('tree %s' % tree.encode('hex'))
757 if parent: l.append('parent %s' % parent.encode('hex'))
758 if author: l.append('author %s %s' % (author, adate_str))
759 if committer: l.append('committer %s %s' % (committer, cdate_str))
762 return self.maybe_write('commit', '\n'.join(l))
765 """Remove the pack file from disk."""
774 os.unlink(self.filename + '.pack')
781 def _end(self, run_midx=True):
783 if not f: return None
790 # update object count
792 cp = struct.pack('!i', self.count)
796 # calculate the pack sha1sum
799 for b in chunkyreader(f):
801 packbin = sum.digest()
803 fdatasync(f.fileno())
807 obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
808 nameprefix = os.path.join(self.repo_dir,
809 'objects/pack/pack-' + obj_list_sha)
810 if os.path.exists(self.filename + '.map'):
811 os.unlink(self.filename + '.map')
812 os.rename(self.filename + '.pack', nameprefix + '.pack')
813 os.rename(self.filename + '.idx', nameprefix + '.idx')
815 os.fsync(self.parentfd)
817 os.close(self.parentfd)
820 auto_midx(os.path.join(self.repo_dir, 'objects/pack'))
822 if self.on_pack_finish:
823 self.on_pack_finish(nameprefix)
827 def close(self, run_midx=True):
828 """Close the pack file and move it to its definitive path."""
829 return self._end(run_midx=run_midx)
831 def _write_pack_idx_v2(self, filename, idx, packbin):
834 for entry in section:
835 if entry[2] >= 2**31:
838 # Length: header + fan-out + shas-and-crcs + overflow-offsets
839 index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
841 idx_f = open(filename, 'w+b')
843 idx_f.truncate(index_len)
844 fdatasync(idx_f.fileno())
845 idx_map = mmap_readwrite(idx_f, close=False)
847 count = _helpers.write_idx(filename, idx_map, idx, self.count)
848 assert(count == self.count)
855 idx_f = open(filename, 'a+b')
860 b = idx_f.read(8 + 4*256)
863 obj_list_sum = Sha1()
864 for b in chunkyreader(idx_f, 20*self.count):
866 obj_list_sum.update(b)
867 namebase = obj_list_sum.hexdigest()
869 for b in chunkyreader(idx_f):
871 idx_f.write(idx_sum.digest())
872 fdatasync(idx_f.fileno())
878 def _gitenv(repo_dir = None):
882 os.environ['GIT_DIR'] = os.path.abspath(repo_dir)
886 def list_refs(patterns=None, repo_dir=None,
887 limit_to_heads=False, limit_to_tags=False):
888 """Yield (refname, hash) tuples for all repository refs unless
889 patterns are specified. In that case, only include tuples for
890 refs matching those patterns (cf. git-show-ref(1)). The limits
891 restrict the result items to refs/heads or refs/tags. If both
892 limits are specified, items from both sources will be included.
895 argv = ['git', 'show-ref']
897 argv.append('--heads')
899 argv.append('--tags')
902 argv.extend(patterns)
903 p = subprocess.Popen(argv,
904 preexec_fn = _gitenv(repo_dir),
905 stdout = subprocess.PIPE)
906 out = p.stdout.read().strip()
907 rv = p.wait() # not fatal
911 for d in out.split('\n'):
912 (sha, name) = d.split(' ', 1)
913 yield (name, sha.decode('hex'))
916 def read_ref(refname, repo_dir = None):
917 """Get the commit id of the most recent commit made on a given ref."""
918 refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
919 l = tuple(islice(refs, 2))
927 def rev_list_invocation(ref_or_refs, count=None, format=None):
928 if isinstance(ref_or_refs, compat.str_type):
929 refs = (ref_or_refs,)
932 argv = ['git', 'rev-list']
933 if isinstance(count, Integral):
934 argv.extend(['-n', str(count)])
936 raise ValueError('unexpected count argument %r' % count)
939 argv.append('--pretty=format:' + format)
941 assert not ref.startswith('-')
947 def rev_list(ref_or_refs, count=None, parse=None, format=None, repo_dir=None):
948 """Yield information about commits as per "git rev-list". If a format
949 is not provided, yield one hex hash at a time. If a format is
950 provided, pass it to rev-list and call parse(git_stdout) for each
951 commit with the stream positioned just after the rev-list "commit
952 HASH" header line. When a format is provided yield (oidx,
953 parse(git_stdout)) for each commit.
956 assert bool(parse) == bool(format)
957 p = subprocess.Popen(rev_list_invocation(ref_or_refs, count=count,
959 preexec_fn = _gitenv(repo_dir),
960 stdout = subprocess.PIPE)
962 for line in p.stdout:
965 line = p.stdout.readline()
968 if not s.startswith('commit '):
969 raise Exception('unexpected line ' + s)
970 yield s[7:], parse(p.stdout)
971 line = p.stdout.readline()
973 rv = p.wait() # not fatal
975 raise GitError, 'git rev-list returned error %d' % rv
978 def get_commit_dates(refs, repo_dir=None):
979 """Get the dates for the specified commit refs. For now, every unique
980 string in refs must resolve to a different commit or this
981 function will fail."""
984 commit = get_commit_items(ref, cp(repo_dir))
985 result.append(commit.author_sec)
989 def rev_parse(committish, repo_dir=None):
990 """Resolve the full hash for 'committish', if it exists.
992 Should be roughly equivalent to 'git rev-parse'.
994 Returns the hex value of the hash if it is found, None if 'committish' does
995 not correspond to anything.
997 head = read_ref(committish, repo_dir=repo_dir)
999 debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
1002 pL = PackIdxList(repo('objects/pack', repo_dir=repo_dir))
1004 if len(committish) == 40:
1006 hash = committish.decode('hex')
1016 def update_ref(refname, newval, oldval, repo_dir=None):
1017 """Update a repository reference."""
1020 assert(refname.startswith('refs/heads/') \
1021 or refname.startswith('refs/tags/'))
1022 p = subprocess.Popen(['git', 'update-ref', refname,
1023 newval.encode('hex'), oldval.encode('hex')],
1024 preexec_fn = _gitenv(repo_dir))
1025 _git_wait('git update-ref', p)
1028 def delete_ref(refname, oldvalue=None):
1029 """Delete a repository reference (see git update-ref(1))."""
1030 assert(refname.startswith('refs/'))
1031 oldvalue = [] if not oldvalue else [oldvalue]
1032 p = subprocess.Popen(['git', 'update-ref', '-d', refname] + oldvalue,
1033 preexec_fn = _gitenv())
1034 _git_wait('git update-ref', p)
1037 def guess_repo(path=None):
1038 """Set the path value in the global variable "repodir".
1039 This makes bup look for an existing bup repository, but not fail if a
1040 repository doesn't exist. Usually, if you are interacting with a bup
1041 repository, you would not be calling this function but using
1042 check_repo_or_die().
1048 repodir = os.environ.get('BUP_DIR')
1050 repodir = os.path.expanduser('~/.bup')
1053 def init_repo(path=None):
1054 """Create the Git bare repository for bup in a given path."""
1056 d = repo() # appends a / to the path
1057 parent = os.path.dirname(os.path.dirname(d))
1058 if parent and not os.path.exists(parent):
1059 raise GitError('parent directory "%s" does not exist\n' % parent)
1060 if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
1061 raise GitError('"%s" exists but is not a directory\n' % d)
1062 p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
1063 preexec_fn = _gitenv())
1064 _git_wait('git init', p)
1065 # Force the index version configuration in order to ensure bup works
1066 # regardless of the version of the installed Git binary.
1067 p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
1068 stdout=sys.stderr, preexec_fn = _gitenv())
1069 _git_wait('git config', p)
1071 p = subprocess.Popen(['git', 'config', 'core.logAllRefUpdates', 'true'],
1072 stdout=sys.stderr, preexec_fn = _gitenv())
1073 _git_wait('git config', p)
1076 def check_repo_or_die(path=None):
1077 """Check to see if a bup repository probably exists, and abort if not."""
1080 pst = stat_if_exists(top + '/objects/pack')
1081 if pst and stat.S_ISDIR(pst.st_mode):
1084 top_st = stat_if_exists(top)
1086 log('error: repository %r does not exist (see "bup help init")\n'
1089 log('error: %r is not a repository\n' % top)
1095 """Get Git's version and ensure a usable version is installed.
1097 The returned version is formatted as an ordered tuple with each position
1098 representing a digit in the version tag. For example, the following tuple
1099 would represent version 1.6.6.9:
1101 ('1', '6', '6', '9')
1105 p = subprocess.Popen(['git', '--version'],
1106 stdout=subprocess.PIPE)
1107 gvs = p.stdout.read()
1108 _git_wait('git --version', p)
1109 m = re.match(r'git version (\S+.\S+)', gvs)
1111 raise GitError('git --version weird output: %r' % gvs)
1112 _ver = tuple(m.group(1).split('.'))
1113 needed = ('1','5', '3', '1')
1115 raise GitError('git version %s or higher is required; you have %s'
1116 % ('.'.join(needed), '.'.join(_ver)))
1120 class _AbortableIter:
1121 def __init__(self, it, onabort = None):
1123 self.onabort = onabort
1131 return next(self.it)
1132 except StopIteration as e:
1140 """Abort iteration and call the abortion callback, if needed."""
1152 """Link to 'git cat-file' that is used to retrieve blob data."""
1153 def __init__(self, repo_dir = None):
1155 self.repo_dir = repo_dir
1156 wanted = ('1','5','6')
1158 log('error: git version must be at least 1.5.6\n')
1160 self.p = self.inprogress = None
1164 self.p.stdout.close()
1165 self.p.stdin.close()
1167 self.inprogress = None
1171 self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1172 stdin=subprocess.PIPE,
1173 stdout=subprocess.PIPE,
1176 preexec_fn = _gitenv(self.repo_dir))
1179 """Yield (oidx, type, size), followed by the data referred to by ref.
1180 If ref does not exist, only yield (None, None, None).
1183 if not self.p or self.p.poll() != None:
1186 poll_result = self.p.poll()
1187 assert(poll_result == None)
1189 log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1190 assert(not self.inprogress)
1191 assert(ref.find('\n') < 0)
1192 assert(ref.find('\r') < 0)
1193 assert(not ref.startswith('-'))
1194 self.inprogress = ref
1195 self.p.stdin.write('%s\n' % ref)
1196 self.p.stdin.flush()
1197 hdr = self.p.stdout.readline()
1198 if hdr.endswith(' missing\n'):
1199 self.inprogress = None
1200 yield None, None, None
1202 info = hdr.split(' ')
1203 if len(info) != 3 or len(info[0]) != 40:
1204 raise GitError('expected object (id, type, size), got %r' % spl)
1205 oidx, typ, size = info
1207 it = _AbortableIter(chunkyreader(self.p.stdout, size),
1208 onabort=self._abort)
1210 yield oidx, typ, size
1213 readline_result = self.p.stdout.readline()
1214 assert(readline_result == '\n')
1215 self.inprogress = None
1216 except Exception as e:
1220 def _join(self, it):
1221 _, typ, _ = next(it)
1226 treefile = ''.join(it)
1227 for (mode, name, sha) in tree_decode(treefile):
1228 for blob in self.join(sha.encode('hex')):
1230 elif typ == 'commit':
1231 treeline = ''.join(it).split('\n')[0]
1232 assert(treeline.startswith('tree '))
1233 for blob in self.join(treeline[5:]):
1236 raise GitError('invalid object type %r: expected blob/tree/commit'
1240 """Generate a list of the content of all blobs that can be reached
1241 from an object. The hash given in 'id' must point to a blob, a tree
1242 or a commit. The content of all blobs that can be seen from trees or
1243 commits will be added to the list.
1246 for d in self._join(self.get(id)):
1248 except StopIteration:
1254 def cp(repo_dir=None):
1255 """Create a CatPipe object or reuse the already existing one."""
1258 repo_dir = repodir or repo()
1259 repo_dir = os.path.abspath(repo_dir)
1260 cp = _cp.get(repo_dir)
1262 cp = CatPipe(repo_dir)
1267 def tags(repo_dir = None):
1268 """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1270 for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1271 assert(n.startswith('refs/tags/'))
1275 tags[c].append(name) # more than one tag can point at 'c'
1279 class MissingObject(KeyError):
1280 def __init__(self, oid):
1282 KeyError.__init__(self, 'object %r is missing' % oid.encode('hex'))
1285 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1286 'path', 'chunk_path', 'data'])
1287 # The path is the mangled path, and if an item represents a fragment
1288 # of a chunked file, the chunk_path will be the chunked subtree path
1289 # for the chunk, i.e. ['', '2d3115e', ...]. The top-level path for a
1290 # chunked file will have a chunk_path of ['']. So some chunk subtree
1291 # of the file '/foo/bar/baz' might look like this:
1293 # item.path = ['foo', 'bar', 'baz.bup']
1294 # item.chunk_path = ['', '2d3115e', '016b097']
1295 # item.type = 'tree'
1299 def walk_object(cat_pipe, oidx,
1302 """Yield everything reachable from oidx via cat_pipe as a WalkItem,
1303 stopping whenever stop_at(oidx) returns true. Throw MissingObject
1304 if a hash encountered is missing from the repository, and don't
1305 read or return blob content in the data field unless include_data
1308 # Maintain the pending stack on the heap to avoid stack overflow
1309 pending = [(oidx, [], [], None)]
1311 oidx, parent_path, chunk_path, mode = pending.pop()
1312 oid = oidx.decode('hex')
1313 if stop_at and stop_at(oidx):
1316 if (not include_data) and mode and stat.S_ISREG(mode):
1317 # If the object is a "regular file", then it's a leaf in
1318 # the graph, so we can skip reading the data if the caller
1319 # hasn't requested it.
1320 yield WalkItem(oid=oid, type='blob',
1321 chunk_path=chunk_path, path=parent_path,
1326 item_it = cat_pipe.get(oidx)
1327 get_oidx, typ, _ = next(item_it)
1329 raise MissingObject(oidx.decode('hex'))
1330 if typ not in ('blob', 'commit', 'tree'):
1331 raise Exception('unexpected repository object type %r' % typ)
1333 # FIXME: set the mode based on the type when the mode is None
1334 if typ == 'blob' and not include_data:
1335 # Dump data until we can ask cat_pipe not to fetch it
1336 for ignored in item_it:
1340 data = ''.join(item_it)
1342 yield WalkItem(oid=oid, type=typ,
1343 chunk_path=chunk_path, path=parent_path,
1345 data=(data if include_data else None))
1348 commit_items = parse_commit(data)
1349 for pid in commit_items.parents:
1350 pending.append((pid, parent_path, chunk_path, mode))
1351 pending.append((commit_items.tree, parent_path, chunk_path,
1352 hashsplit.GIT_MODE_TREE))
1354 for mode, name, ent_id in tree_decode(data):
1355 demangled, bup_type = demangle_name(name, mode)
1357 sub_path = parent_path
1358 sub_chunk_path = chunk_path + [name]
1360 sub_path = parent_path + [name]
1361 if bup_type == BUP_CHUNKED:
1362 sub_chunk_path = ['']
1364 sub_chunk_path = chunk_path
1365 pending.append((ent_id.encode('hex'), sub_path, sub_chunk_path,