1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
6 import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
7 from collections import namedtuple
8 from itertools import islice
9 from numbers import Integral
11 from bup import _helpers, hashsplit, path, midx, bloom, xstat
12 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
14 hostname, localtime, log, merge_iter,
15 mmap_read, mmap_readwrite,
17 progress, qprogress, stat_if_exists,
18 unlink, username, userfullname,
23 repodir = None # The default repository, once initialized
25 _typemap = { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
26 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
32 class GitError(Exception):
36 def _git_wait(cmd, p):
39 raise GitError('%s returned %d' % (cmd, rv))
41 def _git_capture(argv):
42 p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv())
44 _git_wait(repr(argv), p)
47 def git_config_get(option, repo_dir=None):
48 cmd = ('git', 'config', '--get', option)
49 p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
50 preexec_fn=_gitenv(repo_dir=repo_dir))
56 raise GitError('%s returned %d' % (cmd, rc))
60 def parse_tz_offset(s):
61 """UTC offset in seconds."""
62 tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
68 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
69 # Make sure that's authoritative.
70 _start_end_char = r'[^ .,:;<>"\'\0\n]'
71 _content_char = r'[^\0\n<>]'
72 _safe_str_rx = '(?:%s{1,2}|(?:%s%s*%s))' \
74 _start_end_char, _content_char, _start_end_char)
75 _tz_rx = r'[-+]\d\d[0-5]\d'
76 _parent_rx = r'(?:parent [abcdefABCDEF0123456789]{40}\n)'
77 _commit_rx = re.compile(r'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
78 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
79 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)
81 (?P<message>(?:.|\n)*)''' % (_parent_rx,
82 _safe_str_rx, _safe_str_rx, _tz_rx,
83 _safe_str_rx, _safe_str_rx, _tz_rx))
84 _parent_hash_rx = re.compile(r'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
87 # Note that the author_sec and committer_sec values are (UTC) epoch seconds.
88 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
89 'author_name', 'author_mail',
90 'author_sec', 'author_offset',
91 'committer_name', 'committer_mail',
92 'committer_sec', 'committer_offset',
95 def parse_commit(content):
96 commit_match = re.match(_commit_rx, content)
98 raise Exception('cannot parse commit %r' % content)
99 matches = commit_match.groupdict()
100 return CommitInfo(tree=matches['tree'],
101 parents=re.findall(_parent_hash_rx, matches['parents']),
102 author_name=matches['author_name'],
103 author_mail=matches['author_mail'],
104 author_sec=int(matches['asec']),
105 author_offset=parse_tz_offset(matches['atz']),
106 committer_name=matches['committer_name'],
107 committer_mail=matches['committer_mail'],
108 committer_sec=int(matches['csec']),
109 committer_offset=parse_tz_offset(matches['ctz']),
110 message=matches['message'])
113 def get_commit_items(id, cp):
114 commit_it = cp.get(id)
115 assert(next(commit_it) == 'commit')
116 commit_content = ''.join(commit_it)
117 return parse_commit(commit_content)
120 def _local_git_date_str(epoch_sec):
121 return '%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
124 def _git_date_str(epoch_sec, tz_offset_sec):
125 offs = tz_offset_sec // 60
126 return '%d %s%02d%02d' \
128 '+' if offs >= 0 else '-',
133 def repo(sub = '', repo_dir=None):
134 """Get the path to the git repository or one of its subdirectories."""
136 repo_dir = repo_dir or repodir
138 raise GitError('You should call check_repo_or_die()')
140 # If there's a .git subdirectory, then the actual repo is in there.
141 gd = os.path.join(repo_dir, '.git')
142 if os.path.exists(gd):
145 return os.path.join(repo_dir, sub)
149 return re.sub(r'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)',
154 full = os.path.abspath(path)
155 fullrepo = os.path.abspath(repo(''))
156 if not fullrepo.endswith('/'):
158 if full.startswith(fullrepo):
159 path = full[len(fullrepo):]
160 if path.startswith('index-cache/'):
161 path = path[len('index-cache/'):]
162 return shorten_hash(path)
166 paths = [repo('objects/pack')]
167 paths += glob.glob(repo('index-cache/*/.'))
171 def auto_midx(objdir):
172 args = [path.exe(), 'midx', '--auto', '--dir', objdir]
174 rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
176 # make sure 'args' gets printed to help with debugging
177 add_error('%r: exception: %s' % (args, e))
180 add_error('%r: returned %d' % (args, rv))
182 args = [path.exe(), 'bloom', '--dir', objdir]
184 rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
186 # make sure 'args' gets printed to help with debugging
187 add_error('%r: exception: %s' % (args, e))
190 add_error('%r: returned %d' % (args, rv))
193 def mangle_name(name, mode, gitmode):
194 """Mangle a file name to present an abstract name for segmented files.
195 Mangled file names will have the ".bup" extension added to them. If a
196 file's name already ends with ".bup", a ".bupl" extension is added to
197 disambiguate normal files from segmented ones.
199 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
200 assert(stat.S_ISDIR(gitmode))
202 elif name.endswith('.bup') or name[:-1].endswith('.bup'):
203 return name + '.bupl'
208 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
209 def demangle_name(name, mode):
210 """Remove name mangling from a file name, if necessary.
212 The return value is a tuple (demangled_filename,mode), where mode is one of
215 * BUP_NORMAL : files that should be read as-is from the repository
216 * BUP_CHUNKED : files that were chunked and need to be reassembled
218 For more information on the name mangling algorithm, see mangle_name()
220 if name.endswith('.bupl'):
221 return (name[:-5], BUP_NORMAL)
222 elif name.endswith('.bup'):
223 return (name[:-4], BUP_CHUNKED)
224 elif name.endswith('.bupm'):
226 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
228 return (name, BUP_NORMAL)
231 def calc_hash(type, content):
232 """Calculate some content's hash in the Git fashion."""
233 header = '%s %d\0' % (type, len(content))
239 def shalist_item_sort_key(ent):
240 (mode, name, id) = ent
241 assert(mode+0 == mode)
242 if stat.S_ISDIR(mode):
248 def tree_encode(shalist):
249 """Generate a git tree object from (mode,name,hash) tuples."""
250 shalist = sorted(shalist, key = shalist_item_sort_key)
252 for (mode,name,bin) in shalist:
254 assert(mode+0 == mode)
256 assert(len(bin) == 20)
257 s = '%o %s\0%s' % (mode,name,bin)
258 assert(s[0] != '0') # 0-padded octal is not acceptable in a git tree
263 def tree_decode(buf):
264 """Generate a list of (mode,name,hash) from the git tree object in buf."""
266 while ofs < len(buf):
267 z = buf.find('\0', ofs)
269 spl = buf[ofs:z].split(' ', 1)
270 assert(len(spl) == 2)
272 sha = buf[z+1:z+1+20]
274 yield (int(mode, 8), name, sha)
277 def _encode_packobj(type, content, compression_level=1):
278 if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
279 raise ValueError('invalid compression level %s' % compression_level)
282 szbits = (sz & 0x0f) | (_typemap[type]<<4)
285 if sz: szbits |= 0x80
291 z = zlib.compressobj(compression_level)
293 yield z.compress(content)
297 def _encode_looseobj(type, content, compression_level=1):
298 z = zlib.compressobj(compression_level)
299 yield z.compress('%s %d\0' % (type, len(content)))
300 yield z.compress(content)
304 def _decode_looseobj(buf):
306 s = zlib.decompress(buf)
313 assert(type in _typemap)
314 assert(sz == len(content))
315 return (type, content)
318 def _decode_packobj(buf):
321 type = _typermap[(c & 0x70) >> 4]
328 sz |= (c & 0x7f) << shift
332 return (type, zlib.decompress(buf[i+1:]))
339 def find_offset(self, hash):
340 """Get the offset of an object inside the index file."""
341 idx = self._idx_from_hash(hash)
343 return self._ofs_from_idx(idx)
346 def exists(self, hash, want_source=False):
347 """Return nonempty if the object exists in this index."""
348 if hash and (self._idx_from_hash(hash) != None):
349 return want_source and os.path.basename(self.name) or True
353 return int(self.fanout[255])
355 def _idx_from_hash(self, hash):
356 global _total_searches, _total_steps
358 assert(len(hash) == 20)
360 start = self.fanout[b1-1] # range -1..254
361 end = self.fanout[b1] # range 0..255
363 _total_steps += 1 # lookup table is a step
366 mid = start + (end-start)/2
367 v = self._idx_to_hash(mid)
377 class PackIdxV1(PackIdx):
378 """Object representation of a Git pack index (version 1) file."""
379 def __init__(self, filename, f):
381 self.idxnames = [self.name]
382 self.map = mmap_read(f)
383 self.fanout = list(struct.unpack('!256I',
384 str(buffer(self.map, 0, 256*4))))
385 self.fanout.append(0) # entry "-1"
386 nsha = self.fanout[255]
388 self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
390 def _ofs_from_idx(self, idx):
391 return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
393 def _idx_to_hash(self, idx):
394 return str(self.shatable[idx*24+4 : idx*24+24])
397 for i in xrange(self.fanout[255]):
398 yield buffer(self.map, 256*4 + 24*i + 4, 20)
401 class PackIdxV2(PackIdx):
402 """Object representation of a Git pack index (version 2) file."""
403 def __init__(self, filename, f):
405 self.idxnames = [self.name]
406 self.map = mmap_read(f)
407 assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
408 self.fanout = list(struct.unpack('!256I',
409 str(buffer(self.map, 8, 256*4))))
410 self.fanout.append(0) # entry "-1"
411 nsha = self.fanout[255]
412 self.sha_ofs = 8 + 256*4
413 self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
414 self.ofstable = buffer(self.map,
415 self.sha_ofs + nsha*20 + nsha*4,
417 self.ofs64table = buffer(self.map,
418 8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
420 def _ofs_from_idx(self, idx):
421 ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
423 idx64 = ofs & 0x7fffffff
424 ofs = struct.unpack('!Q',
425 str(buffer(self.ofs64table, idx64*8, 8)))[0]
428 def _idx_to_hash(self, idx):
429 return str(self.shatable[idx*20:(idx+1)*20])
432 for i in xrange(self.fanout[255]):
433 yield buffer(self.map, 8 + 256*4 + 20*i, 20)
438 def __init__(self, dir):
440 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
445 self.do_bloom = False
452 assert(_mpi_count == 0)
455 return iter(idxmerge(self.packs))
458 return sum(len(pack) for pack in self.packs)
460 def exists(self, hash, want_source=False):
461 """Return nonempty if the object exists in the index files."""
462 global _total_searches
464 if hash in self.also:
466 if self.do_bloom and self.bloom:
467 if self.bloom.exists(hash):
468 self.do_bloom = False
470 _total_searches -= 1 # was counted by bloom
472 for i in xrange(len(self.packs)):
474 _total_searches -= 1 # will be incremented by sub-pack
475 ix = p.exists(hash, want_source=want_source)
477 # reorder so most recently used packs are searched first
478 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
483 def refresh(self, skip_midx = False):
484 """Refresh the index list.
485 This method verifies if .midx files were superseded (e.g. all of its
486 contents are in another, bigger .midx file) and removes the superseded
489 If skip_midx is True, all work on .midx files will be skipped and .midx
490 files will be removed from the list.
492 The module-global variable 'ignore_midx' can force this function to
493 always act as if skip_midx was True.
495 self.bloom = None # Always reopen the bloom as it may have been relaced
496 self.do_bloom = False
497 skip_midx = skip_midx or ignore_midx
498 d = dict((p.name, p) for p in self.packs
499 if not skip_midx or not isinstance(p, midx.PackMidx))
500 if os.path.exists(self.dir):
503 for ix in self.packs:
504 if isinstance(ix, midx.PackMidx):
505 for name in ix.idxnames:
506 d[os.path.join(self.dir, name)] = ix
507 for full in glob.glob(os.path.join(self.dir,'*.midx')):
509 mx = midx.PackMidx(full)
510 (mxd, mxf) = os.path.split(mx.name)
512 for n in mx.idxnames:
513 if not os.path.exists(os.path.join(mxd, n)):
514 log(('warning: index %s missing\n' +
515 ' used by %s\n') % (n, mxf))
523 midxl.sort(key=lambda ix:
524 (-len(ix), -xstat.stat(ix.name).st_mtime))
527 for sub in ix.idxnames:
528 found = d.get(os.path.join(self.dir, sub))
529 if not found or isinstance(found, PackIdx):
530 # doesn't exist, or exists but not in a midx
535 for name in ix.idxnames:
536 d[os.path.join(self.dir, name)] = ix
537 elif not ix.force_keep:
538 debug1('midx: removing redundant: %s\n'
539 % os.path.basename(ix.name))
542 for full in glob.glob(os.path.join(self.dir,'*.idx')):
546 except GitError as e:
550 bfull = os.path.join(self.dir, 'bup.bloom')
551 if self.bloom is None and os.path.exists(bfull):
552 self.bloom = bloom.ShaBloom(bfull)
553 self.packs = list(set(d.values()))
554 self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
555 if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
559 debug1('PackIdxList: using %d index%s.\n'
560 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
563 """Insert an additional object in the list."""
567 def open_idx(filename):
568 if filename.endswith('.idx'):
569 f = open(filename, 'rb')
571 if header[0:4] == '\377tOc':
572 version = struct.unpack('!I', header[4:8])[0]
574 return PackIdxV2(filename, f)
576 raise GitError('%s: expected idx file version 2, got %d'
577 % (filename, version))
578 elif len(header) == 8 and header[0:4] < '\377tOc':
579 return PackIdxV1(filename, f)
581 raise GitError('%s: unrecognized idx file header' % filename)
582 elif filename.endswith('.midx'):
583 return midx.PackMidx(filename)
585 raise GitError('idx filenames must end with .idx or .midx')
588 def idxmerge(idxlist, final_progress=True):
589 """Generate a list of all the objects reachable in a PackIdxList."""
590 def pfunc(count, total):
591 qprogress('Reading indexes: %.2f%% (%d/%d)\r'
592 % (count*100.0/total, count, total))
593 def pfinal(count, total):
595 progress('Reading indexes: %.2f%% (%d/%d), done.\n'
596 % (100, total, total))
597 return merge_iter(idxlist, 10024, pfunc, pfinal)
600 def _make_objcache():
601 return PackIdxList(repo('objects/pack'))
603 # bup-gc assumes that it can disable all PackWriter activities
604 # (bloom/midx/cache) via the constructor and close() arguments.
607 """Writes Git objects inside a pack file."""
608 def __init__(self, objcache_maker=_make_objcache, compression_level=1,
609 run_midx=True, on_pack_finish=None,
610 max_pack_size=None, max_pack_objects=None):
611 self.repo_dir = repo()
618 self.objcache_maker = objcache_maker
620 self.compression_level = compression_level
621 self.run_midx=run_midx
622 self.on_pack_finish = on_pack_finish
623 if not max_pack_size:
624 max_pack_size = git_config_get('pack.packSizeLimit',
625 repo_dir=self.repo_dir)
626 if max_pack_size is not None:
627 max_pack_size = parse_num(max_pack_size)
628 if not max_pack_size:
629 # larger packs slow down pruning
630 max_pack_size = 1000 * 1000 * 1000
631 self.max_pack_size = max_pack_size
632 # cache memory usage is about 83 bytes per object
633 self.max_pack_objects = max_pack_objects if max_pack_objects \
634 else max(1, self.max_pack_size // 5000)
641 objdir = dir = os.path.join(self.repo_dir, 'objects')
642 fd, name = tempfile.mkstemp(suffix='.pack', dir=objdir)
644 self.file = os.fdopen(fd, 'w+b')
649 self.parentfd = os.open(objdir, os.O_RDONLY)
655 assert(name.endswith('.pack'))
656 self.filename = name[:-5]
657 self.file.write('PACK\0\0\0\2\0\0\0\0')
658 self.idx = list(list() for i in xrange(256))
660 def _raw_write(self, datalist, sha):
663 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
664 # the file never has a *partial* blob. So let's make sure it's
665 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
666 # to our hashsplit algorithm.) f.write() does its own buffering,
667 # but that's okay because we'll flush it in _end().
668 oneblob = ''.join(datalist)
672 raise GitError, e, sys.exc_info()[2]
674 crc = zlib.crc32(oneblob) & 0xffffffff
675 self._update_idx(sha, crc, nw)
680 def _update_idx(self, sha, crc, size):
683 self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
685 def _write(self, sha, type, content):
689 sha = calc_hash(type, content)
690 size, crc = self._raw_write(_encode_packobj(type, content,
691 self.compression_level),
693 if self.outbytes >= self.max_pack_size \
694 or self.count >= self.max_pack_objects:
698 def breakpoint(self):
699 """Clear byte and object counts and return the last processed id."""
700 id = self._end(self.run_midx)
701 self.outbytes = self.count = 0
704 def _require_objcache(self):
705 if self.objcache is None and self.objcache_maker:
706 self.objcache = self.objcache_maker()
707 if self.objcache is None:
709 "PackWriter not opened or can't check exists w/o objcache")
711 def exists(self, id, want_source=False):
712 """Return non-empty if an object is found in the object cache."""
713 self._require_objcache()
714 return self.objcache.exists(id, want_source=want_source)
716 def just_write(self, sha, type, content):
717 """Write an object to the pack file, bypassing the objcache. Fails if
719 self._write(sha, type, content)
721 def maybe_write(self, type, content):
722 """Write an object to the pack file if not present and return its id."""
723 sha = calc_hash(type, content)
724 if not self.exists(sha):
725 self.just_write(sha, type, content)
726 self._require_objcache()
727 self.objcache.add(sha)
730 def new_blob(self, blob):
731 """Create a blob object in the pack with the supplied content."""
732 return self.maybe_write('blob', blob)
734 def new_tree(self, shalist):
735 """Create a tree object in the pack."""
736 content = tree_encode(shalist)
737 return self.maybe_write('tree', content)
739 def new_commit(self, tree, parent,
740 author, adate_sec, adate_tz,
741 committer, cdate_sec, cdate_tz,
743 """Create a commit object in the pack. The date_sec values must be
744 epoch-seconds, and if a tz is None, the local timezone is assumed."""
746 adate_str = _git_date_str(adate_sec, adate_tz)
748 adate_str = _local_git_date_str(adate_sec)
750 cdate_str = _git_date_str(cdate_sec, cdate_tz)
752 cdate_str = _local_git_date_str(cdate_sec)
754 if tree: l.append('tree %s' % tree.encode('hex'))
755 if parent: l.append('parent %s' % parent.encode('hex'))
756 if author: l.append('author %s %s' % (author, adate_str))
757 if committer: l.append('committer %s %s' % (committer, cdate_str))
760 return self.maybe_write('commit', '\n'.join(l))
763 """Remove the pack file from disk."""
772 os.unlink(self.filename + '.pack')
779 def _end(self, run_midx=True):
781 if not f: return None
788 # update object count
790 cp = struct.pack('!i', self.count)
794 # calculate the pack sha1sum
797 for b in chunkyreader(f):
799 packbin = sum.digest()
801 fdatasync(f.fileno())
805 obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
806 nameprefix = os.path.join(self.repo_dir,
807 'objects/pack/pack-' + obj_list_sha)
808 if os.path.exists(self.filename + '.map'):
809 os.unlink(self.filename + '.map')
810 os.rename(self.filename + '.pack', nameprefix + '.pack')
811 os.rename(self.filename + '.idx', nameprefix + '.idx')
813 os.fsync(self.parentfd)
815 os.close(self.parentfd)
818 auto_midx(os.path.join(self.repo_dir, 'objects/pack'))
820 if self.on_pack_finish:
821 self.on_pack_finish(nameprefix)
825 def close(self, run_midx=True):
826 """Close the pack file and move it to its definitive path."""
827 return self._end(run_midx=run_midx)
829 def _write_pack_idx_v2(self, filename, idx, packbin):
832 for entry in section:
833 if entry[2] >= 2**31:
836 # Length: header + fan-out + shas-and-crcs + overflow-offsets
837 index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
839 idx_f = open(filename, 'w+b')
841 idx_f.truncate(index_len)
842 fdatasync(idx_f.fileno())
843 idx_map = mmap_readwrite(idx_f, close=False)
845 count = _helpers.write_idx(filename, idx_map, idx, self.count)
846 assert(count == self.count)
853 idx_f = open(filename, 'a+b')
858 b = idx_f.read(8 + 4*256)
861 obj_list_sum = Sha1()
862 for b in chunkyreader(idx_f, 20*self.count):
864 obj_list_sum.update(b)
865 namebase = obj_list_sum.hexdigest()
867 for b in chunkyreader(idx_f):
869 idx_f.write(idx_sum.digest())
870 fdatasync(idx_f.fileno())
876 def _gitenv(repo_dir = None):
880 os.environ['GIT_DIR'] = os.path.abspath(repo_dir)
884 def list_refs(patterns=None, repo_dir=None,
885 limit_to_heads=False, limit_to_tags=False):
886 """Yield (refname, hash) tuples for all repository refs unless
887 patterns are specified. In that case, only include tuples for
888 refs matching those patterns (cf. git-show-ref(1)). The limits
889 restrict the result items to refs/heads or refs/tags. If both
890 limits are specified, items from both sources will be included.
893 argv = ['git', 'show-ref']
895 argv.append('--heads')
897 argv.append('--tags')
900 argv.extend(patterns)
901 p = subprocess.Popen(argv,
902 preexec_fn = _gitenv(repo_dir),
903 stdout = subprocess.PIPE)
904 out = p.stdout.read().strip()
905 rv = p.wait() # not fatal
909 for d in out.split('\n'):
910 (sha, name) = d.split(' ', 1)
911 yield (name, sha.decode('hex'))
914 def read_ref(refname, repo_dir = None):
915 """Get the commit id of the most recent commit made on a given ref."""
916 refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
917 l = tuple(islice(refs, 2))
925 def rev_list(ref, count=None, repo_dir=None):
926 """Generate a list of reachable commits in reverse chronological order.
928 This generator walks through commits, from child to parent, that are
929 reachable via the specified ref and yields a series of tuples of the form
932 If count is a non-zero integer, limit the number of commits to "count"
935 assert(not ref.startswith('-'))
937 if isinstance(count, Integral):
938 opts += ['-n', str(count)]
941 argv = ['git', 'rev-list', '--pretty=format:%at'] + opts + [ref, '--']
942 p = subprocess.Popen(argv,
943 preexec_fn = _gitenv(repo_dir),
944 stdout = subprocess.PIPE)
948 if s.startswith('commit '):
949 commit = s[7:].decode('hex')
953 rv = p.wait() # not fatal
955 raise GitError, 'git rev-list returned error %d' % rv
958 def get_commit_dates(refs, repo_dir=None):
959 """Get the dates for the specified commit refs. For now, every unique
960 string in refs must resolve to a different commit or this
961 function will fail."""
964 commit = get_commit_items(ref, cp(repo_dir))
965 result.append(commit.author_sec)
969 def rev_parse(committish, repo_dir=None):
970 """Resolve the full hash for 'committish', if it exists.
972 Should be roughly equivalent to 'git rev-parse'.
974 Returns the hex value of the hash if it is found, None if 'committish' does
975 not correspond to anything.
977 head = read_ref(committish, repo_dir=repo_dir)
979 debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
982 pL = PackIdxList(repo('objects/pack', repo_dir=repo_dir))
984 if len(committish) == 40:
986 hash = committish.decode('hex')
996 def update_ref(refname, newval, oldval, repo_dir=None):
997 """Update a repository reference."""
1000 assert(refname.startswith('refs/heads/') \
1001 or refname.startswith('refs/tags/'))
1002 p = subprocess.Popen(['git', 'update-ref', refname,
1003 newval.encode('hex'), oldval.encode('hex')],
1004 preexec_fn = _gitenv(repo_dir))
1005 _git_wait('git update-ref', p)
1008 def delete_ref(refname, oldvalue=None):
1009 """Delete a repository reference (see git update-ref(1))."""
1010 assert(refname.startswith('refs/'))
1011 oldvalue = [] if not oldvalue else [oldvalue]
1012 p = subprocess.Popen(['git', 'update-ref', '-d', refname] + oldvalue,
1013 preexec_fn = _gitenv())
1014 _git_wait('git update-ref', p)
1017 def guess_repo(path=None):
1018 """Set the path value in the global variable "repodir".
1019 This makes bup look for an existing bup repository, but not fail if a
1020 repository doesn't exist. Usually, if you are interacting with a bup
1021 repository, you would not be calling this function but using
1022 check_repo_or_die().
1028 repodir = os.environ.get('BUP_DIR')
1030 repodir = os.path.expanduser('~/.bup')
1033 def init_repo(path=None):
1034 """Create the Git bare repository for bup in a given path."""
1036 d = repo() # appends a / to the path
1037 parent = os.path.dirname(os.path.dirname(d))
1038 if parent and not os.path.exists(parent):
1039 raise GitError('parent directory "%s" does not exist\n' % parent)
1040 if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
1041 raise GitError('"%s" exists but is not a directory\n' % d)
1042 p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
1043 preexec_fn = _gitenv())
1044 _git_wait('git init', p)
1045 # Force the index version configuration in order to ensure bup works
1046 # regardless of the version of the installed Git binary.
1047 p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
1048 stdout=sys.stderr, preexec_fn = _gitenv())
1049 _git_wait('git config', p)
1051 p = subprocess.Popen(['git', 'config', 'core.logAllRefUpdates', 'true'],
1052 stdout=sys.stderr, preexec_fn = _gitenv())
1053 _git_wait('git config', p)
1056 def check_repo_or_die(path=None):
1057 """Check to see if a bup repository probably exists, and abort if not."""
1060 pst = stat_if_exists(top + '/objects/pack')
1061 if pst and stat.S_ISDIR(pst.st_mode):
1064 top_st = stat_if_exists(top)
1066 log('error: repository %r does not exist (see "bup help init")\n'
1069 log('error: %r is not a repository\n' % top)
1075 """Get Git's version and ensure a usable version is installed.
1077 The returned version is formatted as an ordered tuple with each position
1078 representing a digit in the version tag. For example, the following tuple
1079 would represent version 1.6.6.9:
1081 ('1', '6', '6', '9')
1085 p = subprocess.Popen(['git', '--version'],
1086 stdout=subprocess.PIPE)
1087 gvs = p.stdout.read()
1088 _git_wait('git --version', p)
1089 m = re.match(r'git version (\S+.\S+)', gvs)
1091 raise GitError('git --version weird output: %r' % gvs)
1092 _ver = tuple(m.group(1).split('.'))
1093 needed = ('1','5', '3', '1')
1095 raise GitError('git version %s or higher is required; you have %s'
1096 % ('.'.join(needed), '.'.join(_ver)))
1100 class _AbortableIter:
1101 def __init__(self, it, onabort = None):
1103 self.onabort = onabort
1111 return next(self.it)
1112 except StopIteration as e:
1120 """Abort iteration and call the abortion callback, if needed."""
1132 """Link to 'git cat-file' that is used to retrieve blob data."""
1133 def __init__(self, repo_dir = None):
1135 self.repo_dir = repo_dir
1136 wanted = ('1','5','6')
1138 log('error: git version must be at least 1.5.6\n')
1140 self.p = self.inprogress = None
1144 self.p.stdout.close()
1145 self.p.stdin.close()
1147 self.inprogress = None
1151 self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1152 stdin=subprocess.PIPE,
1153 stdout=subprocess.PIPE,
1156 preexec_fn = _gitenv(self.repo_dir))
1158 def get(self, id, size=False):
1159 """Yield info about object id, and then if the object exists, all of
1160 the data referred to by the object. If size is false the info
1161 will just be the object type name. If size is true, the info
1162 will be (type, size). When the object does not exist, in both
1163 cases the type will be None.
1166 if not self.p or self.p.poll() != None:
1169 poll_result = self.p.poll()
1170 assert(poll_result == None)
1172 log('get: opening %r while %r is open\n' % (id, self.inprogress))
1173 assert(not self.inprogress)
1174 assert(id.find('\n') < 0)
1175 assert(id.find('\r') < 0)
1176 assert(not id.startswith('-'))
1177 self.inprogress = id
1178 self.p.stdin.write('%s\n' % id)
1179 self.p.stdin.flush()
1180 hdr = self.p.stdout.readline()
1181 if hdr.endswith(' missing\n'):
1182 self.inprogress = None
1188 spl = hdr.split(' ')
1189 if len(spl) != 3 or len(spl[0]) != 40:
1190 raise GitError('expected blob, got %r' % spl)
1193 it = _AbortableIter(chunkyreader(self.p.stdout, sz),
1194 onabort=self._abort)
1202 readline_result = self.p.stdout.readline()
1203 assert(readline_result == '\n')
1204 self.inprogress = None
1205 except Exception as e:
1209 def _join(self, it):
1214 elif type == 'tree':
1215 treefile = ''.join(it)
1216 for (mode, name, sha) in tree_decode(treefile):
1217 for blob in self.join(sha.encode('hex')):
1219 elif type == 'commit':
1220 treeline = ''.join(it).split('\n')[0]
1221 assert(treeline.startswith('tree '))
1222 for blob in self.join(treeline[5:]):
1225 raise GitError('invalid object type %r: expected blob/tree/commit'
1229 """Generate a list of the content of all blobs that can be reached
1230 from an object. The hash given in 'id' must point to a blob, a tree
1231 or a commit. The content of all blobs that can be seen from trees or
1232 commits will be added to the list.
1235 for d in self._join(self.get(id)):
1237 except StopIteration:
1243 def cp(repo_dir=None):
1244 """Create a CatPipe object or reuse the already existing one."""
1247 repo_dir = repodir or repo()
1248 repo_dir = os.path.abspath(repo_dir)
1249 cp = _cp.get(repo_dir)
1251 cp = CatPipe(repo_dir)
1256 def tags(repo_dir = None):
1257 """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1259 for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1260 assert(n.startswith('refs/tags/'))
1264 tags[c].append(name) # more than one tag can point at 'c'
1268 class MissingObject(KeyError):
1269 def __init__(self, id):
1271 KeyError.__init__(self, 'object %r is missing' % id.encode('hex'))
1274 WalkItem = namedtuple('WalkItem', ['id', 'type', 'mode',
1275 'path', 'chunk_path', 'data'])
1276 # The path is the mangled path, and if an item represents a fragment
1277 # of a chunked file, the chunk_path will be the chunked subtree path
1278 # for the chunk, i.e. ['', '2d3115e', ...]. The top-level path for a
1279 # chunked file will have a chunk_path of ['']. So some chunk subtree
1280 # of the file '/foo/bar/baz' might look like this:
1282 # item.path = ['foo', 'bar', 'baz.bup']
1283 # item.chunk_path = ['', '2d3115e', '016b097']
1284 # item.type = 'tree'
1288 def walk_object(cat_pipe, id,
1291 """Yield everything reachable from id via cat_pipe as a WalkItem,
1292 stopping whenever stop_at(id) returns true. Throw MissingObject
1293 if a hash encountered is missing from the repository, and don't
1294 read or return blob content in the data field unless include_data
1297 # Maintain the pending stack on the heap to avoid stack overflow
1298 pending = [(id, [], [], None)]
1300 id, parent_path, chunk_path, mode = pending.pop()
1301 if stop_at and stop_at(id):
1304 if (not include_data) and mode and stat.S_ISREG(mode):
1305 # If the object is a "regular file", then it's a leaf in
1306 # the graph, so we can skip reading the data if the caller
1307 # hasn't requested it.
1308 yield WalkItem(id=id, type='blob',
1309 chunk_path=chunk_path, path=parent_path,
1314 item_it = cat_pipe.get(id)
1315 type = next(item_it)
1317 raise MissingObject(id.decode('hex'))
1318 if type not in ('blob', 'commit', 'tree'):
1319 raise Exception('unexpected repository object type %r' % type)
1321 # FIXME: set the mode based on the type when the mode is None
1322 if type == 'blob' and not include_data:
1323 # Dump data until we can ask cat_pipe not to fetch it
1324 for ignored in item_it:
1328 data = ''.join(item_it)
1330 yield WalkItem(id=id, type=type,
1331 chunk_path=chunk_path, path=parent_path,
1333 data=(data if include_data else None))
1335 if type == 'commit':
1336 commit_items = parse_commit(data)
1337 for pid in commit_items.parents:
1338 pending.append((pid, parent_path, chunk_path, mode))
1339 pending.append((commit_items.tree, parent_path, chunk_path,
1340 hashsplit.GIT_MODE_TREE))
1341 elif type == 'tree':
1342 for mode, name, ent_id in tree_decode(data):
1343 demangled, bup_type = demangle_name(name, mode)
1345 sub_path = parent_path
1346 sub_chunk_path = chunk_path + [name]
1348 sub_path = parent_path + [name]
1349 if bup_type == BUP_CHUNKED:
1350 sub_chunk_path = ['']
1352 sub_chunk_path = chunk_path
1353 pending.append((ent_id.encode('hex'), sub_path, sub_chunk_path,