1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
6 import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
7 from collections import namedtuple
8 from itertools import islice
9 from numbers import Integral
11 from bup import _helpers, hashsplit, path, midx, bloom, xstat
12 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
14 hostname, localtime, log, merge_iter,
15 mmap_read, mmap_readwrite,
17 progress, qprogress, stat_if_exists,
18 unlink, username, userfullname,
23 repodir = None # The default repository, once initialized
25 _typemap = { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
26 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
32 class GitError(Exception):
36 def _git_wait(cmd, p):
39 raise GitError('%s returned %d' % (cmd, rv))
41 def _git_capture(argv):
42 p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv())
44 _git_wait(repr(argv), p)
47 def git_config_get(option, repo_dir=None):
48 cmd = ('git', 'config', '--get', option)
49 p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
50 preexec_fn=_gitenv(repo_dir=repo_dir))
56 raise GitError('%s returned %d' % (cmd, rc))
60 def parse_tz_offset(s):
61 """UTC offset in seconds."""
62 tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
68 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
69 # Make sure that's authoritative.
70 _start_end_char = r'[^ .,:;<>"\'\0\n]'
71 _content_char = r'[^\0\n<>]'
72 _safe_str_rx = '(?:%s{1,2}|(?:%s%s*%s))' \
74 _start_end_char, _content_char, _start_end_char)
75 _tz_rx = r'[-+]\d\d[0-5]\d'
76 _parent_rx = r'(?:parent [abcdefABCDEF0123456789]{40}\n)'
77 _commit_rx = re.compile(r'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
78 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
79 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)
81 (?P<message>(?:.|\n)*)''' % (_parent_rx,
82 _safe_str_rx, _safe_str_rx, _tz_rx,
83 _safe_str_rx, _safe_str_rx, _tz_rx))
84 _parent_hash_rx = re.compile(r'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
87 # Note that the author_sec and committer_sec values are (UTC) epoch seconds.
88 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
89 'author_name', 'author_mail',
90 'author_sec', 'author_offset',
91 'committer_name', 'committer_mail',
92 'committer_sec', 'committer_offset',
95 def parse_commit(content):
96 commit_match = re.match(_commit_rx, content)
98 raise Exception('cannot parse commit %r' % content)
99 matches = commit_match.groupdict()
100 return CommitInfo(tree=matches['tree'],
101 parents=re.findall(_parent_hash_rx, matches['parents']),
102 author_name=matches['author_name'],
103 author_mail=matches['author_mail'],
104 author_sec=int(matches['asec']),
105 author_offset=parse_tz_offset(matches['atz']),
106 committer_name=matches['committer_name'],
107 committer_mail=matches['committer_mail'],
108 committer_sec=int(matches['csec']),
109 committer_offset=parse_tz_offset(matches['ctz']),
110 message=matches['message'])
113 def get_commit_items(id, cp):
114 commit_it = cp.get(id)
115 _, typ, _ = next(commit_it)
116 assert(typ == 'commit')
117 commit_content = ''.join(commit_it)
118 return parse_commit(commit_content)
121 def _local_git_date_str(epoch_sec):
122 return '%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
125 def _git_date_str(epoch_sec, tz_offset_sec):
126 offs = tz_offset_sec // 60
127 return '%d %s%02d%02d' \
129 '+' if offs >= 0 else '-',
134 def repo(sub = '', repo_dir=None):
135 """Get the path to the git repository or one of its subdirectories."""
137 repo_dir = repo_dir or repodir
139 raise GitError('You should call check_repo_or_die()')
141 # If there's a .git subdirectory, then the actual repo is in there.
142 gd = os.path.join(repo_dir, '.git')
143 if os.path.exists(gd):
146 return os.path.join(repo_dir, sub)
150 return re.sub(r'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)',
155 full = os.path.abspath(path)
156 fullrepo = os.path.abspath(repo(''))
157 if not fullrepo.endswith('/'):
159 if full.startswith(fullrepo):
160 path = full[len(fullrepo):]
161 if path.startswith('index-cache/'):
162 path = path[len('index-cache/'):]
163 return shorten_hash(path)
167 paths = [repo('objects/pack')]
168 paths += glob.glob(repo('index-cache/*/.'))
172 def auto_midx(objdir):
173 args = [path.exe(), 'midx', '--auto', '--dir', objdir]
175 rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
177 # make sure 'args' gets printed to help with debugging
178 add_error('%r: exception: %s' % (args, e))
181 add_error('%r: returned %d' % (args, rv))
183 args = [path.exe(), 'bloom', '--dir', objdir]
185 rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
187 # make sure 'args' gets printed to help with debugging
188 add_error('%r: exception: %s' % (args, e))
191 add_error('%r: returned %d' % (args, rv))
194 def mangle_name(name, mode, gitmode):
195 """Mangle a file name to present an abstract name for segmented files.
196 Mangled file names will have the ".bup" extension added to them. If a
197 file's name already ends with ".bup", a ".bupl" extension is added to
198 disambiguate normal files from segmented ones.
200 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
201 assert(stat.S_ISDIR(gitmode))
203 elif name.endswith('.bup') or name[:-1].endswith('.bup'):
204 return name + '.bupl'
209 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
210 def demangle_name(name, mode):
211 """Remove name mangling from a file name, if necessary.
213 The return value is a tuple (demangled_filename,mode), where mode is one of
216 * BUP_NORMAL : files that should be read as-is from the repository
217 * BUP_CHUNKED : files that were chunked and need to be reassembled
219 For more information on the name mangling algorithm, see mangle_name()
221 if name.endswith('.bupl'):
222 return (name[:-5], BUP_NORMAL)
223 elif name.endswith('.bup'):
224 return (name[:-4], BUP_CHUNKED)
225 elif name.endswith('.bupm'):
227 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
229 return (name, BUP_NORMAL)
232 def calc_hash(type, content):
233 """Calculate some content's hash in the Git fashion."""
234 header = '%s %d\0' % (type, len(content))
240 def shalist_item_sort_key(ent):
241 (mode, name, id) = ent
242 assert(mode+0 == mode)
243 if stat.S_ISDIR(mode):
249 def tree_encode(shalist):
250 """Generate a git tree object from (mode,name,hash) tuples."""
251 shalist = sorted(shalist, key = shalist_item_sort_key)
253 for (mode,name,bin) in shalist:
255 assert(mode+0 == mode)
257 assert(len(bin) == 20)
258 s = '%o %s\0%s' % (mode,name,bin)
259 assert(s[0] != '0') # 0-padded octal is not acceptable in a git tree
264 def tree_decode(buf):
265 """Generate a list of (mode,name,hash) from the git tree object in buf."""
267 while ofs < len(buf):
268 z = buf.find('\0', ofs)
270 spl = buf[ofs:z].split(' ', 1)
271 assert(len(spl) == 2)
273 sha = buf[z+1:z+1+20]
275 yield (int(mode, 8), name, sha)
278 def _encode_packobj(type, content, compression_level=1):
279 if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
280 raise ValueError('invalid compression level %s' % compression_level)
283 szbits = (sz & 0x0f) | (_typemap[type]<<4)
286 if sz: szbits |= 0x80
292 z = zlib.compressobj(compression_level)
294 yield z.compress(content)
298 def _encode_looseobj(type, content, compression_level=1):
299 z = zlib.compressobj(compression_level)
300 yield z.compress('%s %d\0' % (type, len(content)))
301 yield z.compress(content)
305 def _decode_looseobj(buf):
307 s = zlib.decompress(buf)
314 assert(type in _typemap)
315 assert(sz == len(content))
316 return (type, content)
319 def _decode_packobj(buf):
322 type = _typermap[(c & 0x70) >> 4]
329 sz |= (c & 0x7f) << shift
333 return (type, zlib.decompress(buf[i+1:]))
340 def find_offset(self, hash):
341 """Get the offset of an object inside the index file."""
342 idx = self._idx_from_hash(hash)
344 return self._ofs_from_idx(idx)
347 def exists(self, hash, want_source=False):
348 """Return nonempty if the object exists in this index."""
349 if hash and (self._idx_from_hash(hash) != None):
350 return want_source and os.path.basename(self.name) or True
354 return int(self.fanout[255])
356 def _idx_from_hash(self, hash):
357 global _total_searches, _total_steps
359 assert(len(hash) == 20)
361 start = self.fanout[b1-1] # range -1..254
362 end = self.fanout[b1] # range 0..255
364 _total_steps += 1 # lookup table is a step
367 mid = start + (end-start)/2
368 v = self._idx_to_hash(mid)
378 class PackIdxV1(PackIdx):
379 """Object representation of a Git pack index (version 1) file."""
380 def __init__(self, filename, f):
382 self.idxnames = [self.name]
383 self.map = mmap_read(f)
384 self.fanout = list(struct.unpack('!256I',
385 str(buffer(self.map, 0, 256*4))))
386 self.fanout.append(0) # entry "-1"
387 nsha = self.fanout[255]
389 self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
391 def _ofs_from_idx(self, idx):
392 return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
394 def _idx_to_hash(self, idx):
395 return str(self.shatable[idx*24+4 : idx*24+24])
398 for i in xrange(self.fanout[255]):
399 yield buffer(self.map, 256*4 + 24*i + 4, 20)
402 class PackIdxV2(PackIdx):
403 """Object representation of a Git pack index (version 2) file."""
404 def __init__(self, filename, f):
406 self.idxnames = [self.name]
407 self.map = mmap_read(f)
408 assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
409 self.fanout = list(struct.unpack('!256I',
410 str(buffer(self.map, 8, 256*4))))
411 self.fanout.append(0) # entry "-1"
412 nsha = self.fanout[255]
413 self.sha_ofs = 8 + 256*4
414 self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
415 self.ofstable = buffer(self.map,
416 self.sha_ofs + nsha*20 + nsha*4,
418 self.ofs64table = buffer(self.map,
419 8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
421 def _ofs_from_idx(self, idx):
422 ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
424 idx64 = ofs & 0x7fffffff
425 ofs = struct.unpack('!Q',
426 str(buffer(self.ofs64table, idx64*8, 8)))[0]
429 def _idx_to_hash(self, idx):
430 return str(self.shatable[idx*20:(idx+1)*20])
433 for i in xrange(self.fanout[255]):
434 yield buffer(self.map, 8 + 256*4 + 20*i, 20)
439 def __init__(self, dir):
441 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
446 self.do_bloom = False
453 assert(_mpi_count == 0)
456 return iter(idxmerge(self.packs))
459 return sum(len(pack) for pack in self.packs)
461 def exists(self, hash, want_source=False):
462 """Return nonempty if the object exists in the index files."""
463 global _total_searches
465 if hash in self.also:
467 if self.do_bloom and self.bloom:
468 if self.bloom.exists(hash):
469 self.do_bloom = False
471 _total_searches -= 1 # was counted by bloom
473 for i in xrange(len(self.packs)):
475 _total_searches -= 1 # will be incremented by sub-pack
476 ix = p.exists(hash, want_source=want_source)
478 # reorder so most recently used packs are searched first
479 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
484 def refresh(self, skip_midx = False):
485 """Refresh the index list.
486 This method verifies if .midx files were superseded (e.g. all of its
487 contents are in another, bigger .midx file) and removes the superseded
490 If skip_midx is True, all work on .midx files will be skipped and .midx
491 files will be removed from the list.
493 The module-global variable 'ignore_midx' can force this function to
494 always act as if skip_midx was True.
496 self.bloom = None # Always reopen the bloom as it may have been relaced
497 self.do_bloom = False
498 skip_midx = skip_midx or ignore_midx
499 d = dict((p.name, p) for p in self.packs
500 if not skip_midx or not isinstance(p, midx.PackMidx))
501 if os.path.exists(self.dir):
504 for ix in self.packs:
505 if isinstance(ix, midx.PackMidx):
506 for name in ix.idxnames:
507 d[os.path.join(self.dir, name)] = ix
508 for full in glob.glob(os.path.join(self.dir,'*.midx')):
510 mx = midx.PackMidx(full)
511 (mxd, mxf) = os.path.split(mx.name)
513 for n in mx.idxnames:
514 if not os.path.exists(os.path.join(mxd, n)):
515 log(('warning: index %s missing\n' +
516 ' used by %s\n') % (n, mxf))
524 midxl.sort(key=lambda ix:
525 (-len(ix), -xstat.stat(ix.name).st_mtime))
528 for sub in ix.idxnames:
529 found = d.get(os.path.join(self.dir, sub))
530 if not found or isinstance(found, PackIdx):
531 # doesn't exist, or exists but not in a midx
536 for name in ix.idxnames:
537 d[os.path.join(self.dir, name)] = ix
538 elif not ix.force_keep:
539 debug1('midx: removing redundant: %s\n'
540 % os.path.basename(ix.name))
543 for full in glob.glob(os.path.join(self.dir,'*.idx')):
547 except GitError as e:
551 bfull = os.path.join(self.dir, 'bup.bloom')
552 if self.bloom is None and os.path.exists(bfull):
553 self.bloom = bloom.ShaBloom(bfull)
554 self.packs = list(set(d.values()))
555 self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
556 if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
560 debug1('PackIdxList: using %d index%s.\n'
561 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
564 """Insert an additional object in the list."""
568 def open_idx(filename):
569 if filename.endswith('.idx'):
570 f = open(filename, 'rb')
572 if header[0:4] == '\377tOc':
573 version = struct.unpack('!I', header[4:8])[0]
575 return PackIdxV2(filename, f)
577 raise GitError('%s: expected idx file version 2, got %d'
578 % (filename, version))
579 elif len(header) == 8 and header[0:4] < '\377tOc':
580 return PackIdxV1(filename, f)
582 raise GitError('%s: unrecognized idx file header' % filename)
583 elif filename.endswith('.midx'):
584 return midx.PackMidx(filename)
586 raise GitError('idx filenames must end with .idx or .midx')
589 def idxmerge(idxlist, final_progress=True):
590 """Generate a list of all the objects reachable in a PackIdxList."""
591 def pfunc(count, total):
592 qprogress('Reading indexes: %.2f%% (%d/%d)\r'
593 % (count*100.0/total, count, total))
594 def pfinal(count, total):
596 progress('Reading indexes: %.2f%% (%d/%d), done.\n'
597 % (100, total, total))
598 return merge_iter(idxlist, 10024, pfunc, pfinal)
601 def _make_objcache():
602 return PackIdxList(repo('objects/pack'))
604 # bup-gc assumes that it can disable all PackWriter activities
605 # (bloom/midx/cache) via the constructor and close() arguments.
608 """Writes Git objects inside a pack file."""
609 def __init__(self, objcache_maker=_make_objcache, compression_level=1,
610 run_midx=True, on_pack_finish=None,
611 max_pack_size=None, max_pack_objects=None):
612 self.repo_dir = repo()
619 self.objcache_maker = objcache_maker
621 self.compression_level = compression_level
622 self.run_midx=run_midx
623 self.on_pack_finish = on_pack_finish
624 if not max_pack_size:
625 max_pack_size = git_config_get('pack.packSizeLimit',
626 repo_dir=self.repo_dir)
627 if max_pack_size is not None:
628 max_pack_size = parse_num(max_pack_size)
629 if not max_pack_size:
630 # larger packs slow down pruning
631 max_pack_size = 1000 * 1000 * 1000
632 self.max_pack_size = max_pack_size
633 # cache memory usage is about 83 bytes per object
634 self.max_pack_objects = max_pack_objects if max_pack_objects \
635 else max(1, self.max_pack_size // 5000)
642 objdir = dir = os.path.join(self.repo_dir, 'objects')
643 fd, name = tempfile.mkstemp(suffix='.pack', dir=objdir)
645 self.file = os.fdopen(fd, 'w+b')
650 self.parentfd = os.open(objdir, os.O_RDONLY)
656 assert(name.endswith('.pack'))
657 self.filename = name[:-5]
658 self.file.write('PACK\0\0\0\2\0\0\0\0')
659 self.idx = list(list() for i in xrange(256))
661 def _raw_write(self, datalist, sha):
664 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
665 # the file never has a *partial* blob. So let's make sure it's
666 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
667 # to our hashsplit algorithm.) f.write() does its own buffering,
668 # but that's okay because we'll flush it in _end().
669 oneblob = ''.join(datalist)
673 raise GitError, e, sys.exc_info()[2]
675 crc = zlib.crc32(oneblob) & 0xffffffff
676 self._update_idx(sha, crc, nw)
681 def _update_idx(self, sha, crc, size):
684 self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
686 def _write(self, sha, type, content):
690 sha = calc_hash(type, content)
691 size, crc = self._raw_write(_encode_packobj(type, content,
692 self.compression_level),
694 if self.outbytes >= self.max_pack_size \
695 or self.count >= self.max_pack_objects:
699 def breakpoint(self):
700 """Clear byte and object counts and return the last processed id."""
701 id = self._end(self.run_midx)
702 self.outbytes = self.count = 0
705 def _require_objcache(self):
706 if self.objcache is None and self.objcache_maker:
707 self.objcache = self.objcache_maker()
708 if self.objcache is None:
710 "PackWriter not opened or can't check exists w/o objcache")
712 def exists(self, id, want_source=False):
713 """Return non-empty if an object is found in the object cache."""
714 self._require_objcache()
715 return self.objcache.exists(id, want_source=want_source)
717 def just_write(self, sha, type, content):
718 """Write an object to the pack file, bypassing the objcache. Fails if
720 self._write(sha, type, content)
722 def maybe_write(self, type, content):
723 """Write an object to the pack file if not present and return its id."""
724 sha = calc_hash(type, content)
725 if not self.exists(sha):
726 self.just_write(sha, type, content)
727 self._require_objcache()
728 self.objcache.add(sha)
731 def new_blob(self, blob):
732 """Create a blob object in the pack with the supplied content."""
733 return self.maybe_write('blob', blob)
735 def new_tree(self, shalist):
736 """Create a tree object in the pack."""
737 content = tree_encode(shalist)
738 return self.maybe_write('tree', content)
740 def new_commit(self, tree, parent,
741 author, adate_sec, adate_tz,
742 committer, cdate_sec, cdate_tz,
744 """Create a commit object in the pack. The date_sec values must be
745 epoch-seconds, and if a tz is None, the local timezone is assumed."""
747 adate_str = _git_date_str(adate_sec, adate_tz)
749 adate_str = _local_git_date_str(adate_sec)
751 cdate_str = _git_date_str(cdate_sec, cdate_tz)
753 cdate_str = _local_git_date_str(cdate_sec)
755 if tree: l.append('tree %s' % tree.encode('hex'))
756 if parent: l.append('parent %s' % parent.encode('hex'))
757 if author: l.append('author %s %s' % (author, adate_str))
758 if committer: l.append('committer %s %s' % (committer, cdate_str))
761 return self.maybe_write('commit', '\n'.join(l))
764 """Remove the pack file from disk."""
773 os.unlink(self.filename + '.pack')
780 def _end(self, run_midx=True):
782 if not f: return None
789 # update object count
791 cp = struct.pack('!i', self.count)
795 # calculate the pack sha1sum
798 for b in chunkyreader(f):
800 packbin = sum.digest()
802 fdatasync(f.fileno())
806 obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
807 nameprefix = os.path.join(self.repo_dir,
808 'objects/pack/pack-' + obj_list_sha)
809 if os.path.exists(self.filename + '.map'):
810 os.unlink(self.filename + '.map')
811 os.rename(self.filename + '.pack', nameprefix + '.pack')
812 os.rename(self.filename + '.idx', nameprefix + '.idx')
814 os.fsync(self.parentfd)
816 os.close(self.parentfd)
819 auto_midx(os.path.join(self.repo_dir, 'objects/pack'))
821 if self.on_pack_finish:
822 self.on_pack_finish(nameprefix)
826 def close(self, run_midx=True):
827 """Close the pack file and move it to its definitive path."""
828 return self._end(run_midx=run_midx)
830 def _write_pack_idx_v2(self, filename, idx, packbin):
833 for entry in section:
834 if entry[2] >= 2**31:
837 # Length: header + fan-out + shas-and-crcs + overflow-offsets
838 index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
840 idx_f = open(filename, 'w+b')
842 idx_f.truncate(index_len)
843 fdatasync(idx_f.fileno())
844 idx_map = mmap_readwrite(idx_f, close=False)
846 count = _helpers.write_idx(filename, idx_map, idx, self.count)
847 assert(count == self.count)
854 idx_f = open(filename, 'a+b')
859 b = idx_f.read(8 + 4*256)
862 obj_list_sum = Sha1()
863 for b in chunkyreader(idx_f, 20*self.count):
865 obj_list_sum.update(b)
866 namebase = obj_list_sum.hexdigest()
868 for b in chunkyreader(idx_f):
870 idx_f.write(idx_sum.digest())
871 fdatasync(idx_f.fileno())
877 def _gitenv(repo_dir = None):
881 os.environ['GIT_DIR'] = os.path.abspath(repo_dir)
885 def list_refs(patterns=None, repo_dir=None,
886 limit_to_heads=False, limit_to_tags=False):
887 """Yield (refname, hash) tuples for all repository refs unless
888 patterns are specified. In that case, only include tuples for
889 refs matching those patterns (cf. git-show-ref(1)). The limits
890 restrict the result items to refs/heads or refs/tags. If both
891 limits are specified, items from both sources will be included.
894 argv = ['git', 'show-ref']
896 argv.append('--heads')
898 argv.append('--tags')
901 argv.extend(patterns)
902 p = subprocess.Popen(argv,
903 preexec_fn = _gitenv(repo_dir),
904 stdout = subprocess.PIPE)
905 out = p.stdout.read().strip()
906 rv = p.wait() # not fatal
910 for d in out.split('\n'):
911 (sha, name) = d.split(' ', 1)
912 yield (name, sha.decode('hex'))
915 def read_ref(refname, repo_dir = None):
916 """Get the commit id of the most recent commit made on a given ref."""
917 refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
918 l = tuple(islice(refs, 2))
926 def rev_list(ref, count=None, parse=None, format=None, repo_dir=None):
927 """Yield information about commits as per "git rev-list". If a format
928 is not provided, yield one hex hash at a time. If a format is
929 provided, pass it to rev-list and call parse(git_stdout) for each
930 commit with the stream positioned just after the rev-list "commit
931 HASH" header line. When a format is provided yield (oidx,
932 parse(git_stdout)) for each commit.
935 assert bool(parse) == bool(format)
936 assert not ref.startswith('-')
937 argv = ['git', 'rev-list']
938 if isinstance(count, Integral):
939 argv.extend(['-n', str(count)])
943 argv.append('--pretty=format:' + format)
947 p = subprocess.Popen(argv,
948 preexec_fn = _gitenv(repo_dir),
949 stdout = subprocess.PIPE)
951 for line in p.stdout:
954 line = p.stdout.readline()
957 if not s.startswith('commit '):
958 raise Exception('unexpected line ' + s)
959 yield s[7:], parse(p.stdout)
960 line = p.stdout.readline()
962 rv = p.wait() # not fatal
964 raise GitError, 'git rev-list returned error %d' % rv
967 def get_commit_dates(refs, repo_dir=None):
968 """Get the dates for the specified commit refs. For now, every unique
969 string in refs must resolve to a different commit or this
970 function will fail."""
973 commit = get_commit_items(ref, cp(repo_dir))
974 result.append(commit.author_sec)
978 def rev_parse(committish, repo_dir=None):
979 """Resolve the full hash for 'committish', if it exists.
981 Should be roughly equivalent to 'git rev-parse'.
983 Returns the hex value of the hash if it is found, None if 'committish' does
984 not correspond to anything.
986 head = read_ref(committish, repo_dir=repo_dir)
988 debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
991 pL = PackIdxList(repo('objects/pack', repo_dir=repo_dir))
993 if len(committish) == 40:
995 hash = committish.decode('hex')
1005 def update_ref(refname, newval, oldval, repo_dir=None):
1006 """Update a repository reference."""
1009 assert(refname.startswith('refs/heads/') \
1010 or refname.startswith('refs/tags/'))
1011 p = subprocess.Popen(['git', 'update-ref', refname,
1012 newval.encode('hex'), oldval.encode('hex')],
1013 preexec_fn = _gitenv(repo_dir))
1014 _git_wait('git update-ref', p)
1017 def delete_ref(refname, oldvalue=None):
1018 """Delete a repository reference (see git update-ref(1))."""
1019 assert(refname.startswith('refs/'))
1020 oldvalue = [] if not oldvalue else [oldvalue]
1021 p = subprocess.Popen(['git', 'update-ref', '-d', refname] + oldvalue,
1022 preexec_fn = _gitenv())
1023 _git_wait('git update-ref', p)
1026 def guess_repo(path=None):
1027 """Set the path value in the global variable "repodir".
1028 This makes bup look for an existing bup repository, but not fail if a
1029 repository doesn't exist. Usually, if you are interacting with a bup
1030 repository, you would not be calling this function but using
1031 check_repo_or_die().
1037 repodir = os.environ.get('BUP_DIR')
1039 repodir = os.path.expanduser('~/.bup')
1042 def init_repo(path=None):
1043 """Create the Git bare repository for bup in a given path."""
1045 d = repo() # appends a / to the path
1046 parent = os.path.dirname(os.path.dirname(d))
1047 if parent and not os.path.exists(parent):
1048 raise GitError('parent directory "%s" does not exist\n' % parent)
1049 if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
1050 raise GitError('"%s" exists but is not a directory\n' % d)
1051 p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
1052 preexec_fn = _gitenv())
1053 _git_wait('git init', p)
1054 # Force the index version configuration in order to ensure bup works
1055 # regardless of the version of the installed Git binary.
1056 p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
1057 stdout=sys.stderr, preexec_fn = _gitenv())
1058 _git_wait('git config', p)
1060 p = subprocess.Popen(['git', 'config', 'core.logAllRefUpdates', 'true'],
1061 stdout=sys.stderr, preexec_fn = _gitenv())
1062 _git_wait('git config', p)
1065 def check_repo_or_die(path=None):
1066 """Check to see if a bup repository probably exists, and abort if not."""
1069 pst = stat_if_exists(top + '/objects/pack')
1070 if pst and stat.S_ISDIR(pst.st_mode):
1073 top_st = stat_if_exists(top)
1075 log('error: repository %r does not exist (see "bup help init")\n'
1078 log('error: %r is not a repository\n' % top)
1084 """Get Git's version and ensure a usable version is installed.
1086 The returned version is formatted as an ordered tuple with each position
1087 representing a digit in the version tag. For example, the following tuple
1088 would represent version 1.6.6.9:
1090 ('1', '6', '6', '9')
1094 p = subprocess.Popen(['git', '--version'],
1095 stdout=subprocess.PIPE)
1096 gvs = p.stdout.read()
1097 _git_wait('git --version', p)
1098 m = re.match(r'git version (\S+.\S+)', gvs)
1100 raise GitError('git --version weird output: %r' % gvs)
1101 _ver = tuple(m.group(1).split('.'))
1102 needed = ('1','5', '3', '1')
1104 raise GitError('git version %s or higher is required; you have %s'
1105 % ('.'.join(needed), '.'.join(_ver)))
1109 class _AbortableIter:
1110 def __init__(self, it, onabort = None):
1112 self.onabort = onabort
1120 return next(self.it)
1121 except StopIteration as e:
1129 """Abort iteration and call the abortion callback, if needed."""
1141 """Link to 'git cat-file' that is used to retrieve blob data."""
1142 def __init__(self, repo_dir = None):
1144 self.repo_dir = repo_dir
1145 wanted = ('1','5','6')
1147 log('error: git version must be at least 1.5.6\n')
1149 self.p = self.inprogress = None
1153 self.p.stdout.close()
1154 self.p.stdin.close()
1156 self.inprogress = None
1160 self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1161 stdin=subprocess.PIPE,
1162 stdout=subprocess.PIPE,
1165 preexec_fn = _gitenv(self.repo_dir))
1168 """Yield (oidx, type, size), followed by the data referred to by ref.
1169 If ref does not exist, only yield (None, None, None).
1172 if not self.p or self.p.poll() != None:
1175 poll_result = self.p.poll()
1176 assert(poll_result == None)
1178 log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1179 assert(not self.inprogress)
1180 assert(ref.find('\n') < 0)
1181 assert(ref.find('\r') < 0)
1182 assert(not ref.startswith('-'))
1183 self.inprogress = ref
1184 self.p.stdin.write('%s\n' % ref)
1185 self.p.stdin.flush()
1186 hdr = self.p.stdout.readline()
1187 if hdr.endswith(' missing\n'):
1188 self.inprogress = None
1189 yield None, None, None
1191 info = hdr.split(' ')
1192 if len(info) != 3 or len(info[0]) != 40:
1193 raise GitError('expected object (id, type, size), got %r' % spl)
1194 oidx, typ, size = info
1196 it = _AbortableIter(chunkyreader(self.p.stdout, size),
1197 onabort=self._abort)
1199 yield oidx, typ, size
1202 readline_result = self.p.stdout.readline()
1203 assert(readline_result == '\n')
1204 self.inprogress = None
1205 except Exception as e:
1209 def _join(self, it):
1210 _, typ, _ = next(it)
1215 treefile = ''.join(it)
1216 for (mode, name, sha) in tree_decode(treefile):
1217 for blob in self.join(sha.encode('hex')):
1219 elif typ == 'commit':
1220 treeline = ''.join(it).split('\n')[0]
1221 assert(treeline.startswith('tree '))
1222 for blob in self.join(treeline[5:]):
1225 raise GitError('invalid object type %r: expected blob/tree/commit'
1229 """Generate a list of the content of all blobs that can be reached
1230 from an object. The hash given in 'id' must point to a blob, a tree
1231 or a commit. The content of all blobs that can be seen from trees or
1232 commits will be added to the list.
1235 for d in self._join(self.get(id)):
1237 except StopIteration:
1243 def cp(repo_dir=None):
1244 """Create a CatPipe object or reuse the already existing one."""
1247 repo_dir = repodir or repo()
1248 repo_dir = os.path.abspath(repo_dir)
1249 cp = _cp.get(repo_dir)
1251 cp = CatPipe(repo_dir)
1256 def tags(repo_dir = None):
1257 """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1259 for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1260 assert(n.startswith('refs/tags/'))
1264 tags[c].append(name) # more than one tag can point at 'c'
1268 class MissingObject(KeyError):
1269 def __init__(self, oid):
1271 KeyError.__init__(self, 'object %r is missing' % oid.encode('hex'))
1274 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1275 'path', 'chunk_path', 'data'])
1276 # The path is the mangled path, and if an item represents a fragment
1277 # of a chunked file, the chunk_path will be the chunked subtree path
1278 # for the chunk, i.e. ['', '2d3115e', ...]. The top-level path for a
1279 # chunked file will have a chunk_path of ['']. So some chunk subtree
1280 # of the file '/foo/bar/baz' might look like this:
1282 # item.path = ['foo', 'bar', 'baz.bup']
1283 # item.chunk_path = ['', '2d3115e', '016b097']
1284 # item.type = 'tree'
1288 def walk_object(cat_pipe, oidx,
1291 """Yield everything reachable from oidx via cat_pipe as a WalkItem,
1292 stopping whenever stop_at(oidx) returns true. Throw MissingObject
1293 if a hash encountered is missing from the repository, and don't
1294 read or return blob content in the data field unless include_data
1297 # Maintain the pending stack on the heap to avoid stack overflow
1298 pending = [(oidx, [], [], None)]
1300 oidx, parent_path, chunk_path, mode = pending.pop()
1301 oid = oidx.decode('hex')
1302 if stop_at and stop_at(oidx):
1305 if (not include_data) and mode and stat.S_ISREG(mode):
1306 # If the object is a "regular file", then it's a leaf in
1307 # the graph, so we can skip reading the data if the caller
1308 # hasn't requested it.
1309 yield WalkItem(oid=oid, type='blob',
1310 chunk_path=chunk_path, path=parent_path,
1315 item_it = cat_pipe.get(oidx)
1316 get_oidx, typ, _ = next(item_it)
1318 raise MissingObject(oidx.decode('hex'))
1319 if typ not in ('blob', 'commit', 'tree'):
1320 raise Exception('unexpected repository object type %r' % typ)
1322 # FIXME: set the mode based on the type when the mode is None
1323 if typ == 'blob' and not include_data:
1324 # Dump data until we can ask cat_pipe not to fetch it
1325 for ignored in item_it:
1329 data = ''.join(item_it)
1331 yield WalkItem(oid=oid, type=typ,
1332 chunk_path=chunk_path, path=parent_path,
1334 data=(data if include_data else None))
1337 commit_items = parse_commit(data)
1338 for pid in commit_items.parents:
1339 pending.append((pid, parent_path, chunk_path, mode))
1340 pending.append((commit_items.tree, parent_path, chunk_path,
1341 hashsplit.GIT_MODE_TREE))
1343 for mode, name, ent_id in tree_decode(data):
1344 demangled, bup_type = demangle_name(name, mode)
1346 sub_path = parent_path
1347 sub_chunk_path = chunk_path + [name]
1349 sub_path = parent_path + [name]
1350 if bup_type == BUP_CHUNKED:
1351 sub_chunk_path = ['']
1353 sub_chunk_path = chunk_path
1354 pending.append((ent_id.encode('hex'), sub_path, sub_chunk_path,