1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
6 import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
7 from collections import namedtuple
8 from itertools import islice
9 from numbers import Integral
11 from bup import _helpers, hashsplit, path, midx, bloom, xstat
12 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
14 hostname, localtime, log, merge_iter,
15 mmap_read, mmap_readwrite,
17 progress, qprogress, stat_if_exists,
18 unlink, username, userfullname,
23 repodir = None # The default repository, once initialized
25 _typemap = { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
26 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
32 class GitError(Exception):
36 def _git_wait(cmd, p):
39 raise GitError('%s returned %d' % (cmd, rv))
41 def _git_capture(argv):
42 p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv())
44 _git_wait(repr(argv), p)
47 def git_config_get(option, repo_dir=None):
48 cmd = ('git', 'config', '--get', option)
49 p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
50 preexec_fn=_gitenv(repo_dir=repo_dir))
56 raise GitError('%s returned %d' % (cmd, rc))
60 def parse_tz_offset(s):
61 """UTC offset in seconds."""
62 tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
68 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
69 # Make sure that's authoritative.
70 _start_end_char = r'[^ .,:;<>"\'\0\n]'
71 _content_char = r'[^\0\n<>]'
72 _safe_str_rx = '(?:%s{1,2}|(?:%s%s*%s))' \
74 _start_end_char, _content_char, _start_end_char)
75 _tz_rx = r'[-+]\d\d[0-5]\d'
76 _parent_rx = r'(?:parent [abcdefABCDEF0123456789]{40}\n)'
77 _commit_rx = re.compile(r'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
78 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
79 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)
81 (?P<message>(?:.|\n)*)''' % (_parent_rx,
82 _safe_str_rx, _safe_str_rx, _tz_rx,
83 _safe_str_rx, _safe_str_rx, _tz_rx))
84 _parent_hash_rx = re.compile(r'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
87 # Note that the author_sec and committer_sec values are (UTC) epoch seconds.
88 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
89 'author_name', 'author_mail',
90 'author_sec', 'author_offset',
91 'committer_name', 'committer_mail',
92 'committer_sec', 'committer_offset',
95 def parse_commit(content):
96 commit_match = re.match(_commit_rx, content)
98 raise Exception('cannot parse commit %r' % content)
99 matches = commit_match.groupdict()
100 return CommitInfo(tree=matches['tree'],
101 parents=re.findall(_parent_hash_rx, matches['parents']),
102 author_name=matches['author_name'],
103 author_mail=matches['author_mail'],
104 author_sec=int(matches['asec']),
105 author_offset=parse_tz_offset(matches['atz']),
106 committer_name=matches['committer_name'],
107 committer_mail=matches['committer_mail'],
108 committer_sec=int(matches['csec']),
109 committer_offset=parse_tz_offset(matches['ctz']),
110 message=matches['message'])
113 def get_commit_items(id, cp):
114 commit_it = cp.get(id)
115 _, typ, _ = next(commit_it)
116 assert(typ == 'commit')
117 commit_content = ''.join(commit_it)
118 return parse_commit(commit_content)
121 def _local_git_date_str(epoch_sec):
122 return '%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
125 def _git_date_str(epoch_sec, tz_offset_sec):
126 offs = tz_offset_sec // 60
127 return '%d %s%02d%02d' \
129 '+' if offs >= 0 else '-',
134 def repo(sub = '', repo_dir=None):
135 """Get the path to the git repository or one of its subdirectories."""
137 repo_dir = repo_dir or repodir
139 raise GitError('You should call check_repo_or_die()')
141 # If there's a .git subdirectory, then the actual repo is in there.
142 gd = os.path.join(repo_dir, '.git')
143 if os.path.exists(gd):
146 return os.path.join(repo_dir, sub)
150 return re.sub(r'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)',
155 full = os.path.abspath(path)
156 fullrepo = os.path.abspath(repo(''))
157 if not fullrepo.endswith('/'):
159 if full.startswith(fullrepo):
160 path = full[len(fullrepo):]
161 if path.startswith('index-cache/'):
162 path = path[len('index-cache/'):]
163 return shorten_hash(path)
167 paths = [repo('objects/pack')]
168 paths += glob.glob(repo('index-cache/*/.'))
172 def auto_midx(objdir):
173 args = [path.exe(), 'midx', '--auto', '--dir', objdir]
175 rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
177 # make sure 'args' gets printed to help with debugging
178 add_error('%r: exception: %s' % (args, e))
181 add_error('%r: returned %d' % (args, rv))
183 args = [path.exe(), 'bloom', '--dir', objdir]
185 rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
187 # make sure 'args' gets printed to help with debugging
188 add_error('%r: exception: %s' % (args, e))
191 add_error('%r: returned %d' % (args, rv))
194 def mangle_name(name, mode, gitmode):
195 """Mangle a file name to present an abstract name for segmented files.
196 Mangled file names will have the ".bup" extension added to them. If a
197 file's name already ends with ".bup", a ".bupl" extension is added to
198 disambiguate normal files from segmented ones.
200 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
201 assert(stat.S_ISDIR(gitmode))
203 elif name.endswith('.bup') or name[:-1].endswith('.bup'):
204 return name + '.bupl'
209 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
210 def demangle_name(name, mode):
211 """Remove name mangling from a file name, if necessary.
213 The return value is a tuple (demangled_filename,mode), where mode is one of
216 * BUP_NORMAL : files that should be read as-is from the repository
217 * BUP_CHUNKED : files that were chunked and need to be reassembled
219 For more information on the name mangling algorithm, see mangle_name()
221 if name.endswith('.bupl'):
222 return (name[:-5], BUP_NORMAL)
223 elif name.endswith('.bup'):
224 return (name[:-4], BUP_CHUNKED)
225 elif name.endswith('.bupm'):
227 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
229 return (name, BUP_NORMAL)
232 def calc_hash(type, content):
233 """Calculate some content's hash in the Git fashion."""
234 header = '%s %d\0' % (type, len(content))
240 def shalist_item_sort_key(ent):
241 (mode, name, id) = ent
242 assert(mode+0 == mode)
243 if stat.S_ISDIR(mode):
249 def tree_encode(shalist):
250 """Generate a git tree object from (mode,name,hash) tuples."""
251 shalist = sorted(shalist, key = shalist_item_sort_key)
253 for (mode,name,bin) in shalist:
255 assert(mode+0 == mode)
257 assert(len(bin) == 20)
258 s = '%o %s\0%s' % (mode,name,bin)
259 assert(s[0] != '0') # 0-padded octal is not acceptable in a git tree
264 def tree_decode(buf):
265 """Generate a list of (mode,name,hash) from the git tree object in buf."""
267 while ofs < len(buf):
268 z = buf.find('\0', ofs)
270 spl = buf[ofs:z].split(' ', 1)
271 assert(len(spl) == 2)
273 sha = buf[z+1:z+1+20]
275 yield (int(mode, 8), name, sha)
278 def _encode_packobj(type, content, compression_level=1):
279 if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
280 raise ValueError('invalid compression level %s' % compression_level)
283 szbits = (sz & 0x0f) | (_typemap[type]<<4)
286 if sz: szbits |= 0x80
292 z = zlib.compressobj(compression_level)
294 yield z.compress(content)
298 def _encode_looseobj(type, content, compression_level=1):
299 z = zlib.compressobj(compression_level)
300 yield z.compress('%s %d\0' % (type, len(content)))
301 yield z.compress(content)
305 def _decode_looseobj(buf):
307 s = zlib.decompress(buf)
314 assert(type in _typemap)
315 assert(sz == len(content))
316 return (type, content)
319 def _decode_packobj(buf):
322 type = _typermap[(c & 0x70) >> 4]
329 sz |= (c & 0x7f) << shift
333 return (type, zlib.decompress(buf[i+1:]))
340 def find_offset(self, hash):
341 """Get the offset of an object inside the index file."""
342 idx = self._idx_from_hash(hash)
344 return self._ofs_from_idx(idx)
347 def exists(self, hash, want_source=False):
348 """Return nonempty if the object exists in this index."""
349 if hash and (self._idx_from_hash(hash) != None):
350 return want_source and os.path.basename(self.name) or True
354 return int(self.fanout[255])
356 def _idx_from_hash(self, hash):
357 global _total_searches, _total_steps
359 assert(len(hash) == 20)
361 start = self.fanout[b1-1] # range -1..254
362 end = self.fanout[b1] # range 0..255
364 _total_steps += 1 # lookup table is a step
367 mid = start + (end-start)/2
368 v = self._idx_to_hash(mid)
378 class PackIdxV1(PackIdx):
379 """Object representation of a Git pack index (version 1) file."""
380 def __init__(self, filename, f):
382 self.idxnames = [self.name]
383 self.map = mmap_read(f)
384 self.fanout = list(struct.unpack('!256I',
385 str(buffer(self.map, 0, 256*4))))
386 self.fanout.append(0) # entry "-1"
387 nsha = self.fanout[255]
389 self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
391 def _ofs_from_idx(self, idx):
392 return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
394 def _idx_to_hash(self, idx):
395 return str(self.shatable[idx*24+4 : idx*24+24])
398 for i in xrange(self.fanout[255]):
399 yield buffer(self.map, 256*4 + 24*i + 4, 20)
402 class PackIdxV2(PackIdx):
403 """Object representation of a Git pack index (version 2) file."""
404 def __init__(self, filename, f):
406 self.idxnames = [self.name]
407 self.map = mmap_read(f)
408 assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
409 self.fanout = list(struct.unpack('!256I',
410 str(buffer(self.map, 8, 256*4))))
411 self.fanout.append(0) # entry "-1"
412 nsha = self.fanout[255]
413 self.sha_ofs = 8 + 256*4
414 self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
415 self.ofstable = buffer(self.map,
416 self.sha_ofs + nsha*20 + nsha*4,
418 self.ofs64table = buffer(self.map,
419 8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
421 def _ofs_from_idx(self, idx):
422 ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
424 idx64 = ofs & 0x7fffffff
425 ofs = struct.unpack('!Q',
426 str(buffer(self.ofs64table, idx64*8, 8)))[0]
429 def _idx_to_hash(self, idx):
430 return str(self.shatable[idx*20:(idx+1)*20])
433 for i in xrange(self.fanout[255]):
434 yield buffer(self.map, 8 + 256*4 + 20*i, 20)
439 def __init__(self, dir):
441 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
446 self.do_bloom = False
453 assert(_mpi_count == 0)
456 return iter(idxmerge(self.packs))
459 return sum(len(pack) for pack in self.packs)
461 def exists(self, hash, want_source=False):
462 """Return nonempty if the object exists in the index files."""
463 global _total_searches
465 if hash in self.also:
467 if self.do_bloom and self.bloom:
468 if self.bloom.exists(hash):
469 self.do_bloom = False
471 _total_searches -= 1 # was counted by bloom
473 for i in xrange(len(self.packs)):
475 _total_searches -= 1 # will be incremented by sub-pack
476 ix = p.exists(hash, want_source=want_source)
478 # reorder so most recently used packs are searched first
479 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
484 def refresh(self, skip_midx = False):
485 """Refresh the index list.
486 This method verifies if .midx files were superseded (e.g. all of its
487 contents are in another, bigger .midx file) and removes the superseded
490 If skip_midx is True, all work on .midx files will be skipped and .midx
491 files will be removed from the list.
493 The module-global variable 'ignore_midx' can force this function to
494 always act as if skip_midx was True.
496 self.bloom = None # Always reopen the bloom as it may have been relaced
497 self.do_bloom = False
498 skip_midx = skip_midx or ignore_midx
499 d = dict((p.name, p) for p in self.packs
500 if not skip_midx or not isinstance(p, midx.PackMidx))
501 if os.path.exists(self.dir):
504 for ix in self.packs:
505 if isinstance(ix, midx.PackMidx):
506 for name in ix.idxnames:
507 d[os.path.join(self.dir, name)] = ix
508 for full in glob.glob(os.path.join(self.dir,'*.midx')):
510 mx = midx.PackMidx(full)
511 (mxd, mxf) = os.path.split(mx.name)
513 for n in mx.idxnames:
514 if not os.path.exists(os.path.join(mxd, n)):
515 log(('warning: index %s missing\n' +
516 ' used by %s\n') % (n, mxf))
524 midxl.sort(key=lambda ix:
525 (-len(ix), -xstat.stat(ix.name).st_mtime))
528 for sub in ix.idxnames:
529 found = d.get(os.path.join(self.dir, sub))
530 if not found or isinstance(found, PackIdx):
531 # doesn't exist, or exists but not in a midx
536 for name in ix.idxnames:
537 d[os.path.join(self.dir, name)] = ix
538 elif not ix.force_keep:
539 debug1('midx: removing redundant: %s\n'
540 % os.path.basename(ix.name))
543 for full in glob.glob(os.path.join(self.dir,'*.idx')):
547 except GitError as e:
551 bfull = os.path.join(self.dir, 'bup.bloom')
552 if self.bloom is None and os.path.exists(bfull):
553 self.bloom = bloom.ShaBloom(bfull)
554 self.packs = list(set(d.values()))
555 self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
556 if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
560 debug1('PackIdxList: using %d index%s.\n'
561 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
564 """Insert an additional object in the list."""
568 def open_idx(filename):
569 if filename.endswith('.idx'):
570 f = open(filename, 'rb')
572 if header[0:4] == '\377tOc':
573 version = struct.unpack('!I', header[4:8])[0]
575 return PackIdxV2(filename, f)
577 raise GitError('%s: expected idx file version 2, got %d'
578 % (filename, version))
579 elif len(header) == 8 and header[0:4] < '\377tOc':
580 return PackIdxV1(filename, f)
582 raise GitError('%s: unrecognized idx file header' % filename)
583 elif filename.endswith('.midx'):
584 return midx.PackMidx(filename)
586 raise GitError('idx filenames must end with .idx or .midx')
589 def idxmerge(idxlist, final_progress=True):
590 """Generate a list of all the objects reachable in a PackIdxList."""
591 def pfunc(count, total):
592 qprogress('Reading indexes: %.2f%% (%d/%d)\r'
593 % (count*100.0/total, count, total))
594 def pfinal(count, total):
596 progress('Reading indexes: %.2f%% (%d/%d), done.\n'
597 % (100, total, total))
598 return merge_iter(idxlist, 10024, pfunc, pfinal)
601 def _make_objcache():
602 return PackIdxList(repo('objects/pack'))
604 # bup-gc assumes that it can disable all PackWriter activities
605 # (bloom/midx/cache) via the constructor and close() arguments.
608 """Writes Git objects inside a pack file."""
609 def __init__(self, objcache_maker=_make_objcache, compression_level=1,
610 run_midx=True, on_pack_finish=None,
611 max_pack_size=None, max_pack_objects=None):
612 self.repo_dir = repo()
619 self.objcache_maker = objcache_maker
621 self.compression_level = compression_level
622 self.run_midx=run_midx
623 self.on_pack_finish = on_pack_finish
624 if not max_pack_size:
625 max_pack_size = git_config_get('pack.packSizeLimit',
626 repo_dir=self.repo_dir)
627 if max_pack_size is not None:
628 max_pack_size = parse_num(max_pack_size)
629 if not max_pack_size:
630 # larger packs slow down pruning
631 max_pack_size = 1000 * 1000 * 1000
632 self.max_pack_size = max_pack_size
633 # cache memory usage is about 83 bytes per object
634 self.max_pack_objects = max_pack_objects if max_pack_objects \
635 else max(1, self.max_pack_size // 5000)
642 objdir = dir = os.path.join(self.repo_dir, 'objects')
643 fd, name = tempfile.mkstemp(suffix='.pack', dir=objdir)
645 self.file = os.fdopen(fd, 'w+b')
650 self.parentfd = os.open(objdir, os.O_RDONLY)
656 assert(name.endswith('.pack'))
657 self.filename = name[:-5]
658 self.file.write('PACK\0\0\0\2\0\0\0\0')
659 self.idx = list(list() for i in xrange(256))
661 def _raw_write(self, datalist, sha):
664 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
665 # the file never has a *partial* blob. So let's make sure it's
666 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
667 # to our hashsplit algorithm.) f.write() does its own buffering,
668 # but that's okay because we'll flush it in _end().
669 oneblob = ''.join(datalist)
673 raise GitError, e, sys.exc_info()[2]
675 crc = zlib.crc32(oneblob) & 0xffffffff
676 self._update_idx(sha, crc, nw)
681 def _update_idx(self, sha, crc, size):
684 self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
686 def _write(self, sha, type, content):
690 sha = calc_hash(type, content)
691 size, crc = self._raw_write(_encode_packobj(type, content,
692 self.compression_level),
694 if self.outbytes >= self.max_pack_size \
695 or self.count >= self.max_pack_objects:
699 def breakpoint(self):
700 """Clear byte and object counts and return the last processed id."""
701 id = self._end(self.run_midx)
702 self.outbytes = self.count = 0
705 def _require_objcache(self):
706 if self.objcache is None and self.objcache_maker:
707 self.objcache = self.objcache_maker()
708 if self.objcache is None:
710 "PackWriter not opened or can't check exists w/o objcache")
712 def exists(self, id, want_source=False):
713 """Return non-empty if an object is found in the object cache."""
714 self._require_objcache()
715 return self.objcache.exists(id, want_source=want_source)
717 def just_write(self, sha, type, content):
718 """Write an object to the pack file, bypassing the objcache. Fails if
720 self._write(sha, type, content)
722 def maybe_write(self, type, content):
723 """Write an object to the pack file if not present and return its id."""
724 sha = calc_hash(type, content)
725 if not self.exists(sha):
726 self.just_write(sha, type, content)
727 self._require_objcache()
728 self.objcache.add(sha)
731 def new_blob(self, blob):
732 """Create a blob object in the pack with the supplied content."""
733 return self.maybe_write('blob', blob)
735 def new_tree(self, shalist):
736 """Create a tree object in the pack."""
737 content = tree_encode(shalist)
738 return self.maybe_write('tree', content)
740 def new_commit(self, tree, parent,
741 author, adate_sec, adate_tz,
742 committer, cdate_sec, cdate_tz,
744 """Create a commit object in the pack. The date_sec values must be
745 epoch-seconds, and if a tz is None, the local timezone is assumed."""
747 adate_str = _git_date_str(adate_sec, adate_tz)
749 adate_str = _local_git_date_str(adate_sec)
751 cdate_str = _git_date_str(cdate_sec, cdate_tz)
753 cdate_str = _local_git_date_str(cdate_sec)
755 if tree: l.append('tree %s' % tree.encode('hex'))
756 if parent: l.append('parent %s' % parent.encode('hex'))
757 if author: l.append('author %s %s' % (author, adate_str))
758 if committer: l.append('committer %s %s' % (committer, cdate_str))
761 return self.maybe_write('commit', '\n'.join(l))
764 """Remove the pack file from disk."""
773 os.unlink(self.filename + '.pack')
780 def _end(self, run_midx=True):
782 if not f: return None
789 # update object count
791 cp = struct.pack('!i', self.count)
795 # calculate the pack sha1sum
798 for b in chunkyreader(f):
800 packbin = sum.digest()
802 fdatasync(f.fileno())
806 obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
807 nameprefix = os.path.join(self.repo_dir,
808 'objects/pack/pack-' + obj_list_sha)
809 if os.path.exists(self.filename + '.map'):
810 os.unlink(self.filename + '.map')
811 os.rename(self.filename + '.pack', nameprefix + '.pack')
812 os.rename(self.filename + '.idx', nameprefix + '.idx')
814 os.fsync(self.parentfd)
816 os.close(self.parentfd)
819 auto_midx(os.path.join(self.repo_dir, 'objects/pack'))
821 if self.on_pack_finish:
822 self.on_pack_finish(nameprefix)
826 def close(self, run_midx=True):
827 """Close the pack file and move it to its definitive path."""
828 return self._end(run_midx=run_midx)
830 def _write_pack_idx_v2(self, filename, idx, packbin):
833 for entry in section:
834 if entry[2] >= 2**31:
837 # Length: header + fan-out + shas-and-crcs + overflow-offsets
838 index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
840 idx_f = open(filename, 'w+b')
842 idx_f.truncate(index_len)
843 fdatasync(idx_f.fileno())
844 idx_map = mmap_readwrite(idx_f, close=False)
846 count = _helpers.write_idx(filename, idx_map, idx, self.count)
847 assert(count == self.count)
854 idx_f = open(filename, 'a+b')
859 b = idx_f.read(8 + 4*256)
862 obj_list_sum = Sha1()
863 for b in chunkyreader(idx_f, 20*self.count):
865 obj_list_sum.update(b)
866 namebase = obj_list_sum.hexdigest()
868 for b in chunkyreader(idx_f):
870 idx_f.write(idx_sum.digest())
871 fdatasync(idx_f.fileno())
877 def _gitenv(repo_dir = None):
881 os.environ['GIT_DIR'] = os.path.abspath(repo_dir)
885 def list_refs(patterns=None, repo_dir=None,
886 limit_to_heads=False, limit_to_tags=False):
887 """Yield (refname, hash) tuples for all repository refs unless
888 patterns are specified. In that case, only include tuples for
889 refs matching those patterns (cf. git-show-ref(1)). The limits
890 restrict the result items to refs/heads or refs/tags. If both
891 limits are specified, items from both sources will be included.
894 argv = ['git', 'show-ref']
896 argv.append('--heads')
898 argv.append('--tags')
901 argv.extend(patterns)
902 p = subprocess.Popen(argv,
903 preexec_fn = _gitenv(repo_dir),
904 stdout = subprocess.PIPE)
905 out = p.stdout.read().strip()
906 rv = p.wait() # not fatal
910 for d in out.split('\n'):
911 (sha, name) = d.split(' ', 1)
912 yield (name, sha.decode('hex'))
915 def read_ref(refname, repo_dir = None):
916 """Get the commit id of the most recent commit made on a given ref."""
917 refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
918 l = tuple(islice(refs, 2))
926 def rev_list(ref, count=None, repo_dir=None):
927 """Generate a list of reachable commits in reverse chronological order.
929 This generator walks through commits, from child to parent, that are
930 reachable via the specified ref and yields a series of tuples of the form
933 If count is a non-zero integer, limit the number of commits to "count"
936 assert(not ref.startswith('-'))
938 if isinstance(count, Integral):
939 opts += ['-n', str(count)]
942 argv = ['git', 'rev-list', '--pretty=format:%at'] + opts + [ref, '--']
943 p = subprocess.Popen(argv,
944 preexec_fn = _gitenv(repo_dir),
945 stdout = subprocess.PIPE)
949 if s.startswith('commit '):
950 commit = s[7:].decode('hex')
954 rv = p.wait() # not fatal
956 raise GitError, 'git rev-list returned error %d' % rv
959 def get_commit_dates(refs, repo_dir=None):
960 """Get the dates for the specified commit refs. For now, every unique
961 string in refs must resolve to a different commit or this
962 function will fail."""
965 commit = get_commit_items(ref, cp(repo_dir))
966 result.append(commit.author_sec)
970 def rev_parse(committish, repo_dir=None):
971 """Resolve the full hash for 'committish', if it exists.
973 Should be roughly equivalent to 'git rev-parse'.
975 Returns the hex value of the hash if it is found, None if 'committish' does
976 not correspond to anything.
978 head = read_ref(committish, repo_dir=repo_dir)
980 debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
983 pL = PackIdxList(repo('objects/pack', repo_dir=repo_dir))
985 if len(committish) == 40:
987 hash = committish.decode('hex')
997 def update_ref(refname, newval, oldval, repo_dir=None):
998 """Update a repository reference."""
1001 assert(refname.startswith('refs/heads/') \
1002 or refname.startswith('refs/tags/'))
1003 p = subprocess.Popen(['git', 'update-ref', refname,
1004 newval.encode('hex'), oldval.encode('hex')],
1005 preexec_fn = _gitenv(repo_dir))
1006 _git_wait('git update-ref', p)
1009 def delete_ref(refname, oldvalue=None):
1010 """Delete a repository reference (see git update-ref(1))."""
1011 assert(refname.startswith('refs/'))
1012 oldvalue = [] if not oldvalue else [oldvalue]
1013 p = subprocess.Popen(['git', 'update-ref', '-d', refname] + oldvalue,
1014 preexec_fn = _gitenv())
1015 _git_wait('git update-ref', p)
1018 def guess_repo(path=None):
1019 """Set the path value in the global variable "repodir".
1020 This makes bup look for an existing bup repository, but not fail if a
1021 repository doesn't exist. Usually, if you are interacting with a bup
1022 repository, you would not be calling this function but using
1023 check_repo_or_die().
1029 repodir = os.environ.get('BUP_DIR')
1031 repodir = os.path.expanduser('~/.bup')
1034 def init_repo(path=None):
1035 """Create the Git bare repository for bup in a given path."""
1037 d = repo() # appends a / to the path
1038 parent = os.path.dirname(os.path.dirname(d))
1039 if parent and not os.path.exists(parent):
1040 raise GitError('parent directory "%s" does not exist\n' % parent)
1041 if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
1042 raise GitError('"%s" exists but is not a directory\n' % d)
1043 p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
1044 preexec_fn = _gitenv())
1045 _git_wait('git init', p)
1046 # Force the index version configuration in order to ensure bup works
1047 # regardless of the version of the installed Git binary.
1048 p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
1049 stdout=sys.stderr, preexec_fn = _gitenv())
1050 _git_wait('git config', p)
1052 p = subprocess.Popen(['git', 'config', 'core.logAllRefUpdates', 'true'],
1053 stdout=sys.stderr, preexec_fn = _gitenv())
1054 _git_wait('git config', p)
1057 def check_repo_or_die(path=None):
1058 """Check to see if a bup repository probably exists, and abort if not."""
1061 pst = stat_if_exists(top + '/objects/pack')
1062 if pst and stat.S_ISDIR(pst.st_mode):
1065 top_st = stat_if_exists(top)
1067 log('error: repository %r does not exist (see "bup help init")\n'
1070 log('error: %r is not a repository\n' % top)
1076 """Get Git's version and ensure a usable version is installed.
1078 The returned version is formatted as an ordered tuple with each position
1079 representing a digit in the version tag. For example, the following tuple
1080 would represent version 1.6.6.9:
1082 ('1', '6', '6', '9')
1086 p = subprocess.Popen(['git', '--version'],
1087 stdout=subprocess.PIPE)
1088 gvs = p.stdout.read()
1089 _git_wait('git --version', p)
1090 m = re.match(r'git version (\S+.\S+)', gvs)
1092 raise GitError('git --version weird output: %r' % gvs)
1093 _ver = tuple(m.group(1).split('.'))
1094 needed = ('1','5', '3', '1')
1096 raise GitError('git version %s or higher is required; you have %s'
1097 % ('.'.join(needed), '.'.join(_ver)))
1101 class _AbortableIter:
1102 def __init__(self, it, onabort = None):
1104 self.onabort = onabort
1112 return next(self.it)
1113 except StopIteration as e:
1121 """Abort iteration and call the abortion callback, if needed."""
1133 """Link to 'git cat-file' that is used to retrieve blob data."""
1134 def __init__(self, repo_dir = None):
1136 self.repo_dir = repo_dir
1137 wanted = ('1','5','6')
1139 log('error: git version must be at least 1.5.6\n')
1141 self.p = self.inprogress = None
1145 self.p.stdout.close()
1146 self.p.stdin.close()
1148 self.inprogress = None
1152 self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1153 stdin=subprocess.PIPE,
1154 stdout=subprocess.PIPE,
1157 preexec_fn = _gitenv(self.repo_dir))
1160 """Yield (oidx, type, size), followed by the data referred to by ref.
1161 If ref does not exist, only yield (None, None, None).
1164 if not self.p or self.p.poll() != None:
1167 poll_result = self.p.poll()
1168 assert(poll_result == None)
1170 log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1171 assert(not self.inprogress)
1172 assert(ref.find('\n') < 0)
1173 assert(ref.find('\r') < 0)
1174 assert(not ref.startswith('-'))
1175 self.inprogress = ref
1176 self.p.stdin.write('%s\n' % ref)
1177 self.p.stdin.flush()
1178 hdr = self.p.stdout.readline()
1179 if hdr.endswith(' missing\n'):
1180 self.inprogress = None
1181 yield None, None, None
1183 info = hdr.split(' ')
1184 if len(info) != 3 or len(info[0]) != 40:
1185 raise GitError('expected object (id, type, size), got %r' % spl)
1186 oidx, typ, size = info
1188 it = _AbortableIter(chunkyreader(self.p.stdout, size),
1189 onabort=self._abort)
1191 yield oidx, typ, size
1194 readline_result = self.p.stdout.readline()
1195 assert(readline_result == '\n')
1196 self.inprogress = None
1197 except Exception as e:
1201 def _join(self, it):
1202 _, typ, _ = next(it)
1207 treefile = ''.join(it)
1208 for (mode, name, sha) in tree_decode(treefile):
1209 for blob in self.join(sha.encode('hex')):
1211 elif typ == 'commit':
1212 treeline = ''.join(it).split('\n')[0]
1213 assert(treeline.startswith('tree '))
1214 for blob in self.join(treeline[5:]):
1217 raise GitError('invalid object type %r: expected blob/tree/commit'
1221 """Generate a list of the content of all blobs that can be reached
1222 from an object. The hash given in 'id' must point to a blob, a tree
1223 or a commit. The content of all blobs that can be seen from trees or
1224 commits will be added to the list.
1227 for d in self._join(self.get(id)):
1229 except StopIteration:
1235 def cp(repo_dir=None):
1236 """Create a CatPipe object or reuse the already existing one."""
1239 repo_dir = repodir or repo()
1240 repo_dir = os.path.abspath(repo_dir)
1241 cp = _cp.get(repo_dir)
1243 cp = CatPipe(repo_dir)
1248 def tags(repo_dir = None):
1249 """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1251 for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1252 assert(n.startswith('refs/tags/'))
1256 tags[c].append(name) # more than one tag can point at 'c'
1260 class MissingObject(KeyError):
1261 def __init__(self, oid):
1263 KeyError.__init__(self, 'object %r is missing' % oid.encode('hex'))
1266 WalkItem = namedtuple('WalkItem', ['id', 'type', 'mode',
1267 'path', 'chunk_path', 'data'])
1268 # The path is the mangled path, and if an item represents a fragment
1269 # of a chunked file, the chunk_path will be the chunked subtree path
1270 # for the chunk, i.e. ['', '2d3115e', ...]. The top-level path for a
1271 # chunked file will have a chunk_path of ['']. So some chunk subtree
1272 # of the file '/foo/bar/baz' might look like this:
1274 # item.path = ['foo', 'bar', 'baz.bup']
1275 # item.chunk_path = ['', '2d3115e', '016b097']
1276 # item.type = 'tree'
1280 def walk_object(cat_pipe, oidx,
1283 """Yield everything reachable from oidx via cat_pipe as a WalkItem,
1284 stopping whenever stop_at(oidx) returns true. Throw MissingObject
1285 if a hash encountered is missing from the repository, and don't
1286 read or return blob content in the data field unless include_data
1289 # Maintain the pending stack on the heap to avoid stack overflow
1290 pending = [(oidx, [], [], None)]
1292 oidx, parent_path, chunk_path, mode = pending.pop()
1293 if stop_at and stop_at(oidx):
1296 if (not include_data) and mode and stat.S_ISREG(mode):
1297 # If the object is a "regular file", then it's a leaf in
1298 # the graph, so we can skip reading the data if the caller
1299 # hasn't requested it.
1300 yield WalkItem(id=oidx, type='blob',
1301 chunk_path=chunk_path, path=parent_path,
1306 item_it = cat_pipe.get(oidx)
1307 get_oidx, typ, _ = next(item_it)
1309 raise MissingObject(oidx.decode('hex'))
1310 if typ not in ('blob', 'commit', 'tree'):
1311 raise Exception('unexpected repository object type %r' % typ)
1313 # FIXME: set the mode based on the type when the mode is None
1314 if typ == 'blob' and not include_data:
1315 # Dump data until we can ask cat_pipe not to fetch it
1316 for ignored in item_it:
1320 data = ''.join(item_it)
1322 yield WalkItem(id=oidx, type=typ,
1323 chunk_path=chunk_path, path=parent_path,
1325 data=(data if include_data else None))
1328 commit_items = parse_commit(data)
1329 for pid in commit_items.parents:
1330 pending.append((pid, parent_path, chunk_path, mode))
1331 pending.append((commit_items.tree, parent_path, chunk_path,
1332 hashsplit.GIT_MODE_TREE))
1334 for mode, name, ent_id in tree_decode(data):
1335 demangled, bup_type = demangle_name(name, mode)
1337 sub_path = parent_path
1338 sub_chunk_path = chunk_path + [name]
1340 sub_path = parent_path + [name]
1341 if bup_type == BUP_CHUNKED:
1342 sub_chunk_path = ['']
1344 sub_chunk_path = chunk_path
1345 pending.append((ent_id.encode('hex'), sub_path, sub_chunk_path,