1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
6 import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
7 from collections import namedtuple
8 from itertools import islice
10 from bup import _helpers, hashsplit, path, midx, bloom, xstat
11 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
13 hostname, localtime, log, merge_iter,
14 mmap_read, mmap_readwrite,
15 progress, qprogress, stat_if_exists,
16 unlink, username, userfullname,
20 max_pack_size = 1000*1000*1000 # larger packs will slow down pruning
21 max_pack_objects = 200*1000 # cache memory usage is about 83 bytes per object
25 repodir = None # The default repository, once initialized
27 _typemap = { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
28 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
34 class GitError(Exception):
38 def _git_wait(cmd, p):
41 raise GitError('%s returned %d' % (cmd, rv))
43 def _git_capture(argv):
44 p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv())
46 _git_wait(repr(argv), p)
50 def parse_tz_offset(s):
51 """UTC offset in seconds."""
52 tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
58 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
59 # Make sure that's authoritative.
60 _start_end_char = r'[^ .,:;<>"\'\0\n]'
61 _content_char = r'[^\0\n<>]'
62 _safe_str_rx = '(?:%s{1,2}|(?:%s%s*%s))' \
64 _start_end_char, _content_char, _start_end_char)
65 _tz_rx = r'[-+]\d\d[0-5]\d'
66 _parent_rx = r'(?:parent [abcdefABCDEF0123456789]{40}\n)'
67 _commit_rx = re.compile(r'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
68 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
69 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)
71 (?P<message>(?:.|\n)*)''' % (_parent_rx,
72 _safe_str_rx, _safe_str_rx, _tz_rx,
73 _safe_str_rx, _safe_str_rx, _tz_rx))
74 _parent_hash_rx = re.compile(r'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
77 # Note that the author_sec and committer_sec values are (UTC) epoch seconds.
78 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
79 'author_name', 'author_mail',
80 'author_sec', 'author_offset',
81 'committer_name', 'committer_mail',
82 'committer_sec', 'committer_offset',
85 def parse_commit(content):
86 commit_match = re.match(_commit_rx, content)
88 raise Exception('cannot parse commit %r' % content)
89 matches = commit_match.groupdict()
90 return CommitInfo(tree=matches['tree'],
91 parents=re.findall(_parent_hash_rx, matches['parents']),
92 author_name=matches['author_name'],
93 author_mail=matches['author_mail'],
94 author_sec=int(matches['asec']),
95 author_offset=parse_tz_offset(matches['atz']),
96 committer_name=matches['committer_name'],
97 committer_mail=matches['committer_mail'],
98 committer_sec=int(matches['csec']),
99 committer_offset=parse_tz_offset(matches['ctz']),
100 message=matches['message'])
103 def get_commit_items(id, cp):
104 commit_it = cp.get(id)
105 assert(commit_it.next() == 'commit')
106 commit_content = ''.join(commit_it)
107 return parse_commit(commit_content)
110 def _local_git_date_str(epoch_sec):
111 return '%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
114 def _git_date_str(epoch_sec, tz_offset_sec):
115 offs = tz_offset_sec // 60
116 return '%d %s%02d%02d' \
118 '+' if offs >= 0 else '-',
123 def repo(sub = '', repo_dir=None):
124 """Get the path to the git repository or one of its subdirectories."""
126 repo_dir = repo_dir or repodir
128 raise GitError('You should call check_repo_or_die()')
130 # If there's a .git subdirectory, then the actual repo is in there.
131 gd = os.path.join(repo_dir, '.git')
132 if os.path.exists(gd):
135 return os.path.join(repo_dir, sub)
139 return re.sub(r'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)',
144 full = os.path.abspath(path)
145 fullrepo = os.path.abspath(repo(''))
146 if not fullrepo.endswith('/'):
148 if full.startswith(fullrepo):
149 path = full[len(fullrepo):]
150 if path.startswith('index-cache/'):
151 path = path[len('index-cache/'):]
152 return shorten_hash(path)
156 paths = [repo('objects/pack')]
157 paths += glob.glob(repo('index-cache/*/.'))
161 def auto_midx(objdir):
162 args = [path.exe(), 'midx', '--auto', '--dir', objdir]
164 rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
166 # make sure 'args' gets printed to help with debugging
167 add_error('%r: exception: %s' % (args, e))
170 add_error('%r: returned %d' % (args, rv))
172 args = [path.exe(), 'bloom', '--dir', objdir]
174 rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
176 # make sure 'args' gets printed to help with debugging
177 add_error('%r: exception: %s' % (args, e))
180 add_error('%r: returned %d' % (args, rv))
183 def mangle_name(name, mode, gitmode):
184 """Mangle a file name to present an abstract name for segmented files.
185 Mangled file names will have the ".bup" extension added to them. If a
186 file's name already ends with ".bup", a ".bupl" extension is added to
187 disambiguate normal files from segmented ones.
189 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
190 assert(stat.S_ISDIR(gitmode))
192 elif name.endswith('.bup') or name[:-1].endswith('.bup'):
193 return name + '.bupl'
198 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
199 def demangle_name(name, mode):
200 """Remove name mangling from a file name, if necessary.
202 The return value is a tuple (demangled_filename,mode), where mode is one of
205 * BUP_NORMAL : files that should be read as-is from the repository
206 * BUP_CHUNKED : files that were chunked and need to be reassembled
208 For more information on the name mangling algorithm, see mangle_name()
210 if name.endswith('.bupl'):
211 return (name[:-5], BUP_NORMAL)
212 elif name.endswith('.bup'):
213 return (name[:-4], BUP_CHUNKED)
214 elif name.endswith('.bupm'):
216 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
218 return (name, BUP_NORMAL)
221 def calc_hash(type, content):
222 """Calculate some content's hash in the Git fashion."""
223 header = '%s %d\0' % (type, len(content))
229 def shalist_item_sort_key(ent):
230 (mode, name, id) = ent
231 assert(mode+0 == mode)
232 if stat.S_ISDIR(mode):
238 def tree_encode(shalist):
239 """Generate a git tree object from (mode,name,hash) tuples."""
240 shalist = sorted(shalist, key = shalist_item_sort_key)
242 for (mode,name,bin) in shalist:
244 assert(mode+0 == mode)
246 assert(len(bin) == 20)
247 s = '%o %s\0%s' % (mode,name,bin)
248 assert(s[0] != '0') # 0-padded octal is not acceptable in a git tree
253 def tree_decode(buf):
254 """Generate a list of (mode,name,hash) from the git tree object in buf."""
256 while ofs < len(buf):
257 z = buf.find('\0', ofs)
259 spl = buf[ofs:z].split(' ', 1)
260 assert(len(spl) == 2)
262 sha = buf[z+1:z+1+20]
264 yield (int(mode, 8), name, sha)
267 def _encode_packobj(type, content, compression_level=1):
268 if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
269 raise ValueError('invalid compression level %s' % compression_level)
272 szbits = (sz & 0x0f) | (_typemap[type]<<4)
275 if sz: szbits |= 0x80
281 z = zlib.compressobj(compression_level)
283 yield z.compress(content)
287 def _encode_looseobj(type, content, compression_level=1):
288 z = zlib.compressobj(compression_level)
289 yield z.compress('%s %d\0' % (type, len(content)))
290 yield z.compress(content)
294 def _decode_looseobj(buf):
296 s = zlib.decompress(buf)
303 assert(type in _typemap)
304 assert(sz == len(content))
305 return (type, content)
308 def _decode_packobj(buf):
311 type = _typermap[(c & 0x70) >> 4]
318 sz |= (c & 0x7f) << shift
322 return (type, zlib.decompress(buf[i+1:]))
329 def find_offset(self, hash):
330 """Get the offset of an object inside the index file."""
331 idx = self._idx_from_hash(hash)
333 return self._ofs_from_idx(idx)
336 def exists(self, hash, want_source=False):
337 """Return nonempty if the object exists in this index."""
338 if hash and (self._idx_from_hash(hash) != None):
339 return want_source and os.path.basename(self.name) or True
343 return int(self.fanout[255])
345 def _idx_from_hash(self, hash):
346 global _total_searches, _total_steps
348 assert(len(hash) == 20)
350 start = self.fanout[b1-1] # range -1..254
351 end = self.fanout[b1] # range 0..255
353 _total_steps += 1 # lookup table is a step
356 mid = start + (end-start)/2
357 v = self._idx_to_hash(mid)
367 class PackIdxV1(PackIdx):
368 """Object representation of a Git pack index (version 1) file."""
369 def __init__(self, filename, f):
371 self.idxnames = [self.name]
372 self.map = mmap_read(f)
373 self.fanout = list(struct.unpack('!256I',
374 str(buffer(self.map, 0, 256*4))))
375 self.fanout.append(0) # entry "-1"
376 nsha = self.fanout[255]
378 self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
380 def _ofs_from_idx(self, idx):
381 return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
383 def _idx_to_hash(self, idx):
384 return str(self.shatable[idx*24+4 : idx*24+24])
387 for i in xrange(self.fanout[255]):
388 yield buffer(self.map, 256*4 + 24*i + 4, 20)
391 class PackIdxV2(PackIdx):
392 """Object representation of a Git pack index (version 2) file."""
393 def __init__(self, filename, f):
395 self.idxnames = [self.name]
396 self.map = mmap_read(f)
397 assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
398 self.fanout = list(struct.unpack('!256I',
399 str(buffer(self.map, 8, 256*4))))
400 self.fanout.append(0) # entry "-1"
401 nsha = self.fanout[255]
402 self.sha_ofs = 8 + 256*4
403 self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
404 self.ofstable = buffer(self.map,
405 self.sha_ofs + nsha*20 + nsha*4,
407 self.ofs64table = buffer(self.map,
408 8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
410 def _ofs_from_idx(self, idx):
411 ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
413 idx64 = ofs & 0x7fffffff
414 ofs = struct.unpack('!Q',
415 str(buffer(self.ofs64table, idx64*8, 8)))[0]
418 def _idx_to_hash(self, idx):
419 return str(self.shatable[idx*20:(idx+1)*20])
422 for i in xrange(self.fanout[255]):
423 yield buffer(self.map, 8 + 256*4 + 20*i, 20)
428 def __init__(self, dir):
430 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
435 self.do_bloom = False
442 assert(_mpi_count == 0)
445 return iter(idxmerge(self.packs))
448 return sum(len(pack) for pack in self.packs)
450 def exists(self, hash, want_source=False):
451 """Return nonempty if the object exists in the index files."""
452 global _total_searches
454 if hash in self.also:
456 if self.do_bloom and self.bloom:
457 if self.bloom.exists(hash):
458 self.do_bloom = False
460 _total_searches -= 1 # was counted by bloom
462 for i in xrange(len(self.packs)):
464 _total_searches -= 1 # will be incremented by sub-pack
465 ix = p.exists(hash, want_source=want_source)
467 # reorder so most recently used packs are searched first
468 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
473 def refresh(self, skip_midx = False):
474 """Refresh the index list.
475 This method verifies if .midx files were superseded (e.g. all of its
476 contents are in another, bigger .midx file) and removes the superseded
479 If skip_midx is True, all work on .midx files will be skipped and .midx
480 files will be removed from the list.
482 The module-global variable 'ignore_midx' can force this function to
483 always act as if skip_midx was True.
485 self.bloom = None # Always reopen the bloom as it may have been relaced
486 self.do_bloom = False
487 skip_midx = skip_midx or ignore_midx
488 d = dict((p.name, p) for p in self.packs
489 if not skip_midx or not isinstance(p, midx.PackMidx))
490 if os.path.exists(self.dir):
493 for ix in self.packs:
494 if isinstance(ix, midx.PackMidx):
495 for name in ix.idxnames:
496 d[os.path.join(self.dir, name)] = ix
497 for full in glob.glob(os.path.join(self.dir,'*.midx')):
499 mx = midx.PackMidx(full)
500 (mxd, mxf) = os.path.split(mx.name)
502 for n in mx.idxnames:
503 if not os.path.exists(os.path.join(mxd, n)):
504 log(('warning: index %s missing\n' +
505 ' used by %s\n') % (n, mxf))
513 midxl.sort(key=lambda ix:
514 (-len(ix), -xstat.stat(ix.name).st_mtime))
517 for sub in ix.idxnames:
518 found = d.get(os.path.join(self.dir, sub))
519 if not found or isinstance(found, PackIdx):
520 # doesn't exist, or exists but not in a midx
525 for name in ix.idxnames:
526 d[os.path.join(self.dir, name)] = ix
527 elif not ix.force_keep:
528 debug1('midx: removing redundant: %s\n'
529 % os.path.basename(ix.name))
532 for full in glob.glob(os.path.join(self.dir,'*.idx')):
536 except GitError as e:
540 bfull = os.path.join(self.dir, 'bup.bloom')
541 if self.bloom is None and os.path.exists(bfull):
542 self.bloom = bloom.ShaBloom(bfull)
543 self.packs = list(set(d.values()))
544 self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
545 if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
549 debug1('PackIdxList: using %d index%s.\n'
550 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
553 """Insert an additional object in the list."""
557 def open_idx(filename):
558 if filename.endswith('.idx'):
559 f = open(filename, 'rb')
561 if header[0:4] == '\377tOc':
562 version = struct.unpack('!I', header[4:8])[0]
564 return PackIdxV2(filename, f)
566 raise GitError('%s: expected idx file version 2, got %d'
567 % (filename, version))
568 elif len(header) == 8 and header[0:4] < '\377tOc':
569 return PackIdxV1(filename, f)
571 raise GitError('%s: unrecognized idx file header' % filename)
572 elif filename.endswith('.midx'):
573 return midx.PackMidx(filename)
575 raise GitError('idx filenames must end with .idx or .midx')
578 def idxmerge(idxlist, final_progress=True):
579 """Generate a list of all the objects reachable in a PackIdxList."""
580 def pfunc(count, total):
581 qprogress('Reading indexes: %.2f%% (%d/%d)\r'
582 % (count*100.0/total, count, total))
583 def pfinal(count, total):
585 progress('Reading indexes: %.2f%% (%d/%d), done.\n'
586 % (100, total, total))
587 return merge_iter(idxlist, 10024, pfunc, pfinal)
590 def _make_objcache():
591 return PackIdxList(repo('objects/pack'))
593 # bup-gc assumes that it can disable all PackWriter activities
594 # (bloom/midx/cache) via the constructor and close() arguments.
597 """Writes Git objects inside a pack file."""
598 def __init__(self, objcache_maker=_make_objcache, compression_level=1,
599 run_midx=True, on_pack_finish=None):
606 self.objcache_maker = objcache_maker
608 self.compression_level = compression_level
609 self.run_midx=run_midx
610 self.on_pack_finish = on_pack_finish
617 objdir = dir=repo('objects')
618 fd, name = tempfile.mkstemp(suffix='.pack', dir=objdir)
620 self.file = os.fdopen(fd, 'w+b')
625 self.parentfd = os.open(objdir, os.O_RDONLY)
631 assert(name.endswith('.pack'))
632 self.filename = name[:-5]
633 self.file.write('PACK\0\0\0\2\0\0\0\0')
634 self.idx = list(list() for i in xrange(256))
636 def _raw_write(self, datalist, sha):
639 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
640 # the file never has a *partial* blob. So let's make sure it's
641 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
642 # to our hashsplit algorithm.) f.write() does its own buffering,
643 # but that's okay because we'll flush it in _end().
644 oneblob = ''.join(datalist)
648 raise GitError, e, sys.exc_info()[2]
650 crc = zlib.crc32(oneblob) & 0xffffffff
651 self._update_idx(sha, crc, nw)
656 def _update_idx(self, sha, crc, size):
659 self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
661 def _write(self, sha, type, content):
665 sha = calc_hash(type, content)
666 size, crc = self._raw_write(_encode_packobj(type, content,
667 self.compression_level),
669 if self.outbytes >= max_pack_size or self.count >= max_pack_objects:
673 def breakpoint(self):
674 """Clear byte and object counts and return the last processed id."""
675 id = self._end(self.run_midx)
676 self.outbytes = self.count = 0
679 def _require_objcache(self):
680 if self.objcache is None and self.objcache_maker:
681 self.objcache = self.objcache_maker()
682 if self.objcache is None:
684 "PackWriter not opened or can't check exists w/o objcache")
686 def exists(self, id, want_source=False):
687 """Return non-empty if an object is found in the object cache."""
688 self._require_objcache()
689 return self.objcache.exists(id, want_source=want_source)
691 def just_write(self, sha, type, content):
692 """Write an object to the pack file, bypassing the objcache. Fails if
694 self._write(sha, type, content)
696 def maybe_write(self, type, content):
697 """Write an object to the pack file if not present and return its id."""
698 sha = calc_hash(type, content)
699 if not self.exists(sha):
700 self.just_write(sha, type, content)
701 self._require_objcache()
702 self.objcache.add(sha)
705 def new_blob(self, blob):
706 """Create a blob object in the pack with the supplied content."""
707 return self.maybe_write('blob', blob)
709 def new_tree(self, shalist):
710 """Create a tree object in the pack."""
711 content = tree_encode(shalist)
712 return self.maybe_write('tree', content)
714 def new_commit(self, tree, parent,
715 author, adate_sec, adate_tz,
716 committer, cdate_sec, cdate_tz,
718 """Create a commit object in the pack. The date_sec values must be
719 epoch-seconds, and if a tz is None, the local timezone is assumed."""
721 adate_str = _git_date_str(adate_sec, adate_tz)
723 adate_str = _local_git_date_str(adate_sec)
725 cdate_str = _git_date_str(cdate_sec, cdate_tz)
727 cdate_str = _local_git_date_str(cdate_sec)
729 if tree: l.append('tree %s' % tree.encode('hex'))
730 if parent: l.append('parent %s' % parent.encode('hex'))
731 if author: l.append('author %s %s' % (author, adate_str))
732 if committer: l.append('committer %s %s' % (committer, cdate_str))
735 return self.maybe_write('commit', '\n'.join(l))
738 """Remove the pack file from disk."""
747 os.unlink(self.filename + '.pack')
754 def _end(self, run_midx=True):
756 if not f: return None
763 # update object count
765 cp = struct.pack('!i', self.count)
769 # calculate the pack sha1sum
772 for b in chunkyreader(f):
774 packbin = sum.digest()
776 fdatasync(f.fileno())
780 obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
782 nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
783 if os.path.exists(self.filename + '.map'):
784 os.unlink(self.filename + '.map')
785 os.rename(self.filename + '.pack', nameprefix + '.pack')
786 os.rename(self.filename + '.idx', nameprefix + '.idx')
788 os.fsync(self.parentfd)
790 os.close(self.parentfd)
793 auto_midx(repo('objects/pack'))
795 if self.on_pack_finish:
796 self.on_pack_finish(nameprefix)
800 def close(self, run_midx=True):
801 """Close the pack file and move it to its definitive path."""
802 return self._end(run_midx=run_midx)
804 def _write_pack_idx_v2(self, filename, idx, packbin):
807 for entry in section:
808 if entry[2] >= 2**31:
811 # Length: header + fan-out + shas-and-crcs + overflow-offsets
812 index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
814 idx_f = open(filename, 'w+b')
816 idx_f.truncate(index_len)
817 fdatasync(idx_f.fileno())
818 idx_map = mmap_readwrite(idx_f, close=False)
820 count = _helpers.write_idx(filename, idx_map, idx, self.count)
821 assert(count == self.count)
828 idx_f = open(filename, 'a+b')
833 b = idx_f.read(8 + 4*256)
836 obj_list_sum = Sha1()
837 for b in chunkyreader(idx_f, 20*self.count):
839 obj_list_sum.update(b)
840 namebase = obj_list_sum.hexdigest()
842 for b in chunkyreader(idx_f):
844 idx_f.write(idx_sum.digest())
845 fdatasync(idx_f.fileno())
851 def _gitenv(repo_dir = None):
855 os.environ['GIT_DIR'] = os.path.abspath(repo_dir)
859 def list_refs(refnames=None, repo_dir=None,
860 limit_to_heads=False, limit_to_tags=False):
861 """Yield (refname, hash) tuples for all repository refs unless
862 refnames are specified. In that case, only include tuples for
863 those refs. The limits restrict the result items to refs/heads or
864 refs/tags. If both limits are specified, items from both sources
868 argv = ['git', 'show-ref']
870 argv.append('--heads')
872 argv.append('--tags')
876 p = subprocess.Popen(argv,
877 preexec_fn = _gitenv(repo_dir),
878 stdout = subprocess.PIPE)
879 out = p.stdout.read().strip()
880 rv = p.wait() # not fatal
884 for d in out.split('\n'):
885 (sha, name) = d.split(' ', 1)
886 yield (name, sha.decode('hex'))
889 def read_ref(refname, repo_dir = None):
890 """Get the commit id of the most recent commit made on a given ref."""
891 refs = list_refs(refnames=[refname], repo_dir=repo_dir, limit_to_heads=True)
892 l = tuple(islice(refs, 2))
900 def rev_list(ref, count=None, repo_dir=None):
901 """Generate a list of reachable commits in reverse chronological order.
903 This generator walks through commits, from child to parent, that are
904 reachable via the specified ref and yields a series of tuples of the form
907 If count is a non-zero integer, limit the number of commits to "count"
910 assert(not ref.startswith('-'))
913 opts += ['-n', str(atoi(count))]
914 argv = ['git', 'rev-list', '--pretty=format:%at'] + opts + [ref, '--']
915 p = subprocess.Popen(argv,
916 preexec_fn = _gitenv(repo_dir),
917 stdout = subprocess.PIPE)
921 if s.startswith('commit '):
922 commit = s[7:].decode('hex')
926 rv = p.wait() # not fatal
928 raise GitError, 'git rev-list returned error %d' % rv
931 def get_commit_dates(refs, repo_dir=None):
932 """Get the dates for the specified commit refs. For now, every unique
933 string in refs must resolve to a different commit or this
934 function will fail."""
937 commit = get_commit_items(ref, cp(repo_dir))
938 result.append(commit.author_sec)
942 def rev_parse(committish, repo_dir=None):
943 """Resolve the full hash for 'committish', if it exists.
945 Should be roughly equivalent to 'git rev-parse'.
947 Returns the hex value of the hash if it is found, None if 'committish' does
948 not correspond to anything.
950 head = read_ref(committish, repo_dir=repo_dir)
952 debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
955 pL = PackIdxList(repo('objects/pack', repo_dir=repo_dir))
957 if len(committish) == 40:
959 hash = committish.decode('hex')
969 def update_ref(refname, newval, oldval, repo_dir=None):
970 """Update a repository reference."""
973 assert(refname.startswith('refs/heads/') \
974 or refname.startswith('refs/tags/'))
975 p = subprocess.Popen(['git', 'update-ref', refname,
976 newval.encode('hex'), oldval.encode('hex')],
977 preexec_fn = _gitenv(repo_dir))
978 _git_wait('git update-ref', p)
981 def delete_ref(refname, oldvalue=None):
982 """Delete a repository reference (see git update-ref(1))."""
983 assert(refname.startswith('refs/'))
984 oldvalue = [] if not oldvalue else [oldvalue]
985 p = subprocess.Popen(['git', 'update-ref', '-d', refname] + oldvalue,
986 preexec_fn = _gitenv())
987 _git_wait('git update-ref', p)
990 def guess_repo(path=None):
991 """Set the path value in the global variable "repodir".
992 This makes bup look for an existing bup repository, but not fail if a
993 repository doesn't exist. Usually, if you are interacting with a bup
994 repository, you would not be calling this function but using
1001 repodir = os.environ.get('BUP_DIR')
1003 repodir = os.path.expanduser('~/.bup')
1006 def init_repo(path=None):
1007 """Create the Git bare repository for bup in a given path."""
1009 d = repo() # appends a / to the path
1010 parent = os.path.dirname(os.path.dirname(d))
1011 if parent and not os.path.exists(parent):
1012 raise GitError('parent directory "%s" does not exist\n' % parent)
1013 if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
1014 raise GitError('"%s" exists but is not a directory\n' % d)
1015 p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
1016 preexec_fn = _gitenv())
1017 _git_wait('git init', p)
1018 # Force the index version configuration in order to ensure bup works
1019 # regardless of the version of the installed Git binary.
1020 p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
1021 stdout=sys.stderr, preexec_fn = _gitenv())
1022 _git_wait('git config', p)
1024 p = subprocess.Popen(['git', 'config', 'core.logAllRefUpdates', 'true'],
1025 stdout=sys.stderr, preexec_fn = _gitenv())
1026 _git_wait('git config', p)
1029 def check_repo_or_die(path=None):
1030 """Check to see if a bup repository probably exists, and abort if not."""
1033 pst = stat_if_exists(top + '/objects/pack')
1034 if pst and stat.S_ISDIR(pst.st_mode):
1037 top_st = stat_if_exists(top)
1039 log('error: repository %r does not exist (see "bup help init")\n'
1042 log('error: %r is not a repository\n' % top)
1048 """Get Git's version and ensure a usable version is installed.
1050 The returned version is formatted as an ordered tuple with each position
1051 representing a digit in the version tag. For example, the following tuple
1052 would represent version 1.6.6.9:
1054 ('1', '6', '6', '9')
1058 p = subprocess.Popen(['git', '--version'],
1059 stdout=subprocess.PIPE)
1060 gvs = p.stdout.read()
1061 _git_wait('git --version', p)
1062 m = re.match(r'git version (\S+.\S+)', gvs)
1064 raise GitError('git --version weird output: %r' % gvs)
1065 _ver = tuple(m.group(1).split('.'))
1066 needed = ('1','5', '3', '1')
1068 raise GitError('git version %s or higher is required; you have %s'
1069 % ('.'.join(needed), '.'.join(_ver)))
1073 class _AbortableIter:
1074 def __init__(self, it, onabort = None):
1076 self.onabort = onabort
1084 return self.it.next()
1085 except StopIteration as e:
1093 """Abort iteration and call the abortion callback, if needed."""
1103 class MissingObject(KeyError):
1104 def __init__(self, id):
1106 KeyError.__init__(self, 'object %r is missing' % id.encode('hex'))
1111 """Link to 'git cat-file' that is used to retrieve blob data."""
1112 def __init__(self, repo_dir = None):
1114 self.repo_dir = repo_dir
1115 wanted = ('1','5','6')
1118 log('warning: git version < %s; bup will be slow.\n'
1121 self.get = self._slow_get
1123 self.p = self.inprogress = None
1124 self.get = self._fast_get
1128 self.p.stdout.close()
1129 self.p.stdin.close()
1131 self.inprogress = None
1135 self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1136 stdin=subprocess.PIPE,
1137 stdout=subprocess.PIPE,
1140 preexec_fn = _gitenv(self.repo_dir))
1142 def _fast_get(self, id):
1143 if not self.p or self.p.poll() != None:
1146 poll_result = self.p.poll()
1147 assert(poll_result == None)
1149 log('_fast_get: opening %r while %r is open\n'
1150 % (id, self.inprogress))
1151 assert(not self.inprogress)
1152 assert(id.find('\n') < 0)
1153 assert(id.find('\r') < 0)
1154 assert(not id.startswith('-'))
1155 self.inprogress = id
1156 self.p.stdin.write('%s\n' % id)
1157 self.p.stdin.flush()
1158 hdr = self.p.stdout.readline()
1159 if hdr.endswith(' missing\n'):
1160 self.inprogress = None
1161 raise MissingObject(id.decode('hex'))
1162 spl = hdr.split(' ')
1163 if len(spl) != 3 or len(spl[0]) != 40:
1164 raise GitError('expected blob, got %r' % spl)
1165 (hex, type, size) = spl
1167 it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
1168 onabort = self._abort)
1173 readline_result = self.p.stdout.readline()
1174 assert(readline_result == '\n')
1175 self.inprogress = None
1176 except Exception as e:
1180 def _slow_get(self, id):
1181 assert(id.find('\n') < 0)
1182 assert(id.find('\r') < 0)
1183 assert(id[0] != '-')
1184 type = _git_capture(['git', 'cat-file', '-t', id]).strip()
1187 p = subprocess.Popen(['git', 'cat-file', type, id],
1188 stdout=subprocess.PIPE,
1189 preexec_fn = _gitenv(self.repo_dir))
1190 for blob in chunkyreader(p.stdout):
1192 _git_wait('git cat-file', p)
1194 def _join(self, it):
1199 elif type == 'tree':
1200 treefile = ''.join(it)
1201 for (mode, name, sha) in tree_decode(treefile):
1202 for blob in self.join(sha.encode('hex')):
1204 elif type == 'commit':
1205 treeline = ''.join(it).split('\n')[0]
1206 assert(treeline.startswith('tree '))
1207 for blob in self.join(treeline[5:]):
1210 raise GitError('invalid object type %r: expected blob/tree/commit'
1214 """Generate a list of the content of all blobs that can be reached
1215 from an object. The hash given in 'id' must point to a blob, a tree
1216 or a commit. The content of all blobs that can be seen from trees or
1217 commits will be added to the list.
1220 for d in self._join(self.get(id)):
1222 except StopIteration:
1228 def cp(repo_dir=None):
1229 """Create a CatPipe object or reuse the already existing one."""
1232 repo_dir = repodir or repo()
1233 repo_dir = os.path.abspath(repo_dir)
1234 cp = _cp.get(repo_dir)
1236 cp = CatPipe(repo_dir)
1241 def tags(repo_dir = None):
1242 """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1244 for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1245 assert(n.startswith('refs/tags/'))
1249 tags[c].append(name) # more than one tag can point at 'c'
1253 WalkItem = namedtuple('WalkItem', ['id', 'type', 'mode',
1254 'path', 'chunk_path', 'data'])
1255 # The path is the mangled path, and if an item represents a fragment
1256 # of a chunked file, the chunk_path will be the chunked subtree path
1257 # for the chunk, i.e. ['', '2d3115e', ...]. The top-level path for a
1258 # chunked file will have a chunk_path of ['']. So some chunk subtree
1259 # of the file '/foo/bar/baz' might look like this:
1261 # item.path = ['foo', 'bar', 'baz.bup']
1262 # item.chunk_path = ['', '2d3115e', '016b097']
1263 # item.type = 'tree'
1267 def walk_object(cat_pipe, id,
1270 """Yield everything reachable from id via cat_pipe as a WalkItem,
1271 stopping whenever stop_at(id) returns true. Throw MissingObject
1272 if a hash encountered is missing from the repository, and don't
1273 read or return blob content in the data field unless include_data
1276 # Maintain the pending stack on the heap to avoid stack overflow
1277 pending = [(id, [], [], None)]
1279 id, parent_path, chunk_path, mode = pending.pop()
1280 if stop_at and stop_at(id):
1283 if (not include_data) and mode and stat.S_ISREG(mode):
1284 # If the object is a "regular file", then it's a leaf in
1285 # the graph, so we can skip reading the data if the caller
1286 # hasn't requested it.
1287 yield WalkItem(id=id, type='blob',
1288 chunk_path=chunk_path, path=parent_path,
1293 item_it = cat_pipe.get(id)
1294 type = item_it.next()
1295 if type not in ('blob', 'commit', 'tree'):
1296 raise Exception('unexpected repository object type %r' % type)
1298 # FIXME: set the mode based on the type when the mode is None
1299 if type == 'blob' and not include_data:
1300 # Dump data until we can ask cat_pipe not to fetch it
1301 for ignored in item_it:
1305 data = ''.join(item_it)
1307 yield WalkItem(id=id, type=type,
1308 chunk_path=chunk_path, path=parent_path,
1310 data=(data if include_data else None))
1312 if type == 'commit':
1313 commit_items = parse_commit(data)
1314 for pid in commit_items.parents:
1315 pending.append((pid, parent_path, chunk_path, mode))
1316 pending.append((commit_items.tree, parent_path, chunk_path,
1317 hashsplit.GIT_MODE_TREE))
1318 elif type == 'tree':
1319 for mode, name, ent_id in tree_decode(data):
1320 demangled, bup_type = demangle_name(name, mode)
1322 sub_path = parent_path
1323 sub_chunk_path = chunk_path + [name]
1325 sub_path = parent_path + [name]
1326 if bup_type == BUP_CHUNKED:
1327 sub_chunk_path = ['']
1329 sub_chunk_path = chunk_path
1330 pending.append((ent_id.encode('hex'), sub_path, sub_chunk_path,