1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
6 import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
7 from collections import namedtuple
8 from itertools import islice
10 from bup import _helpers, hashsplit, path, midx, bloom, xstat
11 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
13 hostname, localtime, log, merge_iter,
14 mmap_read, mmap_readwrite,
15 progress, qprogress, unlink, username, userfullname,
19 max_pack_size = 1000*1000*1000 # larger packs will slow down pruning
20 max_pack_objects = 200*1000 # cache memory usage is about 83 bytes per object
26 _typemap = { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
27 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
33 class GitError(Exception):
37 def parse_tz_offset(s):
38 """UTC offset in seconds."""
39 tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
45 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
46 # Make sure that's authoritative.
47 _start_end_char = r'[^ .,:;<>"\'\0\n]'
48 _content_char = r'[^\0\n<>]'
49 _safe_str_rx = '(?:%s{1,2}|(?:%s%s*%s))' \
51 _start_end_char, _content_char, _start_end_char)
52 _tz_rx = r'[-+]\d\d[0-5]\d'
53 _parent_rx = r'(?:parent [abcdefABCDEF0123456789]{40}\n)'
54 _commit_rx = re.compile(r'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
55 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
56 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)
58 (?P<message>(?:.|\n)*)''' % (_parent_rx,
59 _safe_str_rx, _safe_str_rx, _tz_rx,
60 _safe_str_rx, _safe_str_rx, _tz_rx))
61 _parent_hash_rx = re.compile(r'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
64 # Note that the author_sec and committer_sec values are (UTC) epoch seconds.
65 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
66 'author_name', 'author_mail',
67 'author_sec', 'author_offset',
68 'committer_name', 'committer_mail',
69 'committer_sec', 'committer_offset',
72 def parse_commit(content):
73 commit_match = re.match(_commit_rx, content)
75 raise Exception('cannot parse commit %r' % content)
76 matches = commit_match.groupdict()
77 return CommitInfo(tree=matches['tree'],
78 parents=re.findall(_parent_hash_rx, matches['parents']),
79 author_name=matches['author_name'],
80 author_mail=matches['author_mail'],
81 author_sec=int(matches['asec']),
82 author_offset=parse_tz_offset(matches['atz']),
83 committer_name=matches['committer_name'],
84 committer_mail=matches['committer_mail'],
85 committer_sec=int(matches['csec']),
86 committer_offset=parse_tz_offset(matches['ctz']),
87 message=matches['message'])
90 def get_commit_items(id, cp):
91 commit_it = cp.get(id)
92 assert(commit_it.next() == 'commit')
93 commit_content = ''.join(commit_it)
94 return parse_commit(commit_content)
97 def _local_git_date_str(epoch_sec):
98 return '%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
101 def _git_date_str(epoch_sec, tz_offset_sec):
102 offs = tz_offset_sec // 60
103 return '%d %s%02d%02d' \
105 '+' if offs >= 0 else '-',
110 def repo(sub = '', repo_dir=None):
111 """Get the path to the git repository or one of its subdirectories."""
113 repo_dir = repo_dir or repodir
115 raise GitError('You should call check_repo_or_die()')
117 # If there's a .git subdirectory, then the actual repo is in there.
118 gd = os.path.join(repo_dir, '.git')
119 if os.path.exists(gd):
122 return os.path.join(repo_dir, sub)
126 return re.sub(r'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)',
131 full = os.path.abspath(path)
132 fullrepo = os.path.abspath(repo(''))
133 if not fullrepo.endswith('/'):
135 if full.startswith(fullrepo):
136 path = full[len(fullrepo):]
137 if path.startswith('index-cache/'):
138 path = path[len('index-cache/'):]
139 return shorten_hash(path)
143 paths = [repo('objects/pack')]
144 paths += glob.glob(repo('index-cache/*/.'))
148 def auto_midx(objdir):
149 args = [path.exe(), 'midx', '--auto', '--dir', objdir]
151 rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
153 # make sure 'args' gets printed to help with debugging
154 add_error('%r: exception: %s' % (args, e))
157 add_error('%r: returned %d' % (args, rv))
159 args = [path.exe(), 'bloom', '--dir', objdir]
161 rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
163 # make sure 'args' gets printed to help with debugging
164 add_error('%r: exception: %s' % (args, e))
167 add_error('%r: returned %d' % (args, rv))
170 def mangle_name(name, mode, gitmode):
171 """Mangle a file name to present an abstract name for segmented files.
172 Mangled file names will have the ".bup" extension added to them. If a
173 file's name already ends with ".bup", a ".bupl" extension is added to
174 disambiguate normal files from segmented ones.
176 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
177 assert(stat.S_ISDIR(gitmode))
179 elif name.endswith('.bup') or name[:-1].endswith('.bup'):
180 return name + '.bupl'
185 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
186 def demangle_name(name, mode):
187 """Remove name mangling from a file name, if necessary.
189 The return value is a tuple (demangled_filename,mode), where mode is one of
192 * BUP_NORMAL : files that should be read as-is from the repository
193 * BUP_CHUNKED : files that were chunked and need to be reassembled
195 For more information on the name mangling algorithm, see mangle_name()
197 if name.endswith('.bupl'):
198 return (name[:-5], BUP_NORMAL)
199 elif name.endswith('.bup'):
200 return (name[:-4], BUP_CHUNKED)
201 elif name.endswith('.bupm'):
203 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
205 return (name, BUP_NORMAL)
208 def calc_hash(type, content):
209 """Calculate some content's hash in the Git fashion."""
210 header = '%s %d\0' % (type, len(content))
216 def shalist_item_sort_key(ent):
217 (mode, name, id) = ent
218 assert(mode+0 == mode)
219 if stat.S_ISDIR(mode):
225 def tree_encode(shalist):
226 """Generate a git tree object from (mode,name,hash) tuples."""
227 shalist = sorted(shalist, key = shalist_item_sort_key)
229 for (mode,name,bin) in shalist:
231 assert(mode+0 == mode)
233 assert(len(bin) == 20)
234 s = '%o %s\0%s' % (mode,name,bin)
235 assert(s[0] != '0') # 0-padded octal is not acceptable in a git tree
240 def tree_decode(buf):
241 """Generate a list of (mode,name,hash) from the git tree object in buf."""
243 while ofs < len(buf):
244 z = buf.find('\0', ofs)
246 spl = buf[ofs:z].split(' ', 1)
247 assert(len(spl) == 2)
249 sha = buf[z+1:z+1+20]
251 yield (int(mode, 8), name, sha)
254 def _encode_packobj(type, content, compression_level=1):
255 if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
256 raise ValueError('invalid compression level %s' % compression_level)
259 szbits = (sz & 0x0f) | (_typemap[type]<<4)
262 if sz: szbits |= 0x80
268 z = zlib.compressobj(compression_level)
270 yield z.compress(content)
274 def _encode_looseobj(type, content, compression_level=1):
275 z = zlib.compressobj(compression_level)
276 yield z.compress('%s %d\0' % (type, len(content)))
277 yield z.compress(content)
281 def _decode_looseobj(buf):
283 s = zlib.decompress(buf)
290 assert(type in _typemap)
291 assert(sz == len(content))
292 return (type, content)
295 def _decode_packobj(buf):
298 type = _typermap[(c & 0x70) >> 4]
305 sz |= (c & 0x7f) << shift
309 return (type, zlib.decompress(buf[i+1:]))
316 def find_offset(self, hash):
317 """Get the offset of an object inside the index file."""
318 idx = self._idx_from_hash(hash)
320 return self._ofs_from_idx(idx)
323 def exists(self, hash, want_source=False):
324 """Return nonempty if the object exists in this index."""
325 if hash and (self._idx_from_hash(hash) != None):
326 return want_source and os.path.basename(self.name) or True
330 return int(self.fanout[255])
332 def _idx_from_hash(self, hash):
333 global _total_searches, _total_steps
335 assert(len(hash) == 20)
337 start = self.fanout[b1-1] # range -1..254
338 end = self.fanout[b1] # range 0..255
340 _total_steps += 1 # lookup table is a step
343 mid = start + (end-start)/2
344 v = self._idx_to_hash(mid)
354 class PackIdxV1(PackIdx):
355 """Object representation of a Git pack index (version 1) file."""
356 def __init__(self, filename, f):
358 self.idxnames = [self.name]
359 self.map = mmap_read(f)
360 self.fanout = list(struct.unpack('!256I',
361 str(buffer(self.map, 0, 256*4))))
362 self.fanout.append(0) # entry "-1"
363 nsha = self.fanout[255]
365 self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
367 def _ofs_from_idx(self, idx):
368 return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
370 def _idx_to_hash(self, idx):
371 return str(self.shatable[idx*24+4 : idx*24+24])
374 for i in xrange(self.fanout[255]):
375 yield buffer(self.map, 256*4 + 24*i + 4, 20)
378 class PackIdxV2(PackIdx):
379 """Object representation of a Git pack index (version 2) file."""
380 def __init__(self, filename, f):
382 self.idxnames = [self.name]
383 self.map = mmap_read(f)
384 assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
385 self.fanout = list(struct.unpack('!256I',
386 str(buffer(self.map, 8, 256*4))))
387 self.fanout.append(0) # entry "-1"
388 nsha = self.fanout[255]
389 self.sha_ofs = 8 + 256*4
390 self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
391 self.ofstable = buffer(self.map,
392 self.sha_ofs + nsha*20 + nsha*4,
394 self.ofs64table = buffer(self.map,
395 8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
397 def _ofs_from_idx(self, idx):
398 ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
400 idx64 = ofs & 0x7fffffff
401 ofs = struct.unpack('!Q',
402 str(buffer(self.ofs64table, idx64*8, 8)))[0]
405 def _idx_to_hash(self, idx):
406 return str(self.shatable[idx*20:(idx+1)*20])
409 for i in xrange(self.fanout[255]):
410 yield buffer(self.map, 8 + 256*4 + 20*i, 20)
415 def __init__(self, dir):
417 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
422 self.do_bloom = False
429 assert(_mpi_count == 0)
432 return iter(idxmerge(self.packs))
435 return sum(len(pack) for pack in self.packs)
437 def exists(self, hash, want_source=False):
438 """Return nonempty if the object exists in the index files."""
439 global _total_searches
441 if hash in self.also:
443 if self.do_bloom and self.bloom:
444 if self.bloom.exists(hash):
445 self.do_bloom = False
447 _total_searches -= 1 # was counted by bloom
449 for i in xrange(len(self.packs)):
451 _total_searches -= 1 # will be incremented by sub-pack
452 ix = p.exists(hash, want_source=want_source)
454 # reorder so most recently used packs are searched first
455 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
460 def refresh(self, skip_midx = False):
461 """Refresh the index list.
462 This method verifies if .midx files were superseded (e.g. all of its
463 contents are in another, bigger .midx file) and removes the superseded
466 If skip_midx is True, all work on .midx files will be skipped and .midx
467 files will be removed from the list.
469 The module-global variable 'ignore_midx' can force this function to
470 always act as if skip_midx was True.
472 self.bloom = None # Always reopen the bloom as it may have been relaced
473 self.do_bloom = False
474 skip_midx = skip_midx or ignore_midx
475 d = dict((p.name, p) for p in self.packs
476 if not skip_midx or not isinstance(p, midx.PackMidx))
477 if os.path.exists(self.dir):
480 for ix in self.packs:
481 if isinstance(ix, midx.PackMidx):
482 for name in ix.idxnames:
483 d[os.path.join(self.dir, name)] = ix
484 for full in glob.glob(os.path.join(self.dir,'*.midx')):
486 mx = midx.PackMidx(full)
487 (mxd, mxf) = os.path.split(mx.name)
489 for n in mx.idxnames:
490 if not os.path.exists(os.path.join(mxd, n)):
491 log(('warning: index %s missing\n' +
492 ' used by %s\n') % (n, mxf))
500 midxl.sort(key=lambda ix:
501 (-len(ix), -xstat.stat(ix.name).st_mtime))
504 for sub in ix.idxnames:
505 found = d.get(os.path.join(self.dir, sub))
506 if not found or isinstance(found, PackIdx):
507 # doesn't exist, or exists but not in a midx
512 for name in ix.idxnames:
513 d[os.path.join(self.dir, name)] = ix
514 elif not ix.force_keep:
515 debug1('midx: removing redundant: %s\n'
516 % os.path.basename(ix.name))
519 for full in glob.glob(os.path.join(self.dir,'*.idx')):
523 except GitError as e:
527 bfull = os.path.join(self.dir, 'bup.bloom')
528 if self.bloom is None and os.path.exists(bfull):
529 self.bloom = bloom.ShaBloom(bfull)
530 self.packs = list(set(d.values()))
531 self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
532 if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
536 debug1('PackIdxList: using %d index%s.\n'
537 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
540 """Insert an additional object in the list."""
544 def open_idx(filename):
545 if filename.endswith('.idx'):
546 f = open(filename, 'rb')
548 if header[0:4] == '\377tOc':
549 version = struct.unpack('!I', header[4:8])[0]
551 return PackIdxV2(filename, f)
553 raise GitError('%s: expected idx file version 2, got %d'
554 % (filename, version))
555 elif len(header) == 8 and header[0:4] < '\377tOc':
556 return PackIdxV1(filename, f)
558 raise GitError('%s: unrecognized idx file header' % filename)
559 elif filename.endswith('.midx'):
560 return midx.PackMidx(filename)
562 raise GitError('idx filenames must end with .idx or .midx')
565 def idxmerge(idxlist, final_progress=True):
566 """Generate a list of all the objects reachable in a PackIdxList."""
567 def pfunc(count, total):
568 qprogress('Reading indexes: %.2f%% (%d/%d)\r'
569 % (count*100.0/total, count, total))
570 def pfinal(count, total):
572 progress('Reading indexes: %.2f%% (%d/%d), done.\n'
573 % (100, total, total))
574 return merge_iter(idxlist, 10024, pfunc, pfinal)
577 def _make_objcache():
578 return PackIdxList(repo('objects/pack'))
580 # bup-gc assumes that it can disable all PackWriter activities
581 # (bloom/midx/cache) via the constructor and close() arguments.
584 """Writes Git objects inside a pack file."""
585 def __init__(self, objcache_maker=_make_objcache, compression_level=1,
586 run_midx=True, on_pack_finish=None):
593 self.objcache_maker = objcache_maker
595 self.compression_level = compression_level
596 self.run_midx=run_midx
597 self.on_pack_finish = on_pack_finish
604 objdir = dir=repo('objects')
605 fd, name = tempfile.mkstemp(suffix='.pack', dir=objdir)
607 self.file = os.fdopen(fd, 'w+b')
612 self.parentfd = os.open(objdir, os.O_RDONLY)
618 assert(name.endswith('.pack'))
619 self.filename = name[:-5]
620 self.file.write('PACK\0\0\0\2\0\0\0\0')
621 self.idx = list(list() for i in xrange(256))
623 def _raw_write(self, datalist, sha):
626 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
627 # the file never has a *partial* blob. So let's make sure it's
628 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
629 # to our hashsplit algorithm.) f.write() does its own buffering,
630 # but that's okay because we'll flush it in _end().
631 oneblob = ''.join(datalist)
635 raise GitError, e, sys.exc_info()[2]
637 crc = zlib.crc32(oneblob) & 0xffffffff
638 self._update_idx(sha, crc, nw)
643 def _update_idx(self, sha, crc, size):
646 self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
648 def _write(self, sha, type, content):
652 sha = calc_hash(type, content)
653 size, crc = self._raw_write(_encode_packobj(type, content,
654 self.compression_level),
656 if self.outbytes >= max_pack_size or self.count >= max_pack_objects:
660 def breakpoint(self):
661 """Clear byte and object counts and return the last processed id."""
662 id = self._end(self.run_midx)
663 self.outbytes = self.count = 0
666 def _require_objcache(self):
667 if self.objcache is None and self.objcache_maker:
668 self.objcache = self.objcache_maker()
669 if self.objcache is None:
671 "PackWriter not opened or can't check exists w/o objcache")
673 def exists(self, id, want_source=False):
674 """Return non-empty if an object is found in the object cache."""
675 self._require_objcache()
676 return self.objcache.exists(id, want_source=want_source)
678 def just_write(self, sha, type, content):
679 """Write an object to the pack file, bypassing the objcache. Fails if
681 self._write(sha, type, content)
683 def maybe_write(self, type, content):
684 """Write an object to the pack file if not present and return its id."""
685 sha = calc_hash(type, content)
686 if not self.exists(sha):
687 self.just_write(sha, type, content)
688 self._require_objcache()
689 self.objcache.add(sha)
692 def new_blob(self, blob):
693 """Create a blob object in the pack with the supplied content."""
694 return self.maybe_write('blob', blob)
696 def new_tree(self, shalist):
697 """Create a tree object in the pack."""
698 content = tree_encode(shalist)
699 return self.maybe_write('tree', content)
701 def new_commit(self, tree, parent,
702 author, adate_sec, adate_tz,
703 committer, cdate_sec, cdate_tz,
705 """Create a commit object in the pack. The date_sec values must be
706 epoch-seconds, and if a tz is None, the local timezone is assumed."""
708 adate_str = _git_date_str(adate_sec, adate_tz)
710 adate_str = _local_git_date_str(adate_sec)
712 cdate_str = _git_date_str(cdate_sec, cdate_tz)
714 cdate_str = _local_git_date_str(cdate_sec)
716 if tree: l.append('tree %s' % tree.encode('hex'))
717 if parent: l.append('parent %s' % parent.encode('hex'))
718 if author: l.append('author %s %s' % (author, adate_str))
719 if committer: l.append('committer %s %s' % (committer, cdate_str))
722 return self.maybe_write('commit', '\n'.join(l))
725 """Remove the pack file from disk."""
734 os.unlink(self.filename + '.pack')
741 def _end(self, run_midx=True):
743 if not f: return None
750 # update object count
752 cp = struct.pack('!i', self.count)
756 # calculate the pack sha1sum
759 for b in chunkyreader(f):
761 packbin = sum.digest()
763 fdatasync(f.fileno())
767 obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
769 nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
770 if os.path.exists(self.filename + '.map'):
771 os.unlink(self.filename + '.map')
772 os.rename(self.filename + '.pack', nameprefix + '.pack')
773 os.rename(self.filename + '.idx', nameprefix + '.idx')
775 os.fsync(self.parentfd)
777 os.close(self.parentfd)
780 auto_midx(repo('objects/pack'))
782 if self.on_pack_finish:
783 self.on_pack_finish(nameprefix)
787 def close(self, run_midx=True):
788 """Close the pack file and move it to its definitive path."""
789 return self._end(run_midx=run_midx)
791 def _write_pack_idx_v2(self, filename, idx, packbin):
794 for entry in section:
795 if entry[2] >= 2**31:
798 # Length: header + fan-out + shas-and-crcs + overflow-offsets
799 index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
801 idx_f = open(filename, 'w+b')
803 idx_f.truncate(index_len)
804 fdatasync(idx_f.fileno())
805 idx_map = mmap_readwrite(idx_f, close=False)
807 count = _helpers.write_idx(filename, idx_map, idx, self.count)
808 assert(count == self.count)
815 idx_f = open(filename, 'a+b')
820 b = idx_f.read(8 + 4*256)
823 obj_list_sum = Sha1()
824 for b in chunkyreader(idx_f, 20*self.count):
826 obj_list_sum.update(b)
827 namebase = obj_list_sum.hexdigest()
829 for b in chunkyreader(idx_f):
831 idx_f.write(idx_sum.digest())
832 fdatasync(idx_f.fileno())
838 def _gitenv(repo_dir = None):
842 os.environ['GIT_DIR'] = os.path.abspath(repo_dir)
846 def list_refs(refnames=None, repo_dir=None,
847 limit_to_heads=False, limit_to_tags=False):
848 """Yield (refname, hash) tuples for all repository refs unless
849 refnames are specified. In that case, only include tuples for
850 those refs. The limits restrict the result items to refs/heads or
851 refs/tags. If both limits are specified, items from both sources
855 argv = ['git', 'show-ref']
857 argv.append('--heads')
859 argv.append('--tags')
863 p = subprocess.Popen(argv,
864 preexec_fn = _gitenv(repo_dir),
865 stdout = subprocess.PIPE)
866 out = p.stdout.read().strip()
867 rv = p.wait() # not fatal
871 for d in out.split('\n'):
872 (sha, name) = d.split(' ', 1)
873 yield (name, sha.decode('hex'))
876 def read_ref(refname, repo_dir = None):
877 """Get the commit id of the most recent commit made on a given ref."""
878 refs = list_refs(refnames=[refname], repo_dir=repo_dir, limit_to_heads=True)
879 l = tuple(islice(refs, 2))
887 def rev_list(ref, count=None, repo_dir=None):
888 """Generate a list of reachable commits in reverse chronological order.
890 This generator walks through commits, from child to parent, that are
891 reachable via the specified ref and yields a series of tuples of the form
894 If count is a non-zero integer, limit the number of commits to "count"
897 assert(not ref.startswith('-'))
900 opts += ['-n', str(atoi(count))]
901 argv = ['git', 'rev-list', '--pretty=format:%at'] + opts + [ref, '--']
902 p = subprocess.Popen(argv,
903 preexec_fn = _gitenv(repo_dir),
904 stdout = subprocess.PIPE)
908 if s.startswith('commit '):
909 commit = s[7:].decode('hex')
913 rv = p.wait() # not fatal
915 raise GitError, 'git rev-list returned error %d' % rv
918 def get_commit_dates(refs, repo_dir=None):
919 """Get the dates for the specified commit refs. For now, every unique
920 string in refs must resolve to a different commit or this
921 function will fail."""
924 commit = get_commit_items(ref, cp(repo_dir))
925 result.append(commit.author_sec)
929 def rev_parse(committish, repo_dir=None):
930 """Resolve the full hash for 'committish', if it exists.
932 Should be roughly equivalent to 'git rev-parse'.
934 Returns the hex value of the hash if it is found, None if 'committish' does
935 not correspond to anything.
937 head = read_ref(committish, repo_dir=repo_dir)
939 debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
942 pL = PackIdxList(repo('objects/pack', repo_dir=repo_dir))
944 if len(committish) == 40:
946 hash = committish.decode('hex')
956 def update_ref(refname, newval, oldval, repo_dir=None):
957 """Update a repository reference."""
960 assert(refname.startswith('refs/heads/') \
961 or refname.startswith('refs/tags/'))
962 p = subprocess.Popen(['git', 'update-ref', refname,
963 newval.encode('hex'), oldval.encode('hex')],
964 preexec_fn = _gitenv(repo_dir))
965 _git_wait('git update-ref', p)
968 def delete_ref(refname, oldvalue=None):
969 """Delete a repository reference (see git update-ref(1))."""
970 assert(refname.startswith('refs/'))
971 oldvalue = [] if not oldvalue else [oldvalue]
972 p = subprocess.Popen(['git', 'update-ref', '-d', refname] + oldvalue,
973 preexec_fn = _gitenv())
974 _git_wait('git update-ref', p)
977 def guess_repo(path=None):
978 """Set the path value in the global variable "repodir".
979 This makes bup look for an existing bup repository, but not fail if a
980 repository doesn't exist. Usually, if you are interacting with a bup
981 repository, you would not be calling this function but using
988 repodir = os.environ.get('BUP_DIR')
990 repodir = os.path.expanduser('~/.bup')
993 def init_repo(path=None):
994 """Create the Git bare repository for bup in a given path."""
996 d = repo() # appends a / to the path
997 parent = os.path.dirname(os.path.dirname(d))
998 if parent and not os.path.exists(parent):
999 raise GitError('parent directory "%s" does not exist\n' % parent)
1000 if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
1001 raise GitError('"%s" exists but is not a directory\n' % d)
1002 p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
1003 preexec_fn = _gitenv())
1004 _git_wait('git init', p)
1005 # Force the index version configuration in order to ensure bup works
1006 # regardless of the version of the installed Git binary.
1007 p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
1008 stdout=sys.stderr, preexec_fn = _gitenv())
1009 _git_wait('git config', p)
1011 p = subprocess.Popen(['git', 'config', 'core.logAllRefUpdates', 'true'],
1012 stdout=sys.stderr, preexec_fn = _gitenv())
1013 _git_wait('git config', p)
1016 def check_repo_or_die(path=None):
1017 """Make sure a bup repository exists, and abort if not.
1018 If the path to a particular repository was not specified, this function
1019 initializes the default repository automatically.
1023 os.stat(repo('objects/pack/.'))
1024 except OSError as e:
1025 if e.errno == errno.ENOENT:
1026 log('error: %r is not a bup repository; run "bup init"\n'
1030 log('error: %s\n' % e)
1036 """Get Git's version and ensure a usable version is installed.
1038 The returned version is formatted as an ordered tuple with each position
1039 representing a digit in the version tag. For example, the following tuple
1040 would represent version 1.6.6.9:
1042 ('1', '6', '6', '9')
1046 p = subprocess.Popen(['git', '--version'],
1047 stdout=subprocess.PIPE)
1048 gvs = p.stdout.read()
1049 _git_wait('git --version', p)
1050 m = re.match(r'git version (\S+.\S+)', gvs)
1052 raise GitError('git --version weird output: %r' % gvs)
1053 _ver = tuple(m.group(1).split('.'))
1054 needed = ('1','5', '3', '1')
1056 raise GitError('git version %s or higher is required; you have %s'
1057 % ('.'.join(needed), '.'.join(_ver)))
1061 def _git_wait(cmd, p):
1064 raise GitError('%s returned %d' % (cmd, rv))
1067 def _git_capture(argv):
1068 p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv())
1070 _git_wait(repr(argv), p)
1074 class _AbortableIter:
1075 def __init__(self, it, onabort = None):
1077 self.onabort = onabort
1085 return self.it.next()
1086 except StopIteration as e:
1094 """Abort iteration and call the abortion callback, if needed."""
1104 class MissingObject(KeyError):
1105 def __init__(self, id):
1107 KeyError.__init__(self, 'object %r is missing' % id.encode('hex'))
1112 """Link to 'git cat-file' that is used to retrieve blob data."""
1113 def __init__(self, repo_dir = None):
1115 self.repo_dir = repo_dir
1116 wanted = ('1','5','6')
1119 log('warning: git version < %s; bup will be slow.\n'
1122 self.get = self._slow_get
1124 self.p = self.inprogress = None
1125 self.get = self._fast_get
1129 self.p.stdout.close()
1130 self.p.stdin.close()
1132 self.inprogress = None
1136 self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1137 stdin=subprocess.PIPE,
1138 stdout=subprocess.PIPE,
1141 preexec_fn = _gitenv(self.repo_dir))
1143 def _fast_get(self, id):
1144 if not self.p or self.p.poll() != None:
1147 poll_result = self.p.poll()
1148 assert(poll_result == None)
1150 log('_fast_get: opening %r while %r is open\n'
1151 % (id, self.inprogress))
1152 assert(not self.inprogress)
1153 assert(id.find('\n') < 0)
1154 assert(id.find('\r') < 0)
1155 assert(not id.startswith('-'))
1156 self.inprogress = id
1157 self.p.stdin.write('%s\n' % id)
1158 self.p.stdin.flush()
1159 hdr = self.p.stdout.readline()
1160 if hdr.endswith(' missing\n'):
1161 self.inprogress = None
1162 raise MissingObject(id.decode('hex'))
1163 spl = hdr.split(' ')
1164 if len(spl) != 3 or len(spl[0]) != 40:
1165 raise GitError('expected blob, got %r' % spl)
1166 (hex, type, size) = spl
1168 it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
1169 onabort = self._abort)
1174 readline_result = self.p.stdout.readline()
1175 assert(readline_result == '\n')
1176 self.inprogress = None
1177 except Exception as e:
1181 def _slow_get(self, id):
1182 assert(id.find('\n') < 0)
1183 assert(id.find('\r') < 0)
1184 assert(id[0] != '-')
1185 type = _git_capture(['git', 'cat-file', '-t', id]).strip()
1188 p = subprocess.Popen(['git', 'cat-file', type, id],
1189 stdout=subprocess.PIPE,
1190 preexec_fn = _gitenv(self.repo_dir))
1191 for blob in chunkyreader(p.stdout):
1193 _git_wait('git cat-file', p)
1195 def _join(self, it):
1200 elif type == 'tree':
1201 treefile = ''.join(it)
1202 for (mode, name, sha) in tree_decode(treefile):
1203 for blob in self.join(sha.encode('hex')):
1205 elif type == 'commit':
1206 treeline = ''.join(it).split('\n')[0]
1207 assert(treeline.startswith('tree '))
1208 for blob in self.join(treeline[5:]):
1211 raise GitError('invalid object type %r: expected blob/tree/commit'
1215 """Generate a list of the content of all blobs that can be reached
1216 from an object. The hash given in 'id' must point to a blob, a tree
1217 or a commit. The content of all blobs that can be seen from trees or
1218 commits will be added to the list.
1221 for d in self._join(self.get(id)):
1223 except StopIteration:
1229 def cp(repo_dir=None):
1230 """Create a CatPipe object or reuse the already existing one."""
1234 repo_dir = os.path.abspath(repo_dir)
1235 cp = _cp.get(repo_dir)
1237 cp = CatPipe(repo_dir)
1242 def tags(repo_dir = None):
1243 """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1245 for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1246 assert(n.startswith('refs/tags/'))
1250 tags[c].append(name) # more than one tag can point at 'c'
1254 WalkItem = namedtuple('WalkItem', ['id', 'type', 'mode',
1255 'path', 'chunk_path', 'data'])
1256 # The path is the mangled path, and if an item represents a fragment
1257 # of a chunked file, the chunk_path will be the chunked subtree path
1258 # for the chunk, i.e. ['', '2d3115e', ...]. The top-level path for a
1259 # chunked file will have a chunk_path of ['']. So some chunk subtree
1260 # of the file '/foo/bar/baz' might look like this:
1262 # item.path = ['foo', 'bar', 'baz.bup']
1263 # item.chunk_path = ['', '2d3115e', '016b097']
1264 # item.type = 'tree'
1268 def walk_object(cat_pipe, id,
1271 """Yield everything reachable from id via cat_pipe as a WalkItem,
1272 stopping whenever stop_at(id) returns true. Throw MissingObject
1273 if a hash encountered is missing from the repository, and don't
1274 read or return blob content in the data field unless include_data
1277 # Maintain the pending stack on the heap to avoid stack overflow
1278 pending = [(id, [], [], None)]
1280 id, parent_path, chunk_path, mode = pending.pop()
1281 if stop_at and stop_at(id):
1284 if (not include_data) and mode and stat.S_ISREG(mode):
1285 # If the object is a "regular file", then it's a leaf in
1286 # the graph, so we can skip reading the data if the caller
1287 # hasn't requested it.
1288 yield WalkItem(id=id, type='blob',
1289 chunk_path=chunk_path, path=parent_path,
1294 item_it = cat_pipe.get(id)
1295 type = item_it.next()
1296 if type not in ('blob', 'commit', 'tree'):
1297 raise Exception('unexpected repository object type %r' % type)
1299 # FIXME: set the mode based on the type when the mode is None
1300 if type == 'blob' and not include_data:
1301 # Dump data until we can ask cat_pipe not to fetch it
1302 for ignored in item_it:
1306 data = ''.join(item_it)
1308 yield WalkItem(id=id, type=type,
1309 chunk_path=chunk_path, path=parent_path,
1311 data=(data if include_data else None))
1313 if type == 'commit':
1314 commit_items = parse_commit(data)
1315 for pid in commit_items.parents:
1316 pending.append((pid, parent_path, chunk_path, mode))
1317 pending.append((commit_items.tree, parent_path, chunk_path,
1318 hashsplit.GIT_MODE_TREE))
1319 elif type == 'tree':
1320 for mode, name, ent_id in tree_decode(data):
1321 demangled, bup_type = demangle_name(name, mode)
1323 sub_path = parent_path
1324 sub_chunk_path = chunk_path + [name]
1326 sub_path = parent_path + [name]
1327 if bup_type == BUP_CHUNKED:
1328 sub_chunk_path = ['']
1330 sub_chunk_path = chunk_path
1331 pending.append((ent_id.encode('hex'), sub_path, sub_chunk_path,