1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
5 import os, zlib, time, subprocess, struct, stat, re, tempfile
7 from bup.helpers import *
8 from bup import _helpers
14 home_repodir = os.path.expanduser('~/.bup')
17 _typemap = { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
18 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
24 class GitError(Exception):
29 """Get the path to the git repository or one of its subdirectories."""
32 raise GitError('You should call check_repo_or_die()')
34 # If there's a .git subdirectory, then the actual repo is in there.
35 gd = os.path.join(repodir, '.git')
36 if os.path.exists(gd):
39 return os.path.join(repodir, sub)
42 def auto_midx(objdir):
43 main_exe = os.environ.get('BUP_MAIN_EXE') or sys.argv[0]
44 args = [main_exe, 'midx', '--auto', '--dir', objdir]
45 rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
47 add_error('%r: returned %d' % (args, rv))
50 def mangle_name(name, mode, gitmode):
51 """Mangle a file name to present an abstract name for segmented files.
52 Mangled file names will have the ".bup" extension added to them. If a
53 file's name already ends with ".bup", a ".bupl" extension is added to
54 disambiguate normal files from semgmented ones.
56 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
58 elif name.endswith('.bup') or name[:-1].endswith('.bup'):
64 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
65 def demangle_name(name):
66 """Remove name mangling from a file name, if necessary.
68 The return value is a tuple (demangled_filename,mode), where mode is one of
71 * BUP_NORMAL : files that should be read as-is from the repository
72 * BUP_CHUNKED : files that were chunked and need to be assembled
74 For more information on the name mangling algorythm, see mangle_name()
76 if name.endswith('.bupl'):
77 return (name[:-5], BUP_NORMAL)
78 elif name.endswith('.bup'):
79 return (name[:-4], BUP_CHUNKED)
81 return (name, BUP_NORMAL)
84 def _encode_packobj(type, content):
87 szbits = (sz & 0x0f) | (_typemap[type]<<4)
96 z = zlib.compressobj(1)
98 yield z.compress(content)
102 def _encode_looseobj(type, content):
103 z = zlib.compressobj(1)
104 yield z.compress('%s %d\0' % (type, len(content)))
105 yield z.compress(content)
109 def _decode_looseobj(buf):
111 s = zlib.decompress(buf)
118 assert(type in _typemap)
119 assert(sz == len(content))
120 return (type, content)
123 def _decode_packobj(buf):
126 type = _typermap[(c & 0x70) >> 4]
133 sz |= (c & 0x7f) << shift
137 return (type, zlib.decompress(buf[i+1:]))
144 def find_offset(self, hash):
145 """Get the offset of an object inside the index file."""
146 idx = self._idx_from_hash(hash)
148 return self._ofs_from_idx(idx)
151 def exists(self, hash):
152 """Return nonempty if the object exists in this index."""
153 return hash and (self._idx_from_hash(hash) != None) and True or None
156 return int(self.fanout[255])
158 def _idx_from_hash(self, hash):
159 global _total_searches, _total_steps
161 assert(len(hash) == 20)
163 start = self.fanout[b1-1] # range -1..254
164 end = self.fanout[b1] # range 0..255
166 _total_steps += 1 # lookup table is a step
169 mid = start + (end-start)/2
170 v = self._idx_to_hash(mid)
180 class PackIdxV1(PackIdx):
181 """Object representation of a Git pack index (version 1) file."""
182 def __init__(self, filename, f):
184 self.idxnames = [self.name]
185 self.map = mmap_read(f)
186 self.fanout = list(struct.unpack('!256I',
187 str(buffer(self.map, 0, 256*4))))
188 self.fanout.append(0) # entry "-1"
189 nsha = self.fanout[255]
190 self.shatable = buffer(self.map, 256*4, nsha*24)
192 def _ofs_from_idx(self, idx):
193 return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
195 def _idx_to_hash(self, idx):
196 return str(self.shatable[idx*24+4 : idx*24+24])
199 for i in xrange(self.fanout[255]):
200 yield buffer(self.map, 256*4 + 24*i + 4, 20)
203 class PackIdxV2(PackIdx):
204 """Object representation of a Git pack index (version 2) file."""
205 def __init__(self, filename, f):
207 self.idxnames = [self.name]
208 self.map = mmap_read(f)
209 assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
210 self.fanout = list(struct.unpack('!256I',
211 str(buffer(self.map, 8, 256*4))))
212 self.fanout.append(0) # entry "-1"
213 nsha = self.fanout[255]
214 self.shatable = buffer(self.map, 8 + 256*4, nsha*20)
215 self.ofstable = buffer(self.map,
216 8 + 256*4 + nsha*20 + nsha*4,
218 self.ofs64table = buffer(self.map,
219 8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
221 def _ofs_from_idx(self, idx):
222 ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
224 idx64 = ofs & 0x7fffffff
225 ofs = struct.unpack('!I',
226 str(buffer(self.ofs64table, idx64*8, 8)))[0]
229 def _idx_to_hash(self, idx):
230 return str(self.shatable[idx*20:(idx+1)*20])
233 for i in xrange(self.fanout[255]):
234 yield buffer(self.map, 8 + 256*4 + 20*i, 20)
237 extract_bits = _helpers.extract_bits
241 """Wrapper which contains data from multiple index files.
242 Multiple index (.midx) files constitute a wrapper around index (.idx) files
243 and make it possible for bup to expand Git's indexing capabilities to vast
246 def __init__(self, filename):
248 self.force_keep = False
249 assert(filename.endswith('.midx'))
250 self.map = mmap_read(open(filename))
251 if str(self.map[0:4]) != 'MIDX':
252 log('Warning: skipping: invalid MIDX header in %r\n' % filename)
253 self.force_keep = True
254 return self._init_failed()
255 ver = struct.unpack('!I', self.map[4:8])[0]
256 if ver < MIDX_VERSION:
257 log('Warning: ignoring old-style (v%d) midx %r\n'
259 self.force_keep = False # old stuff is boring
260 return self._init_failed()
261 if ver > MIDX_VERSION:
262 log('Warning: ignoring too-new (v%d) midx %r\n'
264 self.force_keep = True # new stuff is exciting
265 return self._init_failed()
267 self.bits = _helpers.firstword(self.map[8:12])
268 self.entries = 2**self.bits
269 self.fanout = buffer(self.map, 12, self.entries*4)
270 shaofs = 12 + self.entries*4
271 nsha = self._fanget(self.entries-1)
272 self.shalist = buffer(self.map, shaofs, nsha*20)
273 self.idxnames = str(self.map[shaofs + 20*nsha:]).split('\0')
275 def _init_failed(self):
278 self.fanout = buffer('\0\0\0\0')
279 self.shalist = buffer('\0'*20)
282 def _fanget(self, i):
284 s = self.fanout[start:start+4]
285 return _helpers.firstword(s)
288 return str(self.shalist[i*20:(i+1)*20])
290 def exists(self, hash):
291 """Return nonempty if the object exists in the index files."""
292 global _total_searches, _total_steps
295 el = extract_bits(want, self.bits)
297 start = self._fanget(el-1)
298 startv = el << (32-self.bits)
302 end = self._fanget(el)
303 endv = (el+1) << (32-self.bits)
304 _total_steps += 1 # lookup table is a step
305 hashv = _helpers.firstword(hash)
306 #print '(%08x) %08x %08x %08x' % (extract_bits(want, 32), startv, hashv, endv)
309 #print '! %08x %08x %08x %d - %d' % (startv, hashv, endv, start, end)
310 mid = start + (hashv-startv)*(end-start-1)/(endv-startv)
311 #print ' %08x %08x %08x %d %d %d' % (startv, hashv, endv, start, mid, end)
313 #print ' %08x' % self._num(v)
316 startv = _helpers.firstword(v)
319 endv = _helpers.firstword(v)
325 for i in xrange(self._fanget(self.entries-1)):
326 yield buffer(self.shalist, i*20, 20)
329 return int(self._fanget(self.entries-1))
334 def __init__(self, dir):
336 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
346 assert(_mpi_count == 0)
349 return iter(idxmerge(self.packs))
352 return sum(len(pack) for pack in self.packs)
354 def exists(self, hash):
355 """Return nonempty if the object exists in the index files."""
356 global _total_searches
358 if hash in self.also:
360 for i in range(len(self.packs)):
362 _total_searches -= 1 # will be incremented by sub-pack
364 # reorder so most recently used packs are searched first
365 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
369 def refresh(self, skip_midx = False):
370 """Refresh the index list.
371 This method verifies if .midx files were superseded (e.g. all of its
372 contents are in another, bigger .midx file) and removes the superseded
375 If skip_midx is True, all work on .midx files will be skipped and .midx
376 files will be removed from the list.
378 The module-global variable 'ignore_midx' can force this function to
379 always act as if skip_midx was True.
381 skip_midx = skip_midx or ignore_midx
382 d = dict((p.name, p) for p in self.packs
383 if not skip_midx or not isinstance(p, PackMidx))
384 if os.path.exists(self.dir):
387 for ix in self.packs:
388 if isinstance(ix, PackMidx):
389 for name in ix.idxnames:
390 d[os.path.join(self.dir, name)] = ix
391 for f in os.listdir(self.dir):
392 full = os.path.join(self.dir, f)
393 if f.endswith('.midx') and not d.get(full):
395 (mxd, mxf) = os.path.split(mx.name)
397 for n in mx.idxnames:
398 if not os.path.exists(os.path.join(mxd, n)):
399 log(('warning: index %s missing\n' +
400 ' used by %s\n') % (n, mxf))
404 midxl.sort(lambda x,y: -cmp(len(x),len(y)))
407 for sub in ix.idxnames:
408 found = d.get(os.path.join(self.dir, sub))
409 if not found or isinstance(found, PackIdx):
410 # doesn't exist, or exists but not in a midx
412 for name in ix.idxnames:
413 d[os.path.join(self.dir, name)] = ix
416 if not any and not ix.force_keep:
417 debug1('midx: removing redundant: %s\n'
418 % os.path.basename(ix.name))
420 for f in os.listdir(self.dir):
421 full = os.path.join(self.dir, f)
422 if f.endswith('.idx') and not d.get(full):
425 self.packs = list(set(d.values()))
426 debug1('PackIdxList: using %d index%s.\n'
427 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
430 """Insert an additional object in the list."""
434 """Remove all additional objects from the list."""
438 def calc_hash(type, content):
439 """Calculate some content's hash in the Git fashion."""
440 header = '%s %d\0' % (type, len(content))
446 def _shalist_sort_key(ent):
447 (mode, name, id) = ent
448 if stat.S_ISDIR(int(mode, 8)):
454 def open_idx(filename):
455 if filename.endswith('.idx'):
456 f = open(filename, 'rb')
458 if header[0:4] == '\377tOc':
459 version = struct.unpack('!I', header[4:8])[0]
461 return PackIdxV2(filename, f)
463 raise GitError('%s: expected idx file version 2, got %d'
464 % (filename, version))
466 return PackIdxV1(filename, f)
467 elif filename.endswith('.midx'):
468 return PackMidx(filename)
470 raise GitError('idx filenames must end with .idx or .midx')
473 def idxmerge(idxlist, final_progress=True):
474 """Generate a list of all the objects reachable in a PackIdxList."""
475 total = sum(len(i) for i in idxlist)
476 iters = (iter(i) for i in idxlist)
477 heap = [(next(it), it) for it in iters]
482 if (count % 10024) == 0:
483 progress('Reading indexes: %.2f%% (%d/%d)\r'
484 % (count*100.0/total, count, total))
492 heapq.heapreplace(heap, (e, it))
496 log('Reading indexes: %.2f%% (%d/%d), done.\n' % (100, total, total))
500 """Writes Git objects insid a pack file."""
501 def __init__(self, objcache_maker=None):
506 self.objcache_maker = objcache_maker
512 def _make_objcache(self):
513 if self.objcache == None:
514 if self.objcache_maker:
515 self.objcache = self.objcache_maker()
517 self.objcache = PackIdxList(repo('objects/pack'))
521 self._make_objcache()
522 (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
523 self.file = os.fdopen(fd, 'w+b')
524 assert(name.endswith('.pack'))
525 self.filename = name[:-5]
526 self.file.write('PACK\0\0\0\2\0\0\0\0')
528 def _raw_write(self, datalist):
531 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
532 # the file never has a *partial* blob. So let's make sure it's
533 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
534 # to our hashsplit algorithm.) f.write() does its own buffering,
535 # but that's okay because we'll flush it in _end().
536 oneblob = ''.join(datalist)
538 self.outbytes += len(oneblob)
541 def _write(self, bin, type, content):
544 self._raw_write(_encode_packobj(type, content))
547 def breakpoint(self):
548 """Clear byte and object counts and return the last processed id."""
550 self.outbytes = self.count = 0
553 def write(self, type, content):
554 """Write an object in this pack file."""
555 return self._write(calc_hash(type, content), type, content)
557 def exists(self, id):
558 """Return non-empty if an object is found in the object cache."""
559 if not self.objcache:
560 self._make_objcache()
561 return self.objcache.exists(id)
563 def maybe_write(self, type, content):
564 """Write an object to the pack file if not present and return its id."""
565 bin = calc_hash(type, content)
566 if not self.exists(bin):
567 self._write(bin, type, content)
568 self.objcache.add(bin)
571 def new_blob(self, blob):
572 """Create a blob object in the pack with the supplied content."""
573 return self.maybe_write('blob', blob)
575 def new_tree(self, shalist):
576 """Create a tree object in the pack."""
577 shalist = sorted(shalist, key = _shalist_sort_key)
579 for (mode,name,bin) in shalist:
582 assert(mode[0] != '0')
584 assert(len(bin) == 20)
585 l.append('%s %s\0%s' % (mode,name,bin))
586 return self.maybe_write('tree', ''.join(l))
588 def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
590 if tree: l.append('tree %s' % tree.encode('hex'))
591 if parent: l.append('parent %s' % parent.encode('hex'))
592 if author: l.append('author %s %s' % (author, _git_date(adate)))
593 if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
596 return self.maybe_write('commit', '\n'.join(l))
598 def new_commit(self, parent, tree, date, msg):
599 """Create a commit object in the pack."""
600 userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
601 commit = self._new_commit(tree, parent,
602 userline, date, userline, date,
607 """Remove the pack file from disk."""
612 os.unlink(self.filename + '.pack')
616 if not f: return None
620 # update object count
622 cp = struct.pack('!i', self.count)
626 # calculate the pack sha1sum
633 f.write(sum.digest())
637 p = subprocess.Popen(['git', 'index-pack', '-v',
639 self.filename + '.pack'],
640 preexec_fn = _gitenv,
641 stdout = subprocess.PIPE)
642 out = p.stdout.read().strip()
643 _git_wait('git index-pack', p)
645 raise GitError('git index-pack produced no output')
646 nameprefix = repo('objects/pack/%s' % out)
647 if os.path.exists(self.filename + '.map'):
648 os.unlink(self.filename + '.map')
649 os.rename(self.filename + '.pack', nameprefix + '.pack')
650 os.rename(self.filename + '.idx', nameprefix + '.idx')
652 auto_midx(repo('objects/pack'))
656 """Close the pack file and move it to its definitive path."""
661 return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
665 os.environ['GIT_DIR'] = os.path.abspath(repo())
668 def list_refs(refname = None):
669 """Generate a list of tuples in the form (refname,hash).
670 If a ref name is specified, list only this particular ref.
672 argv = ['git', 'show-ref', '--']
675 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
676 out = p.stdout.read().strip()
677 rv = p.wait() # not fatal
681 for d in out.split('\n'):
682 (sha, name) = d.split(' ', 1)
683 yield (name, sha.decode('hex'))
686 def read_ref(refname):
687 """Get the commit id of the most recent commit made on a given ref."""
688 l = list(list_refs(refname))
696 def rev_list(ref, count=None):
697 """Generate a list of reachable commits in reverse chronological order.
699 This generator walks through commits, from child to parent, that are
700 reachable via the specified ref and yields a series of tuples of the form
703 If count is a non-zero integer, limit the number of commits to "count"
706 assert(not ref.startswith('-'))
709 opts += ['-n', str(atoi(count))]
710 argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
711 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
715 if s.startswith('commit '):
716 commit = s[7:].decode('hex')
720 rv = p.wait() # not fatal
722 raise GitError, 'git rev-list returned error %d' % rv
725 def rev_get_date(ref):
726 """Get the date of the latest commit on the specified ref."""
727 for (date, commit) in rev_list(ref, count=1):
729 raise GitError, 'no such commit %r' % ref
732 def rev_parse(committish):
733 """Resolve the full hash for 'committish', if it exists.
735 Should be roughly equivalent to 'git rev-parse'.
737 Returns the hex value of the hash if it is found, None if 'committish' does
738 not correspond to anything.
740 head = read_ref(committish)
742 debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
745 pL = PackIdxList(repo('objects/pack'))
747 if len(committish) == 40:
749 hash = committish.decode('hex')
759 def update_ref(refname, newval, oldval):
760 """Change the commit pointed to by a branch."""
763 assert(refname.startswith('refs/heads/'))
764 p = subprocess.Popen(['git', 'update-ref', refname,
765 newval.encode('hex'), oldval.encode('hex')],
766 preexec_fn = _gitenv)
767 _git_wait('git update-ref', p)
770 def guess_repo(path=None):
771 """Set the path value in the global variable "repodir".
772 This makes bup look for an existing bup repository, but not fail if a
773 repository doesn't exist. Usually, if you are interacting with a bup
774 repository, you would not be calling this function but using
781 repodir = os.environ.get('BUP_DIR')
783 repodir = os.path.expanduser('~/.bup')
786 def init_repo(path=None):
787 """Create the Git bare repository for bup in a given path."""
790 if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
791 raise GitError('"%d" exists but is not a directory\n' % d)
792 p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
793 preexec_fn = _gitenv)
794 _git_wait('git init', p)
795 # Force the index version configuration in order to ensure bup works
796 # regardless of the version of the installed Git binary.
797 p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
798 stdout=sys.stderr, preexec_fn = _gitenv)
799 _git_wait('git config', p)
802 def check_repo_or_die(path=None):
803 """Make sure a bup repository exists, and abort if not.
804 If the path to a particular repository was not specified, this function
805 initializes the default repository automatically.
808 if not os.path.isdir(repo('objects/pack/.')):
809 if repodir == home_repodir:
812 log('error: %r is not a bup/git repository\n' % repo())
817 """Generate a list of (mode, name, hash) tuples of objects from 'buf'."""
819 while ofs < len(buf):
820 z = buf[ofs:].find('\0')
822 spl = buf[ofs:ofs+z].split(' ', 1)
823 assert(len(spl) == 2)
824 sha = buf[ofs+z+1:ofs+z+1+20]
826 yield (spl[0], spl[1], sha)
831 """Get Git's version and ensure a usable version is installed.
833 The returned version is formatted as an ordered tuple with each position
834 representing a digit in the version tag. For example, the following tuple
835 would represent version 1.6.6.9:
841 p = subprocess.Popen(['git', '--version'],
842 stdout=subprocess.PIPE)
843 gvs = p.stdout.read()
844 _git_wait('git --version', p)
845 m = re.match(r'git version (\S+.\S+)', gvs)
847 raise GitError('git --version weird output: %r' % gvs)
848 _ver = tuple(m.group(1).split('.'))
849 needed = ('1','5', '3', '1')
851 raise GitError('git version %s or higher is required; you have %s'
852 % ('.'.join(needed), '.'.join(_ver)))
856 def _git_wait(cmd, p):
859 raise GitError('%s returned %d' % (cmd, rv))
862 def _git_capture(argv):
863 p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
865 _git_wait(repr(argv), p)
869 class _AbortableIter:
870 def __init__(self, it, onabort = None):
872 self.onabort = onabort
880 return self.it.next()
881 except StopIteration, e:
889 """Abort iteration and call the abortion callback, if needed."""
901 """Link to 'git cat-file' that is used to retrieve blob data."""
904 wanted = ('1','5','6')
907 log('warning: git version < %s; bup will be slow.\n'
910 self.get = self._slow_get
912 self.p = self.inprogress = None
913 self.get = self._fast_get
917 self.p.stdout.close()
920 self.inprogress = None
924 self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
925 stdin=subprocess.PIPE,
926 stdout=subprocess.PIPE,
928 preexec_fn = _gitenv)
930 def _fast_get(self, id):
931 if not self.p or self.p.poll() != None:
934 assert(self.p.poll() == None)
936 log('_fast_get: opening %r while %r is open'
937 % (id, self.inprogress))
938 assert(not self.inprogress)
939 assert(id.find('\n') < 0)
940 assert(id.find('\r') < 0)
941 assert(not id.startswith('-'))
943 self.p.stdin.write('%s\n' % id)
944 hdr = self.p.stdout.readline()
945 if hdr.endswith(' missing\n'):
946 self.inprogress = None
947 raise KeyError('blob %r is missing' % id)
949 if len(spl) != 3 or len(spl[0]) != 40:
950 raise GitError('expected blob, got %r' % spl)
951 (hex, type, size) = spl
953 it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
954 onabort = self._abort)
959 assert(self.p.stdout.readline() == '\n')
960 self.inprogress = None
965 def _slow_get(self, id):
966 assert(id.find('\n') < 0)
967 assert(id.find('\r') < 0)
969 type = _git_capture(['git', 'cat-file', '-t', id]).strip()
972 p = subprocess.Popen(['git', 'cat-file', type, id],
973 stdout=subprocess.PIPE,
974 preexec_fn = _gitenv)
975 for blob in chunkyreader(p.stdout):
977 _git_wait('git cat-file', p)
985 treefile = ''.join(it)
986 for (mode, name, sha) in treeparse(treefile):
987 for blob in self.join(sha.encode('hex')):
989 elif type == 'commit':
990 treeline = ''.join(it).split('\n')[0]
991 assert(treeline.startswith('tree '))
992 for blob in self.join(treeline[5:]):
995 raise GitError('invalid object type %r: expected blob/tree/commit'
999 """Generate a list of the content of all blobs that can be reached
1000 from an object. The hash given in 'id' must point to a blob, a tree
1001 or a commit. The content of all blobs that can be seen from trees or
1002 commits will be added to the list.
1005 for d in self._join(self.get(id)):
1007 except StopIteration: