1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
5 import os, zlib, time, subprocess, struct, stat, re, tempfile
7 from bup.helpers import *
8 from bup import _helpers
14 home_repodir = os.path.expanduser('~/.bup')
17 _typemap = { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
18 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
24 class GitError(Exception):
29 """Get the path to the git repository or one of its subdirectories."""
32 raise GitError('You should call check_repo_or_die()')
34 # If there's a .git subdirectory, then the actual repo is in there.
35 gd = os.path.join(repodir, '.git')
36 if os.path.exists(gd):
39 return os.path.join(repodir, sub)
42 def mangle_name(name, mode, gitmode):
43 """Mangle a file name to present an abstract name for segmented files.
44 Mangled file names will have the ".bup" extension added to them. If a
45 file's name already ends with ".bup", a ".bupl" extension is added to
46 disambiguate normal files from semgmented ones.
48 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
50 elif name.endswith('.bup') or name[:-1].endswith('.bup'):
56 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
57 def demangle_name(name):
58 """Remove name mangling from a file name, if necessary.
60 The return value is a tuple (demangled_filename,mode), where mode is one of
63 * BUP_NORMAL : files that should be read as-is from the repository
64 * BUP_CHUNKED : files that were chunked and need to be assembled
66 For more information on the name mangling algorythm, see mangle_name()
68 if name.endswith('.bupl'):
69 return (name[:-5], BUP_NORMAL)
70 elif name.endswith('.bup'):
71 return (name[:-4], BUP_CHUNKED)
73 return (name, BUP_NORMAL)
76 def _encode_packobj(type, content):
79 szbits = (sz & 0x0f) | (_typemap[type]<<4)
88 z = zlib.compressobj(1)
90 yield z.compress(content)
94 def _encode_looseobj(type, content):
95 z = zlib.compressobj(1)
96 yield z.compress('%s %d\0' % (type, len(content)))
97 yield z.compress(content)
101 def _decode_looseobj(buf):
103 s = zlib.decompress(buf)
110 assert(type in _typemap)
111 assert(sz == len(content))
112 return (type, content)
115 def _decode_packobj(buf):
118 type = _typermap[(c & 0x70) >> 4]
125 sz |= (c & 0x7f) << shift
129 return (type, zlib.decompress(buf[i+1:]))
133 """Object representation of a Git pack index file."""
134 def __init__(self, filename):
136 self.idxnames = [self.name]
137 self.map = mmap_read(open(filename))
138 assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
139 self.fanout = list(struct.unpack('!256I',
140 str(buffer(self.map, 8, 256*4))))
141 self.fanout.append(0) # entry "-1"
142 nsha = self.fanout[255]
143 self.ofstable = buffer(self.map,
144 8 + 256*4 + nsha*20 + nsha*4,
146 self.ofs64table = buffer(self.map,
147 8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
149 def _ofs_from_idx(self, idx):
150 ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
152 idx64 = ofs & 0x7fffffff
153 ofs = struct.unpack('!I',
154 str(buffer(self.ofs64table, idx64*8, 8)))[0]
157 def _idx_from_hash(self, hash):
158 global _total_searches, _total_steps
160 assert(len(hash) == 20)
162 start = self.fanout[b1-1] # range -1..254
163 end = self.fanout[b1] # range 0..255
164 buf = buffer(self.map, 8 + 256*4, end*20)
166 _total_steps += 1 # lookup table is a step
169 mid = start + (end-start)/2
170 v = str(buf[mid*20:(mid+1)*20])
179 def find_offset(self, hash):
180 """Get the offset of an object inside the index file."""
181 idx = self._idx_from_hash(hash)
183 return self._ofs_from_idx(idx)
186 def exists(self, hash):
187 """Return nonempty if the object exists in this index."""
188 return hash and (self._idx_from_hash(hash) != None) and True or None
191 for i in xrange(self.fanout[255]):
192 yield buffer(self.map, 8 + 256*4 + 20*i, 20)
195 return int(self.fanout[255])
198 extract_bits = _helpers.extract_bits
202 """Wrapper which contains data from multiple index files.
203 Multiple index (.midx) files constitute a wrapper around index (.idx) files
204 and make it possible for bup to expand Git's indexing capabilities to vast
207 def __init__(self, filename):
209 self.force_keep = False
210 assert(filename.endswith('.midx'))
211 self.map = mmap_read(open(filename))
212 if str(self.map[0:4]) != 'MIDX':
213 log('Warning: skipping: invalid MIDX header in %r\n' % filename)
214 self.force_keep = True
215 return self._init_failed()
216 ver = struct.unpack('!I', self.map[4:8])[0]
217 if ver < MIDX_VERSION:
218 log('Warning: ignoring old-style (v%d) midx %r\n'
220 self.force_keep = False # old stuff is boring
221 return self._init_failed()
222 if ver > MIDX_VERSION:
223 log('Warning: ignoring too-new (v%d) midx %r\n'
225 self.force_keep = True # new stuff is exciting
226 return self._init_failed()
228 self.bits = _helpers.firstword(self.map[8:12])
229 self.entries = 2**self.bits
230 self.fanout = buffer(self.map, 12, self.entries*4)
231 shaofs = 12 + self.entries*4
232 nsha = self._fanget(self.entries-1)
233 self.shalist = buffer(self.map, shaofs, nsha*20)
234 self.idxnames = str(self.map[shaofs + 20*nsha:]).split('\0')
236 def _init_failed(self):
239 self.fanout = buffer('\0\0\0\0')
240 self.shalist = buffer('\0'*20)
243 def _fanget(self, i):
245 s = self.fanout[start:start+4]
246 return _helpers.firstword(s)
249 return str(self.shalist[i*20:(i+1)*20])
251 def exists(self, hash):
252 """Return nonempty if the object exists in the index files."""
253 global _total_searches, _total_steps
256 el = extract_bits(want, self.bits)
258 start = self._fanget(el-1)
259 startv = el << (32-self.bits)
263 end = self._fanget(el)
264 endv = (el+1) << (32-self.bits)
265 _total_steps += 1 # lookup table is a step
266 hashv = _helpers.firstword(hash)
267 #print '(%08x) %08x %08x %08x' % (extract_bits(want, 32), startv, hashv, endv)
270 #print '! %08x %08x %08x %d - %d' % (startv, hashv, endv, start, end)
271 mid = start + (hashv-startv)*(end-start-1)/(endv-startv)
272 #print ' %08x %08x %08x %d %d %d' % (startv, hashv, endv, start, mid, end)
274 #print ' %08x' % self._num(v)
277 startv = _helpers.firstword(v)
280 endv = _helpers.firstword(v)
286 for i in xrange(self._fanget(self.entries-1)):
287 yield buffer(self.shalist, i*20, 20)
290 return int(self._fanget(self.entries-1))
295 def __init__(self, dir):
297 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
307 assert(_mpi_count == 0)
310 return iter(idxmerge(self.packs))
313 return sum(len(pack) for pack in self.packs)
315 def exists(self, hash):
316 """Return nonempty if the object exists in the index files."""
317 global _total_searches
319 if hash in self.also:
321 for i in range(len(self.packs)):
323 _total_searches -= 1 # will be incremented by sub-pack
325 # reorder so most recently used packs are searched first
326 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
330 def refresh(self, skip_midx = False):
331 """Refresh the index list.
332 This method verifies if .midx files were superseded (e.g. all of its
333 contents are in another, bigger .midx file) and removes the superseded
336 If skip_midx is True, all work on .midx files will be skipped and .midx
337 files will be removed from the list.
339 The module-global variable 'ignore_midx' can force this function to
340 always act as if skip_midx was True.
342 skip_midx = skip_midx or ignore_midx
343 d = dict((p.name, p) for p in self.packs
344 if not skip_midx or not isinstance(p, PackMidx))
345 if os.path.exists(self.dir):
348 for ix in self.packs:
349 if isinstance(ix, PackMidx):
350 for name in ix.idxnames:
351 d[os.path.join(self.dir, name)] = ix
352 for f in os.listdir(self.dir):
353 full = os.path.join(self.dir, f)
354 if f.endswith('.midx') and not d.get(full):
356 (mxd, mxf) = os.path.split(mx.name)
358 for n in mx.idxnames:
359 if not os.path.exists(os.path.join(mxd, n)):
360 log(('warning: index %s missing\n' +
361 ' used by %s\n') % (n, mxf))
365 midxl.sort(lambda x,y: -cmp(len(x),len(y)))
368 for sub in ix.idxnames:
369 found = d.get(os.path.join(self.dir, sub))
370 if not found or isinstance(found, PackIdx):
371 # doesn't exist, or exists but not in a midx
373 for name in ix.idxnames:
374 d[os.path.join(self.dir, name)] = ix
377 if not any and not ix.force_keep:
378 log('midx: removing redundant: %s\n'
379 % os.path.basename(ix.name))
381 for f in os.listdir(self.dir):
382 full = os.path.join(self.dir, f)
383 if f.endswith('.idx') and not d.get(full):
386 self.packs = list(set(d.values()))
387 log('PackIdxList: using %d index%s.\n'
388 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
391 """Insert an additional object in the list."""
395 """Remove all additional objects from the list."""
399 def calc_hash(type, content):
400 """Calculate some content's hash in the Git fashion."""
401 header = '%s %d\0' % (type, len(content))
407 def _shalist_sort_key(ent):
408 (mode, name, id) = ent
409 if stat.S_ISDIR(int(mode, 8)):
415 def open_idx(filename):
416 if filename.endswith('.idx'):
417 return PackIdx(filename)
418 elif filename.endswith('.midx'):
419 return PackMidx(filename)
421 raise GitError('idx filenames must end with .idx or .midx')
424 def idxmerge(idxlist):
425 """Generate a list of all the objects reachable in a PackIdxList."""
426 total = sum(len(i) for i in idxlist)
427 iters = (iter(i) for i in idxlist)
428 heap = [(next(it), it) for it in iters]
433 if (count % 10024) == 0:
434 progress('Reading indexes: %.2f%% (%d/%d)\r'
435 % (count*100.0/total, count, total))
443 heapq.heapreplace(heap, (e, it))
446 log('Reading indexes: %.2f%% (%d/%d), done.\n' % (100, total, total))
450 """Writes Git objects insid a pack file."""
451 def __init__(self, objcache_maker=None):
456 self.objcache_maker = objcache_maker
462 def _make_objcache(self):
463 if self.objcache == None:
464 if self.objcache_maker:
465 self.objcache = self.objcache_maker()
467 self.objcache = PackIdxList(repo('objects/pack'))
471 self._make_objcache()
472 (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
473 self.file = os.fdopen(fd, 'w+b')
474 assert(name.endswith('.pack'))
475 self.filename = name[:-5]
476 self.file.write('PACK\0\0\0\2\0\0\0\0')
478 def _raw_write(self, datalist):
481 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
482 # the file never has a *partial* blob. So let's make sure it's
483 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
484 # to our hashsplit algorithm.) f.write() does its own buffering,
485 # but that's okay because we'll flush it in _end().
486 oneblob = ''.join(datalist)
488 self.outbytes += len(oneblob)
491 def _write(self, bin, type, content):
494 self._raw_write(_encode_packobj(type, content))
497 def breakpoint(self):
498 """Clear byte and object counts and return the last processed id."""
500 self.outbytes = self.count = 0
503 def write(self, type, content):
504 """Write an object in this pack file."""
505 return self._write(calc_hash(type, content), type, content)
507 def exists(self, id):
508 """Return non-empty if an object is found in the object cache."""
509 if not self.objcache:
510 self._make_objcache()
511 return self.objcache.exists(id)
513 def maybe_write(self, type, content):
514 """Write an object to the pack file if not present and return its id."""
515 bin = calc_hash(type, content)
516 if not self.exists(bin):
517 self._write(bin, type, content)
518 self.objcache.add(bin)
521 def new_blob(self, blob):
522 """Create a blob object in the pack with the supplied content."""
523 return self.maybe_write('blob', blob)
525 def new_tree(self, shalist):
526 """Create a tree object in the pack."""
527 shalist = sorted(shalist, key = _shalist_sort_key)
529 for (mode,name,bin) in shalist:
532 assert(mode[0] != '0')
534 assert(len(bin) == 20)
535 l.append('%s %s\0%s' % (mode,name,bin))
536 return self.maybe_write('tree', ''.join(l))
538 def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
540 if tree: l.append('tree %s' % tree.encode('hex'))
541 if parent: l.append('parent %s' % parent.encode('hex'))
542 if author: l.append('author %s %s' % (author, _git_date(adate)))
543 if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
546 return self.maybe_write('commit', '\n'.join(l))
548 def new_commit(self, parent, tree, msg):
549 """Create a commit object in the pack."""
551 userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
552 commit = self._new_commit(tree, parent,
553 userline, now, userline, now,
558 """Remove the pack file from disk."""
563 os.unlink(self.filename + '.pack')
567 if not f: return None
571 # update object count
573 cp = struct.pack('!i', self.count)
577 # calculate the pack sha1sum
584 f.write(sum.digest())
588 p = subprocess.Popen(['git', 'index-pack', '-v',
590 self.filename + '.pack'],
591 preexec_fn = _gitenv,
592 stdout = subprocess.PIPE)
593 out = p.stdout.read().strip()
594 _git_wait('git index-pack', p)
596 raise GitError('git index-pack produced no output')
597 nameprefix = repo('objects/pack/%s' % out)
598 if os.path.exists(self.filename + '.map'):
599 os.unlink(self.filename + '.map')
600 os.rename(self.filename + '.pack', nameprefix + '.pack')
601 os.rename(self.filename + '.idx', nameprefix + '.idx')
605 """Close the pack file and move it to its definitive path."""
610 return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
614 os.environ['GIT_DIR'] = os.path.abspath(repo())
617 def list_refs(refname = None):
618 """Generate a list of tuples in the form (refname,hash).
619 If a ref name is specified, list only this particular ref.
621 argv = ['git', 'show-ref', '--']
624 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
625 out = p.stdout.read().strip()
626 rv = p.wait() # not fatal
630 for d in out.split('\n'):
631 (sha, name) = d.split(' ', 1)
632 yield (name, sha.decode('hex'))
635 def read_ref(refname):
636 """Get the commit id of the most recent commit made on a given ref."""
637 l = list(list_refs(refname))
645 def rev_list(ref, count=None):
646 """Generate a list of reachable commits in reverse chronological order.
648 This generator walks through commits, from child to parent, that are
649 reachable via the specified ref and yields a series of tuples of the form
652 If count is a non-zero integer, limit the number of commits to "count"
655 assert(not ref.startswith('-'))
658 opts += ['-n', str(atoi(count))]
659 argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
660 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
664 if s.startswith('commit '):
665 commit = s[7:].decode('hex')
669 rv = p.wait() # not fatal
671 raise GitError, 'git rev-list returned error %d' % rv
674 def rev_get_date(ref):
675 """Get the date of the latest commit on the specified ref."""
676 for (date, commit) in rev_list(ref, count=1):
678 raise GitError, 'no such commit %r' % ref
681 def update_ref(refname, newval, oldval):
682 """Change the commit pointed to by a branch."""
685 assert(refname.startswith('refs/heads/'))
686 p = subprocess.Popen(['git', 'update-ref', refname,
687 newval.encode('hex'), oldval.encode('hex')],
688 preexec_fn = _gitenv)
689 _git_wait('git update-ref', p)
692 def guess_repo(path=None):
693 """Set the path value in the global variable "repodir".
694 This makes bup look for an existing bup repository, but not fail if a
695 repository doesn't exist. Usually, if you are interacting with a bup
696 repository, you would not be calling this function but using
703 repodir = os.environ.get('BUP_DIR')
705 repodir = os.path.expanduser('~/.bup')
708 def init_repo(path=None):
709 """Create the Git bare repository for bup in a given path."""
712 if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
713 raise GitError('"%d" exists but is not a directory\n' % d)
714 p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
715 preexec_fn = _gitenv)
716 _git_wait('git init', p)
717 # Force the index version configuration in order to ensure bup works
718 # regardless of the version of the installed Git binary.
719 p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
720 stdout=sys.stderr, preexec_fn = _gitenv)
721 _git_wait('git config', p)
724 def check_repo_or_die(path=None):
725 """Make sure a bup repository exists, and abort if not.
726 If the path to a particular repository was not specified, this function
727 initializes the default repository automatically.
730 if not os.path.isdir(repo('objects/pack/.')):
731 if repodir == home_repodir:
734 log('error: %r is not a bup/git repository\n' % repo())
739 """Generate a list of (mode, name, hash) tuples of objects from 'buf'."""
741 while ofs < len(buf):
742 z = buf[ofs:].find('\0')
744 spl = buf[ofs:ofs+z].split(' ', 1)
745 assert(len(spl) == 2)
746 sha = buf[ofs+z+1:ofs+z+1+20]
748 yield (spl[0], spl[1], sha)
753 """Get Git's version and ensure a usable version is installed.
755 The returned version is formatted as an ordered tuple with each position
756 representing a digit in the version tag. For example, the following tuple
757 would represent version 1.6.6.9:
763 p = subprocess.Popen(['git', '--version'],
764 stdout=subprocess.PIPE)
765 gvs = p.stdout.read()
766 _git_wait('git --version', p)
767 m = re.match(r'git version (\S+.\S+)', gvs)
769 raise GitError('git --version weird output: %r' % gvs)
770 _ver = tuple(m.group(1).split('.'))
771 needed = ('1','5', '3', '1')
773 raise GitError('git version %s or higher is required; you have %s'
774 % ('.'.join(needed), '.'.join(_ver)))
778 def _git_wait(cmd, p):
781 raise GitError('%s returned %d' % (cmd, rv))
784 def _git_capture(argv):
785 p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
787 _git_wait(repr(argv), p)
791 class _AbortableIter:
792 def __init__(self, it, onabort = None):
794 self.onabort = onabort
802 return self.it.next()
803 except StopIteration, e:
811 """Abort iteration and call the abortion callback, if needed."""
823 """Link to 'git cat-file' that is used to retrieve blob data."""
826 wanted = ('1','5','6')
829 log('warning: git version < %s; bup will be slow.\n'
832 self.get = self._slow_get
834 self.p = self.inprogress = None
835 self.get = self._fast_get
839 self.p.stdout.close()
842 self.inprogress = None
846 self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
847 stdin=subprocess.PIPE,
848 stdout=subprocess.PIPE,
850 preexec_fn = _gitenv)
852 def _fast_get(self, id):
853 if not self.p or self.p.poll() != None:
856 assert(self.p.poll() == None)
858 log('_fast_get: opening %r while %r is open'
859 % (id, self.inprogress))
860 assert(not self.inprogress)
861 assert(id.find('\n') < 0)
862 assert(id.find('\r') < 0)
865 self.p.stdin.write('%s\n' % id)
866 hdr = self.p.stdout.readline()
867 if hdr.endswith(' missing\n'):
868 raise KeyError('blob %r is missing' % id)
870 if len(spl) != 3 or len(spl[0]) != 40:
871 raise GitError('expected blob, got %r' % spl)
872 (hex, type, size) = spl
874 it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
875 onabort = self._abort)
880 assert(self.p.stdout.readline() == '\n')
881 self.inprogress = None
886 def _slow_get(self, id):
887 assert(id.find('\n') < 0)
888 assert(id.find('\r') < 0)
890 type = _git_capture(['git', 'cat-file', '-t', id]).strip()
893 p = subprocess.Popen(['git', 'cat-file', type, id],
894 stdout=subprocess.PIPE,
895 preexec_fn = _gitenv)
896 for blob in chunkyreader(p.stdout):
898 _git_wait('git cat-file', p)
906 treefile = ''.join(it)
907 for (mode, name, sha) in treeparse(treefile):
908 for blob in self.join(sha.encode('hex')):
910 elif type == 'commit':
911 treeline = ''.join(it).split('\n')[0]
912 assert(treeline.startswith('tree '))
913 for blob in self.join(treeline[5:]):
916 raise GitError('invalid object type %r: expected blob/tree/commit'
920 """Generate a list of the content of all blobs that can be reached
921 from an object. The hash given in 'id' must point to a blob, a tree
922 or a commit. The content of all blobs that can be seen from trees or
923 commits will be added to the list.
926 for d in self._join(self.get(id)):
928 except StopIteration: