1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
5 import os, zlib, time, subprocess, struct, stat, re, tempfile
7 from bup.helpers import *
8 from bup import _helpers
12 home_repodir = os.path.expanduser('~/.bup')
15 _typemap = { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
16 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
22 class GitError(Exception):
27 """Get the path to the git repository or one of its subdirectories."""
30 raise GitError('You should call check_repo_or_die()')
32 # If there's a .git subdirectory, then the actual repo is in there.
33 gd = os.path.join(repodir, '.git')
34 if os.path.exists(gd):
37 return os.path.join(repodir, sub)
40 def mangle_name(name, mode, gitmode):
41 """Mangle a file name to present an abstract name for segmented files.
42 Mangled file names will have the ".bup" extension added to them. If a
43 file's name already ends with ".bup", a ".bupl" extension is added to
44 disambiguate normal files from semgmented ones.
46 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
48 elif name.endswith('.bup') or name[:-1].endswith('.bup'):
54 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
55 def demangle_name(name):
56 """Remove name mangling from a file name, if necessary.
58 The return value is a tuple (demangled_filename,mode), where mode is one of
61 * BUP_NORMAL : files that should be read as-is from the repository
62 * BUP_CHUNKED : files that were chunked and need to be assembled
64 For more information on the name mangling algorythm, see mangle_name()
66 if name.endswith('.bupl'):
67 return (name[:-5], BUP_NORMAL)
68 elif name.endswith('.bup'):
69 return (name[:-4], BUP_CHUNKED)
71 return (name, BUP_NORMAL)
74 def _encode_packobj(type, content):
77 szbits = (sz & 0x0f) | (_typemap[type]<<4)
86 z = zlib.compressobj(1)
88 yield z.compress(content)
92 def _encode_looseobj(type, content):
93 z = zlib.compressobj(1)
94 yield z.compress('%s %d\0' % (type, len(content)))
95 yield z.compress(content)
99 def _decode_looseobj(buf):
101 s = zlib.decompress(buf)
108 assert(type in _typemap)
109 assert(sz == len(content))
110 return (type, content)
113 def _decode_packobj(buf):
116 type = _typermap[(c & 0x70) >> 4]
123 sz |= (c & 0x7f) << shift
127 return (type, zlib.decompress(buf[i+1:]))
131 """Object representation of a Git pack index file."""
132 def __init__(self, filename):
134 self.map = mmap_read(open(filename))
135 assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
136 self.fanout = list(struct.unpack('!256I',
137 str(buffer(self.map, 8, 256*4))))
138 self.fanout.append(0) # entry "-1"
139 nsha = self.fanout[255]
140 self.ofstable = buffer(self.map,
141 8 + 256*4 + nsha*20 + nsha*4,
143 self.ofs64table = buffer(self.map,
144 8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
146 def _ofs_from_idx(self, idx):
147 ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
149 idx64 = ofs & 0x7fffffff
150 ofs = struct.unpack('!I',
151 str(buffer(self.ofs64table, idx64*8, 8)))[0]
154 def _idx_from_hash(self, hash):
155 global _total_searches, _total_steps
157 assert(len(hash) == 20)
159 start = self.fanout[b1-1] # range -1..254
160 end = self.fanout[b1] # range 0..255
161 buf = buffer(self.map, 8 + 256*4, end*20)
163 _total_steps += 1 # lookup table is a step
166 mid = start + (end-start)/2
167 v = str(buf[mid*20:(mid+1)*20])
176 def find_offset(self, hash):
177 """Get the offset of an object inside the index file."""
178 idx = self._idx_from_hash(hash)
180 return self._ofs_from_idx(idx)
183 def exists(self, hash):
184 """Return nonempty if the object exists in this index."""
185 return hash and (self._idx_from_hash(hash) != None) and True or None
188 for i in xrange(self.fanout[255]):
189 yield buffer(self.map, 8 + 256*4 + 20*i, 20)
192 return int(self.fanout[255])
195 extract_bits = _helpers.extract_bits
199 """Wrapper which contains data from multiple index files.
200 Multiple index (.midx) files constitute a wrapper around index (.idx) files
201 and make it possible for bup to expand Git's indexing capabilities to vast
204 def __init__(self, filename):
206 assert(filename.endswith('.midx'))
207 self.map = mmap_read(open(filename))
208 if str(self.map[0:8]) == 'MIDX\0\0\0\1':
209 log('Warning: ignoring old-style midx %r\n' % filename)
212 self.fanout = buffer('\0\0\0\0')
213 self.shalist = buffer('\0'*20)
216 assert(str(self.map[0:8]) == 'MIDX\0\0\0\2')
217 self.bits = _helpers.firstword(self.map[8:12])
218 self.entries = 2**self.bits
219 self.fanout = buffer(self.map, 12, self.entries*4)
220 shaofs = 12 + self.entries*4
221 nsha = self._fanget(self.entries-1)
222 self.shalist = buffer(self.map, shaofs, nsha*20)
223 self.idxnames = str(self.map[shaofs + 20*nsha:]).split('\0')
225 def _fanget(self, i):
227 s = self.fanout[start:start+4]
228 return _helpers.firstword(s)
231 return str(self.shalist[i*20:(i+1)*20])
233 def exists(self, hash):
234 """Return nonempty if the object exists in the index files."""
235 global _total_searches, _total_steps
238 el = extract_bits(want, self.bits)
240 start = self._fanget(el-1)
241 startv = el << (32-self.bits)
245 end = self._fanget(el)
246 endv = (el+1) << (32-self.bits)
247 _total_steps += 1 # lookup table is a step
248 hashv = _helpers.firstword(hash)
249 #print '(%08x) %08x %08x %08x' % (extract_bits(want, 32), startv, hashv, endv)
252 #print '! %08x %08x %08x %d - %d' % (startv, hashv, endv, start, end)
253 mid = start + (hashv-startv)*(end-start-1)/(endv-startv)
254 #print ' %08x %08x %08x %d %d %d' % (startv, hashv, endv, start, mid, end)
256 #print ' %08x' % self._num(v)
259 startv = _helpers.firstword(v)
262 endv = _helpers.firstword(v)
268 for i in xrange(self._fanget(self.entries-1)):
269 yield buffer(self.shalist, i*20, 20)
272 return int(self._fanget(self.entries-1))
277 def __init__(self, dir):
279 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
289 assert(_mpi_count == 0)
292 return iter(idxmerge(self.packs))
295 return sum(len(pack) for pack in self.packs)
297 def exists(self, hash):
298 """Return nonempty if the object exists in the index files."""
299 global _total_searches
301 if hash in self.also:
303 for i in range(len(self.packs)):
305 _total_searches -= 1 # will be incremented by sub-pack
307 # reorder so most recently used packs are searched first
308 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
312 def refresh(self, skip_midx = False):
313 """Refresh the index list.
314 This method verifies if .midx files were superseded (e.g. all of its
315 contents are in another, bigger .midx file) and removes the superseded
318 If skip_midx is True, all work on .midx files will be skipped and .midx
319 files will be removed from the list.
321 The module-global variable 'ignore_midx' can force this function to
322 always act as if skip_midx was True.
324 skip_midx = skip_midx or ignore_midx
325 d = dict((p.name, p) for p in self.packs
326 if not skip_midx or not isinstance(p, PackMidx))
327 if os.path.exists(self.dir):
330 for ix in self.packs:
331 if isinstance(ix, PackMidx):
332 for name in ix.idxnames:
333 d[os.path.join(self.dir, name)] = ix
334 for f in os.listdir(self.dir):
335 full = os.path.join(self.dir, f)
336 if f.endswith('.midx') and not d.get(full):
338 (mxd, mxf) = os.path.split(mx.name)
340 for n in mx.idxnames:
341 if not os.path.exists(os.path.join(mxd, n)):
342 log(('warning: index %s missing\n' +
343 ' used by %s\n') % (n, mxf))
347 midxl.sort(lambda x,y: -cmp(len(x),len(y)))
350 for sub in ix.idxnames:
351 found = d.get(os.path.join(self.dir, sub))
352 if not found or isinstance(found, PackIdx):
353 # doesn't exist, or exists but not in a midx
355 for name in ix.idxnames:
356 d[os.path.join(self.dir, name)] = ix
360 log('midx: removing redundant: %s\n'
361 % os.path.basename(ix.name))
363 for f in os.listdir(self.dir):
364 full = os.path.join(self.dir, f)
365 if f.endswith('.idx') and not d.get(full):
368 self.packs = list(set(d.values()))
369 log('PackIdxList: using %d index%s.\n'
370 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
373 """Insert an additional object in the list."""
377 """Remove all additional objects from the list."""
381 def calc_hash(type, content):
382 """Calculate some content's hash in the Git fashion."""
383 header = '%s %d\0' % (type, len(content))
389 def _shalist_sort_key(ent):
390 (mode, name, id) = ent
391 if stat.S_ISDIR(int(mode, 8)):
397 def idxmerge(idxlist):
398 """Generate a list of all the objects reachable in a PackIdxList."""
399 total = sum(len(i) for i in idxlist)
400 iters = (iter(i) for i in idxlist)
401 heap = [(next(it), it) for it in iters]
406 if (count % 10024) == 0:
407 progress('Reading indexes: %.2f%% (%d/%d)\r'
408 % (count*100.0/total, count, total))
416 heapq.heapreplace(heap, (e, it))
419 log('Reading indexes: %.2f%% (%d/%d), done.\n' % (100, total, total))
423 """Writes Git objects insid a pack file."""
424 def __init__(self, objcache_maker=None):
429 self.objcache_maker = objcache_maker
435 def _make_objcache(self):
436 if self.objcache == None:
437 if self.objcache_maker:
438 self.objcache = self.objcache_maker()
440 self.objcache = PackIdxList(repo('objects/pack'))
444 self._make_objcache()
445 (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
446 self.file = os.fdopen(fd, 'w+b')
447 assert(name.endswith('.pack'))
448 self.filename = name[:-5]
449 self.file.write('PACK\0\0\0\2\0\0\0\0')
451 def _raw_write(self, datalist):
454 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
455 # the file never has a *partial* blob. So let's make sure it's
456 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
457 # to our hashsplit algorithm.) f.write() does its own buffering,
458 # but that's okay because we'll flush it in _end().
459 oneblob = ''.join(datalist)
461 self.outbytes += len(oneblob)
464 def _write(self, bin, type, content):
467 self._raw_write(_encode_packobj(type, content))
470 def breakpoint(self):
471 """Clear byte and object counts and return the last processed id."""
473 self.outbytes = self.count = 0
476 def write(self, type, content):
477 """Write an object in this pack file."""
478 return self._write(calc_hash(type, content), type, content)
480 def exists(self, id):
481 """Return non-empty if an object is found in the object cache."""
482 if not self.objcache:
483 self._make_objcache()
484 return self.objcache.exists(id)
486 def maybe_write(self, type, content):
487 """Write an object to the pack file if not present and return its id."""
488 bin = calc_hash(type, content)
489 if not self.exists(bin):
490 self._write(bin, type, content)
491 self.objcache.add(bin)
494 def new_blob(self, blob):
495 """Create a blob object in the pack with the supplied content."""
496 return self.maybe_write('blob', blob)
498 def new_tree(self, shalist):
499 """Create a tree object in the pack."""
500 shalist = sorted(shalist, key = _shalist_sort_key)
502 for (mode,name,bin) in shalist:
505 assert(mode[0] != '0')
507 assert(len(bin) == 20)
508 l.append('%s %s\0%s' % (mode,name,bin))
509 return self.maybe_write('tree', ''.join(l))
511 def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
513 if tree: l.append('tree %s' % tree.encode('hex'))
514 if parent: l.append('parent %s' % parent.encode('hex'))
515 if author: l.append('author %s %s' % (author, _git_date(adate)))
516 if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
519 return self.maybe_write('commit', '\n'.join(l))
521 def new_commit(self, parent, tree, msg):
522 """Create a commit object in the pack."""
524 userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
525 commit = self._new_commit(tree, parent,
526 userline, now, userline, now,
531 """Remove the pack file from disk."""
536 os.unlink(self.filename + '.pack')
540 if not f: return None
544 # update object count
546 cp = struct.pack('!i', self.count)
550 # calculate the pack sha1sum
557 f.write(sum.digest())
561 p = subprocess.Popen(['git', 'index-pack', '-v',
563 self.filename + '.pack'],
564 preexec_fn = _gitenv,
565 stdout = subprocess.PIPE)
566 out = p.stdout.read().strip()
567 _git_wait('git index-pack', p)
569 raise GitError('git index-pack produced no output')
570 nameprefix = repo('objects/pack/%s' % out)
571 if os.path.exists(self.filename + '.map'):
572 os.unlink(self.filename + '.map')
573 os.rename(self.filename + '.pack', nameprefix + '.pack')
574 os.rename(self.filename + '.idx', nameprefix + '.idx')
578 """Close the pack file and move it to its definitive path."""
583 return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
587 os.environ['GIT_DIR'] = os.path.abspath(repo())
590 def list_refs(refname = None):
591 """Generate a list of tuples in the form (refname,hash).
592 If a ref name is specified, list only this particular ref.
594 argv = ['git', 'show-ref', '--']
597 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
598 out = p.stdout.read().strip()
599 rv = p.wait() # not fatal
603 for d in out.split('\n'):
604 (sha, name) = d.split(' ', 1)
605 yield (name, sha.decode('hex'))
608 def read_ref(refname):
609 """Get the commit id of the most recent commit made on a given ref."""
610 l = list(list_refs(refname))
618 def rev_list(ref, count=None):
619 """Generate a list of reachable commits in reverse chronological order.
621 This generator walks through commits, from child to parent, that are
622 reachable via the specified ref and yields a series of tuples of the form
625 If count is a non-zero integer, limit the number of commits to "count"
628 assert(not ref.startswith('-'))
631 opts += ['-n', str(atoi(count))]
632 argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
633 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
637 if s.startswith('commit '):
638 commit = s[7:].decode('hex')
642 rv = p.wait() # not fatal
644 raise GitError, 'git rev-list returned error %d' % rv
647 def rev_get_date(ref):
648 """Get the date of the latest commit on the specified ref."""
649 for (date, commit) in rev_list(ref, count=1):
651 raise GitError, 'no such commit %r' % ref
654 def update_ref(refname, newval, oldval):
655 """Change the commit pointed to by a branch."""
658 assert(refname.startswith('refs/heads/'))
659 p = subprocess.Popen(['git', 'update-ref', refname,
660 newval.encode('hex'), oldval.encode('hex')],
661 preexec_fn = _gitenv)
662 _git_wait('git update-ref', p)
665 def guess_repo(path=None):
666 """Set the path value in the global variable "repodir".
667 This makes bup look for an existing bup repository, but not fail if a
668 repository doesn't exist. Usually, if you are interacting with a bup
669 repository, you would not be calling this function but using
676 repodir = os.environ.get('BUP_DIR')
678 repodir = os.path.expanduser('~/.bup')
681 def init_repo(path=None):
682 """Create the Git bare repository for bup in a given path."""
685 if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
686 raise GitError('"%d" exists but is not a directory\n' % d)
687 p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
688 preexec_fn = _gitenv)
689 _git_wait('git init', p)
690 # Force the index version configuration in order to ensure bup works
691 # regardless of the version of the installed Git binary.
692 p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
693 stdout=sys.stderr, preexec_fn = _gitenv)
694 _git_wait('git config', p)
697 def check_repo_or_die(path=None):
698 """Make sure a bup repository exists, and abort if not.
699 If the path to a particular repository was not specified, this function
700 initializes the default repository automatically.
703 if not os.path.isdir(repo('objects/pack/.')):
704 if repodir == home_repodir:
707 log('error: %r is not a bup/git repository\n' % repo())
712 """Generate a list of (mode, name, hash) tuples of objects from 'buf'."""
714 while ofs < len(buf):
715 z = buf[ofs:].find('\0')
717 spl = buf[ofs:ofs+z].split(' ', 1)
718 assert(len(spl) == 2)
719 sha = buf[ofs+z+1:ofs+z+1+20]
721 yield (spl[0], spl[1], sha)
726 """Get Git's version and ensure a usable version is installed.
728 The returned version is formatted as an ordered tuple with each position
729 representing a digit in the version tag. For example, the following tuple
730 would represent version 1.6.6.9:
736 p = subprocess.Popen(['git', '--version'],
737 stdout=subprocess.PIPE)
738 gvs = p.stdout.read()
739 _git_wait('git --version', p)
740 m = re.match(r'git version (\S+.\S+)', gvs)
742 raise GitError('git --version weird output: %r' % gvs)
743 _ver = tuple(m.group(1).split('.'))
744 needed = ('1','5', '3', '1')
746 raise GitError('git version %s or higher is required; you have %s'
747 % ('.'.join(needed), '.'.join(_ver)))
751 def _git_wait(cmd, p):
754 raise GitError('%s returned %d' % (cmd, rv))
757 def _git_capture(argv):
758 p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
760 _git_wait(repr(argv), p)
764 class _AbortableIter:
765 def __init__(self, it, onabort = None):
767 self.onabort = onabort
775 return self.it.next()
776 except StopIteration, e:
784 """Abort iteration and call the abortion callback, if needed."""
796 """Link to 'git cat-file' that is used to retrieve blob data."""
799 wanted = ('1','5','6')
802 log('warning: git version < %s; bup will be slow.\n'
805 self.get = self._slow_get
807 self.p = self.inprogress = None
808 self.get = self._fast_get
812 self.p.stdout.close()
815 self.inprogress = None
819 self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
820 stdin=subprocess.PIPE,
821 stdout=subprocess.PIPE,
823 preexec_fn = _gitenv)
825 def _fast_get(self, id):
826 if not self.p or self.p.poll() != None:
829 assert(self.p.poll() == None)
831 log('_fast_get: opening %r while %r is open'
832 % (id, self.inprogress))
833 assert(not self.inprogress)
834 assert(id.find('\n') < 0)
835 assert(id.find('\r') < 0)
838 self.p.stdin.write('%s\n' % id)
839 hdr = self.p.stdout.readline()
840 if hdr.endswith(' missing\n'):
841 raise KeyError('blob %r is missing' % id)
843 if len(spl) != 3 or len(spl[0]) != 40:
844 raise GitError('expected blob, got %r' % spl)
845 (hex, type, size) = spl
847 it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
848 onabort = self._abort)
853 assert(self.p.stdout.readline() == '\n')
854 self.inprogress = None
859 def _slow_get(self, id):
860 assert(id.find('\n') < 0)
861 assert(id.find('\r') < 0)
863 type = _git_capture(['git', 'cat-file', '-t', id]).strip()
866 p = subprocess.Popen(['git', 'cat-file', type, id],
867 stdout=subprocess.PIPE,
868 preexec_fn = _gitenv)
869 for blob in chunkyreader(p.stdout):
871 _git_wait('git cat-file', p)
879 treefile = ''.join(it)
880 for (mode, name, sha) in treeparse(treefile):
881 for blob in self.join(sha.encode('hex')):
883 elif type == 'commit':
884 treeline = ''.join(it).split('\n')[0]
885 assert(treeline.startswith('tree '))
886 for blob in self.join(treeline[5:]):
889 raise GitError('invalid object type %r: expected blob/tree/commit'
893 """Generate a list of the content of all blobs that can be reached
894 from an object. The hash given in 'id' must point to a blob, a tree
895 or a commit. The content of all blobs that can be seen from trees or
896 commits will be added to the list.
899 for d in self._join(self.get(id)):
901 except StopIteration: