1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
5 import os, zlib, time, subprocess, struct, stat, re, tempfile
7 from bup.helpers import *
11 home_repodir = os.path.expanduser('~/.bup')
14 _typemap = { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
15 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
21 class GitError(Exception):
26 """Get the path to the git repository or one of its subdirectories."""
29 raise GitError('You should call check_repo_or_die()')
31 # If there's a .git subdirectory, then the actual repo is in there.
32 gd = os.path.join(repodir, '.git')
33 if os.path.exists(gd):
36 return os.path.join(repodir, sub)
39 def mangle_name(name, mode, gitmode):
40 """Mangle a file name to present an abstract name for segmented files.
41 Mangled file names will have the ".bup" extension added to them. If a
42 file's name already ends with ".bup", a ".bupl" extension is added to
43 disambiguate normal files from semgmented ones.
45 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
47 elif name.endswith('.bup') or name[:-1].endswith('.bup'):
53 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
54 def demangle_name(name):
55 """Remove name mangling from a file name, if necessary.
57 The return value is a tuple (demangled_filename,mode), where mode is one of
60 * BUP_NORMAL : files that should be read as-is from the repository
61 * BUP_CHUNKED : files that were chunked and need to be assembled
63 For more information on the name mangling algorythm, see mangle_name()
65 if name.endswith('.bupl'):
66 return (name[:-5], BUP_NORMAL)
67 elif name.endswith('.bup'):
68 return (name[:-4], BUP_CHUNKED)
70 return (name, BUP_NORMAL)
73 def _encode_packobj(type, content):
76 szbits = (sz & 0x0f) | (_typemap[type]<<4)
85 z = zlib.compressobj(1)
87 yield z.compress(content)
91 def _encode_looseobj(type, content):
92 z = zlib.compressobj(1)
93 yield z.compress('%s %d\0' % (type, len(content)))
94 yield z.compress(content)
98 def _decode_looseobj(buf):
100 s = zlib.decompress(buf)
107 assert(type in _typemap)
108 assert(sz == len(content))
109 return (type, content)
112 def _decode_packobj(buf):
115 type = _typermap[(c & 0x70) >> 4]
122 sz |= (c & 0x7f) << shift
126 return (type, zlib.decompress(buf[i+1:]))
130 """Object representation of a Git pack index file."""
131 def __init__(self, filename):
133 self.map = mmap_read(open(filename))
134 assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
135 self.fanout = list(struct.unpack('!256I',
136 str(buffer(self.map, 8, 256*4))))
137 self.fanout.append(0) # entry "-1"
138 nsha = self.fanout[255]
139 self.ofstable = buffer(self.map,
140 8 + 256*4 + nsha*20 + nsha*4,
142 self.ofs64table = buffer(self.map,
143 8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
145 def _ofs_from_idx(self, idx):
146 ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
148 idx64 = ofs & 0x7fffffff
149 ofs = struct.unpack('!I',
150 str(buffer(self.ofs64table, idx64*8, 8)))[0]
153 def _idx_from_hash(self, hash):
154 global _total_searches, _total_steps
156 assert(len(hash) == 20)
158 start = self.fanout[b1-1] # range -1..254
159 end = self.fanout[b1] # range 0..255
160 buf = buffer(self.map, 8 + 256*4, end*20)
162 _total_steps += 1 # lookup table is a step
165 mid = start + (end-start)/2
166 v = str(buf[mid*20:(mid+1)*20])
175 def find_offset(self, hash):
176 """Get the offset of an object inside the index file."""
177 idx = self._idx_from_hash(hash)
179 return self._ofs_from_idx(idx)
182 def exists(self, hash):
183 """Return nonempty if the object exists in this index."""
184 return hash and (self._idx_from_hash(hash) != None) and True or None
187 for i in xrange(self.fanout[255]):
188 yield buffer(self.map, 8 + 256*4 + 20*i, 20)
191 return int(self.fanout[255])
194 def extract_bits(buf, nbits):
195 """Take the first 'nbits' bits from 'buf' and return them as an integer."""
196 mask = (1<<nbits) - 1
197 v = struct.unpack('!I', buf[0:4])[0]
198 v = (v >> (32-nbits)) & mask
203 """Wrapper which contains data from multiple index files.
204 Multiple index (.midx) files constitute a wrapper around index (.idx) files
205 and make it possible for bup to expand Git's indexing capabilities to vast
208 def __init__(self, filename):
210 assert(filename.endswith('.midx'))
211 self.map = mmap_read(open(filename))
212 if str(self.map[0:8]) == 'MIDX\0\0\0\1':
213 log('Warning: ignoring old-style midx %r\n' % filename)
216 self.fanout = buffer('\0\0\0\0')
217 self.shalist = buffer('\0'*20)
220 assert(str(self.map[0:8]) == 'MIDX\0\0\0\2')
221 self.bits = struct.unpack('!I', self.map[8:12])[0]
222 self.entries = 2**self.bits
223 self.fanout = buffer(self.map, 12, self.entries*4)
224 shaofs = 12 + self.entries*4
225 nsha = self._fanget(self.entries-1)
226 self.shalist = buffer(self.map, shaofs, nsha*20)
227 self.idxnames = str(self.map[shaofs + 20*nsha:]).split('\0')
229 def _fanget(self, i):
231 s = self.fanout[start:start+4]
232 return struct.unpack('!I', s)[0]
235 return str(self.shalist[i*20:(i+1)*20])
237 def _num(self, hash):
238 return struct.unpack('!I', hash[:4])[0]
240 def exists(self, hash):
241 """Return nonempty if the object exists in the index files."""
242 global _total_searches, _total_steps
245 el = extract_bits(want, self.bits)
247 start = self._fanget(el-1)
248 startv = el << (32-self.bits)
252 end = self._fanget(el)
253 endv = (el+1) << (32-self.bits)
254 _total_steps += 1 # lookup table is a step
255 hashv = self._num(hash)
256 #print '(%08x) %08x %08x %08x' % (extract_bits(want, 32), startv, hashv, endv)
259 #print '! %08x %08x %08x %d - %d' % (startv, hashv, endv, start, end)
260 mid = start + (hashv-startv)*(end-start-1)/(endv-startv)
261 #print ' %08x %08x %08x %d %d %d' % (startv, hashv, endv, start, mid, end)
263 #print ' %08x' % self._num(v)
266 startv = self._num(v)
275 for i in xrange(self._fanget(self.entries-1)):
276 yield buffer(self.shalist, i*20, 20)
279 return int(self._fanget(self.entries-1))
284 def __init__(self, dir):
286 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
296 assert(_mpi_count == 0)
299 return iter(idxmerge(self.packs))
302 return sum(len(pack) for pack in self.packs)
304 def exists(self, hash):
305 """Return nonempty if the object exists in the index files."""
306 global _total_searches
308 if hash in self.also:
310 for i in range(len(self.packs)):
312 _total_searches -= 1 # will be incremented by sub-pack
314 # reorder so most recently used packs are searched first
315 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
319 def refresh(self, skip_midx = False):
320 """Refresh the index list.
321 This method verifies if .midx files were superseded (e.g. all of its
322 contents are in another, bigger .midx file) and removes the superseded
325 If skip_midx is True, all work on .midx files will be skipped and .midx
326 files will be removed from the list.
328 The module-global variable 'ignore_midx' can force this function to
329 always act as if skip_midx was True.
331 skip_midx = skip_midx or ignore_midx
332 d = dict((p.name, p) for p in self.packs
333 if not skip_midx or not isinstance(p, PackMidx))
334 if os.path.exists(self.dir):
337 for ix in self.packs:
338 if isinstance(ix, PackMidx):
339 for name in ix.idxnames:
340 d[os.path.join(self.dir, name)] = ix
341 for f in os.listdir(self.dir):
342 full = os.path.join(self.dir, f)
343 if f.endswith('.midx') and not d.get(full):
345 (mxd, mxf) = os.path.split(mx.name)
347 for n in mx.idxnames:
348 if not os.path.exists(os.path.join(mxd, n)):
349 log(('warning: index %s missing\n' +
350 ' used by %s\n') % (n, mxf))
354 midxl.sort(lambda x,y: -cmp(len(x),len(y)))
357 for sub in ix.idxnames:
358 found = d.get(os.path.join(self.dir, sub))
359 if not found or isinstance(found, PackIdx):
360 # doesn't exist, or exists but not in a midx
362 for name in ix.idxnames:
363 d[os.path.join(self.dir, name)] = ix
367 log('midx: removing redundant: %s\n'
368 % os.path.basename(ix.name))
370 for f in os.listdir(self.dir):
371 full = os.path.join(self.dir, f)
372 if f.endswith('.idx') and not d.get(full):
375 self.packs = list(set(d.values()))
376 log('PackIdxList: using %d index%s.\n'
377 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
380 """Insert an additional object in the list."""
384 """Remove all additional objects from the list."""
388 def calc_hash(type, content):
389 """Calculate some content's hash in the Git fashion."""
390 header = '%s %d\0' % (type, len(content))
396 def _shalist_sort_key(ent):
397 (mode, name, id) = ent
398 if stat.S_ISDIR(int(mode, 8)):
404 def idxmerge(idxlist):
405 """Generate a list of all the objects reachable in a PackIdxList."""
406 total = sum(len(i) for i in idxlist)
407 iters = (iter(i) for i in idxlist)
408 heap = [(next(it), it) for it in iters]
413 if (count % 10024) == 0:
414 progress('Reading indexes: %.2f%% (%d/%d)\r'
415 % (count*100.0/total, count, total))
423 heapq.heapreplace(heap, (e, it))
426 log('Reading indexes: %.2f%% (%d/%d), done.\n' % (100, total, total))
430 """Writes Git objects insid a pack file."""
431 def __init__(self, objcache_maker=None):
436 self.objcache_maker = objcache_maker
442 def _make_objcache(self):
443 if self.objcache == None:
444 if self.objcache_maker:
445 self.objcache = self.objcache_maker()
447 self.objcache = PackIdxList(repo('objects/pack'))
451 self._make_objcache()
452 (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
453 self.file = os.fdopen(fd, 'w+b')
454 assert(name.endswith('.pack'))
455 self.filename = name[:-5]
456 self.file.write('PACK\0\0\0\2\0\0\0\0')
458 def _raw_write(self, datalist):
461 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
462 # the file never has a *partial* blob. So let's make sure it's
463 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
464 # to our hashsplit algorithm.) f.write() does its own buffering,
465 # but that's okay because we'll flush it in _end().
466 oneblob = ''.join(datalist)
468 self.outbytes += len(oneblob)
471 def _write(self, bin, type, content):
474 self._raw_write(_encode_packobj(type, content))
477 def breakpoint(self):
478 """Clear byte and object counts and return the last processed id."""
480 self.outbytes = self.count = 0
483 def write(self, type, content):
484 """Write an object in this pack file."""
485 return self._write(calc_hash(type, content), type, content)
487 def exists(self, id):
488 """Return non-empty if an object is found in the object cache."""
489 if not self.objcache:
490 self._make_objcache()
491 return self.objcache.exists(id)
493 def maybe_write(self, type, content):
494 """Write an object to the pack file if not present and return its id."""
495 bin = calc_hash(type, content)
496 if not self.exists(bin):
497 self._write(bin, type, content)
498 self.objcache.add(bin)
501 def new_blob(self, blob):
502 """Create a blob object in the pack with the supplied content."""
503 return self.maybe_write('blob', blob)
505 def new_tree(self, shalist):
506 """Create a tree object in the pack."""
507 shalist = sorted(shalist, key = _shalist_sort_key)
509 for (mode,name,bin) in shalist:
512 assert(mode[0] != '0')
514 assert(len(bin) == 20)
515 l.append('%s %s\0%s' % (mode,name,bin))
516 return self.maybe_write('tree', ''.join(l))
518 def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
520 if tree: l.append('tree %s' % tree.encode('hex'))
521 if parent: l.append('parent %s' % parent.encode('hex'))
522 if author: l.append('author %s %s' % (author, _git_date(adate)))
523 if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
526 return self.maybe_write('commit', '\n'.join(l))
528 def new_commit(self, parent, tree, msg):
529 """Create a commit object in the pack."""
531 userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
532 commit = self._new_commit(tree, parent,
533 userline, now, userline, now,
538 """Remove the pack file from disk."""
543 os.unlink(self.filename + '.pack')
547 if not f: return None
551 # update object count
553 cp = struct.pack('!i', self.count)
557 # calculate the pack sha1sum
564 f.write(sum.digest())
568 p = subprocess.Popen(['git', 'index-pack', '-v',
570 self.filename + '.pack'],
571 preexec_fn = _gitenv,
572 stdout = subprocess.PIPE)
573 out = p.stdout.read().strip()
574 _git_wait('git index-pack', p)
576 raise GitError('git index-pack produced no output')
577 nameprefix = repo('objects/pack/%s' % out)
578 if os.path.exists(self.filename + '.map'):
579 os.unlink(self.filename + '.map')
580 os.rename(self.filename + '.pack', nameprefix + '.pack')
581 os.rename(self.filename + '.idx', nameprefix + '.idx')
585 """Close the pack file and move it to its definitive path."""
590 return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
594 os.environ['GIT_DIR'] = os.path.abspath(repo())
597 def list_refs(refname = None):
598 """Generate a list of tuples in the form (refname,hash).
599 If a ref name is specified, list only this particular ref.
601 argv = ['git', 'show-ref', '--']
604 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
605 out = p.stdout.read().strip()
606 rv = p.wait() # not fatal
610 for d in out.split('\n'):
611 (sha, name) = d.split(' ', 1)
612 yield (name, sha.decode('hex'))
615 def read_ref(refname):
616 """Get the commit id of the most recent commit made on a given ref."""
617 l = list(list_refs(refname))
625 def rev_list(ref, count=None):
626 """Generate a list of reachable commits in reverse chronological order.
628 This generator walks through commits, from child to parent, that are
629 reachable via the specified ref and yields a series of tuples of the form
632 If count is a non-zero integer, limit the number of commits to "count"
635 assert(not ref.startswith('-'))
638 opts += ['-n', str(atoi(count))]
639 argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
640 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
644 if s.startswith('commit '):
645 commit = s[7:].decode('hex')
649 rv = p.wait() # not fatal
651 raise GitError, 'git rev-list returned error %d' % rv
654 def rev_get_date(ref):
655 """Get the date of the latest commit on the specified ref."""
656 for (date, commit) in rev_list(ref, count=1):
658 raise GitError, 'no such commit %r' % ref
661 def update_ref(refname, newval, oldval):
662 """Change the commit pointed to by a branch."""
665 assert(refname.startswith('refs/heads/'))
666 p = subprocess.Popen(['git', 'update-ref', refname,
667 newval.encode('hex'), oldval.encode('hex')],
668 preexec_fn = _gitenv)
669 _git_wait('git update-ref', p)
672 def guess_repo(path=None):
673 """Set the path value in the global variable "repodir".
674 This makes bup look for an existing bup repository, but not fail if a
675 repository doesn't exist. Usually, if you are interacting with a bup
676 repository, you would not be calling this function but using
683 repodir = os.environ.get('BUP_DIR')
685 repodir = os.path.expanduser('~/.bup')
688 def init_repo(path=None):
689 """Create the Git bare repository for bup in a given path."""
692 if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
693 raise GitError('"%d" exists but is not a directory\n' % d)
694 p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
695 preexec_fn = _gitenv)
696 _git_wait('git init', p)
697 # Force the index version configuration in order to ensure bup works
698 # regardless of the version of the installed Git binary.
699 p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
700 stdout=sys.stderr, preexec_fn = _gitenv)
701 _git_wait('git config', p)
704 def check_repo_or_die(path=None):
705 """Make sure a bup repository exists, and abort if not.
706 If the path to a particular repository was not specified, this function
707 initializes the default repository automatically.
710 if not os.path.isdir(repo('objects/pack/.')):
711 if repodir == home_repodir:
714 log('error: %r is not a bup/git repository\n' % repo())
719 """Generate a list of (mode, name, hash) tuples of objects from 'buf'."""
721 while ofs < len(buf):
722 z = buf[ofs:].find('\0')
724 spl = buf[ofs:ofs+z].split(' ', 1)
725 assert(len(spl) == 2)
726 sha = buf[ofs+z+1:ofs+z+1+20]
728 yield (spl[0], spl[1], sha)
733 """Get Git's version and ensure a usable version is installed.
735 The returned version is formatted as an ordered tuple with each position
736 representing a digit in the version tag. For example, the following tuple
737 would represent version 1.6.6.9:
743 p = subprocess.Popen(['git', '--version'],
744 stdout=subprocess.PIPE)
745 gvs = p.stdout.read()
746 _git_wait('git --version', p)
747 m = re.match(r'git version (\S+.\S+)', gvs)
749 raise GitError('git --version weird output: %r' % gvs)
750 _ver = tuple(m.group(1).split('.'))
751 needed = ('1','5', '3', '1')
753 raise GitError('git version %s or higher is required; you have %s'
754 % ('.'.join(needed), '.'.join(_ver)))
758 def _git_wait(cmd, p):
761 raise GitError('%s returned %d' % (cmd, rv))
764 def _git_capture(argv):
765 p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
767 _git_wait(repr(argv), p)
771 class _AbortableIter:
772 def __init__(self, it, onabort = None):
774 self.onabort = onabort
782 return self.it.next()
783 except StopIteration, e:
791 """Abort iteration and call the abortion callback, if needed."""
803 """Link to 'git cat-file' that is used to retrieve blob data."""
806 wanted = ('1','5','6')
809 log('warning: git version < %s; bup will be slow.\n'
812 self.get = self._slow_get
814 self.p = self.inprogress = None
815 self.get = self._fast_get
819 self.p.stdout.close()
822 self.inprogress = None
826 self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
827 stdin=subprocess.PIPE,
828 stdout=subprocess.PIPE,
830 preexec_fn = _gitenv)
832 def _fast_get(self, id):
833 if not self.p or self.p.poll() != None:
836 assert(self.p.poll() == None)
838 log('_fast_get: opening %r while %r is open'
839 % (id, self.inprogress))
840 assert(not self.inprogress)
841 assert(id.find('\n') < 0)
842 assert(id.find('\r') < 0)
845 self.p.stdin.write('%s\n' % id)
846 hdr = self.p.stdout.readline()
847 if hdr.endswith(' missing\n'):
848 raise KeyError('blob %r is missing' % id)
850 if len(spl) != 3 or len(spl[0]) != 40:
851 raise GitError('expected blob, got %r' % spl)
852 (hex, type, size) = spl
854 it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
855 onabort = self._abort)
860 assert(self.p.stdout.readline() == '\n')
861 self.inprogress = None
866 def _slow_get(self, id):
867 assert(id.find('\n') < 0)
868 assert(id.find('\r') < 0)
870 type = _git_capture(['git', 'cat-file', '-t', id]).strip()
873 p = subprocess.Popen(['git', 'cat-file', type, id],
874 stdout=subprocess.PIPE,
875 preexec_fn = _gitenv)
876 for blob in chunkyreader(p.stdout):
878 _git_wait('git cat-file', p)
886 treefile = ''.join(it)
887 for (mode, name, sha) in treeparse(treefile):
888 for blob in self.join(sha.encode('hex')):
890 elif type == 'commit':
891 treeline = ''.join(it).split('\n')[0]
892 assert(treeline.startswith('tree '))
893 for blob in self.join(treeline[5:]):
896 raise GitError('invalid object type %r: expected blob/tree/commit'
900 """Generate a list of the content of all blobs that can be reached
901 from an object. The hash given in 'id' must point to a blob, a tree
902 or a commit. The content of all blobs that can be seen from trees or
903 commits will be added to the list.
906 for d in self._join(self.get(id)):
908 except StopIteration: