1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
5 import os, zlib, time, subprocess, struct, stat, re, tempfile
7 from bup.helpers import *
8 from bup import _helpers
12 home_repodir = os.path.expanduser('~/.bup')
15 _typemap = { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
16 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
22 class GitError(Exception):
27 """Get the path to the git repository or one of its subdirectories."""
30 raise GitError('You should call check_repo_or_die()')
32 # If there's a .git subdirectory, then the actual repo is in there.
33 gd = os.path.join(repodir, '.git')
34 if os.path.exists(gd):
37 return os.path.join(repodir, sub)
40 def mangle_name(name, mode, gitmode):
41 """Mangle a file name to present an abstract name for segmented files.
42 Mangled file names will have the ".bup" extension added to them. If a
43 file's name already ends with ".bup", a ".bupl" extension is added to
44 disambiguate normal files from semgmented ones.
46 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
48 elif name.endswith('.bup') or name[:-1].endswith('.bup'):
54 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
55 def demangle_name(name):
56 """Remove name mangling from a file name, if necessary.
58 The return value is a tuple (demangled_filename,mode), where mode is one of
61 * BUP_NORMAL : files that should be read as-is from the repository
62 * BUP_CHUNKED : files that were chunked and need to be assembled
64 For more information on the name mangling algorythm, see mangle_name()
66 if name.endswith('.bupl'):
67 return (name[:-5], BUP_NORMAL)
68 elif name.endswith('.bup'):
69 return (name[:-4], BUP_CHUNKED)
71 return (name, BUP_NORMAL)
74 def _encode_packobj(type, content):
77 szbits = (sz & 0x0f) | (_typemap[type]<<4)
86 z = zlib.compressobj(1)
88 yield z.compress(content)
92 def _encode_looseobj(type, content):
93 z = zlib.compressobj(1)
94 yield z.compress('%s %d\0' % (type, len(content)))
95 yield z.compress(content)
99 def _decode_looseobj(buf):
101 s = zlib.decompress(buf)
108 assert(type in _typemap)
109 assert(sz == len(content))
110 return (type, content)
113 def _decode_packobj(buf):
116 type = _typermap[(c & 0x70) >> 4]
123 sz |= (c & 0x7f) << shift
127 return (type, zlib.decompress(buf[i+1:]))
131 """Object representation of a Git pack index file."""
132 def __init__(self, filename):
134 self.map = mmap_read(open(filename))
135 assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
136 self.fanout = list(struct.unpack('!256I',
137 str(buffer(self.map, 8, 256*4))))
138 self.fanout.append(0) # entry "-1"
139 nsha = self.fanout[255]
140 self.ofstable = buffer(self.map,
141 8 + 256*4 + nsha*20 + nsha*4,
143 self.ofs64table = buffer(self.map,
144 8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
146 def _ofs_from_idx(self, idx):
147 ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
149 idx64 = ofs & 0x7fffffff
150 ofs = struct.unpack('!I',
151 str(buffer(self.ofs64table, idx64*8, 8)))[0]
154 def _idx_from_hash(self, hash):
155 global _total_searches, _total_steps
157 assert(len(hash) == 20)
159 start = self.fanout[b1-1] # range -1..254
160 end = self.fanout[b1] # range 0..255
161 buf = buffer(self.map, 8 + 256*4, end*20)
163 _total_steps += 1 # lookup table is a step
166 mid = start + (end-start)/2
167 v = str(buf[mid*20:(mid+1)*20])
176 def find_offset(self, hash):
177 """Get the offset of an object inside the index file."""
178 idx = self._idx_from_hash(hash)
180 return self._ofs_from_idx(idx)
183 def exists(self, hash):
184 """Return nonempty if the object exists in this index."""
185 return hash and (self._idx_from_hash(hash) != None) and True or None
188 for i in xrange(self.fanout[255]):
189 yield buffer(self.map, 8 + 256*4 + 20*i, 20)
192 return int(self.fanout[255])
195 def extract_bits(buf, nbits):
196 """Take the first 'nbits' bits from 'buf' and return them as an integer."""
197 mask = (1<<nbits) - 1
198 v = _helpers.firstword(buf)
199 v = (v >> (32-nbits)) & mask
204 """Wrapper which contains data from multiple index files.
205 Multiple index (.midx) files constitute a wrapper around index (.idx) files
206 and make it possible for bup to expand Git's indexing capabilities to vast
209 def __init__(self, filename):
211 assert(filename.endswith('.midx'))
212 self.map = mmap_read(open(filename))
213 if str(self.map[0:8]) == 'MIDX\0\0\0\1':
214 log('Warning: ignoring old-style midx %r\n' % filename)
217 self.fanout = buffer('\0\0\0\0')
218 self.shalist = buffer('\0'*20)
221 assert(str(self.map[0:8]) == 'MIDX\0\0\0\2')
222 self.bits = _helpers.firstword(self.map[8:12])
223 self.entries = 2**self.bits
224 self.fanout = buffer(self.map, 12, self.entries*4)
225 shaofs = 12 + self.entries*4
226 nsha = self._fanget(self.entries-1)
227 self.shalist = buffer(self.map, shaofs, nsha*20)
228 self.idxnames = str(self.map[shaofs + 20*nsha:]).split('\0')
230 def _fanget(self, i):
232 s = self.fanout[start:start+4]
233 return _helpers.firstword(s)
236 return str(self.shalist[i*20:(i+1)*20])
238 def exists(self, hash):
239 """Return nonempty if the object exists in the index files."""
240 global _total_searches, _total_steps
243 el = extract_bits(want, self.bits)
245 start = self._fanget(el-1)
246 startv = el << (32-self.bits)
250 end = self._fanget(el)
251 endv = (el+1) << (32-self.bits)
252 _total_steps += 1 # lookup table is a step
253 hashv = _helpers.firstword(hash)
254 #print '(%08x) %08x %08x %08x' % (extract_bits(want, 32), startv, hashv, endv)
257 #print '! %08x %08x %08x %d - %d' % (startv, hashv, endv, start, end)
258 mid = start + (hashv-startv)*(end-start-1)/(endv-startv)
259 #print ' %08x %08x %08x %d %d %d' % (startv, hashv, endv, start, mid, end)
261 #print ' %08x' % self._num(v)
264 startv = _helpers.firstword(v)
267 endv = _helpers.firstword(v)
273 for i in xrange(self._fanget(self.entries-1)):
274 yield buffer(self.shalist, i*20, 20)
277 return int(self._fanget(self.entries-1))
282 def __init__(self, dir):
284 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
294 assert(_mpi_count == 0)
297 return iter(idxmerge(self.packs))
300 return sum(len(pack) for pack in self.packs)
302 def exists(self, hash):
303 """Return nonempty if the object exists in the index files."""
304 global _total_searches
306 if hash in self.also:
308 for i in range(len(self.packs)):
310 _total_searches -= 1 # will be incremented by sub-pack
312 # reorder so most recently used packs are searched first
313 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
317 def refresh(self, skip_midx = False):
318 """Refresh the index list.
319 This method verifies if .midx files were superseded (e.g. all of its
320 contents are in another, bigger .midx file) and removes the superseded
323 If skip_midx is True, all work on .midx files will be skipped and .midx
324 files will be removed from the list.
326 The module-global variable 'ignore_midx' can force this function to
327 always act as if skip_midx was True.
329 skip_midx = skip_midx or ignore_midx
330 d = dict((p.name, p) for p in self.packs
331 if not skip_midx or not isinstance(p, PackMidx))
332 if os.path.exists(self.dir):
335 for ix in self.packs:
336 if isinstance(ix, PackMidx):
337 for name in ix.idxnames:
338 d[os.path.join(self.dir, name)] = ix
339 for f in os.listdir(self.dir):
340 full = os.path.join(self.dir, f)
341 if f.endswith('.midx') and not d.get(full):
343 (mxd, mxf) = os.path.split(mx.name)
345 for n in mx.idxnames:
346 if not os.path.exists(os.path.join(mxd, n)):
347 log(('warning: index %s missing\n' +
348 ' used by %s\n') % (n, mxf))
352 midxl.sort(lambda x,y: -cmp(len(x),len(y)))
355 for sub in ix.idxnames:
356 found = d.get(os.path.join(self.dir, sub))
357 if not found or isinstance(found, PackIdx):
358 # doesn't exist, or exists but not in a midx
360 for name in ix.idxnames:
361 d[os.path.join(self.dir, name)] = ix
365 log('midx: removing redundant: %s\n'
366 % os.path.basename(ix.name))
368 for f in os.listdir(self.dir):
369 full = os.path.join(self.dir, f)
370 if f.endswith('.idx') and not d.get(full):
373 self.packs = list(set(d.values()))
374 log('PackIdxList: using %d index%s.\n'
375 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
378 """Insert an additional object in the list."""
382 """Remove all additional objects from the list."""
386 def calc_hash(type, content):
387 """Calculate some content's hash in the Git fashion."""
388 header = '%s %d\0' % (type, len(content))
394 def _shalist_sort_key(ent):
395 (mode, name, id) = ent
396 if stat.S_ISDIR(int(mode, 8)):
402 def idxmerge(idxlist):
403 """Generate a list of all the objects reachable in a PackIdxList."""
404 total = sum(len(i) for i in idxlist)
405 iters = (iter(i) for i in idxlist)
406 heap = [(next(it), it) for it in iters]
411 if (count % 10024) == 0:
412 progress('Reading indexes: %.2f%% (%d/%d)\r'
413 % (count*100.0/total, count, total))
421 heapq.heapreplace(heap, (e, it))
424 log('Reading indexes: %.2f%% (%d/%d), done.\n' % (100, total, total))
428 """Writes Git objects insid a pack file."""
429 def __init__(self, objcache_maker=None):
434 self.objcache_maker = objcache_maker
440 def _make_objcache(self):
441 if self.objcache == None:
442 if self.objcache_maker:
443 self.objcache = self.objcache_maker()
445 self.objcache = PackIdxList(repo('objects/pack'))
449 self._make_objcache()
450 (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
451 self.file = os.fdopen(fd, 'w+b')
452 assert(name.endswith('.pack'))
453 self.filename = name[:-5]
454 self.file.write('PACK\0\0\0\2\0\0\0\0')
456 def _raw_write(self, datalist):
459 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
460 # the file never has a *partial* blob. So let's make sure it's
461 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
462 # to our hashsplit algorithm.) f.write() does its own buffering,
463 # but that's okay because we'll flush it in _end().
464 oneblob = ''.join(datalist)
466 self.outbytes += len(oneblob)
469 def _write(self, bin, type, content):
472 self._raw_write(_encode_packobj(type, content))
475 def breakpoint(self):
476 """Clear byte and object counts and return the last processed id."""
478 self.outbytes = self.count = 0
481 def write(self, type, content):
482 """Write an object in this pack file."""
483 return self._write(calc_hash(type, content), type, content)
485 def exists(self, id):
486 """Return non-empty if an object is found in the object cache."""
487 if not self.objcache:
488 self._make_objcache()
489 return self.objcache.exists(id)
491 def maybe_write(self, type, content):
492 """Write an object to the pack file if not present and return its id."""
493 bin = calc_hash(type, content)
494 if not self.exists(bin):
495 self._write(bin, type, content)
496 self.objcache.add(bin)
499 def new_blob(self, blob):
500 """Create a blob object in the pack with the supplied content."""
501 return self.maybe_write('blob', blob)
503 def new_tree(self, shalist):
504 """Create a tree object in the pack."""
505 shalist = sorted(shalist, key = _shalist_sort_key)
507 for (mode,name,bin) in shalist:
510 assert(mode[0] != '0')
512 assert(len(bin) == 20)
513 l.append('%s %s\0%s' % (mode,name,bin))
514 return self.maybe_write('tree', ''.join(l))
516 def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
518 if tree: l.append('tree %s' % tree.encode('hex'))
519 if parent: l.append('parent %s' % parent.encode('hex'))
520 if author: l.append('author %s %s' % (author, _git_date(adate)))
521 if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
524 return self.maybe_write('commit', '\n'.join(l))
526 def new_commit(self, parent, tree, msg):
527 """Create a commit object in the pack."""
529 userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
530 commit = self._new_commit(tree, parent,
531 userline, now, userline, now,
536 """Remove the pack file from disk."""
541 os.unlink(self.filename + '.pack')
545 if not f: return None
549 # update object count
551 cp = struct.pack('!i', self.count)
555 # calculate the pack sha1sum
562 f.write(sum.digest())
566 p = subprocess.Popen(['git', 'index-pack', '-v',
568 self.filename + '.pack'],
569 preexec_fn = _gitenv,
570 stdout = subprocess.PIPE)
571 out = p.stdout.read().strip()
572 _git_wait('git index-pack', p)
574 raise GitError('git index-pack produced no output')
575 nameprefix = repo('objects/pack/%s' % out)
576 if os.path.exists(self.filename + '.map'):
577 os.unlink(self.filename + '.map')
578 os.rename(self.filename + '.pack', nameprefix + '.pack')
579 os.rename(self.filename + '.idx', nameprefix + '.idx')
583 """Close the pack file and move it to its definitive path."""
588 return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
592 os.environ['GIT_DIR'] = os.path.abspath(repo())
595 def list_refs(refname = None):
596 """Generate a list of tuples in the form (refname,hash).
597 If a ref name is specified, list only this particular ref.
599 argv = ['git', 'show-ref', '--']
602 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
603 out = p.stdout.read().strip()
604 rv = p.wait() # not fatal
608 for d in out.split('\n'):
609 (sha, name) = d.split(' ', 1)
610 yield (name, sha.decode('hex'))
613 def read_ref(refname):
614 """Get the commit id of the most recent commit made on a given ref."""
615 l = list(list_refs(refname))
623 def rev_list(ref, count=None):
624 """Generate a list of reachable commits in reverse chronological order.
626 This generator walks through commits, from child to parent, that are
627 reachable via the specified ref and yields a series of tuples of the form
630 If count is a non-zero integer, limit the number of commits to "count"
633 assert(not ref.startswith('-'))
636 opts += ['-n', str(atoi(count))]
637 argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
638 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
642 if s.startswith('commit '):
643 commit = s[7:].decode('hex')
647 rv = p.wait() # not fatal
649 raise GitError, 'git rev-list returned error %d' % rv
652 def rev_get_date(ref):
653 """Get the date of the latest commit on the specified ref."""
654 for (date, commit) in rev_list(ref, count=1):
656 raise GitError, 'no such commit %r' % ref
659 def update_ref(refname, newval, oldval):
660 """Change the commit pointed to by a branch."""
663 assert(refname.startswith('refs/heads/'))
664 p = subprocess.Popen(['git', 'update-ref', refname,
665 newval.encode('hex'), oldval.encode('hex')],
666 preexec_fn = _gitenv)
667 _git_wait('git update-ref', p)
670 def guess_repo(path=None):
671 """Set the path value in the global variable "repodir".
672 This makes bup look for an existing bup repository, but not fail if a
673 repository doesn't exist. Usually, if you are interacting with a bup
674 repository, you would not be calling this function but using
681 repodir = os.environ.get('BUP_DIR')
683 repodir = os.path.expanduser('~/.bup')
686 def init_repo(path=None):
687 """Create the Git bare repository for bup in a given path."""
690 if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
691 raise GitError('"%d" exists but is not a directory\n' % d)
692 p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
693 preexec_fn = _gitenv)
694 _git_wait('git init', p)
695 # Force the index version configuration in order to ensure bup works
696 # regardless of the version of the installed Git binary.
697 p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
698 stdout=sys.stderr, preexec_fn = _gitenv)
699 _git_wait('git config', p)
702 def check_repo_or_die(path=None):
703 """Make sure a bup repository exists, and abort if not.
704 If the path to a particular repository was not specified, this function
705 initializes the default repository automatically.
708 if not os.path.isdir(repo('objects/pack/.')):
709 if repodir == home_repodir:
712 log('error: %r is not a bup/git repository\n' % repo())
717 """Generate a list of (mode, name, hash) tuples of objects from 'buf'."""
719 while ofs < len(buf):
720 z = buf[ofs:].find('\0')
722 spl = buf[ofs:ofs+z].split(' ', 1)
723 assert(len(spl) == 2)
724 sha = buf[ofs+z+1:ofs+z+1+20]
726 yield (spl[0], spl[1], sha)
731 """Get Git's version and ensure a usable version is installed.
733 The returned version is formatted as an ordered tuple with each position
734 representing a digit in the version tag. For example, the following tuple
735 would represent version 1.6.6.9:
741 p = subprocess.Popen(['git', '--version'],
742 stdout=subprocess.PIPE)
743 gvs = p.stdout.read()
744 _git_wait('git --version', p)
745 m = re.match(r'git version (\S+.\S+)', gvs)
747 raise GitError('git --version weird output: %r' % gvs)
748 _ver = tuple(m.group(1).split('.'))
749 needed = ('1','5', '3', '1')
751 raise GitError('git version %s or higher is required; you have %s'
752 % ('.'.join(needed), '.'.join(_ver)))
756 def _git_wait(cmd, p):
759 raise GitError('%s returned %d' % (cmd, rv))
762 def _git_capture(argv):
763 p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
765 _git_wait(repr(argv), p)
769 class _AbortableIter:
770 def __init__(self, it, onabort = None):
772 self.onabort = onabort
780 return self.it.next()
781 except StopIteration, e:
789 """Abort iteration and call the abortion callback, if needed."""
801 """Link to 'git cat-file' that is used to retrieve blob data."""
804 wanted = ('1','5','6')
807 log('warning: git version < %s; bup will be slow.\n'
810 self.get = self._slow_get
812 self.p = self.inprogress = None
813 self.get = self._fast_get
817 self.p.stdout.close()
820 self.inprogress = None
824 self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
825 stdin=subprocess.PIPE,
826 stdout=subprocess.PIPE,
828 preexec_fn = _gitenv)
830 def _fast_get(self, id):
831 if not self.p or self.p.poll() != None:
834 assert(self.p.poll() == None)
836 log('_fast_get: opening %r while %r is open'
837 % (id, self.inprogress))
838 assert(not self.inprogress)
839 assert(id.find('\n') < 0)
840 assert(id.find('\r') < 0)
843 self.p.stdin.write('%s\n' % id)
844 hdr = self.p.stdout.readline()
845 if hdr.endswith(' missing\n'):
846 raise KeyError('blob %r is missing' % id)
848 if len(spl) != 3 or len(spl[0]) != 40:
849 raise GitError('expected blob, got %r' % spl)
850 (hex, type, size) = spl
852 it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
853 onabort = self._abort)
858 assert(self.p.stdout.readline() == '\n')
859 self.inprogress = None
864 def _slow_get(self, id):
865 assert(id.find('\n') < 0)
866 assert(id.find('\r') < 0)
868 type = _git_capture(['git', 'cat-file', '-t', id]).strip()
871 p = subprocess.Popen(['git', 'cat-file', type, id],
872 stdout=subprocess.PIPE,
873 preexec_fn = _gitenv)
874 for blob in chunkyreader(p.stdout):
876 _git_wait('git cat-file', p)
884 treefile = ''.join(it)
885 for (mode, name, sha) in treeparse(treefile):
886 for blob in self.join(sha.encode('hex')):
888 elif type == 'commit':
889 treeline = ''.join(it).split('\n')[0]
890 assert(treeline.startswith('tree '))
891 for blob in self.join(treeline[5:]):
894 raise GitError('invalid object type %r: expected blob/tree/commit'
898 """Generate a list of the content of all blobs that can be reached
899 from an object. The hash given in 'id' must point to a blob, a tree
900 or a commit. The content of all blobs that can be seen from trees or
901 commits will be added to the list.
904 for d in self._join(self.get(id)):
906 except StopIteration: