1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
5 import os, zlib, time, subprocess, struct, stat, re, tempfile
7 from bup.helpers import *
8 from bup import _helpers
14 home_repodir = os.path.expanduser('~/.bup')
17 _typemap = { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
18 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
24 class GitError(Exception):
29 """Get the path to the git repository or one of its subdirectories."""
32 raise GitError('You should call check_repo_or_die()')
34 # If there's a .git subdirectory, then the actual repo is in there.
35 gd = os.path.join(repodir, '.git')
36 if os.path.exists(gd):
39 return os.path.join(repodir, sub)
42 def auto_midx(objdir):
43 main_exe = os.environ.get('BUP_MAIN_EXE') or sys.argv[0]
44 args = [main_exe, 'midx', '--auto', '--dir', objdir]
45 rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
47 add_error('%r: returned %d' % (args, rv))
50 def mangle_name(name, mode, gitmode):
51 """Mangle a file name to present an abstract name for segmented files.
52 Mangled file names will have the ".bup" extension added to them. If a
53 file's name already ends with ".bup", a ".bupl" extension is added to
54 disambiguate normal files from semgmented ones.
56 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
58 elif name.endswith('.bup') or name[:-1].endswith('.bup'):
64 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
65 def demangle_name(name):
66 """Remove name mangling from a file name, if necessary.
68 The return value is a tuple (demangled_filename,mode), where mode is one of
71 * BUP_NORMAL : files that should be read as-is from the repository
72 * BUP_CHUNKED : files that were chunked and need to be assembled
74 For more information on the name mangling algorythm, see mangle_name()
76 if name.endswith('.bupl'):
77 return (name[:-5], BUP_NORMAL)
78 elif name.endswith('.bup'):
79 return (name[:-4], BUP_CHUNKED)
81 return (name, BUP_NORMAL)
84 def _encode_packobj(type, content):
87 szbits = (sz & 0x0f) | (_typemap[type]<<4)
96 z = zlib.compressobj(1)
98 yield z.compress(content)
102 def _encode_looseobj(type, content):
103 z = zlib.compressobj(1)
104 yield z.compress('%s %d\0' % (type, len(content)))
105 yield z.compress(content)
109 def _decode_looseobj(buf):
111 s = zlib.decompress(buf)
118 assert(type in _typemap)
119 assert(sz == len(content))
120 return (type, content)
123 def _decode_packobj(buf):
126 type = _typermap[(c & 0x70) >> 4]
133 sz |= (c & 0x7f) << shift
137 return (type, zlib.decompress(buf[i+1:]))
141 """Object representation of a Git pack index file."""
142 def __init__(self, filename):
144 self.idxnames = [self.name]
145 self.map = mmap_read(open(filename))
146 assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
147 self.fanout = list(struct.unpack('!256I',
148 str(buffer(self.map, 8, 256*4))))
149 self.fanout.append(0) # entry "-1"
150 nsha = self.fanout[255]
151 self.ofstable = buffer(self.map,
152 8 + 256*4 + nsha*20 + nsha*4,
154 self.ofs64table = buffer(self.map,
155 8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
157 def _ofs_from_idx(self, idx):
158 ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
160 idx64 = ofs & 0x7fffffff
161 ofs = struct.unpack('!I',
162 str(buffer(self.ofs64table, idx64*8, 8)))[0]
165 def _idx_from_hash(self, hash):
166 global _total_searches, _total_steps
168 assert(len(hash) == 20)
170 start = self.fanout[b1-1] # range -1..254
171 end = self.fanout[b1] # range 0..255
172 buf = buffer(self.map, 8 + 256*4, end*20)
174 _total_steps += 1 # lookup table is a step
177 mid = start + (end-start)/2
178 v = str(buf[mid*20:(mid+1)*20])
187 def find_offset(self, hash):
188 """Get the offset of an object inside the index file."""
189 idx = self._idx_from_hash(hash)
191 return self._ofs_from_idx(idx)
194 def exists(self, hash):
195 """Return nonempty if the object exists in this index."""
196 return hash and (self._idx_from_hash(hash) != None) and True or None
199 for i in xrange(self.fanout[255]):
200 yield buffer(self.map, 8 + 256*4 + 20*i, 20)
203 return int(self.fanout[255])
206 extract_bits = _helpers.extract_bits
210 """Wrapper which contains data from multiple index files.
211 Multiple index (.midx) files constitute a wrapper around index (.idx) files
212 and make it possible for bup to expand Git's indexing capabilities to vast
215 def __init__(self, filename):
217 self.force_keep = False
218 assert(filename.endswith('.midx'))
219 self.map = mmap_read(open(filename))
220 if str(self.map[0:4]) != 'MIDX':
221 log('Warning: skipping: invalid MIDX header in %r\n' % filename)
222 self.force_keep = True
223 return self._init_failed()
224 ver = struct.unpack('!I', self.map[4:8])[0]
225 if ver < MIDX_VERSION:
226 log('Warning: ignoring old-style (v%d) midx %r\n'
228 self.force_keep = False # old stuff is boring
229 return self._init_failed()
230 if ver > MIDX_VERSION:
231 log('Warning: ignoring too-new (v%d) midx %r\n'
233 self.force_keep = True # new stuff is exciting
234 return self._init_failed()
236 self.bits = _helpers.firstword(self.map[8:12])
237 self.entries = 2**self.bits
238 self.fanout = buffer(self.map, 12, self.entries*4)
239 shaofs = 12 + self.entries*4
240 nsha = self._fanget(self.entries-1)
241 self.shalist = buffer(self.map, shaofs, nsha*20)
242 self.idxnames = str(self.map[shaofs + 20*nsha:]).split('\0')
244 def _init_failed(self):
247 self.fanout = buffer('\0\0\0\0')
248 self.shalist = buffer('\0'*20)
251 def _fanget(self, i):
253 s = self.fanout[start:start+4]
254 return _helpers.firstword(s)
257 return str(self.shalist[i*20:(i+1)*20])
259 def exists(self, hash):
260 """Return nonempty if the object exists in the index files."""
261 global _total_searches, _total_steps
264 el = extract_bits(want, self.bits)
266 start = self._fanget(el-1)
267 startv = el << (32-self.bits)
271 end = self._fanget(el)
272 endv = (el+1) << (32-self.bits)
273 _total_steps += 1 # lookup table is a step
274 hashv = _helpers.firstword(hash)
275 #print '(%08x) %08x %08x %08x' % (extract_bits(want, 32), startv, hashv, endv)
278 #print '! %08x %08x %08x %d - %d' % (startv, hashv, endv, start, end)
279 mid = start + (hashv-startv)*(end-start-1)/(endv-startv)
280 #print ' %08x %08x %08x %d %d %d' % (startv, hashv, endv, start, mid, end)
282 #print ' %08x' % self._num(v)
285 startv = _helpers.firstword(v)
288 endv = _helpers.firstword(v)
294 for i in xrange(self._fanget(self.entries-1)):
295 yield buffer(self.shalist, i*20, 20)
298 return int(self._fanget(self.entries-1))
303 def __init__(self, dir):
305 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
315 assert(_mpi_count == 0)
318 return iter(idxmerge(self.packs))
321 return sum(len(pack) for pack in self.packs)
323 def exists(self, hash):
324 """Return nonempty if the object exists in the index files."""
325 global _total_searches
327 if hash in self.also:
329 for i in range(len(self.packs)):
331 _total_searches -= 1 # will be incremented by sub-pack
333 # reorder so most recently used packs are searched first
334 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
338 def refresh(self, skip_midx = False):
339 """Refresh the index list.
340 This method verifies if .midx files were superseded (e.g. all of its
341 contents are in another, bigger .midx file) and removes the superseded
344 If skip_midx is True, all work on .midx files will be skipped and .midx
345 files will be removed from the list.
347 The module-global variable 'ignore_midx' can force this function to
348 always act as if skip_midx was True.
350 skip_midx = skip_midx or ignore_midx
351 d = dict((p.name, p) for p in self.packs
352 if not skip_midx or not isinstance(p, PackMidx))
353 if os.path.exists(self.dir):
356 for ix in self.packs:
357 if isinstance(ix, PackMidx):
358 for name in ix.idxnames:
359 d[os.path.join(self.dir, name)] = ix
360 for f in os.listdir(self.dir):
361 full = os.path.join(self.dir, f)
362 if f.endswith('.midx') and not d.get(full):
364 (mxd, mxf) = os.path.split(mx.name)
366 for n in mx.idxnames:
367 if not os.path.exists(os.path.join(mxd, n)):
368 log(('warning: index %s missing\n' +
369 ' used by %s\n') % (n, mxf))
373 midxl.sort(lambda x,y: -cmp(len(x),len(y)))
376 for sub in ix.idxnames:
377 found = d.get(os.path.join(self.dir, sub))
378 if not found or isinstance(found, PackIdx):
379 # doesn't exist, or exists but not in a midx
381 for name in ix.idxnames:
382 d[os.path.join(self.dir, name)] = ix
385 if not any and not ix.force_keep:
386 debug1('midx: removing redundant: %s\n'
387 % os.path.basename(ix.name))
389 for f in os.listdir(self.dir):
390 full = os.path.join(self.dir, f)
391 if f.endswith('.idx') and not d.get(full):
394 self.packs = list(set(d.values()))
395 debug1('PackIdxList: using %d index%s.\n'
396 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
399 """Insert an additional object in the list."""
403 """Remove all additional objects from the list."""
407 def calc_hash(type, content):
408 """Calculate some content's hash in the Git fashion."""
409 header = '%s %d\0' % (type, len(content))
415 def _shalist_sort_key(ent):
416 (mode, name, id) = ent
417 if stat.S_ISDIR(int(mode, 8)):
423 def open_idx(filename):
424 if filename.endswith('.idx'):
425 return PackIdx(filename)
426 elif filename.endswith('.midx'):
427 return PackMidx(filename)
429 raise GitError('idx filenames must end with .idx or .midx')
432 def idxmerge(idxlist, final_progress=True):
433 """Generate a list of all the objects reachable in a PackIdxList."""
434 total = sum(len(i) for i in idxlist)
435 iters = (iter(i) for i in idxlist)
436 heap = [(next(it), it) for it in iters]
441 if (count % 10024) == 0:
442 progress('Reading indexes: %.2f%% (%d/%d)\r'
443 % (count*100.0/total, count, total))
451 heapq.heapreplace(heap, (e, it))
455 log('Reading indexes: %.2f%% (%d/%d), done.\n' % (100, total, total))
459 """Writes Git objects insid a pack file."""
460 def __init__(self, objcache_maker=None):
465 self.objcache_maker = objcache_maker
471 def _make_objcache(self):
472 if self.objcache == None:
473 if self.objcache_maker:
474 self.objcache = self.objcache_maker()
476 self.objcache = PackIdxList(repo('objects/pack'))
480 self._make_objcache()
481 (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
482 self.file = os.fdopen(fd, 'w+b')
483 assert(name.endswith('.pack'))
484 self.filename = name[:-5]
485 self.file.write('PACK\0\0\0\2\0\0\0\0')
487 def _raw_write(self, datalist):
490 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
491 # the file never has a *partial* blob. So let's make sure it's
492 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
493 # to our hashsplit algorithm.) f.write() does its own buffering,
494 # but that's okay because we'll flush it in _end().
495 oneblob = ''.join(datalist)
497 self.outbytes += len(oneblob)
500 def _write(self, bin, type, content):
503 self._raw_write(_encode_packobj(type, content))
506 def breakpoint(self):
507 """Clear byte and object counts and return the last processed id."""
509 self.outbytes = self.count = 0
512 def write(self, type, content):
513 """Write an object in this pack file."""
514 return self._write(calc_hash(type, content), type, content)
516 def exists(self, id):
517 """Return non-empty if an object is found in the object cache."""
518 if not self.objcache:
519 self._make_objcache()
520 return self.objcache.exists(id)
522 def maybe_write(self, type, content):
523 """Write an object to the pack file if not present and return its id."""
524 bin = calc_hash(type, content)
525 if not self.exists(bin):
526 self._write(bin, type, content)
527 self.objcache.add(bin)
530 def new_blob(self, blob):
531 """Create a blob object in the pack with the supplied content."""
532 return self.maybe_write('blob', blob)
534 def new_tree(self, shalist):
535 """Create a tree object in the pack."""
536 shalist = sorted(shalist, key = _shalist_sort_key)
538 for (mode,name,bin) in shalist:
541 assert(mode[0] != '0')
543 assert(len(bin) == 20)
544 l.append('%s %s\0%s' % (mode,name,bin))
545 return self.maybe_write('tree', ''.join(l))
547 def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
549 if tree: l.append('tree %s' % tree.encode('hex'))
550 if parent: l.append('parent %s' % parent.encode('hex'))
551 if author: l.append('author %s %s' % (author, _git_date(adate)))
552 if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
555 return self.maybe_write('commit', '\n'.join(l))
557 def new_commit(self, parent, tree, date, msg):
558 """Create a commit object in the pack."""
559 userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
560 commit = self._new_commit(tree, parent,
561 userline, date, userline, date,
566 """Remove the pack file from disk."""
571 os.unlink(self.filename + '.pack')
575 if not f: return None
579 # update object count
581 cp = struct.pack('!i', self.count)
585 # calculate the pack sha1sum
592 f.write(sum.digest())
596 p = subprocess.Popen(['git', 'index-pack', '-v',
598 self.filename + '.pack'],
599 preexec_fn = _gitenv,
600 stdout = subprocess.PIPE)
601 out = p.stdout.read().strip()
602 _git_wait('git index-pack', p)
604 raise GitError('git index-pack produced no output')
605 nameprefix = repo('objects/pack/%s' % out)
606 if os.path.exists(self.filename + '.map'):
607 os.unlink(self.filename + '.map')
608 os.rename(self.filename + '.pack', nameprefix + '.pack')
609 os.rename(self.filename + '.idx', nameprefix + '.idx')
611 auto_midx(repo('objects/pack'))
615 """Close the pack file and move it to its definitive path."""
620 return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
624 os.environ['GIT_DIR'] = os.path.abspath(repo())
627 def list_refs(refname = None):
628 """Generate a list of tuples in the form (refname,hash).
629 If a ref name is specified, list only this particular ref.
631 argv = ['git', 'show-ref', '--']
634 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
635 out = p.stdout.read().strip()
636 rv = p.wait() # not fatal
640 for d in out.split('\n'):
641 (sha, name) = d.split(' ', 1)
642 yield (name, sha.decode('hex'))
645 def read_ref(refname):
646 """Get the commit id of the most recent commit made on a given ref."""
647 l = list(list_refs(refname))
655 def rev_list(ref, count=None):
656 """Generate a list of reachable commits in reverse chronological order.
658 This generator walks through commits, from child to parent, that are
659 reachable via the specified ref and yields a series of tuples of the form
662 If count is a non-zero integer, limit the number of commits to "count"
665 assert(not ref.startswith('-'))
668 opts += ['-n', str(atoi(count))]
669 argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
670 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
674 if s.startswith('commit '):
675 commit = s[7:].decode('hex')
679 rv = p.wait() # not fatal
681 raise GitError, 'git rev-list returned error %d' % rv
684 def rev_get_date(ref):
685 """Get the date of the latest commit on the specified ref."""
686 for (date, commit) in rev_list(ref, count=1):
688 raise GitError, 'no such commit %r' % ref
691 def update_ref(refname, newval, oldval):
692 """Change the commit pointed to by a branch."""
695 assert(refname.startswith('refs/heads/'))
696 p = subprocess.Popen(['git', 'update-ref', refname,
697 newval.encode('hex'), oldval.encode('hex')],
698 preexec_fn = _gitenv)
699 _git_wait('git update-ref', p)
702 def guess_repo(path=None):
703 """Set the path value in the global variable "repodir".
704 This makes bup look for an existing bup repository, but not fail if a
705 repository doesn't exist. Usually, if you are interacting with a bup
706 repository, you would not be calling this function but using
713 repodir = os.environ.get('BUP_DIR')
715 repodir = os.path.expanduser('~/.bup')
718 def init_repo(path=None):
719 """Create the Git bare repository for bup in a given path."""
722 if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
723 raise GitError('"%d" exists but is not a directory\n' % d)
724 p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
725 preexec_fn = _gitenv)
726 _git_wait('git init', p)
727 # Force the index version configuration in order to ensure bup works
728 # regardless of the version of the installed Git binary.
729 p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
730 stdout=sys.stderr, preexec_fn = _gitenv)
731 _git_wait('git config', p)
734 def check_repo_or_die(path=None):
735 """Make sure a bup repository exists, and abort if not.
736 If the path to a particular repository was not specified, this function
737 initializes the default repository automatically.
740 if not os.path.isdir(repo('objects/pack/.')):
741 if repodir == home_repodir:
744 log('error: %r is not a bup/git repository\n' % repo())
749 """Generate a list of (mode, name, hash) tuples of objects from 'buf'."""
751 while ofs < len(buf):
752 z = buf[ofs:].find('\0')
754 spl = buf[ofs:ofs+z].split(' ', 1)
755 assert(len(spl) == 2)
756 sha = buf[ofs+z+1:ofs+z+1+20]
758 yield (spl[0], spl[1], sha)
763 """Get Git's version and ensure a usable version is installed.
765 The returned version is formatted as an ordered tuple with each position
766 representing a digit in the version tag. For example, the following tuple
767 would represent version 1.6.6.9:
773 p = subprocess.Popen(['git', '--version'],
774 stdout=subprocess.PIPE)
775 gvs = p.stdout.read()
776 _git_wait('git --version', p)
777 m = re.match(r'git version (\S+.\S+)', gvs)
779 raise GitError('git --version weird output: %r' % gvs)
780 _ver = tuple(m.group(1).split('.'))
781 needed = ('1','5', '3', '1')
783 raise GitError('git version %s or higher is required; you have %s'
784 % ('.'.join(needed), '.'.join(_ver)))
788 def _git_wait(cmd, p):
791 raise GitError('%s returned %d' % (cmd, rv))
794 def _git_capture(argv):
795 p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
797 _git_wait(repr(argv), p)
801 class _AbortableIter:
802 def __init__(self, it, onabort = None):
804 self.onabort = onabort
812 return self.it.next()
813 except StopIteration, e:
821 """Abort iteration and call the abortion callback, if needed."""
833 """Link to 'git cat-file' that is used to retrieve blob data."""
836 wanted = ('1','5','6')
839 log('warning: git version < %s; bup will be slow.\n'
842 self.get = self._slow_get
844 self.p = self.inprogress = None
845 self.get = self._fast_get
849 self.p.stdout.close()
852 self.inprogress = None
856 self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
857 stdin=subprocess.PIPE,
858 stdout=subprocess.PIPE,
860 preexec_fn = _gitenv)
862 def _fast_get(self, id):
863 if not self.p or self.p.poll() != None:
866 assert(self.p.poll() == None)
868 log('_fast_get: opening %r while %r is open'
869 % (id, self.inprogress))
870 assert(not self.inprogress)
871 assert(id.find('\n') < 0)
872 assert(id.find('\r') < 0)
875 self.p.stdin.write('%s\n' % id)
876 hdr = self.p.stdout.readline()
877 if hdr.endswith(' missing\n'):
878 raise KeyError('blob %r is missing' % id)
880 if len(spl) != 3 or len(spl[0]) != 40:
881 raise GitError('expected blob, got %r' % spl)
882 (hex, type, size) = spl
884 it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
885 onabort = self._abort)
890 assert(self.p.stdout.readline() == '\n')
891 self.inprogress = None
896 def _slow_get(self, id):
897 assert(id.find('\n') < 0)
898 assert(id.find('\r') < 0)
900 type = _git_capture(['git', 'cat-file', '-t', id]).strip()
903 p = subprocess.Popen(['git', 'cat-file', type, id],
904 stdout=subprocess.PIPE,
905 preexec_fn = _gitenv)
906 for blob in chunkyreader(p.stdout):
908 _git_wait('git cat-file', p)
916 treefile = ''.join(it)
917 for (mode, name, sha) in treeparse(treefile):
918 for blob in self.join(sha.encode('hex')):
920 elif type == 'commit':
921 treeline = ''.join(it).split('\n')[0]
922 assert(treeline.startswith('tree '))
923 for blob in self.join(treeline[5:]):
926 raise GitError('invalid object type %r: expected blob/tree/commit'
930 """Generate a list of the content of all blobs that can be reached
931 from an object. The hash given in 'id' must point to a blob, a tree
932 or a commit. The content of all blobs that can be seen from trees or
933 commits will be added to the list.
936 for d in self._join(self.get(id)):
938 except StopIteration: