1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
5 import os, zlib, time, subprocess, struct, stat, re, tempfile
7 from bup.helpers import *
8 from bup import _helpers
14 home_repodir = os.path.expanduser('~/.bup')
17 _typemap = { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
18 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
24 class GitError(Exception):
29 """Get the path to the git repository or one of its subdirectories."""
32 raise GitError('You should call check_repo_or_die()')
34 # If there's a .git subdirectory, then the actual repo is in there.
35 gd = os.path.join(repodir, '.git')
36 if os.path.exists(gd):
39 return os.path.join(repodir, sub)
42 def auto_midx(objdir):
43 main_exe = os.environ.get('BUP_MAIN_EXE') or sys.argv[0]
44 args = [main_exe, 'midx', '--auto', '--dir', objdir]
45 rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
47 add_error('%r: returned %d' % (args, rv))
50 def mangle_name(name, mode, gitmode):
51 """Mangle a file name to present an abstract name for segmented files.
52 Mangled file names will have the ".bup" extension added to them. If a
53 file's name already ends with ".bup", a ".bupl" extension is added to
54 disambiguate normal files from semgmented ones.
56 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
58 elif name.endswith('.bup') or name[:-1].endswith('.bup'):
64 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
65 def demangle_name(name):
66 """Remove name mangling from a file name, if necessary.
68 The return value is a tuple (demangled_filename,mode), where mode is one of
71 * BUP_NORMAL : files that should be read as-is from the repository
72 * BUP_CHUNKED : files that were chunked and need to be assembled
74 For more information on the name mangling algorythm, see mangle_name()
76 if name.endswith('.bupl'):
77 return (name[:-5], BUP_NORMAL)
78 elif name.endswith('.bup'):
79 return (name[:-4], BUP_CHUNKED)
81 return (name, BUP_NORMAL)
84 def _encode_packobj(type, content):
87 szbits = (sz & 0x0f) | (_typemap[type]<<4)
96 z = zlib.compressobj(1)
98 yield z.compress(content)
102 def _encode_looseobj(type, content):
103 z = zlib.compressobj(1)
104 yield z.compress('%s %d\0' % (type, len(content)))
105 yield z.compress(content)
109 def _decode_looseobj(buf):
111 s = zlib.decompress(buf)
118 assert(type in _typemap)
119 assert(sz == len(content))
120 return (type, content)
123 def _decode_packobj(buf):
126 type = _typermap[(c & 0x70) >> 4]
133 sz |= (c & 0x7f) << shift
137 return (type, zlib.decompress(buf[i+1:]))
141 """Object representation of a Git pack index file."""
142 def __init__(self, filename):
144 self.idxnames = [self.name]
145 self.map = mmap_read(open(filename))
146 assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
147 self.fanout = list(struct.unpack('!256I',
148 str(buffer(self.map, 8, 256*4))))
149 self.fanout.append(0) # entry "-1"
150 nsha = self.fanout[255]
151 self.ofstable = buffer(self.map,
152 8 + 256*4 + nsha*20 + nsha*4,
154 self.ofs64table = buffer(self.map,
155 8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
157 def _ofs_from_idx(self, idx):
158 ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
160 idx64 = ofs & 0x7fffffff
161 ofs = struct.unpack('!I',
162 str(buffer(self.ofs64table, idx64*8, 8)))[0]
165 def _idx_from_hash(self, hash):
166 global _total_searches, _total_steps
168 assert(len(hash) == 20)
170 start = self.fanout[b1-1] # range -1..254
171 end = self.fanout[b1] # range 0..255
172 buf = buffer(self.map, 8 + 256*4, end*20)
174 _total_steps += 1 # lookup table is a step
177 mid = start + (end-start)/2
178 v = str(buf[mid*20:(mid+1)*20])
187 def find_offset(self, hash):
188 """Get the offset of an object inside the index file."""
189 idx = self._idx_from_hash(hash)
191 return self._ofs_from_idx(idx)
194 def exists(self, hash):
195 """Return nonempty if the object exists in this index."""
196 return hash and (self._idx_from_hash(hash) != None) and True or None
199 for i in xrange(self.fanout[255]):
200 yield buffer(self.map, 8 + 256*4 + 20*i, 20)
203 return int(self.fanout[255])
206 extract_bits = _helpers.extract_bits
210 """Wrapper which contains data from multiple index files.
211 Multiple index (.midx) files constitute a wrapper around index (.idx) files
212 and make it possible for bup to expand Git's indexing capabilities to vast
215 def __init__(self, filename):
217 self.force_keep = False
218 assert(filename.endswith('.midx'))
219 self.map = mmap_read(open(filename))
220 if str(self.map[0:4]) != 'MIDX':
221 log('Warning: skipping: invalid MIDX header in %r\n' % filename)
222 self.force_keep = True
223 return self._init_failed()
224 ver = struct.unpack('!I', self.map[4:8])[0]
225 if ver < MIDX_VERSION:
226 log('Warning: ignoring old-style (v%d) midx %r\n'
228 self.force_keep = False # old stuff is boring
229 return self._init_failed()
230 if ver > MIDX_VERSION:
231 log('Warning: ignoring too-new (v%d) midx %r\n'
233 self.force_keep = True # new stuff is exciting
234 return self._init_failed()
236 self.bits = _helpers.firstword(self.map[8:12])
237 self.entries = 2**self.bits
238 self.fanout = buffer(self.map, 12, self.entries*4)
239 shaofs = 12 + self.entries*4
240 nsha = self._fanget(self.entries-1)
241 self.shalist = buffer(self.map, shaofs, nsha*20)
242 self.idxnames = str(self.map[shaofs + 20*nsha:]).split('\0')
244 def _init_failed(self):
247 self.fanout = buffer('\0\0\0\0')
248 self.shalist = buffer('\0'*20)
251 def _fanget(self, i):
253 s = self.fanout[start:start+4]
254 return _helpers.firstword(s)
257 return str(self.shalist[i*20:(i+1)*20])
259 def exists(self, hash):
260 """Return nonempty if the object exists in the index files."""
261 global _total_searches, _total_steps
264 el = extract_bits(want, self.bits)
266 start = self._fanget(el-1)
267 startv = el << (32-self.bits)
271 end = self._fanget(el)
272 endv = (el+1) << (32-self.bits)
273 _total_steps += 1 # lookup table is a step
274 hashv = _helpers.firstword(hash)
275 #print '(%08x) %08x %08x %08x' % (extract_bits(want, 32), startv, hashv, endv)
278 #print '! %08x %08x %08x %d - %d' % (startv, hashv, endv, start, end)
279 mid = start + (hashv-startv)*(end-start-1)/(endv-startv)
280 #print ' %08x %08x %08x %d %d %d' % (startv, hashv, endv, start, mid, end)
282 #print ' %08x' % self._num(v)
285 startv = _helpers.firstword(v)
288 endv = _helpers.firstword(v)
294 for i in xrange(self._fanget(self.entries-1)):
295 yield buffer(self.shalist, i*20, 20)
298 return int(self._fanget(self.entries-1))
303 def __init__(self, dir):
305 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
315 assert(_mpi_count == 0)
318 return iter(idxmerge(self.packs))
321 return sum(len(pack) for pack in self.packs)
323 def exists(self, hash):
324 """Return nonempty if the object exists in the index files."""
325 global _total_searches
327 if hash in self.also:
329 for i in range(len(self.packs)):
331 _total_searches -= 1 # will be incremented by sub-pack
333 # reorder so most recently used packs are searched first
334 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
338 def refresh(self, skip_midx = False):
339 """Refresh the index list.
340 This method verifies if .midx files were superseded (e.g. all of its
341 contents are in another, bigger .midx file) and removes the superseded
344 If skip_midx is True, all work on .midx files will be skipped and .midx
345 files will be removed from the list.
347 The module-global variable 'ignore_midx' can force this function to
348 always act as if skip_midx was True.
350 skip_midx = skip_midx or ignore_midx
351 d = dict((p.name, p) for p in self.packs
352 if not skip_midx or not isinstance(p, PackMidx))
353 if os.path.exists(self.dir):
356 for ix in self.packs:
357 if isinstance(ix, PackMidx):
358 for name in ix.idxnames:
359 d[os.path.join(self.dir, name)] = ix
360 for f in os.listdir(self.dir):
361 full = os.path.join(self.dir, f)
362 if f.endswith('.midx') and not d.get(full):
364 (mxd, mxf) = os.path.split(mx.name)
366 for n in mx.idxnames:
367 if not os.path.exists(os.path.join(mxd, n)):
368 log(('warning: index %s missing\n' +
369 ' used by %s\n') % (n, mxf))
373 midxl.sort(lambda x,y: -cmp(len(x),len(y)))
376 for sub in ix.idxnames:
377 found = d.get(os.path.join(self.dir, sub))
378 if not found or isinstance(found, PackIdx):
379 # doesn't exist, or exists but not in a midx
381 for name in ix.idxnames:
382 d[os.path.join(self.dir, name)] = ix
385 if not any and not ix.force_keep:
386 debug1('midx: removing redundant: %s\n'
387 % os.path.basename(ix.name))
389 for f in os.listdir(self.dir):
390 full = os.path.join(self.dir, f)
391 if f.endswith('.idx') and not d.get(full):
394 self.packs = list(set(d.values()))
395 debug1('PackIdxList: using %d index%s.\n'
396 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
399 """Insert an additional object in the list."""
403 """Remove all additional objects from the list."""
407 def calc_hash(type, content):
408 """Calculate some content's hash in the Git fashion."""
409 header = '%s %d\0' % (type, len(content))
415 def _shalist_sort_key(ent):
416 (mode, name, id) = ent
417 if stat.S_ISDIR(int(mode, 8)):
423 def open_idx(filename):
424 if filename.endswith('.idx'):
425 return PackIdx(filename)
426 elif filename.endswith('.midx'):
427 return PackMidx(filename)
429 raise GitError('idx filenames must end with .idx or .midx')
432 def idxmerge(idxlist, final_progress=True):
433 """Generate a list of all the objects reachable in a PackIdxList."""
434 total = sum(len(i) for i in idxlist)
435 iters = (iter(i) for i in idxlist)
436 heap = [(next(it), it) for it in iters]
441 if (count % 10024) == 0:
442 progress('Reading indexes: %.2f%% (%d/%d)\r'
443 % (count*100.0/total, count, total))
451 heapq.heapreplace(heap, (e, it))
455 log('Reading indexes: %.2f%% (%d/%d), done.\n' % (100, total, total))
459 """Writes Git objects insid a pack file."""
460 def __init__(self, objcache_maker=None):
465 self.objcache_maker = objcache_maker
471 def _make_objcache(self):
472 if self.objcache == None:
473 if self.objcache_maker:
474 self.objcache = self.objcache_maker()
476 self.objcache = PackIdxList(repo('objects/pack'))
480 self._make_objcache()
481 (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
482 self.file = os.fdopen(fd, 'w+b')
483 assert(name.endswith('.pack'))
484 self.filename = name[:-5]
485 self.file.write('PACK\0\0\0\2\0\0\0\0')
487 def _raw_write(self, datalist):
490 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
491 # the file never has a *partial* blob. So let's make sure it's
492 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
493 # to our hashsplit algorithm.) f.write() does its own buffering,
494 # but that's okay because we'll flush it in _end().
495 oneblob = ''.join(datalist)
497 self.outbytes += len(oneblob)
500 def _write(self, bin, type, content):
503 self._raw_write(_encode_packobj(type, content))
506 def breakpoint(self):
507 """Clear byte and object counts and return the last processed id."""
509 self.outbytes = self.count = 0
512 def write(self, type, content):
513 """Write an object in this pack file."""
514 return self._write(calc_hash(type, content), type, content)
516 def exists(self, id):
517 """Return non-empty if an object is found in the object cache."""
518 if not self.objcache:
519 self._make_objcache()
520 return self.objcache.exists(id)
522 def maybe_write(self, type, content):
523 """Write an object to the pack file if not present and return its id."""
524 bin = calc_hash(type, content)
525 if not self.exists(bin):
526 self._write(bin, type, content)
527 self.objcache.add(bin)
530 def new_blob(self, blob):
531 """Create a blob object in the pack with the supplied content."""
532 return self.maybe_write('blob', blob)
534 def new_tree(self, shalist):
535 """Create a tree object in the pack."""
536 shalist = sorted(shalist, key = _shalist_sort_key)
538 for (mode,name,bin) in shalist:
541 assert(mode[0] != '0')
543 assert(len(bin) == 20)
544 l.append('%s %s\0%s' % (mode,name,bin))
545 return self.maybe_write('tree', ''.join(l))
547 def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
549 if tree: l.append('tree %s' % tree.encode('hex'))
550 if parent: l.append('parent %s' % parent.encode('hex'))
551 if author: l.append('author %s %s' % (author, _git_date(adate)))
552 if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
555 return self.maybe_write('commit', '\n'.join(l))
557 def new_commit(self, parent, tree, msg):
558 """Create a commit object in the pack."""
560 userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
561 commit = self._new_commit(tree, parent,
562 userline, now, userline, now,
567 """Remove the pack file from disk."""
572 os.unlink(self.filename + '.pack')
576 if not f: return None
580 # update object count
582 cp = struct.pack('!i', self.count)
586 # calculate the pack sha1sum
593 f.write(sum.digest())
597 p = subprocess.Popen(['git', 'index-pack', '-v',
599 self.filename + '.pack'],
600 preexec_fn = _gitenv,
601 stdout = subprocess.PIPE)
602 out = p.stdout.read().strip()
603 _git_wait('git index-pack', p)
605 raise GitError('git index-pack produced no output')
606 nameprefix = repo('objects/pack/%s' % out)
607 if os.path.exists(self.filename + '.map'):
608 os.unlink(self.filename + '.map')
609 os.rename(self.filename + '.pack', nameprefix + '.pack')
610 os.rename(self.filename + '.idx', nameprefix + '.idx')
612 auto_midx(repo('objects/pack'))
616 """Close the pack file and move it to its definitive path."""
621 return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
625 os.environ['GIT_DIR'] = os.path.abspath(repo())
628 def list_refs(refname = None):
629 """Generate a list of tuples in the form (refname,hash).
630 If a ref name is specified, list only this particular ref.
632 argv = ['git', 'show-ref', '--']
635 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
636 out = p.stdout.read().strip()
637 rv = p.wait() # not fatal
641 for d in out.split('\n'):
642 (sha, name) = d.split(' ', 1)
643 yield (name, sha.decode('hex'))
646 def read_ref(refname):
647 """Get the commit id of the most recent commit made on a given ref."""
648 l = list(list_refs(refname))
656 def rev_list(ref, count=None):
657 """Generate a list of reachable commits in reverse chronological order.
659 This generator walks through commits, from child to parent, that are
660 reachable via the specified ref and yields a series of tuples of the form
663 If count is a non-zero integer, limit the number of commits to "count"
666 assert(not ref.startswith('-'))
669 opts += ['-n', str(atoi(count))]
670 argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
671 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
675 if s.startswith('commit '):
676 commit = s[7:].decode('hex')
680 rv = p.wait() # not fatal
682 raise GitError, 'git rev-list returned error %d' % rv
685 def rev_get_date(ref):
686 """Get the date of the latest commit on the specified ref."""
687 for (date, commit) in rev_list(ref, count=1):
689 raise GitError, 'no such commit %r' % ref
692 def update_ref(refname, newval, oldval):
693 """Change the commit pointed to by a branch."""
696 assert(refname.startswith('refs/heads/'))
697 p = subprocess.Popen(['git', 'update-ref', refname,
698 newval.encode('hex'), oldval.encode('hex')],
699 preexec_fn = _gitenv)
700 _git_wait('git update-ref', p)
703 def guess_repo(path=None):
704 """Set the path value in the global variable "repodir".
705 This makes bup look for an existing bup repository, but not fail if a
706 repository doesn't exist. Usually, if you are interacting with a bup
707 repository, you would not be calling this function but using
714 repodir = os.environ.get('BUP_DIR')
716 repodir = os.path.expanduser('~/.bup')
719 def init_repo(path=None):
720 """Create the Git bare repository for bup in a given path."""
723 if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
724 raise GitError('"%d" exists but is not a directory\n' % d)
725 p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
726 preexec_fn = _gitenv)
727 _git_wait('git init', p)
728 # Force the index version configuration in order to ensure bup works
729 # regardless of the version of the installed Git binary.
730 p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
731 stdout=sys.stderr, preexec_fn = _gitenv)
732 _git_wait('git config', p)
735 def check_repo_or_die(path=None):
736 """Make sure a bup repository exists, and abort if not.
737 If the path to a particular repository was not specified, this function
738 initializes the default repository automatically.
741 if not os.path.isdir(repo('objects/pack/.')):
742 if repodir == home_repodir:
745 log('error: %r is not a bup/git repository\n' % repo())
750 """Generate a list of (mode, name, hash) tuples of objects from 'buf'."""
752 while ofs < len(buf):
753 z = buf[ofs:].find('\0')
755 spl = buf[ofs:ofs+z].split(' ', 1)
756 assert(len(spl) == 2)
757 sha = buf[ofs+z+1:ofs+z+1+20]
759 yield (spl[0], spl[1], sha)
764 """Get Git's version and ensure a usable version is installed.
766 The returned version is formatted as an ordered tuple with each position
767 representing a digit in the version tag. For example, the following tuple
768 would represent version 1.6.6.9:
774 p = subprocess.Popen(['git', '--version'],
775 stdout=subprocess.PIPE)
776 gvs = p.stdout.read()
777 _git_wait('git --version', p)
778 m = re.match(r'git version (\S+.\S+)', gvs)
780 raise GitError('git --version weird output: %r' % gvs)
781 _ver = tuple(m.group(1).split('.'))
782 needed = ('1','5', '3', '1')
784 raise GitError('git version %s or higher is required; you have %s'
785 % ('.'.join(needed), '.'.join(_ver)))
789 def _git_wait(cmd, p):
792 raise GitError('%s returned %d' % (cmd, rv))
795 def _git_capture(argv):
796 p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
798 _git_wait(repr(argv), p)
802 class _AbortableIter:
803 def __init__(self, it, onabort = None):
805 self.onabort = onabort
813 return self.it.next()
814 except StopIteration, e:
822 """Abort iteration and call the abortion callback, if needed."""
834 """Link to 'git cat-file' that is used to retrieve blob data."""
837 wanted = ('1','5','6')
840 log('warning: git version < %s; bup will be slow.\n'
843 self.get = self._slow_get
845 self.p = self.inprogress = None
846 self.get = self._fast_get
850 self.p.stdout.close()
853 self.inprogress = None
857 self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
858 stdin=subprocess.PIPE,
859 stdout=subprocess.PIPE,
861 preexec_fn = _gitenv)
863 def _fast_get(self, id):
864 if not self.p or self.p.poll() != None:
867 assert(self.p.poll() == None)
869 log('_fast_get: opening %r while %r is open'
870 % (id, self.inprogress))
871 assert(not self.inprogress)
872 assert(id.find('\n') < 0)
873 assert(id.find('\r') < 0)
876 self.p.stdin.write('%s\n' % id)
877 hdr = self.p.stdout.readline()
878 if hdr.endswith(' missing\n'):
879 raise KeyError('blob %r is missing' % id)
881 if len(spl) != 3 or len(spl[0]) != 40:
882 raise GitError('expected blob, got %r' % spl)
883 (hex, type, size) = spl
885 it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
886 onabort = self._abort)
891 assert(self.p.stdout.readline() == '\n')
892 self.inprogress = None
897 def _slow_get(self, id):
898 assert(id.find('\n') < 0)
899 assert(id.find('\r') < 0)
901 type = _git_capture(['git', 'cat-file', '-t', id]).strip()
904 p = subprocess.Popen(['git', 'cat-file', type, id],
905 stdout=subprocess.PIPE,
906 preexec_fn = _gitenv)
907 for blob in chunkyreader(p.stdout):
909 _git_wait('git cat-file', p)
917 treefile = ''.join(it)
918 for (mode, name, sha) in treeparse(treefile):
919 for blob in self.join(sha.encode('hex')):
921 elif type == 'commit':
922 treeline = ''.join(it).split('\n')[0]
923 assert(treeline.startswith('tree '))
924 for blob in self.join(treeline[5:]):
927 raise GitError('invalid object type %r: expected blob/tree/commit'
931 """Generate a list of the content of all blobs that can be reached
932 from an object. The hash given in 'id' must point to a blob, a tree
933 or a commit. The content of all blobs that can be seen from trees or
934 commits will be added to the list.
937 for d in self._join(self.get(id)):
939 except StopIteration: