1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
5 import os, zlib, time, subprocess, struct, stat, re, tempfile
7 from bup.helpers import *
8 from bup import _helpers
14 home_repodir = os.path.expanduser('~/.bup')
17 _typemap = { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
18 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
24 class GitError(Exception):
29 """Get the path to the git repository or one of its subdirectories."""
32 raise GitError('You should call check_repo_or_die()')
34 # If there's a .git subdirectory, then the actual repo is in there.
35 gd = os.path.join(repodir, '.git')
36 if os.path.exists(gd):
39 return os.path.join(repodir, sub)
42 def auto_midx(objdir):
43 main_exe = os.environ.get('BUP_MAIN_EXE') or sys.argv[0]
44 args = [main_exe, 'midx', '--auto', '--dir', objdir]
45 rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
47 add_error('%r: returned %d' % (args, rv))
50 def mangle_name(name, mode, gitmode):
51 """Mangle a file name to present an abstract name for segmented files.
52 Mangled file names will have the ".bup" extension added to them. If a
53 file's name already ends with ".bup", a ".bupl" extension is added to
54 disambiguate normal files from semgmented ones.
56 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
58 elif name.endswith('.bup') or name[:-1].endswith('.bup'):
64 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
65 def demangle_name(name):
66 """Remove name mangling from a file name, if necessary.
68 The return value is a tuple (demangled_filename,mode), where mode is one of
71 * BUP_NORMAL : files that should be read as-is from the repository
72 * BUP_CHUNKED : files that were chunked and need to be assembled
74 For more information on the name mangling algorythm, see mangle_name()
76 if name.endswith('.bupl'):
77 return (name[:-5], BUP_NORMAL)
78 elif name.endswith('.bup'):
79 return (name[:-4], BUP_CHUNKED)
81 return (name, BUP_NORMAL)
84 def _encode_packobj(type, content):
87 szbits = (sz & 0x0f) | (_typemap[type]<<4)
96 z = zlib.compressobj(1)
98 yield z.compress(content)
102 def _encode_looseobj(type, content):
103 z = zlib.compressobj(1)
104 yield z.compress('%s %d\0' % (type, len(content)))
105 yield z.compress(content)
109 def _decode_looseobj(buf):
111 s = zlib.decompress(buf)
118 assert(type in _typemap)
119 assert(sz == len(content))
120 return (type, content)
123 def _decode_packobj(buf):
126 type = _typermap[(c & 0x70) >> 4]
133 sz |= (c & 0x7f) << shift
137 return (type, zlib.decompress(buf[i+1:]))
144 def find_offset(self, hash):
145 """Get the offset of an object inside the index file."""
146 idx = self._idx_from_hash(hash)
148 return self._ofs_from_idx(idx)
151 def exists(self, hash):
152 """Return nonempty if the object exists in this index."""
153 return hash and (self._idx_from_hash(hash) != None) and True or None
156 return int(self.fanout[255])
158 def _idx_from_hash(self, hash):
159 global _total_searches, _total_steps
161 assert(len(hash) == 20)
163 start = self.fanout[b1-1] # range -1..254
164 end = self.fanout[b1] # range 0..255
166 _total_steps += 1 # lookup table is a step
169 mid = start + (end-start)/2
170 v = self._idx_to_hash(mid)
180 class PackIdxV1(PackIdx):
181 """Object representation of a Git pack index (version 1) file."""
182 def __init__(self, filename, f):
184 self.idxnames = [self.name]
185 self.map = mmap_read(f)
186 self.fanout = list(struct.unpack('!256I',
187 str(buffer(self.map, 0, 256*4))))
188 self.fanout.append(0) # entry "-1"
189 nsha = self.fanout[255]
190 self.shatable = buffer(self.map, 256*4, nsha*24)
192 def _ofs_from_idx(self, idx):
193 return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
195 def _idx_to_hash(self, idx):
196 return str(self.shatable[idx*24+4 : idx*24+24])
199 for i in xrange(self.fanout[255]):
200 yield buffer(self.map, 256*4 + 24*i + 4, 20)
203 class PackIdxV2(PackIdx):
204 """Object representation of a Git pack index (version 2) file."""
205 def __init__(self, filename, f):
207 self.idxnames = [self.name]
208 self.map = mmap_read(f)
209 assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
210 self.fanout = list(struct.unpack('!256I',
211 str(buffer(self.map, 8, 256*4))))
212 self.fanout.append(0) # entry "-1"
213 nsha = self.fanout[255]
214 self.shatable = buffer(self.map, 8 + 256*4, nsha*20)
215 self.ofstable = buffer(self.map,
216 8 + 256*4 + nsha*20 + nsha*4,
218 self.ofs64table = buffer(self.map,
219 8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
221 def _ofs_from_idx(self, idx):
222 ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
224 idx64 = ofs & 0x7fffffff
225 ofs = struct.unpack('!I',
226 str(buffer(self.ofs64table, idx64*8, 8)))[0]
229 def _idx_to_hash(self, idx):
230 return str(self.shatable[idx*20:(idx+1)*20])
233 for i in xrange(self.fanout[255]):
234 yield buffer(self.map, 8 + 256*4 + 20*i, 20)
237 extract_bits = _helpers.extract_bits
241 """Wrapper which contains data from multiple index files.
242 Multiple index (.midx) files constitute a wrapper around index (.idx) files
243 and make it possible for bup to expand Git's indexing capabilities to vast
246 def __init__(self, filename):
248 self.force_keep = False
249 assert(filename.endswith('.midx'))
250 self.map = mmap_read(open(filename))
251 if str(self.map[0:4]) != 'MIDX':
252 log('Warning: skipping: invalid MIDX header in %r\n' % filename)
253 self.force_keep = True
254 return self._init_failed()
255 ver = struct.unpack('!I', self.map[4:8])[0]
256 if ver < MIDX_VERSION:
257 log('Warning: ignoring old-style (v%d) midx %r\n'
259 self.force_keep = False # old stuff is boring
260 return self._init_failed()
261 if ver > MIDX_VERSION:
262 log('Warning: ignoring too-new (v%d) midx %r\n'
264 self.force_keep = True # new stuff is exciting
265 return self._init_failed()
267 self.bits = _helpers.firstword(self.map[8:12])
268 self.entries = 2**self.bits
269 self.fanout = buffer(self.map, 12, self.entries*4)
270 shaofs = 12 + self.entries*4
271 nsha = self._fanget(self.entries-1)
272 self.shalist = buffer(self.map, shaofs, nsha*20)
273 self.idxnames = str(self.map[shaofs + 20*nsha:]).split('\0')
275 def _init_failed(self):
278 self.fanout = buffer('\0\0\0\0')
279 self.shalist = buffer('\0'*20)
282 def _fanget(self, i):
284 s = self.fanout[start:start+4]
285 return _helpers.firstword(s)
288 return str(self.shalist[i*20:(i+1)*20])
290 def exists(self, hash):
291 """Return nonempty if the object exists in the index files."""
292 global _total_searches, _total_steps
295 el = extract_bits(want, self.bits)
297 start = self._fanget(el-1)
298 startv = el << (32-self.bits)
302 end = self._fanget(el)
303 endv = (el+1) << (32-self.bits)
304 _total_steps += 1 # lookup table is a step
305 hashv = _helpers.firstword(hash)
306 #print '(%08x) %08x %08x %08x' % (extract_bits(want, 32), startv, hashv, endv)
309 #print '! %08x %08x %08x %d - %d' % (startv, hashv, endv, start, end)
310 mid = start + (hashv-startv)*(end-start-1)/(endv-startv)
311 #print ' %08x %08x %08x %d %d %d' % (startv, hashv, endv, start, mid, end)
313 #print ' %08x' % self._num(v)
316 startv = _helpers.firstword(v)
319 endv = _helpers.firstword(v)
325 for i in xrange(self._fanget(self.entries-1)):
326 yield buffer(self.shalist, i*20, 20)
329 return int(self._fanget(self.entries-1))
334 def __init__(self, dir):
336 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
346 assert(_mpi_count == 0)
349 return iter(idxmerge(self.packs))
352 return sum(len(pack) for pack in self.packs)
354 def exists(self, hash):
355 """Return nonempty if the object exists in the index files."""
356 global _total_searches
358 if hash in self.also:
360 for i in range(len(self.packs)):
362 _total_searches -= 1 # will be incremented by sub-pack
364 # reorder so most recently used packs are searched first
365 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
369 def refresh(self, skip_midx = False):
370 """Refresh the index list.
371 This method verifies if .midx files were superseded (e.g. all of its
372 contents are in another, bigger .midx file) and removes the superseded
375 If skip_midx is True, all work on .midx files will be skipped and .midx
376 files will be removed from the list.
378 The module-global variable 'ignore_midx' can force this function to
379 always act as if skip_midx was True.
381 skip_midx = skip_midx or ignore_midx
382 d = dict((p.name, p) for p in self.packs
383 if not skip_midx or not isinstance(p, PackMidx))
384 if os.path.exists(self.dir):
387 for ix in self.packs:
388 if isinstance(ix, PackMidx):
389 for name in ix.idxnames:
390 d[os.path.join(self.dir, name)] = ix
391 for f in os.listdir(self.dir):
392 full = os.path.join(self.dir, f)
393 if f.endswith('.midx') and not d.get(full):
395 (mxd, mxf) = os.path.split(mx.name)
397 for n in mx.idxnames:
398 if not os.path.exists(os.path.join(mxd, n)):
399 log(('warning: index %s missing\n' +
400 ' used by %s\n') % (n, mxf))
404 midxl.sort(lambda x,y: -cmp(len(x),len(y)))
407 for sub in ix.idxnames:
408 found = d.get(os.path.join(self.dir, sub))
409 if not found or isinstance(found, PackIdx):
410 # doesn't exist, or exists but not in a midx
412 for name in ix.idxnames:
413 d[os.path.join(self.dir, name)] = ix
416 if not any and not ix.force_keep:
417 debug1('midx: removing redundant: %s\n'
418 % os.path.basename(ix.name))
420 for f in os.listdir(self.dir):
421 full = os.path.join(self.dir, f)
422 if f.endswith('.idx') and not d.get(full):
425 self.packs = list(set(d.values()))
426 debug1('PackIdxList: using %d index%s.\n'
427 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
430 """Insert an additional object in the list."""
434 """Remove all additional objects from the list."""
438 def calc_hash(type, content):
439 """Calculate some content's hash in the Git fashion."""
440 header = '%s %d\0' % (type, len(content))
446 def _shalist_sort_key(ent):
447 (mode, name, id) = ent
448 if stat.S_ISDIR(int(mode, 8)):
454 def open_idx(filename):
455 if filename.endswith('.idx'):
456 f = open(filename, 'rb')
458 if header[0:4] == '\377tOc':
459 version = struct.unpack('!I', header[4:8])[0]
461 return PackIdxV2(filename, f)
463 raise GitError('%s: expected idx file version 2, got %d'
464 % (filename, version))
466 return PackIdxV1(filename, f)
467 elif filename.endswith('.midx'):
468 return PackMidx(filename)
470 raise GitError('idx filenames must end with .idx or .midx')
473 def idxmerge(idxlist, final_progress=True):
474 """Generate a list of all the objects reachable in a PackIdxList."""
475 total = sum(len(i) for i in idxlist)
476 iters = (iter(i) for i in idxlist)
477 heap = [(next(it), it) for it in iters]
482 if (count % 10024) == 0:
483 progress('Reading indexes: %.2f%% (%d/%d)\r'
484 % (count*100.0/total, count, total))
492 heapq.heapreplace(heap, (e, it))
496 log('Reading indexes: %.2f%% (%d/%d), done.\n' % (100, total, total))
500 """Writes Git objects insid a pack file."""
501 def __init__(self, objcache_maker=None):
506 self.objcache_maker = objcache_maker
512 def _make_objcache(self):
513 if self.objcache == None:
514 if self.objcache_maker:
515 self.objcache = self.objcache_maker()
517 self.objcache = PackIdxList(repo('objects/pack'))
521 self._make_objcache()
522 (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
523 self.file = os.fdopen(fd, 'w+b')
524 assert(name.endswith('.pack'))
525 self.filename = name[:-5]
526 self.file.write('PACK\0\0\0\2\0\0\0\0')
528 def _raw_write(self, datalist):
531 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
532 # the file never has a *partial* blob. So let's make sure it's
533 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
534 # to our hashsplit algorithm.) f.write() does its own buffering,
535 # but that's okay because we'll flush it in _end().
536 oneblob = ''.join(datalist)
538 self.outbytes += len(oneblob)
541 def _write(self, bin, type, content):
544 self._raw_write(_encode_packobj(type, content))
547 def breakpoint(self):
548 """Clear byte and object counts and return the last processed id."""
550 self.outbytes = self.count = 0
553 def write(self, type, content):
554 """Write an object in this pack file."""
555 return self._write(calc_hash(type, content), type, content)
557 def exists(self, id):
558 """Return non-empty if an object is found in the object cache."""
559 if not self.objcache:
560 self._make_objcache()
561 return self.objcache.exists(id)
563 def maybe_write(self, type, content):
564 """Write an object to the pack file if not present and return its id."""
565 bin = calc_hash(type, content)
566 if not self.exists(bin):
567 self._write(bin, type, content)
568 self.objcache.add(bin)
571 def new_blob(self, blob):
572 """Create a blob object in the pack with the supplied content."""
573 return self.maybe_write('blob', blob)
575 def new_tree(self, shalist):
576 """Create a tree object in the pack."""
577 shalist = sorted(shalist, key = _shalist_sort_key)
579 for (mode,name,bin) in shalist:
582 assert(mode[0] != '0')
584 assert(len(bin) == 20)
585 l.append('%s %s\0%s' % (mode,name,bin))
586 return self.maybe_write('tree', ''.join(l))
588 def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
590 if tree: l.append('tree %s' % tree.encode('hex'))
591 if parent: l.append('parent %s' % parent.encode('hex'))
592 if author: l.append('author %s %s' % (author, _git_date(adate)))
593 if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
596 return self.maybe_write('commit', '\n'.join(l))
598 def new_commit(self, parent, tree, date, msg):
599 """Create a commit object in the pack."""
600 userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
601 commit = self._new_commit(tree, parent,
602 userline, date, userline, date,
607 """Remove the pack file from disk."""
612 os.unlink(self.filename + '.pack')
616 if not f: return None
620 # update object count
622 cp = struct.pack('!i', self.count)
626 # calculate the pack sha1sum
633 f.write(sum.digest())
637 p = subprocess.Popen(['git', 'index-pack', '-v',
639 self.filename + '.pack'],
640 preexec_fn = _gitenv,
641 stdout = subprocess.PIPE)
642 out = p.stdout.read().strip()
643 _git_wait('git index-pack', p)
645 raise GitError('git index-pack produced no output')
646 nameprefix = repo('objects/pack/%s' % out)
647 if os.path.exists(self.filename + '.map'):
648 os.unlink(self.filename + '.map')
649 os.rename(self.filename + '.pack', nameprefix + '.pack')
650 os.rename(self.filename + '.idx', nameprefix + '.idx')
652 auto_midx(repo('objects/pack'))
656 """Close the pack file and move it to its definitive path."""
661 return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
665 os.environ['GIT_DIR'] = os.path.abspath(repo())
668 def list_refs(refname = None):
669 """Generate a list of tuples in the form (refname,hash).
670 If a ref name is specified, list only this particular ref.
672 argv = ['git', 'show-ref', '--']
675 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
676 out = p.stdout.read().strip()
677 rv = p.wait() # not fatal
681 for d in out.split('\n'):
682 (sha, name) = d.split(' ', 1)
683 yield (name, sha.decode('hex'))
686 def read_ref(refname):
687 """Get the commit id of the most recent commit made on a given ref."""
688 l = list(list_refs(refname))
696 def rev_list(ref, count=None):
697 """Generate a list of reachable commits in reverse chronological order.
699 This generator walks through commits, from child to parent, that are
700 reachable via the specified ref and yields a series of tuples of the form
703 If count is a non-zero integer, limit the number of commits to "count"
706 assert(not ref.startswith('-'))
709 opts += ['-n', str(atoi(count))]
710 argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
711 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
715 if s.startswith('commit '):
716 commit = s[7:].decode('hex')
720 rv = p.wait() # not fatal
722 raise GitError, 'git rev-list returned error %d' % rv
725 def rev_get_date(ref):
726 """Get the date of the latest commit on the specified ref."""
727 for (date, commit) in rev_list(ref, count=1):
729 raise GitError, 'no such commit %r' % ref
732 def update_ref(refname, newval, oldval):
733 """Change the commit pointed to by a branch."""
736 assert(refname.startswith('refs/heads/'))
737 p = subprocess.Popen(['git', 'update-ref', refname,
738 newval.encode('hex'), oldval.encode('hex')],
739 preexec_fn = _gitenv)
740 _git_wait('git update-ref', p)
743 def guess_repo(path=None):
744 """Set the path value in the global variable "repodir".
745 This makes bup look for an existing bup repository, but not fail if a
746 repository doesn't exist. Usually, if you are interacting with a bup
747 repository, you would not be calling this function but using
754 repodir = os.environ.get('BUP_DIR')
756 repodir = os.path.expanduser('~/.bup')
759 def init_repo(path=None):
760 """Create the Git bare repository for bup in a given path."""
763 if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
764 raise GitError('"%d" exists but is not a directory\n' % d)
765 p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
766 preexec_fn = _gitenv)
767 _git_wait('git init', p)
768 # Force the index version configuration in order to ensure bup works
769 # regardless of the version of the installed Git binary.
770 p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
771 stdout=sys.stderr, preexec_fn = _gitenv)
772 _git_wait('git config', p)
775 def check_repo_or_die(path=None):
776 """Make sure a bup repository exists, and abort if not.
777 If the path to a particular repository was not specified, this function
778 initializes the default repository automatically.
781 if not os.path.isdir(repo('objects/pack/.')):
782 if repodir == home_repodir:
785 log('error: %r is not a bup/git repository\n' % repo())
790 """Generate a list of (mode, name, hash) tuples of objects from 'buf'."""
792 while ofs < len(buf):
793 z = buf[ofs:].find('\0')
795 spl = buf[ofs:ofs+z].split(' ', 1)
796 assert(len(spl) == 2)
797 sha = buf[ofs+z+1:ofs+z+1+20]
799 yield (spl[0], spl[1], sha)
804 """Get Git's version and ensure a usable version is installed.
806 The returned version is formatted as an ordered tuple with each position
807 representing a digit in the version tag. For example, the following tuple
808 would represent version 1.6.6.9:
814 p = subprocess.Popen(['git', '--version'],
815 stdout=subprocess.PIPE)
816 gvs = p.stdout.read()
817 _git_wait('git --version', p)
818 m = re.match(r'git version (\S+.\S+)', gvs)
820 raise GitError('git --version weird output: %r' % gvs)
821 _ver = tuple(m.group(1).split('.'))
822 needed = ('1','5', '3', '1')
824 raise GitError('git version %s or higher is required; you have %s'
825 % ('.'.join(needed), '.'.join(_ver)))
829 def _git_wait(cmd, p):
832 raise GitError('%s returned %d' % (cmd, rv))
835 def _git_capture(argv):
836 p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
838 _git_wait(repr(argv), p)
842 class _AbortableIter:
843 def __init__(self, it, onabort = None):
845 self.onabort = onabort
853 return self.it.next()
854 except StopIteration, e:
862 """Abort iteration and call the abortion callback, if needed."""
874 """Link to 'git cat-file' that is used to retrieve blob data."""
877 wanted = ('1','5','6')
880 log('warning: git version < %s; bup will be slow.\n'
883 self.get = self._slow_get
885 self.p = self.inprogress = None
886 self.get = self._fast_get
890 self.p.stdout.close()
893 self.inprogress = None
897 self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
898 stdin=subprocess.PIPE,
899 stdout=subprocess.PIPE,
901 preexec_fn = _gitenv)
903 def _fast_get(self, id):
904 if not self.p or self.p.poll() != None:
907 assert(self.p.poll() == None)
909 log('_fast_get: opening %r while %r is open'
910 % (id, self.inprogress))
911 assert(not self.inprogress)
912 assert(id.find('\n') < 0)
913 assert(id.find('\r') < 0)
914 assert(not id.startswith('-'))
916 self.p.stdin.write('%s\n' % id)
917 hdr = self.p.stdout.readline()
918 if hdr.endswith(' missing\n'):
919 self.inprogress = None
920 raise KeyError('blob %r is missing' % id)
922 if len(spl) != 3 or len(spl[0]) != 40:
923 raise GitError('expected blob, got %r' % spl)
924 (hex, type, size) = spl
926 it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
927 onabort = self._abort)
932 assert(self.p.stdout.readline() == '\n')
933 self.inprogress = None
938 def _slow_get(self, id):
939 assert(id.find('\n') < 0)
940 assert(id.find('\r') < 0)
942 type = _git_capture(['git', 'cat-file', '-t', id]).strip()
945 p = subprocess.Popen(['git', 'cat-file', type, id],
946 stdout=subprocess.PIPE,
947 preexec_fn = _gitenv)
948 for blob in chunkyreader(p.stdout):
950 _git_wait('git cat-file', p)
958 treefile = ''.join(it)
959 for (mode, name, sha) in treeparse(treefile):
960 for blob in self.join(sha.encode('hex')):
962 elif type == 'commit':
963 treeline = ''.join(it).split('\n')[0]
964 assert(treeline.startswith('tree '))
965 for blob in self.join(treeline[5:]):
968 raise GitError('invalid object type %r: expected blob/tree/commit'
972 """Generate a list of the content of all blobs that can be reached
973 from an object. The hash given in 'id' must point to a blob, a tree
974 or a commit. The content of all blobs that can be seen from trees or
975 commits will be added to the list.
978 for d in self._join(self.get(id)):
980 except StopIteration: