1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
5 import os, zlib, time, subprocess, struct, stat, re, tempfile
7 from bup.helpers import *
13 home_repodir = os.path.expanduser('~/.bup')
16 _typemap = { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
17 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
23 class GitError(Exception):
28 """Get the path to the git repository or one of its subdirectories."""
31 raise GitError('You should call check_repo_or_die()')
33 # If there's a .git subdirectory, then the actual repo is in there.
34 gd = os.path.join(repodir, '.git')
35 if os.path.exists(gd):
38 return os.path.join(repodir, sub)
41 def mangle_name(name, mode, gitmode):
42 """Mangle a file name to present an abstract name for segmented files.
43 Mangled file names will have the ".bup" extension added to them. If a
44 file's name already ends with ".bup", a ".bupl" extension is added to
45 disambiguate normal files from semgmented ones.
47 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
49 elif name.endswith('.bup') or name[:-1].endswith('.bup'):
55 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
56 def demangle_name(name):
57 """Remove name mangling from a file name, if necessary.
59 The return value is a tuple (demangled_filename,mode), where mode is one of
62 * BUP_NORMAL : files that should be read as-is from the repository
63 * BUP_CHUNKED : files that were chunked and need to be assembled
65 For more information on the name mangling algorythm, see mangle_name()
67 if name.endswith('.bupl'):
68 return (name[:-5], BUP_NORMAL)
69 elif name.endswith('.bup'):
70 return (name[:-4], BUP_CHUNKED)
72 return (name, BUP_NORMAL)
75 def _encode_packobj(type, content):
78 szbits = (sz & 0x0f) | (_typemap[type]<<4)
87 z = zlib.compressobj(1)
89 yield z.compress(content)
93 def _encode_looseobj(type, content):
94 z = zlib.compressobj(1)
95 yield z.compress('%s %d\0' % (type, len(content)))
96 yield z.compress(content)
100 def _decode_looseobj(buf):
102 s = zlib.decompress(buf)
109 assert(type in _typemap)
110 assert(sz == len(content))
111 return (type, content)
114 def _decode_packobj(buf):
117 type = _typermap[(c & 0x70) >> 4]
124 sz |= (c & 0x7f) << shift
128 return (type, zlib.decompress(buf[i+1:]))
132 """Object representation of a Git pack index file."""
133 def __init__(self, filename):
135 self.map = mmap_read(open(filename))
136 assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
137 self.fanout = list(struct.unpack('!256I',
138 str(buffer(self.map, 8, 256*4))))
139 self.fanout.append(0) # entry "-1"
140 nsha = self.fanout[255]
141 self.ofstable = buffer(self.map,
142 8 + 256*4 + nsha*20 + nsha*4,
144 self.ofs64table = buffer(self.map,
145 8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
147 def _ofs_from_idx(self, idx):
148 ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
150 idx64 = ofs & 0x7fffffff
151 ofs = struct.unpack('!I',
152 str(buffer(self.ofs64table, idx64*8, 8)))[0]
155 def _idx_from_hash(self, hash):
156 global _total_searches, _total_steps
158 assert(len(hash) == 20)
160 start = self.fanout[b1-1] # range -1..254
161 end = self.fanout[b1] # range 0..255
162 buf = buffer(self.map, 8 + 256*4, end*20)
164 _total_steps += 1 # lookup table is a step
167 mid = start + (end-start)/2
168 v = str(buf[mid*20:(mid+1)*20])
177 def find_offset(self, hash):
178 """Get the offset of an object inside the index file."""
179 idx = self._idx_from_hash(hash)
181 return self._ofs_from_idx(idx)
184 def exists(self, hash):
185 """Return nonempty if the object exists in this index."""
186 return hash and (self._idx_from_hash(hash) != None) and True or None
189 for i in xrange(self.fanout[255]):
190 yield buffer(self.map, 8 + 256*4 + 20*i, 20)
193 return int(self.fanout[255])
196 def extract_bits(buf, nbits):
197 """Take the first 'nbits' bits from 'buf' and return them as an integer."""
198 mask = (1<<nbits) - 1
199 v = struct.unpack('!I', buf[0:4])[0]
200 v = (v >> (32-nbits)) & mask
205 """Wrapper which contains data from multiple index files.
206 Multiple index (.midx) files constitute a wrapper around index (.idx) files
207 and make it possible for bup to expand Git's indexing capabilities to vast
210 def __init__(self, filename):
212 self.force_keep = False
213 assert(filename.endswith('.midx'))
214 self.map = mmap_read(open(filename))
215 if str(self.map[0:4]) != 'MIDX':
216 log('Warning: skipping: invalid MIDX header in %r\n' % filename)
217 self.force_keep = True
218 return self._init_failed()
219 ver = struct.unpack('!I', self.map[4:8])[0]
220 if ver < MIDX_VERSION:
221 log('Warning: ignoring old-style (v%d) midx %r\n'
223 self.force_keep = False # old stuff is boring
224 return self._init_failed()
225 if ver > MIDX_VERSION:
226 log('Warning: ignoring too-new (v%d) midx %r\n'
228 self.force_keep = True # new stuff is exciting
229 return self._init_failed()
231 self.bits = struct.unpack('!I', self.map[8:12])[0]
232 self.entries = 2**self.bits
233 self.fanout = buffer(self.map, 12, self.entries*4)
234 shaofs = 12 + self.entries*4
235 nsha = self._fanget(self.entries-1)
236 self.shalist = buffer(self.map, shaofs, nsha*20)
237 self.idxnames = str(self.map[shaofs + 20*nsha:]).split('\0')
239 def _init_failed(self):
242 self.fanout = buffer('\0\0\0\0')
243 self.shalist = buffer('\0'*20)
246 def _fanget(self, i):
248 s = self.fanout[start:start+4]
249 return struct.unpack('!I', s)[0]
251 def exists(self, hash):
252 """Return nonempty if the object exists in the index files."""
253 global _total_searches, _total_steps
256 el = extract_bits(want, self.bits)
258 start = self._fanget(el-1)
261 end = self._fanget(el)
262 _total_steps += 1 # lookup table is a step
265 mid = start + (end-start)/2
266 v = str(self.shalist[mid*20:(mid+1)*20])
276 for i in xrange(self._fanget(self.entries-1)):
277 yield buffer(self.shalist, i*20, 20)
280 return int(self._fanget(self.entries-1))
285 def __init__(self, dir):
287 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
297 assert(_mpi_count == 0)
300 return iter(idxmerge(self.packs))
303 return sum(len(pack) for pack in self.packs)
305 def exists(self, hash):
306 """Return nonempty if the object exists in the index files."""
307 global _total_searches
309 if hash in self.also:
311 for i in range(len(self.packs)):
313 _total_searches -= 1 # will be incremented by sub-pack
315 # reorder so most recently used packs are searched first
316 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
320 def refresh(self, skip_midx = False):
321 """Refresh the index list.
322 This method verifies if .midx files were superseded (e.g. all of its
323 contents are in another, bigger .midx file) and removes the superseded
326 If skip_midx is True, all work on .midx files will be skipped and .midx
327 files will be removed from the list.
329 The module-global variable 'ignore_midx' can force this function to
330 always act as if skip_midx was True.
332 skip_midx = skip_midx or ignore_midx
333 d = dict((p.name, p) for p in self.packs
334 if not skip_midx or not isinstance(p, PackMidx))
335 if os.path.exists(self.dir):
338 for ix in self.packs:
339 if isinstance(ix, PackMidx):
340 for name in ix.idxnames:
341 d[os.path.join(self.dir, name)] = ix
342 for f in os.listdir(self.dir):
343 full = os.path.join(self.dir, f)
344 if f.endswith('.midx') and not d.get(full):
346 (mxd, mxf) = os.path.split(mx.name)
348 for n in mx.idxnames:
349 if not os.path.exists(os.path.join(mxd, n)):
350 log(('warning: index %s missing\n' +
351 ' used by %s\n') % (n, mxf))
355 midxl.sort(lambda x,y: -cmp(len(x),len(y)))
358 for sub in ix.idxnames:
359 found = d.get(os.path.join(self.dir, sub))
360 if not found or isinstance(found, PackIdx):
361 # doesn't exist, or exists but not in a midx
363 for name in ix.idxnames:
364 d[os.path.join(self.dir, name)] = ix
367 if not any and not ix.force_keep:
368 log('midx: removing redundant: %s\n'
369 % os.path.basename(ix.name))
371 for f in os.listdir(self.dir):
372 full = os.path.join(self.dir, f)
373 if f.endswith('.idx') and not d.get(full):
376 self.packs = list(set(d.values()))
377 log('PackIdxList: using %d index%s.\n'
378 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
381 """Insert an additional object in the list."""
385 """Remove all additional objects from the list."""
389 def calc_hash(type, content):
390 """Calculate some content's hash in the Git fashion."""
391 header = '%s %d\0' % (type, len(content))
397 def _shalist_sort_key(ent):
398 (mode, name, id) = ent
399 if stat.S_ISDIR(int(mode, 8)):
405 def idxmerge(idxlist):
406 """Generate a list of all the objects reachable in a PackIdxList."""
407 total = sum(len(i) for i in idxlist)
408 iters = (iter(i) for i in idxlist)
409 heap = [(next(it), it) for it in iters]
414 if (count % 10024) == 0:
415 progress('Reading indexes: %.2f%% (%d/%d)\r'
416 % (count*100.0/total, count, total))
424 heapq.heapreplace(heap, (e, it))
427 log('Reading indexes: %.2f%% (%d/%d), done.\n' % (100, total, total))
431 """Writes Git objects insid a pack file."""
432 def __init__(self, objcache_maker=None):
437 self.objcache_maker = objcache_maker
443 def _make_objcache(self):
444 if self.objcache == None:
445 if self.objcache_maker:
446 self.objcache = self.objcache_maker()
448 self.objcache = PackIdxList(repo('objects/pack'))
452 self._make_objcache()
453 (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
454 self.file = os.fdopen(fd, 'w+b')
455 assert(name.endswith('.pack'))
456 self.filename = name[:-5]
457 self.file.write('PACK\0\0\0\2\0\0\0\0')
459 def _raw_write(self, datalist):
462 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
463 # the file never has a *partial* blob. So let's make sure it's
464 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
465 # to our hashsplit algorithm.) f.write() does its own buffering,
466 # but that's okay because we'll flush it in _end().
467 oneblob = ''.join(datalist)
469 self.outbytes += len(oneblob)
472 def _write(self, bin, type, content):
475 self._raw_write(_encode_packobj(type, content))
478 def breakpoint(self):
479 """Clear byte and object counts and return the last processed id."""
481 self.outbytes = self.count = 0
484 def write(self, type, content):
485 """Write an object in this pack file."""
486 return self._write(calc_hash(type, content), type, content)
488 def exists(self, id):
489 """Return non-empty if an object is found in the object cache."""
490 if not self.objcache:
491 self._make_objcache()
492 return self.objcache.exists(id)
494 def maybe_write(self, type, content):
495 """Write an object to the pack file if not present and return its id."""
496 bin = calc_hash(type, content)
497 if not self.exists(bin):
498 self._write(bin, type, content)
499 self.objcache.add(bin)
502 def new_blob(self, blob):
503 """Create a blob object in the pack with the supplied content."""
504 return self.maybe_write('blob', blob)
506 def new_tree(self, shalist):
507 """Create a tree object in the pack."""
508 shalist = sorted(shalist, key = _shalist_sort_key)
510 for (mode,name,bin) in shalist:
513 assert(mode[0] != '0')
515 assert(len(bin) == 20)
516 l.append('%s %s\0%s' % (mode,name,bin))
517 return self.maybe_write('tree', ''.join(l))
519 def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
521 if tree: l.append('tree %s' % tree.encode('hex'))
522 if parent: l.append('parent %s' % parent.encode('hex'))
523 if author: l.append('author %s %s' % (author, _git_date(adate)))
524 if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
527 return self.maybe_write('commit', '\n'.join(l))
529 def new_commit(self, parent, tree, msg):
530 """Create a commit object in the pack."""
532 userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
533 commit = self._new_commit(tree, parent,
534 userline, now, userline, now,
539 """Remove the pack file from disk."""
544 os.unlink(self.filename + '.pack')
548 if not f: return None
552 # update object count
554 cp = struct.pack('!i', self.count)
558 # calculate the pack sha1sum
565 f.write(sum.digest())
569 p = subprocess.Popen(['git', 'index-pack', '-v',
571 self.filename + '.pack'],
572 preexec_fn = _gitenv,
573 stdout = subprocess.PIPE)
574 out = p.stdout.read().strip()
575 _git_wait('git index-pack', p)
577 raise GitError('git index-pack produced no output')
578 nameprefix = repo('objects/pack/%s' % out)
579 if os.path.exists(self.filename + '.map'):
580 os.unlink(self.filename + '.map')
581 os.rename(self.filename + '.pack', nameprefix + '.pack')
582 os.rename(self.filename + '.idx', nameprefix + '.idx')
586 """Close the pack file and move it to its definitive path."""
591 return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
595 os.environ['GIT_DIR'] = os.path.abspath(repo())
598 def list_refs(refname = None):
599 """Generate a list of tuples in the form (refname,hash).
600 If a ref name is specified, list only this particular ref.
602 argv = ['git', 'show-ref', '--']
605 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
606 out = p.stdout.read().strip()
607 rv = p.wait() # not fatal
611 for d in out.split('\n'):
612 (sha, name) = d.split(' ', 1)
613 yield (name, sha.decode('hex'))
616 def read_ref(refname):
617 """Get the commit id of the most recent commit made on a given ref."""
618 l = list(list_refs(refname))
626 def rev_list(ref, count=None):
627 """Generate a list of reachable commits in reverse chronological order.
629 This generator walks through commits, from child to parent, that are
630 reachable via the specified ref and yields a series of tuples of the form
633 If count is a non-zero integer, limit the number of commits to "count"
636 assert(not ref.startswith('-'))
639 opts += ['-n', str(atoi(count))]
640 argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
641 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
645 if s.startswith('commit '):
646 commit = s[7:].decode('hex')
650 rv = p.wait() # not fatal
652 raise GitError, 'git rev-list returned error %d' % rv
655 def rev_get_date(ref):
656 """Get the date of the latest commit on the specified ref."""
657 for (date, commit) in rev_list(ref, count=1):
659 raise GitError, 'no such commit %r' % ref
662 def update_ref(refname, newval, oldval):
663 """Change the commit pointed to by a branch."""
666 assert(refname.startswith('refs/heads/'))
667 p = subprocess.Popen(['git', 'update-ref', refname,
668 newval.encode('hex'), oldval.encode('hex')],
669 preexec_fn = _gitenv)
670 _git_wait('git update-ref', p)
673 def guess_repo(path=None):
674 """Set the path value in the global variable "repodir".
675 This makes bup look for an existing bup repository, but not fail if a
676 repository doesn't exist. Usually, if you are interacting with a bup
677 repository, you would not be calling this function but using
684 repodir = os.environ.get('BUP_DIR')
686 repodir = os.path.expanduser('~/.bup')
689 def init_repo(path=None):
690 """Create the Git bare repository for bup in a given path."""
693 if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
694 raise GitError('"%d" exists but is not a directory\n' % d)
695 p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
696 preexec_fn = _gitenv)
697 _git_wait('git init', p)
698 # Force the index version configuration in order to ensure bup works
699 # regardless of the version of the installed Git binary.
700 p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
701 stdout=sys.stderr, preexec_fn = _gitenv)
702 _git_wait('git config', p)
705 def check_repo_or_die(path=None):
706 """Make sure a bup repository exists, and abort if not.
707 If the path to a particular repository was not specified, this function
708 initializes the default repository automatically.
711 if not os.path.isdir(repo('objects/pack/.')):
712 if repodir == home_repodir:
715 log('error: %r is not a bup/git repository\n' % repo())
720 """Generate a list of (mode, name, hash) tuples of objects from 'buf'."""
722 while ofs < len(buf):
723 z = buf[ofs:].find('\0')
725 spl = buf[ofs:ofs+z].split(' ', 1)
726 assert(len(spl) == 2)
727 sha = buf[ofs+z+1:ofs+z+1+20]
729 yield (spl[0], spl[1], sha)
734 """Get Git's version and ensure a usable version is installed.
736 The returned version is formatted as an ordered tuple with each position
737 representing a digit in the version tag. For example, the following tuple
738 would represent version 1.6.6.9:
744 p = subprocess.Popen(['git', '--version'],
745 stdout=subprocess.PIPE)
746 gvs = p.stdout.read()
747 _git_wait('git --version', p)
748 m = re.match(r'git version (\S+.\S+)', gvs)
750 raise GitError('git --version weird output: %r' % gvs)
751 _ver = tuple(m.group(1).split('.'))
752 needed = ('1','5', '3', '1')
754 raise GitError('git version %s or higher is required; you have %s'
755 % ('.'.join(needed), '.'.join(_ver)))
759 def _git_wait(cmd, p):
762 raise GitError('%s returned %d' % (cmd, rv))
765 def _git_capture(argv):
766 p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
768 _git_wait(repr(argv), p)
772 class _AbortableIter:
773 def __init__(self, it, onabort = None):
775 self.onabort = onabort
783 return self.it.next()
784 except StopIteration, e:
792 """Abort iteration and call the abortion callback, if needed."""
804 """Link to 'git cat-file' that is used to retrieve blob data."""
807 wanted = ('1','5','6')
810 log('warning: git version < %s; bup will be slow.\n'
813 self.get = self._slow_get
815 self.p = self.inprogress = None
816 self.get = self._fast_get
820 self.p.stdout.close()
823 self.inprogress = None
827 self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
828 stdin=subprocess.PIPE,
829 stdout=subprocess.PIPE,
831 preexec_fn = _gitenv)
833 def _fast_get(self, id):
834 if not self.p or self.p.poll() != None:
837 assert(self.p.poll() == None)
839 log('_fast_get: opening %r while %r is open'
840 % (id, self.inprogress))
841 assert(not self.inprogress)
842 assert(id.find('\n') < 0)
843 assert(id.find('\r') < 0)
846 self.p.stdin.write('%s\n' % id)
847 hdr = self.p.stdout.readline()
848 if hdr.endswith(' missing\n'):
849 raise KeyError('blob %r is missing' % id)
851 if len(spl) != 3 or len(spl[0]) != 40:
852 raise GitError('expected blob, got %r' % spl)
853 (hex, type, size) = spl
855 it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
856 onabort = self._abort)
861 assert(self.p.stdout.readline() == '\n')
862 self.inprogress = None
867 def _slow_get(self, id):
868 assert(id.find('\n') < 0)
869 assert(id.find('\r') < 0)
871 type = _git_capture(['git', 'cat-file', '-t', id]).strip()
874 p = subprocess.Popen(['git', 'cat-file', type, id],
875 stdout=subprocess.PIPE,
876 preexec_fn = _gitenv)
877 for blob in chunkyreader(p.stdout):
879 _git_wait('git cat-file', p)
887 treefile = ''.join(it)
888 for (mode, name, sha) in treeparse(treefile):
889 for blob in self.join(sha.encode('hex')):
891 elif type == 'commit':
892 treeline = ''.join(it).split('\n')[0]
893 assert(treeline.startswith('tree '))
894 for blob in self.join(treeline[5:]):
897 raise GitError('invalid object type %r: expected blob/tree/commit'
901 """Generate a list of the content of all blobs that can be reached
902 from an object. The hash given in 'id' must point to a blob, a tree
903 or a commit. The content of all blobs that can be seen from trees or
904 commits will be added to the list.
907 for d in self._join(self.get(id)):
909 except StopIteration: