1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
5 import os, zlib, time, subprocess, struct, stat, re, tempfile
7 from bup.helpers import *
11 home_repodir = os.path.expanduser('~/.bup')
14 _typemap = { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
15 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
18 class GitError(Exception):
23 """Get the path to the git repository or one of its subdirectories."""
26 raise GitError('You should call check_repo_or_die()')
28 # If there's a .git subdirectory, then the actual repo is in there.
29 gd = os.path.join(repodir, '.git')
30 if os.path.exists(gd):
33 return os.path.join(repodir, sub)
36 def mangle_name(name, mode, gitmode):
37 """Mangle a file name to present an abstract name for segmented files.
38 Mangled file names will have the ".bup" extension added to them. If a
39 file's name already ends with ".bup", a ".bupl" extension is added to
40 disambiguate normal files from semgmented ones.
42 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
44 elif name.endswith('.bup') or name[:-1].endswith('.bup'):
50 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
51 def demangle_name(name):
52 """Remove name mangling from a file name, if necessary.
54 The return value is a tuple (demangled_filename,mode), where mode is one of
57 * BUP_NORMAL : files that should be read as-is from the repository
58 * BUP_CHUNKED : files that were chunked and need to be assembled
60 For more information on the name mangling algorythm, see mangle_name()
62 if name.endswith('.bupl'):
63 return (name[:-5], BUP_NORMAL)
64 elif name.endswith('.bup'):
65 return (name[:-4], BUP_CHUNKED)
67 return (name, BUP_NORMAL)
70 def _encode_packobj(type, content):
73 szbits = (sz & 0x0f) | (_typemap[type]<<4)
82 z = zlib.compressobj(1)
84 yield z.compress(content)
88 def _encode_looseobj(type, content):
89 z = zlib.compressobj(1)
90 yield z.compress('%s %d\0' % (type, len(content)))
91 yield z.compress(content)
95 def _decode_looseobj(buf):
97 s = zlib.decompress(buf)
104 assert(type in _typemap)
105 assert(sz == len(content))
106 return (type, content)
109 def _decode_packobj(buf):
112 type = _typermap[(c & 0x70) >> 4]
119 sz |= (c & 0x7f) << shift
123 return (type, zlib.decompress(buf[i+1:]))
127 """Object representation of a Git pack index file."""
128 def __init__(self, filename):
130 self.map = mmap_read(open(filename))
131 assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
132 self.fanout = list(struct.unpack('!256I',
133 str(buffer(self.map, 8, 256*4))))
134 self.fanout.append(0) # entry "-1"
135 nsha = self.fanout[255]
136 self.ofstable = buffer(self.map,
137 8 + 256*4 + nsha*20 + nsha*4,
139 self.ofs64table = buffer(self.map,
140 8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
142 def _ofs_from_idx(self, idx):
143 ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
145 idx64 = ofs & 0x7fffffff
146 ofs = struct.unpack('!I',
147 str(buffer(self.ofs64table, idx64*8, 8)))[0]
150 def _idx_from_hash(self, hash):
151 assert(len(hash) == 20)
153 start = self.fanout[b1-1] # range -1..254
154 end = self.fanout[b1] # range 0..255
155 buf = buffer(self.map, 8 + 256*4, end*20)
158 mid = start + (end-start)/2
159 v = str(buf[mid*20:(mid+1)*20])
168 def find_offset(self, hash):
169 """Get the offset of an object inside the index file."""
170 idx = self._idx_from_hash(hash)
172 return self._ofs_from_idx(idx)
175 def exists(self, hash):
176 """Return nonempty if the object exists in this index."""
177 return hash and (self._idx_from_hash(hash) != None) and True or None
180 for i in xrange(self.fanout[255]):
181 yield buffer(self.map, 8 + 256*4 + 20*i, 20)
184 return int(self.fanout[255])
187 def extract_bits(buf, nbits):
188 """Take the first 'nbits' bits from 'buf' and return them as an integer."""
189 mask = (1<<nbits) - 1
190 v = struct.unpack('!I', buf[0:4])[0]
191 v = (v >> (32-nbits)) & mask
196 """Wrapper which contains data from multiple index files.
197 Multiple index (.midx) files constitute a wrapper around index (.idx) files
198 and make it possible for bup to expand Git's indexing capabilities to vast
201 def __init__(self, filename):
203 assert(filename.endswith('.midx'))
204 self.map = mmap_read(open(filename))
205 if str(self.map[0:8]) == 'MIDX\0\0\0\1':
206 log('Warning: ignoring old-style midx %r\n' % filename)
209 self.fanout = buffer('\0\0\0\0')
210 self.shalist = buffer('\0'*20)
213 assert(str(self.map[0:8]) == 'MIDX\0\0\0\2')
214 self.bits = struct.unpack('!I', self.map[8:12])[0]
215 self.entries = 2**self.bits
216 self.fanout = buffer(self.map, 12, self.entries*4)
217 shaofs = 12 + self.entries*4
218 nsha = self._fanget(self.entries-1)
219 self.shalist = buffer(self.map, shaofs, nsha*20)
220 self.idxnames = str(self.map[shaofs + 20*nsha:]).split('\0')
222 def _fanget(self, i):
224 s = self.fanout[start:start+4]
225 return struct.unpack('!I', s)[0]
227 def exists(self, hash):
228 """Return nonempty if the object exists in the index files."""
230 el = extract_bits(want, self.bits)
232 start = self._fanget(el-1)
235 end = self._fanget(el)
237 mid = start + (end-start)/2
238 v = str(self.shalist[mid*20:(mid+1)*20])
248 for i in xrange(self._fanget(self.entries-1)):
249 yield buffer(self.shalist, i*20, 20)
252 return int(self._fanget(self.entries-1))
257 def __init__(self, dir):
259 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
269 assert(_mpi_count == 0)
272 return iter(idxmerge(self.packs))
275 return sum(len(pack) for pack in self.packs)
277 def exists(self, hash):
278 """Return nonempty if the object exists in the index files."""
279 if hash in self.also:
281 for i in range(len(self.packs)):
284 # reorder so most recently used packs are searched first
285 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
289 def refresh(self, skip_midx = False):
290 """Refresh the index list.
291 This method verifies if .midx files were superseded (e.g. all of its
292 contents are in another, bigger .midx file) and removes the superseded
295 If skip_midx is True, all work on .midx files will be skipped and .midx
296 files will be removed from the list.
298 The module-global variable 'ignore_midx' can force this function to
299 always act as if skip_midx was True.
301 skip_midx = skip_midx or ignore_midx
302 d = dict((p.name, p) for p in self.packs
303 if not skip_midx or not isinstance(p, PackMidx))
304 if os.path.exists(self.dir):
307 for ix in self.packs:
308 if isinstance(ix, PackMidx):
309 for name in ix.idxnames:
310 d[os.path.join(self.dir, name)] = ix
311 for f in os.listdir(self.dir):
312 full = os.path.join(self.dir, f)
313 if f.endswith('.midx') and not d.get(full):
315 (mxd, mxf) = os.path.split(mx.name)
317 for n in mx.idxnames:
318 if not os.path.exists(os.path.join(mxd, n)):
319 log(('warning: index %s missing\n' +
320 ' used by %s\n') % (n, mxf))
324 midxl.sort(lambda x,y: -cmp(len(x),len(y)))
327 for sub in ix.idxnames:
328 found = d.get(os.path.join(self.dir, sub))
329 if not found or isinstance(found, PackIdx):
330 # doesn't exist, or exists but not in a midx
332 for name in ix.idxnames:
333 d[os.path.join(self.dir, name)] = ix
337 log('midx: removing redundant: %s\n'
338 % os.path.basename(ix.name))
340 for f in os.listdir(self.dir):
341 full = os.path.join(self.dir, f)
342 if f.endswith('.idx') and not d.get(full):
345 self.packs = list(set(d.values()))
346 log('PackIdxList: using %d index%s.\n'
347 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
350 """Insert an additional object in the list."""
354 """Remove all additional objects from the list."""
358 def calc_hash(type, content):
359 """Calculate some content's hash in the Git fashion."""
360 header = '%s %d\0' % (type, len(content))
366 def _shalist_sort_key(ent):
367 (mode, name, id) = ent
368 if stat.S_ISDIR(int(mode, 8)):
374 def idxmerge(idxlist):
375 """Generate a list of all the objects reachable in a PackIdxList."""
376 total = sum(len(i) for i in idxlist)
377 iters = (iter(i) for i in idxlist)
378 heap = [(next(it), it) for it in iters]
383 if (count % 10024) == 0:
384 progress('Reading indexes: %.2f%% (%d/%d)\r'
385 % (count*100.0/total, count, total))
393 heapq.heapreplace(heap, (e, it))
396 log('Reading indexes: %.2f%% (%d/%d), done.\n' % (100, total, total))
400 """Writes Git objects insid a pack file."""
401 def __init__(self, objcache_maker=None):
406 self.objcache_maker = objcache_maker
412 def _make_objcache(self):
413 if self.objcache == None:
414 if self.objcache_maker:
415 self.objcache = self.objcache_maker()
417 self.objcache = PackIdxList(repo('objects/pack'))
421 self._make_objcache()
422 (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
423 self.file = os.fdopen(fd, 'w+b')
424 assert(name.endswith('.pack'))
425 self.filename = name[:-5]
426 self.file.write('PACK\0\0\0\2\0\0\0\0')
428 def _raw_write(self, datalist):
431 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
432 # the file never has a *partial* blob. So let's make sure it's
433 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
434 # to our hashsplit algorithm.) f.write() does its own buffering,
435 # but that's okay because we'll flush it in _end().
436 oneblob = ''.join(datalist)
438 self.outbytes += len(oneblob)
441 def _write(self, bin, type, content):
444 self._raw_write(_encode_packobj(type, content))
447 def breakpoint(self):
448 """Clear byte and object counts and return the last processed id."""
450 self.outbytes = self.count = 0
453 def write(self, type, content):
454 """Write an object in this pack file."""
455 return self._write(calc_hash(type, content), type, content)
457 def exists(self, id):
458 """Return non-empty if an object is found in the object cache."""
459 if not self.objcache:
460 self._make_objcache()
461 return self.objcache.exists(id)
463 def maybe_write(self, type, content):
464 """Write an object to the pack file if not present and return its id."""
465 bin = calc_hash(type, content)
466 if not self.exists(bin):
467 self._write(bin, type, content)
468 self.objcache.add(bin)
471 def new_blob(self, blob):
472 """Create a blob object in the pack with the supplied content."""
473 return self.maybe_write('blob', blob)
475 def new_tree(self, shalist):
476 """Create a tree object in the pack."""
477 shalist = sorted(shalist, key = _shalist_sort_key)
479 for (mode,name,bin) in shalist:
482 assert(mode[0] != '0')
484 assert(len(bin) == 20)
485 l.append('%s %s\0%s' % (mode,name,bin))
486 return self.maybe_write('tree', ''.join(l))
488 def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
490 if tree: l.append('tree %s' % tree.encode('hex'))
491 if parent: l.append('parent %s' % parent.encode('hex'))
492 if author: l.append('author %s %s' % (author, _git_date(adate)))
493 if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
496 return self.maybe_write('commit', '\n'.join(l))
498 def new_commit(self, parent, tree, msg):
499 """Create a commit object in the pack."""
501 userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
502 commit = self._new_commit(tree, parent,
503 userline, now, userline, now,
508 """Remove the pack file from disk."""
513 os.unlink(self.filename + '.pack')
517 if not f: return None
521 # update object count
523 cp = struct.pack('!i', self.count)
527 # calculate the pack sha1sum
534 f.write(sum.digest())
538 p = subprocess.Popen(['git', 'index-pack', '-v',
540 self.filename + '.pack'],
541 preexec_fn = _gitenv,
542 stdout = subprocess.PIPE)
543 out = p.stdout.read().strip()
544 _git_wait('git index-pack', p)
546 raise GitError('git index-pack produced no output')
547 nameprefix = repo('objects/pack/%s' % out)
548 if os.path.exists(self.filename + '.map'):
549 os.unlink(self.filename + '.map')
550 os.rename(self.filename + '.pack', nameprefix + '.pack')
551 os.rename(self.filename + '.idx', nameprefix + '.idx')
555 """Close the pack file and move it to its definitive path."""
560 return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
564 os.environ['GIT_DIR'] = os.path.abspath(repo())
567 def list_refs(refname = None):
568 """Generate a list of tuples in the form (refname,hash).
569 If a ref name is specified, list only this particular ref.
571 argv = ['git', 'show-ref', '--']
574 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
575 out = p.stdout.read().strip()
576 rv = p.wait() # not fatal
580 for d in out.split('\n'):
581 (sha, name) = d.split(' ', 1)
582 yield (name, sha.decode('hex'))
585 def read_ref(refname):
586 """Get the commit id of the most recent commit made on a given ref."""
587 l = list(list_refs(refname))
595 def rev_list(ref, count=None):
596 """Generate a list of reachable commits in reverse chronological order.
598 This generator walks through commits, from child to parent, that are
599 reachable via the specified ref and yields a series of tuples of the form
602 If count is a non-zero integer, limit the number of commits to "count"
605 assert(not ref.startswith('-'))
608 opts += ['-n', str(atoi(count))]
609 argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
610 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
614 if s.startswith('commit '):
615 commit = s[7:].decode('hex')
619 rv = p.wait() # not fatal
621 raise GitError, 'git rev-list returned error %d' % rv
624 def rev_get_date(ref):
625 """Get the date of the latest commit on the specified ref."""
626 for (date, commit) in rev_list(ref, count=1):
628 raise GitError, 'no such commit %r' % ref
631 def update_ref(refname, newval, oldval):
632 """Change the commit pointed to by a branch."""
635 assert(refname.startswith('refs/heads/'))
636 p = subprocess.Popen(['git', 'update-ref', refname,
637 newval.encode('hex'), oldval.encode('hex')],
638 preexec_fn = _gitenv)
639 _git_wait('git update-ref', p)
642 def guess_repo(path=None):
643 """Set the path value in the global variable "repodir".
644 This makes bup look for an existing bup repository, but not fail if a
645 repository doesn't exist. Usually, if you are interacting with a bup
646 repository, you would not be calling this function but using
653 repodir = os.environ.get('BUP_DIR')
655 repodir = os.path.expanduser('~/.bup')
658 def init_repo(path=None):
659 """Create the Git bare repository for bup in a given path."""
662 if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
663 raise GitError('"%d" exists but is not a directory\n' % d)
664 p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
665 preexec_fn = _gitenv)
666 _git_wait('git init', p)
667 # Force the index version configuration in order to ensure bup works
668 # regardless of the version of the installed Git binary.
669 p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
670 stdout=sys.stderr, preexec_fn = _gitenv)
671 _git_wait('git config', p)
674 def check_repo_or_die(path=None):
675 """Make sure a bup repository exists, and abort if not.
676 If the path to a particular repository was not specified, this function
677 initializes the default repository automatically.
680 if not os.path.isdir(repo('objects/pack/.')):
681 if repodir == home_repodir:
684 log('error: %r is not a bup/git repository\n' % repo())
689 """Generate a list of (mode, name, hash) tuples of objects from 'buf'."""
691 while ofs < len(buf):
692 z = buf[ofs:].find('\0')
694 spl = buf[ofs:ofs+z].split(' ', 1)
695 assert(len(spl) == 2)
696 sha = buf[ofs+z+1:ofs+z+1+20]
698 yield (spl[0], spl[1], sha)
703 """Get Git's version and ensure a usable version is installed.
705 The returned version is formatted as an ordered tuple with each position
706 representing a digit in the version tag. For example, the following tuple
707 would represent version 1.6.6.9:
713 p = subprocess.Popen(['git', '--version'],
714 stdout=subprocess.PIPE)
715 gvs = p.stdout.read()
716 _git_wait('git --version', p)
717 m = re.match(r'git version (\S+.\S+)', gvs)
719 raise GitError('git --version weird output: %r' % gvs)
720 _ver = tuple(m.group(1).split('.'))
721 needed = ('1','5', '3', '1')
723 raise GitError('git version %s or higher is required; you have %s'
724 % ('.'.join(needed), '.'.join(_ver)))
728 def _git_wait(cmd, p):
731 raise GitError('%s returned %d' % (cmd, rv))
734 def _git_capture(argv):
735 p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
737 _git_wait(repr(argv), p)
741 class _AbortableIter:
742 def __init__(self, it, onabort = None):
744 self.onabort = onabort
752 return self.it.next()
753 except StopIteration, e:
761 """Abort iteration and call the abortion callback, if needed."""
773 """Link to 'git cat-file' that is used to retrieve blob data."""
776 wanted = ('1','5','6')
779 log('warning: git version < %s; bup will be slow.\n'
782 self.get = self._slow_get
784 self.p = self.inprogress = None
785 self.get = self._fast_get
789 self.p.stdout.close()
792 self.inprogress = None
796 self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
797 stdin=subprocess.PIPE,
798 stdout=subprocess.PIPE,
800 preexec_fn = _gitenv)
802 def _fast_get(self, id):
803 if not self.p or self.p.poll() != None:
806 assert(self.p.poll() == None)
808 log('_fast_get: opening %r while %r is open'
809 % (id, self.inprogress))
810 assert(not self.inprogress)
811 assert(id.find('\n') < 0)
812 assert(id.find('\r') < 0)
815 self.p.stdin.write('%s\n' % id)
816 hdr = self.p.stdout.readline()
817 if hdr.endswith(' missing\n'):
818 raise KeyError('blob %r is missing' % id)
820 if len(spl) != 3 or len(spl[0]) != 40:
821 raise GitError('expected blob, got %r' % spl)
822 (hex, type, size) = spl
824 it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
825 onabort = self._abort)
830 assert(self.p.stdout.readline() == '\n')
831 self.inprogress = None
836 def _slow_get(self, id):
837 assert(id.find('\n') < 0)
838 assert(id.find('\r') < 0)
840 type = _git_capture(['git', 'cat-file', '-t', id]).strip()
843 p = subprocess.Popen(['git', 'cat-file', type, id],
844 stdout=subprocess.PIPE,
845 preexec_fn = _gitenv)
846 for blob in chunkyreader(p.stdout):
848 _git_wait('git cat-file', p)
856 treefile = ''.join(it)
857 for (mode, name, sha) in treeparse(treefile):
858 for blob in self.join(sha.encode('hex')):
860 elif type == 'commit':
861 treeline = ''.join(it).split('\n')[0]
862 assert(treeline.startswith('tree '))
863 for blob in self.join(treeline[5:]):
866 raise GitError('invalid object type %r: expected blob/tree/commit'
870 """Generate a list of the content of all blobs that can be reached
871 from an object. The hash given in 'id' must point to a blob, a tree
872 or a commit. The content of all blobs that can be seen from trees or
873 commits will be added to the list.
876 for d in self._join(self.get(id)):
878 except StopIteration: