1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
5 import os, zlib, time, subprocess, struct, stat, re, tempfile
7 from bup.helpers import *
11 home_repodir = os.path.expanduser('~/.bup')
14 _typemap = { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
15 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
21 class GitError(Exception):
26 """Get the path to the git repository or one of its subdirectories."""
29 raise GitError('You should call check_repo_or_die()')
31 # If there's a .git subdirectory, then the actual repo is in there.
32 gd = os.path.join(repodir, '.git')
33 if os.path.exists(gd):
36 return os.path.join(repodir, sub)
39 def mangle_name(name, mode, gitmode):
40 """Mangle a file name to present an abstract name for segmented files.
41 Mangled file names will have the ".bup" extension added to them. If a
42 file's name already ends with ".bup", a ".bupl" extension is added to
43 disambiguate normal files from semgmented ones.
45 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
47 elif name.endswith('.bup') or name[:-1].endswith('.bup'):
53 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
54 def demangle_name(name):
55 """Remove name mangling from a file name, if necessary.
57 The return value is a tuple (demangled_filename,mode), where mode is one of
60 * BUP_NORMAL : files that should be read as-is from the repository
61 * BUP_CHUNKED : files that were chunked and need to be assembled
63 For more information on the name mangling algorythm, see mangle_name()
65 if name.endswith('.bupl'):
66 return (name[:-5], BUP_NORMAL)
67 elif name.endswith('.bup'):
68 return (name[:-4], BUP_CHUNKED)
70 return (name, BUP_NORMAL)
73 def _encode_packobj(type, content):
76 szbits = (sz & 0x0f) | (_typemap[type]<<4)
85 z = zlib.compressobj(1)
87 yield z.compress(content)
91 def _encode_looseobj(type, content):
92 z = zlib.compressobj(1)
93 yield z.compress('%s %d\0' % (type, len(content)))
94 yield z.compress(content)
98 def _decode_looseobj(buf):
100 s = zlib.decompress(buf)
107 assert(type in _typemap)
108 assert(sz == len(content))
109 return (type, content)
112 def _decode_packobj(buf):
115 type = _typermap[(c & 0x70) >> 4]
122 sz |= (c & 0x7f) << shift
126 return (type, zlib.decompress(buf[i+1:]))
130 """Object representation of a Git pack index file."""
131 def __init__(self, filename):
133 self.map = mmap_read(open(filename))
134 assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
135 self.fanout = list(struct.unpack('!256I',
136 str(buffer(self.map, 8, 256*4))))
137 self.fanout.append(0) # entry "-1"
138 nsha = self.fanout[255]
139 self.ofstable = buffer(self.map,
140 8 + 256*4 + nsha*20 + nsha*4,
142 self.ofs64table = buffer(self.map,
143 8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
145 def _ofs_from_idx(self, idx):
146 ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
148 idx64 = ofs & 0x7fffffff
149 ofs = struct.unpack('!I',
150 str(buffer(self.ofs64table, idx64*8, 8)))[0]
153 def _idx_from_hash(self, hash):
154 global _total_searches, _total_steps
156 assert(len(hash) == 20)
158 start = self.fanout[b1-1] # range -1..254
159 end = self.fanout[b1] # range 0..255
160 buf = buffer(self.map, 8 + 256*4, end*20)
162 _total_steps += 1 # lookup table is a step
165 mid = start + (end-start)/2
166 v = str(buf[mid*20:(mid+1)*20])
175 def find_offset(self, hash):
176 """Get the offset of an object inside the index file."""
177 idx = self._idx_from_hash(hash)
179 return self._ofs_from_idx(idx)
182 def exists(self, hash):
183 """Return nonempty if the object exists in this index."""
184 return hash and (self._idx_from_hash(hash) != None) and True or None
187 for i in xrange(self.fanout[255]):
188 yield buffer(self.map, 8 + 256*4 + 20*i, 20)
191 return int(self.fanout[255])
194 def extract_bits(buf, nbits):
195 """Take the first 'nbits' bits from 'buf' and return them as an integer."""
196 mask = (1<<nbits) - 1
197 v = struct.unpack('!I', buf[0:4])[0]
198 v = (v >> (32-nbits)) & mask
203 """Wrapper which contains data from multiple index files.
204 Multiple index (.midx) files constitute a wrapper around index (.idx) files
205 and make it possible for bup to expand Git's indexing capabilities to vast
208 def __init__(self, filename):
210 assert(filename.endswith('.midx'))
211 self.map = mmap_read(open(filename))
212 if str(self.map[0:8]) == 'MIDX\0\0\0\1':
213 log('Warning: ignoring old-style midx %r\n' % filename)
216 self.fanout = buffer('\0\0\0\0')
217 self.shalist = buffer('\0'*20)
220 assert(str(self.map[0:8]) == 'MIDX\0\0\0\2')
221 self.bits = struct.unpack('!I', self.map[8:12])[0]
222 self.entries = 2**self.bits
223 self.fanout = buffer(self.map, 12, self.entries*4)
224 shaofs = 12 + self.entries*4
225 nsha = self._fanget(self.entries-1)
226 self.shalist = buffer(self.map, shaofs, nsha*20)
227 self.idxnames = str(self.map[shaofs + 20*nsha:]).split('\0')
229 def _fanget(self, i):
231 s = self.fanout[start:start+4]
232 return struct.unpack('!I', s)[0]
234 def exists(self, hash):
235 """Return nonempty if the object exists in the index files."""
236 global _total_searches, _total_steps
239 el = extract_bits(want, self.bits)
241 start = self._fanget(el-1)
244 end = self._fanget(el)
245 _total_steps += 1 # lookup table is a step
248 mid = start + (end-start)/2
249 v = str(self.shalist[mid*20:(mid+1)*20])
259 for i in xrange(self._fanget(self.entries-1)):
260 yield buffer(self.shalist, i*20, 20)
263 return int(self._fanget(self.entries-1))
268 def __init__(self, dir):
270 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
280 assert(_mpi_count == 0)
283 return iter(idxmerge(self.packs))
286 return sum(len(pack) for pack in self.packs)
288 def exists(self, hash):
289 """Return nonempty if the object exists in the index files."""
290 global _total_searches
292 if hash in self.also:
294 for i in range(len(self.packs)):
296 _total_searches -= 1 # will be incremented by sub-pack
298 # reorder so most recently used packs are searched first
299 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
303 def refresh(self, skip_midx = False):
304 """Refresh the index list.
305 This method verifies if .midx files were superseded (e.g. all of its
306 contents are in another, bigger .midx file) and removes the superseded
309 If skip_midx is True, all work on .midx files will be skipped and .midx
310 files will be removed from the list.
312 The module-global variable 'ignore_midx' can force this function to
313 always act as if skip_midx was True.
315 skip_midx = skip_midx or ignore_midx
316 d = dict((p.name, p) for p in self.packs
317 if not skip_midx or not isinstance(p, PackMidx))
318 if os.path.exists(self.dir):
321 for ix in self.packs:
322 if isinstance(ix, PackMidx):
323 for name in ix.idxnames:
324 d[os.path.join(self.dir, name)] = ix
325 for f in os.listdir(self.dir):
326 full = os.path.join(self.dir, f)
327 if f.endswith('.midx') and not d.get(full):
329 (mxd, mxf) = os.path.split(mx.name)
331 for n in mx.idxnames:
332 if not os.path.exists(os.path.join(mxd, n)):
333 log(('warning: index %s missing\n' +
334 ' used by %s\n') % (n, mxf))
338 midxl.sort(lambda x,y: -cmp(len(x),len(y)))
341 for sub in ix.idxnames:
342 found = d.get(os.path.join(self.dir, sub))
343 if not found or isinstance(found, PackIdx):
344 # doesn't exist, or exists but not in a midx
346 for name in ix.idxnames:
347 d[os.path.join(self.dir, name)] = ix
351 log('midx: removing redundant: %s\n'
352 % os.path.basename(ix.name))
354 for f in os.listdir(self.dir):
355 full = os.path.join(self.dir, f)
356 if f.endswith('.idx') and not d.get(full):
359 self.packs = list(set(d.values()))
360 log('PackIdxList: using %d index%s.\n'
361 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
364 """Insert an additional object in the list."""
368 """Remove all additional objects from the list."""
372 def calc_hash(type, content):
373 """Calculate some content's hash in the Git fashion."""
374 header = '%s %d\0' % (type, len(content))
380 def _shalist_sort_key(ent):
381 (mode, name, id) = ent
382 if stat.S_ISDIR(int(mode, 8)):
388 def idxmerge(idxlist):
389 """Generate a list of all the objects reachable in a PackIdxList."""
390 total = sum(len(i) for i in idxlist)
391 iters = (iter(i) for i in idxlist)
392 heap = [(next(it), it) for it in iters]
397 if (count % 10024) == 0:
398 progress('Reading indexes: %.2f%% (%d/%d)\r'
399 % (count*100.0/total, count, total))
407 heapq.heapreplace(heap, (e, it))
410 log('Reading indexes: %.2f%% (%d/%d), done.\n' % (100, total, total))
414 """Writes Git objects insid a pack file."""
415 def __init__(self, objcache_maker=None):
420 self.objcache_maker = objcache_maker
426 def _make_objcache(self):
427 if self.objcache == None:
428 if self.objcache_maker:
429 self.objcache = self.objcache_maker()
431 self.objcache = PackIdxList(repo('objects/pack'))
435 self._make_objcache()
436 (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
437 self.file = os.fdopen(fd, 'w+b')
438 assert(name.endswith('.pack'))
439 self.filename = name[:-5]
440 self.file.write('PACK\0\0\0\2\0\0\0\0')
442 def _raw_write(self, datalist):
445 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
446 # the file never has a *partial* blob. So let's make sure it's
447 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
448 # to our hashsplit algorithm.) f.write() does its own buffering,
449 # but that's okay because we'll flush it in _end().
450 oneblob = ''.join(datalist)
452 self.outbytes += len(oneblob)
455 def _write(self, bin, type, content):
458 self._raw_write(_encode_packobj(type, content))
461 def breakpoint(self):
462 """Clear byte and object counts and return the last processed id."""
464 self.outbytes = self.count = 0
467 def write(self, type, content):
468 """Write an object in this pack file."""
469 return self._write(calc_hash(type, content), type, content)
471 def exists(self, id):
472 """Return non-empty if an object is found in the object cache."""
473 if not self.objcache:
474 self._make_objcache()
475 return self.objcache.exists(id)
477 def maybe_write(self, type, content):
478 """Write an object to the pack file if not present and return its id."""
479 bin = calc_hash(type, content)
480 if not self.exists(bin):
481 self._write(bin, type, content)
482 self.objcache.add(bin)
485 def new_blob(self, blob):
486 """Create a blob object in the pack with the supplied content."""
487 return self.maybe_write('blob', blob)
489 def new_tree(self, shalist):
490 """Create a tree object in the pack."""
491 shalist = sorted(shalist, key = _shalist_sort_key)
493 for (mode,name,bin) in shalist:
496 assert(mode[0] != '0')
498 assert(len(bin) == 20)
499 l.append('%s %s\0%s' % (mode,name,bin))
500 return self.maybe_write('tree', ''.join(l))
502 def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
504 if tree: l.append('tree %s' % tree.encode('hex'))
505 if parent: l.append('parent %s' % parent.encode('hex'))
506 if author: l.append('author %s %s' % (author, _git_date(adate)))
507 if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
510 return self.maybe_write('commit', '\n'.join(l))
512 def new_commit(self, parent, tree, msg):
513 """Create a commit object in the pack."""
515 userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
516 commit = self._new_commit(tree, parent,
517 userline, now, userline, now,
522 """Remove the pack file from disk."""
527 os.unlink(self.filename + '.pack')
531 if not f: return None
535 # update object count
537 cp = struct.pack('!i', self.count)
541 # calculate the pack sha1sum
548 f.write(sum.digest())
552 p = subprocess.Popen(['git', 'index-pack', '-v',
554 self.filename + '.pack'],
555 preexec_fn = _gitenv,
556 stdout = subprocess.PIPE)
557 out = p.stdout.read().strip()
558 _git_wait('git index-pack', p)
560 raise GitError('git index-pack produced no output')
561 nameprefix = repo('objects/pack/%s' % out)
562 if os.path.exists(self.filename + '.map'):
563 os.unlink(self.filename + '.map')
564 os.rename(self.filename + '.pack', nameprefix + '.pack')
565 os.rename(self.filename + '.idx', nameprefix + '.idx')
569 """Close the pack file and move it to its definitive path."""
574 return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
578 os.environ['GIT_DIR'] = os.path.abspath(repo())
581 def list_refs(refname = None):
582 """Generate a list of tuples in the form (refname,hash).
583 If a ref name is specified, list only this particular ref.
585 argv = ['git', 'show-ref', '--']
588 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
589 out = p.stdout.read().strip()
590 rv = p.wait() # not fatal
594 for d in out.split('\n'):
595 (sha, name) = d.split(' ', 1)
596 yield (name, sha.decode('hex'))
599 def read_ref(refname):
600 """Get the commit id of the most recent commit made on a given ref."""
601 l = list(list_refs(refname))
609 def rev_list(ref, count=None):
610 """Generate a list of reachable commits in reverse chronological order.
612 This generator walks through commits, from child to parent, that are
613 reachable via the specified ref and yields a series of tuples of the form
616 If count is a non-zero integer, limit the number of commits to "count"
619 assert(not ref.startswith('-'))
622 opts += ['-n', str(atoi(count))]
623 argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
624 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
628 if s.startswith('commit '):
629 commit = s[7:].decode('hex')
633 rv = p.wait() # not fatal
635 raise GitError, 'git rev-list returned error %d' % rv
638 def rev_get_date(ref):
639 """Get the date of the latest commit on the specified ref."""
640 for (date, commit) in rev_list(ref, count=1):
642 raise GitError, 'no such commit %r' % ref
645 def update_ref(refname, newval, oldval):
646 """Change the commit pointed to by a branch."""
649 assert(refname.startswith('refs/heads/'))
650 p = subprocess.Popen(['git', 'update-ref', refname,
651 newval.encode('hex'), oldval.encode('hex')],
652 preexec_fn = _gitenv)
653 _git_wait('git update-ref', p)
656 def guess_repo(path=None):
657 """Set the path value in the global variable "repodir".
658 This makes bup look for an existing bup repository, but not fail if a
659 repository doesn't exist. Usually, if you are interacting with a bup
660 repository, you would not be calling this function but using
667 repodir = os.environ.get('BUP_DIR')
669 repodir = os.path.expanduser('~/.bup')
672 def init_repo(path=None):
673 """Create the Git bare repository for bup in a given path."""
676 if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
677 raise GitError('"%d" exists but is not a directory\n' % d)
678 p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
679 preexec_fn = _gitenv)
680 _git_wait('git init', p)
681 # Force the index version configuration in order to ensure bup works
682 # regardless of the version of the installed Git binary.
683 p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
684 stdout=sys.stderr, preexec_fn = _gitenv)
685 _git_wait('git config', p)
688 def check_repo_or_die(path=None):
689 """Make sure a bup repository exists, and abort if not.
690 If the path to a particular repository was not specified, this function
691 initializes the default repository automatically.
694 if not os.path.isdir(repo('objects/pack/.')):
695 if repodir == home_repodir:
698 log('error: %r is not a bup/git repository\n' % repo())
703 """Generate a list of (mode, name, hash) tuples of objects from 'buf'."""
705 while ofs < len(buf):
706 z = buf[ofs:].find('\0')
708 spl = buf[ofs:ofs+z].split(' ', 1)
709 assert(len(spl) == 2)
710 sha = buf[ofs+z+1:ofs+z+1+20]
712 yield (spl[0], spl[1], sha)
717 """Get Git's version and ensure a usable version is installed.
719 The returned version is formatted as an ordered tuple with each position
720 representing a digit in the version tag. For example, the following tuple
721 would represent version 1.6.6.9:
727 p = subprocess.Popen(['git', '--version'],
728 stdout=subprocess.PIPE)
729 gvs = p.stdout.read()
730 _git_wait('git --version', p)
731 m = re.match(r'git version (\S+.\S+)', gvs)
733 raise GitError('git --version weird output: %r' % gvs)
734 _ver = tuple(m.group(1).split('.'))
735 needed = ('1','5', '3', '1')
737 raise GitError('git version %s or higher is required; you have %s'
738 % ('.'.join(needed), '.'.join(_ver)))
742 def _git_wait(cmd, p):
745 raise GitError('%s returned %d' % (cmd, rv))
748 def _git_capture(argv):
749 p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
751 _git_wait(repr(argv), p)
755 class _AbortableIter:
756 def __init__(self, it, onabort = None):
758 self.onabort = onabort
766 return self.it.next()
767 except StopIteration, e:
775 """Abort iteration and call the abortion callback, if needed."""
787 """Link to 'git cat-file' that is used to retrieve blob data."""
790 wanted = ('1','5','6')
793 log('warning: git version < %s; bup will be slow.\n'
796 self.get = self._slow_get
798 self.p = self.inprogress = None
799 self.get = self._fast_get
803 self.p.stdout.close()
806 self.inprogress = None
810 self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
811 stdin=subprocess.PIPE,
812 stdout=subprocess.PIPE,
814 preexec_fn = _gitenv)
816 def _fast_get(self, id):
817 if not self.p or self.p.poll() != None:
820 assert(self.p.poll() == None)
822 log('_fast_get: opening %r while %r is open'
823 % (id, self.inprogress))
824 assert(not self.inprogress)
825 assert(id.find('\n') < 0)
826 assert(id.find('\r') < 0)
829 self.p.stdin.write('%s\n' % id)
830 hdr = self.p.stdout.readline()
831 if hdr.endswith(' missing\n'):
832 raise KeyError('blob %r is missing' % id)
834 if len(spl) != 3 or len(spl[0]) != 40:
835 raise GitError('expected blob, got %r' % spl)
836 (hex, type, size) = spl
838 it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
839 onabort = self._abort)
844 assert(self.p.stdout.readline() == '\n')
845 self.inprogress = None
850 def _slow_get(self, id):
851 assert(id.find('\n') < 0)
852 assert(id.find('\r') < 0)
854 type = _git_capture(['git', 'cat-file', '-t', id]).strip()
857 p = subprocess.Popen(['git', 'cat-file', type, id],
858 stdout=subprocess.PIPE,
859 preexec_fn = _gitenv)
860 for blob in chunkyreader(p.stdout):
862 _git_wait('git cat-file', p)
870 treefile = ''.join(it)
871 for (mode, name, sha) in treeparse(treefile):
872 for blob in self.join(sha.encode('hex')):
874 elif type == 'commit':
875 treeline = ''.join(it).split('\n')[0]
876 assert(treeline.startswith('tree '))
877 for blob in self.join(treeline[5:]):
880 raise GitError('invalid object type %r: expected blob/tree/commit'
884 """Generate a list of the content of all blobs that can be reached
885 from an object. The hash given in 'id' must point to a blob, a tree
886 or a commit. The content of all blobs that can be seen from trees or
887 commits will be added to the list.
890 for d in self._join(self.get(id)):
892 except StopIteration: