1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
5 import os, errno, zlib, time, subprocess, struct, stat, re, tempfile
7 from bup.helpers import *
11 home_repodir = os.path.expanduser('~/.bup')
14 _typemap = { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
15 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
18 class GitError(Exception):
23 """Get the path to the git repository or one of its subdirectories."""
26 raise GitError('You should call check_repo_or_die()')
28 # If there's a .git subdirectory, then the actual repo is in there.
29 gd = os.path.join(repodir, '.git')
30 if os.path.exists(gd):
33 return os.path.join(repodir, sub)
36 def mangle_name(name, mode, gitmode):
37 """Mangle a file name to present an abstract name for segmented files.
38 Mangled file names will have the ".bup" extension added to them. If a
39 file's name already ends with ".bup", a ".bupl" extension is added to
40 disambiguate normal files from semgmented ones.
42 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
44 elif name.endswith('.bup') or name[:-1].endswith('.bup'):
50 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
51 def demangle_name(name):
52 """Remove name mangling from a file name, if necessary.
54 The return value is a tuple (demangled_filename,mode), where mode is one of
57 * BUP_NORMAL : files that should be read as-is from the repository
58 * BUP_CHUNKED : files that were chunked and need to be assembled
60 For more information on the name mangling algorythm, see mangle_name()
62 if name.endswith('.bupl'):
63 return (name[:-5], BUP_NORMAL)
64 elif name.endswith('.bup'):
65 return (name[:-4], BUP_CHUNKED)
67 return (name, BUP_NORMAL)
70 def _encode_packobj(type, content):
73 szbits = (sz & 0x0f) | (_typemap[type]<<4)
82 z = zlib.compressobj(1)
84 yield z.compress(content)
88 def _encode_looseobj(type, content):
89 z = zlib.compressobj(1)
90 yield z.compress('%s %d\0' % (type, len(content)))
91 yield z.compress(content)
95 def _decode_looseobj(buf):
97 s = zlib.decompress(buf)
104 assert(type in _typemap)
105 assert(sz == len(content))
106 return (type, content)
109 def _decode_packobj(buf):
112 type = _typermap[(c & 0x70) >> 4]
119 sz |= (c & 0x7f) << shift
123 return (type, zlib.decompress(buf[i+1:]))
127 """Object representation of a Git pack index file."""
128 def __init__(self, filename):
130 self.map = mmap_read(open(filename))
131 assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
132 self.fanout = list(struct.unpack('!256I',
133 str(buffer(self.map, 8, 256*4))))
134 self.fanout.append(0) # entry "-1"
135 nsha = self.fanout[255]
136 self.ofstable = buffer(self.map,
137 8 + 256*4 + nsha*20 + nsha*4,
139 self.ofs64table = buffer(self.map,
140 8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
142 def _ofs_from_idx(self, idx):
143 ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
145 idx64 = ofs & 0x7fffffff
146 ofs = struct.unpack('!I',
147 str(buffer(self.ofs64table, idx64*8, 8)))[0]
150 def _idx_from_hash(self, hash):
151 assert(len(hash) == 20)
153 start = self.fanout[b1-1] # range -1..254
154 end = self.fanout[b1] # range 0..255
155 buf = buffer(self.map, 8 + 256*4, end*20)
158 mid = start + (end-start)/2
159 v = str(buf[mid*20:(mid+1)*20])
168 def find_offset(self, hash):
169 """Get the offset of an object inside the index file."""
170 idx = self._idx_from_hash(hash)
172 return self._ofs_from_idx(idx)
175 def exists(self, hash):
176 """Return nonempty if the object exists in this index."""
177 return hash and (self._idx_from_hash(hash) != None) and True or None
180 for i in xrange(self.fanout[255]):
181 yield buffer(self.map, 8 + 256*4 + 20*i, 20)
184 return int(self.fanout[255])
187 def extract_bits(buf, nbits):
188 """Take the first 'nbits' bits from 'buf' and return them as an integer."""
189 mask = (1<<nbits) - 1
190 v = struct.unpack('!I', buf[0:4])[0]
191 v = (v >> (32-nbits)) & mask
196 """Wrapper which contains data from multiple index files.
197 Multiple index (.midx) files constitute a wrapper around index (.idx) files
198 and make it possible for bup to expand Git's indexing capabilities to vast
201 def __init__(self, filename):
203 assert(filename.endswith('.midx'))
204 self.map = mmap_read(open(filename))
205 if str(self.map[0:8]) == 'MIDX\0\0\0\1':
206 log('Warning: ignoring old-style midx %r\n' % filename)
209 self.fanout = buffer('\0\0\0\0')
210 self.shalist = buffer('\0'*20)
213 assert(str(self.map[0:8]) == 'MIDX\0\0\0\2')
214 self.bits = struct.unpack('!I', self.map[8:12])[0]
215 self.entries = 2**self.bits
216 self.fanout = buffer(self.map, 12, self.entries*4)
217 shaofs = 12 + self.entries*4
218 nsha = self._fanget(self.entries-1)
219 self.shalist = buffer(self.map, shaofs, nsha*20)
220 self.idxnames = str(self.map[shaofs + 20*nsha:]).split('\0')
222 def _fanget(self, i):
224 s = self.fanout[start:start+4]
225 return struct.unpack('!I', s)[0]
227 def exists(self, hash):
228 """Return nonempty if the object exists in the index files."""
230 el = extract_bits(want, self.bits)
232 start = self._fanget(el-1)
235 end = self._fanget(el)
237 mid = start + (end-start)/2
238 v = str(self.shalist[mid*20:(mid+1)*20])
248 for i in xrange(self._fanget(self.entries-1)):
249 yield buffer(self.shalist, i*20, 20)
252 return int(self._fanget(self.entries-1))
257 def __init__(self, dir):
259 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
269 assert(_mpi_count == 0)
272 return iter(idxmerge(self.packs))
274 def exists(self, hash):
275 """Return nonempty if the object exists in the index files."""
276 if hash in self.also:
278 for i in range(len(self.packs)):
281 # reorder so most recently used packs are searched first
282 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
286 def refresh(self, skip_midx = False):
287 """Refresh the index list.
288 This method verifies if .midx files were superseded (e.g. all of its
289 contents are in another, bigger .midx file) and removes the superseded
292 If skip_midx is True, all work on .midx files will be skipped and .midx
293 files will be removed from the list.
295 The module-global variable 'ignore_midx' can force this function to
296 always act as if skip_midx was True.
298 skip_midx = skip_midx or ignore_midx
299 d = dict((p.name, p) for p in self.packs
300 if not skip_midx or not isinstance(p, PackMidx))
301 if os.path.exists(self.dir):
304 for ix in self.packs:
305 if isinstance(ix, PackMidx):
306 for name in ix.idxnames:
307 d[os.path.join(self.dir, name)] = ix
308 for f in os.listdir(self.dir):
309 full = os.path.join(self.dir, f)
310 if f.endswith('.midx') and not d.get(full):
312 (mxd, mxf) = os.path.split(mx.name)
314 for n in mx.idxnames:
315 if not os.path.exists(os.path.join(mxd, n)):
316 log(('warning: index %s missing\n' +
317 ' used by %s\n') % (n, mxf))
321 midxl.sort(lambda x,y: -cmp(len(x),len(y)))
324 for sub in ix.idxnames:
325 found = d.get(os.path.join(self.dir, sub))
326 if not found or isinstance(found, PackIdx):
327 # doesn't exist, or exists but not in a midx
329 for name in ix.idxnames:
330 d[os.path.join(self.dir, name)] = ix
334 log('midx: removing redundant: %s\n'
335 % os.path.basename(ix.name))
337 for f in os.listdir(self.dir):
338 full = os.path.join(self.dir, f)
339 if f.endswith('.idx') and not d.get(full):
342 self.packs = list(set(d.values()))
343 log('PackIdxList: using %d index%s.\n'
344 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
347 """Insert an additional object in the list."""
351 """Remove all additional objects from the list."""
355 def calc_hash(type, content):
356 """Calculate some content's hash in the Git fashion."""
357 header = '%s %d\0' % (type, len(content))
363 def _shalist_sort_key(ent):
364 (mode, name, id) = ent
365 if stat.S_ISDIR(int(mode, 8)):
371 def idxmerge(idxlist):
372 """Generate a list of all the objects reachable in a PackIdxList."""
373 total = sum(len(i) for i in idxlist)
374 iters = (iter(i) for i in idxlist)
375 heap = [(next(it), it) for it in iters]
380 if (count % 10024) == 0:
381 progress('Reading indexes: %.2f%% (%d/%d)\r'
382 % (count*100.0/total, count, total))
390 heapq.heapreplace(heap, (e, it))
393 log('Reading indexes: %.2f%% (%d/%d), done.\n' % (100, total, total))
397 """Writes Git objects insid a pack file."""
398 def __init__(self, objcache_maker=None):
403 self.objcache_maker = objcache_maker
409 def _make_objcache(self):
410 if not self.objcache:
411 if self.objcache_maker:
412 self.objcache = self.objcache_maker()
414 self.objcache = PackIdxList(repo('objects/pack'))
418 self._make_objcache()
419 (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
420 self.file = os.fdopen(fd, 'w+b')
421 assert(name.endswith('.pack'))
422 self.filename = name[:-5]
423 self.file.write('PACK\0\0\0\2\0\0\0\0')
425 def _raw_write(self, datalist):
428 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
429 # the file never has a *partial* blob. So let's make sure it's
430 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
431 # to our hashsplit algorithm.) f.write() does its own buffering,
432 # but that's okay because we'll flush it in _end().
433 oneblob = ''.join(datalist)
435 self.outbytes += len(oneblob)
438 def _write(self, bin, type, content):
441 self._raw_write(_encode_packobj(type, content))
444 def breakpoint(self):
445 """Clear byte and object counts and return the last processed id."""
447 self.outbytes = self.count = 0
450 def write(self, type, content):
451 """Write an object in this pack file."""
452 return self._write(calc_hash(type, content), type, content)
454 def exists(self, id):
455 """Return non-empty if an object is found in the object cache."""
456 if not self.objcache:
457 self._make_objcache()
458 return self.objcache.exists(id)
460 def maybe_write(self, type, content):
461 """Write an object to the pack file if not present and return its id."""
462 bin = calc_hash(type, content)
463 if not self.exists(bin):
464 self._write(bin, type, content)
465 self.objcache.add(bin)
468 def new_blob(self, blob):
469 """Create a blob object in the pack with the supplied content."""
470 return self.maybe_write('blob', blob)
472 def new_tree(self, shalist):
473 """Create a tree object in the pack."""
474 shalist = sorted(shalist, key = _shalist_sort_key)
476 for (mode,name,bin) in shalist:
479 assert(mode[0] != '0')
481 assert(len(bin) == 20)
482 l.append('%s %s\0%s' % (mode,name,bin))
483 return self.maybe_write('tree', ''.join(l))
485 def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
487 if tree: l.append('tree %s' % tree.encode('hex'))
488 if parent: l.append('parent %s' % parent.encode('hex'))
489 if author: l.append('author %s %s' % (author, _git_date(adate)))
490 if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
493 return self.maybe_write('commit', '\n'.join(l))
495 def new_commit(self, parent, tree, msg):
496 """Create a commit object in the pack."""
498 userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
499 commit = self._new_commit(tree, parent,
500 userline, now, userline, now,
505 """Remove the pack file from disk."""
510 os.unlink(self.filename + '.pack')
514 if not f: return None
518 # update object count
520 cp = struct.pack('!i', self.count)
524 # calculate the pack sha1sum
531 f.write(sum.digest())
535 p = subprocess.Popen(['git', 'index-pack', '-v',
537 self.filename + '.pack'],
538 preexec_fn = _gitenv,
539 stdout = subprocess.PIPE)
540 out = p.stdout.read().strip()
541 _git_wait('git index-pack', p)
543 raise GitError('git index-pack produced no output')
544 nameprefix = repo('objects/pack/%s' % out)
545 if os.path.exists(self.filename + '.map'):
546 os.unlink(self.filename + '.map')
547 os.rename(self.filename + '.pack', nameprefix + '.pack')
548 os.rename(self.filename + '.idx', nameprefix + '.idx')
552 """Close the pack file and move it to its definitive path."""
557 return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
561 os.environ['GIT_DIR'] = os.path.abspath(repo())
564 def list_refs(refname = None):
565 """Generate a list of tuples in the form (refname,hash).
566 If a ref name is specified, list only this particular ref.
568 argv = ['git', 'show-ref', '--']
571 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
572 out = p.stdout.read().strip()
573 rv = p.wait() # not fatal
577 for d in out.split('\n'):
578 (sha, name) = d.split(' ', 1)
579 yield (name, sha.decode('hex'))
582 def read_ref(refname):
583 """Get the commit id of the most recent commit made on a given ref."""
584 l = list(list_refs(refname))
592 def rev_list(ref, count=None):
593 """Generate a list of reachable commits in reverse chronological order.
595 This generator walks through commits, from child to parent, that are
596 reachable via the specified ref and yields a series of tuples of the form
599 If count is a non-zero integer, limit the number of commits to "count"
602 assert(not ref.startswith('-'))
605 opts += ['-n', str(atoi(count))]
606 argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
607 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
611 if s.startswith('commit '):
612 commit = s[7:].decode('hex')
616 rv = p.wait() # not fatal
618 raise GitError, 'git rev-list returned error %d' % rv
621 def rev_get_date(ref):
622 """Get the date of the latest commit on the specified ref."""
623 for (date, commit) in rev_list(ref, count=1):
625 raise GitError, 'no such commit %r' % ref
628 def update_ref(refname, newval, oldval):
629 """Change the commit pointed to by a branch."""
632 assert(refname.startswith('refs/heads/'))
633 p = subprocess.Popen(['git', 'update-ref', refname,
634 newval.encode('hex'), oldval.encode('hex')],
635 preexec_fn = _gitenv)
636 _git_wait('git update-ref', p)
639 def guess_repo(path=None):
640 """Set the path value in the global variable "repodir".
641 This makes bup look for an existing bup repository, but not fail if a
642 repository doesn't exist. Usually, if you are interacting with a bup
643 repository, you would not be calling this function but using
650 repodir = os.environ.get('BUP_DIR')
652 repodir = os.path.expanduser('~/.bup')
655 def init_repo(path=None):
656 """Create the Git bare repository for bup in a given path."""
659 if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
660 raise GitError('"%d" exists but is not a directory\n' % d)
661 p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
662 preexec_fn = _gitenv)
663 _git_wait('git init', p)
664 # Force the index version configuration in order to ensure bup works
665 # regardless of the version of the installed Git binary.
666 p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
667 stdout=sys.stderr, preexec_fn = _gitenv)
668 _git_wait('git config', p)
671 def check_repo_or_die(path=None):
672 """Make sure a bup repository exists, and abort if not.
673 If the path to a particular repository was not specified, this function
674 initializes the default repository automatically.
677 if not os.path.isdir(repo('objects/pack/.')):
678 if repodir == home_repodir:
681 log('error: %r is not a bup/git repository\n' % repo())
687 while ofs < len(buf):
688 z = buf[ofs:].find('\0')
690 spl = buf[ofs:ofs+z].split(' ', 1)
691 assert(len(spl) == 2)
692 sha = buf[ofs+z+1:ofs+z+1+20]
694 yield (spl[0], spl[1], sha)
699 """Get Git's version and ensure a usable version is installed.
701 The returned version is formatted as an ordered tuple with each position
702 representing a digit in the version tag. For example, the following tuple
703 would represent version 1.6.6.9:
709 p = subprocess.Popen(['git', '--version'],
710 stdout=subprocess.PIPE)
711 gvs = p.stdout.read()
712 _git_wait('git --version', p)
713 m = re.match(r'git version (\S+.\S+)', gvs)
715 raise GitError('git --version weird output: %r' % gvs)
716 _ver = tuple(m.group(1).split('.'))
717 needed = ('1','5', '3', '1')
719 raise GitError('git version %s or higher is required; you have %s'
720 % ('.'.join(needed), '.'.join(_ver)))
724 def _git_wait(cmd, p):
727 raise GitError('%s returned %d' % (cmd, rv))
730 def _git_capture(argv):
731 p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
733 _git_wait(repr(argv), p)
737 class _AbortableIter:
738 def __init__(self, it, onabort = None):
740 self.onabort = onabort
748 return self.it.next()
749 except StopIteration, e:
757 """Abort iteration and call the abortion callback, if needed."""
769 """Link to 'git cat-file' that is used to retrieve blob data."""
772 wanted = ('1','5','6')
775 log('warning: git version < %s; bup will be slow.\n'
778 self.get = self._slow_get
780 self.p = self.inprogress = None
781 self.get = self._fast_get
785 self.p.stdout.close()
788 self.inprogress = None
792 self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
793 stdin=subprocess.PIPE,
794 stdout=subprocess.PIPE,
795 preexec_fn = _gitenv)
797 def _fast_get(self, id):
798 if not self.p or self.p.poll() != None:
801 assert(self.p.poll() == None)
803 log('_fast_get: opening %r while %r is open'
804 % (id, self.inprogress))
805 assert(not self.inprogress)
806 assert(id.find('\n') < 0)
807 assert(id.find('\r') < 0)
810 self.p.stdin.write('%s\n' % id)
811 hdr = self.p.stdout.readline()
812 if hdr.endswith(' missing\n'):
813 raise KeyError('blob %r is missing' % id)
815 if len(spl) != 3 or len(spl[0]) != 40:
816 raise GitError('expected blob, got %r' % spl)
817 (hex, type, size) = spl
819 it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
820 onabort = self._abort)
825 assert(self.p.stdout.readline() == '\n')
826 self.inprogress = None
831 def _slow_get(self, id):
832 assert(id.find('\n') < 0)
833 assert(id.find('\r') < 0)
835 type = _git_capture(['git', 'cat-file', '-t', id]).strip()
838 p = subprocess.Popen(['git', 'cat-file', type, id],
839 stdout=subprocess.PIPE,
840 preexec_fn = _gitenv)
841 for blob in chunkyreader(p.stdout):
843 _git_wait('git cat-file', p)
851 treefile = ''.join(it)
852 for (mode, name, sha) in _treeparse(treefile):
853 for blob in self.join(sha.encode('hex')):
855 elif type == 'commit':
856 treeline = ''.join(it).split('\n')[0]
857 assert(treeline.startswith('tree '))
858 for blob in self.join(treeline[5:]):
861 raise GitError('invalid object type %r: expected blob/tree/commit'
865 """Generate a list of the content of all blobs that can be reached
866 from an object. The hash given in 'id' must point to a blob, a tree
867 or a commit. The content of all blobs that can be seen from trees or
868 commits will be added to the list.
871 for d in self._join(self.get(id)):
873 except StopIteration: