1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
5 import os, zlib, time, subprocess, struct, stat, re, tempfile
7 from bup.helpers import *
8 from bup import _helpers
14 home_repodir = os.path.expanduser('~/.bup')
17 _typemap = { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
18 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
24 class GitError(Exception):
29 """Get the path to the git repository or one of its subdirectories."""
32 raise GitError('You should call check_repo_or_die()')
34 # If there's a .git subdirectory, then the actual repo is in there.
35 gd = os.path.join(repodir, '.git')
36 if os.path.exists(gd):
39 return os.path.join(repodir, sub)
42 def mangle_name(name, mode, gitmode):
43 """Mangle a file name to present an abstract name for segmented files.
44 Mangled file names will have the ".bup" extension added to them. If a
45 file's name already ends with ".bup", a ".bupl" extension is added to
46 disambiguate normal files from semgmented ones.
48 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
50 elif name.endswith('.bup') or name[:-1].endswith('.bup'):
56 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
57 def demangle_name(name):
58 """Remove name mangling from a file name, if necessary.
60 The return value is a tuple (demangled_filename,mode), where mode is one of
63 * BUP_NORMAL : files that should be read as-is from the repository
64 * BUP_CHUNKED : files that were chunked and need to be assembled
66 For more information on the name mangling algorythm, see mangle_name()
68 if name.endswith('.bupl'):
69 return (name[:-5], BUP_NORMAL)
70 elif name.endswith('.bup'):
71 return (name[:-4], BUP_CHUNKED)
73 return (name, BUP_NORMAL)
76 def _encode_packobj(type, content):
79 szbits = (sz & 0x0f) | (_typemap[type]<<4)
88 z = zlib.compressobj(1)
90 yield z.compress(content)
94 def _encode_looseobj(type, content):
95 z = zlib.compressobj(1)
96 yield z.compress('%s %d\0' % (type, len(content)))
97 yield z.compress(content)
101 def _decode_looseobj(buf):
103 s = zlib.decompress(buf)
110 assert(type in _typemap)
111 assert(sz == len(content))
112 return (type, content)
115 def _decode_packobj(buf):
118 type = _typermap[(c & 0x70) >> 4]
125 sz |= (c & 0x7f) << shift
129 return (type, zlib.decompress(buf[i+1:]))
133 """Object representation of a Git pack index file."""
134 def __init__(self, filename):
136 self.map = mmap_read(open(filename))
137 assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
138 self.fanout = list(struct.unpack('!256I',
139 str(buffer(self.map, 8, 256*4))))
140 self.fanout.append(0) # entry "-1"
141 nsha = self.fanout[255]
142 self.ofstable = buffer(self.map,
143 8 + 256*4 + nsha*20 + nsha*4,
145 self.ofs64table = buffer(self.map,
146 8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
148 def _ofs_from_idx(self, idx):
149 ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
151 idx64 = ofs & 0x7fffffff
152 ofs = struct.unpack('!I',
153 str(buffer(self.ofs64table, idx64*8, 8)))[0]
156 def _idx_from_hash(self, hash):
157 global _total_searches, _total_steps
159 assert(len(hash) == 20)
161 start = self.fanout[b1-1] # range -1..254
162 end = self.fanout[b1] # range 0..255
163 buf = buffer(self.map, 8 + 256*4, end*20)
165 _total_steps += 1 # lookup table is a step
168 mid = start + (end-start)/2
169 v = str(buf[mid*20:(mid+1)*20])
178 def find_offset(self, hash):
179 """Get the offset of an object inside the index file."""
180 idx = self._idx_from_hash(hash)
182 return self._ofs_from_idx(idx)
185 def exists(self, hash):
186 """Return nonempty if the object exists in this index."""
187 return hash and (self._idx_from_hash(hash) != None) and True or None
190 for i in xrange(self.fanout[255]):
191 yield buffer(self.map, 8 + 256*4 + 20*i, 20)
194 return int(self.fanout[255])
197 extract_bits = _helpers.extract_bits
201 """Wrapper which contains data from multiple index files.
202 Multiple index (.midx) files constitute a wrapper around index (.idx) files
203 and make it possible for bup to expand Git's indexing capabilities to vast
206 def __init__(self, filename):
208 self.force_keep = False
209 assert(filename.endswith('.midx'))
210 self.map = mmap_read(open(filename))
211 if str(self.map[0:4]) != 'MIDX':
212 log('Warning: skipping: invalid MIDX header in %r\n' % filename)
213 self.force_keep = True
214 return self._init_failed()
215 ver = struct.unpack('!I', self.map[4:8])[0]
216 if ver < MIDX_VERSION:
217 log('Warning: ignoring old-style (v%d) midx %r\n'
219 self.force_keep = False # old stuff is boring
220 return self._init_failed()
221 if ver > MIDX_VERSION:
222 log('Warning: ignoring too-new (v%d) midx %r\n'
224 self.force_keep = True # new stuff is exciting
225 return self._init_failed()
227 self.bits = _helpers.firstword(self.map[8:12])
228 self.entries = 2**self.bits
229 self.fanout = buffer(self.map, 12, self.entries*4)
230 shaofs = 12 + self.entries*4
231 nsha = self._fanget(self.entries-1)
232 self.shalist = buffer(self.map, shaofs, nsha*20)
233 self.idxnames = str(self.map[shaofs + 20*nsha:]).split('\0')
235 def _init_failed(self):
238 self.fanout = buffer('\0\0\0\0')
239 self.shalist = buffer('\0'*20)
242 def _fanget(self, i):
244 s = self.fanout[start:start+4]
245 return _helpers.firstword(s)
248 return str(self.shalist[i*20:(i+1)*20])
250 def exists(self, hash):
251 """Return nonempty if the object exists in the index files."""
252 global _total_searches, _total_steps
255 el = extract_bits(want, self.bits)
257 start = self._fanget(el-1)
258 startv = el << (32-self.bits)
262 end = self._fanget(el)
263 endv = (el+1) << (32-self.bits)
264 _total_steps += 1 # lookup table is a step
265 hashv = _helpers.firstword(hash)
266 #print '(%08x) %08x %08x %08x' % (extract_bits(want, 32), startv, hashv, endv)
269 #print '! %08x %08x %08x %d - %d' % (startv, hashv, endv, start, end)
270 mid = start + (hashv-startv)*(end-start-1)/(endv-startv)
271 #print ' %08x %08x %08x %d %d %d' % (startv, hashv, endv, start, mid, end)
273 #print ' %08x' % self._num(v)
276 startv = _helpers.firstword(v)
279 endv = _helpers.firstword(v)
285 for i in xrange(self._fanget(self.entries-1)):
286 yield buffer(self.shalist, i*20, 20)
289 return int(self._fanget(self.entries-1))
294 def __init__(self, dir):
296 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
306 assert(_mpi_count == 0)
309 return iter(idxmerge(self.packs))
312 return sum(len(pack) for pack in self.packs)
314 def exists(self, hash):
315 """Return nonempty if the object exists in the index files."""
316 global _total_searches
318 if hash in self.also:
320 for i in range(len(self.packs)):
322 _total_searches -= 1 # will be incremented by sub-pack
324 # reorder so most recently used packs are searched first
325 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
329 def refresh(self, skip_midx = False):
330 """Refresh the index list.
331 This method verifies if .midx files were superseded (e.g. all of its
332 contents are in another, bigger .midx file) and removes the superseded
335 If skip_midx is True, all work on .midx files will be skipped and .midx
336 files will be removed from the list.
338 The module-global variable 'ignore_midx' can force this function to
339 always act as if skip_midx was True.
341 skip_midx = skip_midx or ignore_midx
342 d = dict((p.name, p) for p in self.packs
343 if not skip_midx or not isinstance(p, PackMidx))
344 if os.path.exists(self.dir):
347 for ix in self.packs:
348 if isinstance(ix, PackMidx):
349 for name in ix.idxnames:
350 d[os.path.join(self.dir, name)] = ix
351 for f in os.listdir(self.dir):
352 full = os.path.join(self.dir, f)
353 if f.endswith('.midx') and not d.get(full):
355 (mxd, mxf) = os.path.split(mx.name)
357 for n in mx.idxnames:
358 if not os.path.exists(os.path.join(mxd, n)):
359 log(('warning: index %s missing\n' +
360 ' used by %s\n') % (n, mxf))
364 midxl.sort(lambda x,y: -cmp(len(x),len(y)))
367 for sub in ix.idxnames:
368 found = d.get(os.path.join(self.dir, sub))
369 if not found or isinstance(found, PackIdx):
370 # doesn't exist, or exists but not in a midx
372 for name in ix.idxnames:
373 d[os.path.join(self.dir, name)] = ix
376 if not any and not ix.force_keep:
377 log('midx: removing redundant: %s\n'
378 % os.path.basename(ix.name))
380 for f in os.listdir(self.dir):
381 full = os.path.join(self.dir, f)
382 if f.endswith('.idx') and not d.get(full):
385 self.packs = list(set(d.values()))
386 log('PackIdxList: using %d index%s.\n'
387 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
390 """Insert an additional object in the list."""
394 """Remove all additional objects from the list."""
398 def calc_hash(type, content):
399 """Calculate some content's hash in the Git fashion."""
400 header = '%s %d\0' % (type, len(content))
406 def _shalist_sort_key(ent):
407 (mode, name, id) = ent
408 if stat.S_ISDIR(int(mode, 8)):
414 def idxmerge(idxlist):
415 """Generate a list of all the objects reachable in a PackIdxList."""
416 total = sum(len(i) for i in idxlist)
417 iters = (iter(i) for i in idxlist)
418 heap = [(next(it), it) for it in iters]
423 if (count % 10024) == 0:
424 progress('Reading indexes: %.2f%% (%d/%d)\r'
425 % (count*100.0/total, count, total))
433 heapq.heapreplace(heap, (e, it))
436 log('Reading indexes: %.2f%% (%d/%d), done.\n' % (100, total, total))
440 """Writes Git objects insid a pack file."""
441 def __init__(self, objcache_maker=None):
446 self.objcache_maker = objcache_maker
452 def _make_objcache(self):
453 if self.objcache == None:
454 if self.objcache_maker:
455 self.objcache = self.objcache_maker()
457 self.objcache = PackIdxList(repo('objects/pack'))
461 self._make_objcache()
462 (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
463 self.file = os.fdopen(fd, 'w+b')
464 assert(name.endswith('.pack'))
465 self.filename = name[:-5]
466 self.file.write('PACK\0\0\0\2\0\0\0\0')
468 def _raw_write(self, datalist):
471 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
472 # the file never has a *partial* blob. So let's make sure it's
473 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
474 # to our hashsplit algorithm.) f.write() does its own buffering,
475 # but that's okay because we'll flush it in _end().
476 oneblob = ''.join(datalist)
478 self.outbytes += len(oneblob)
481 def _write(self, bin, type, content):
484 self._raw_write(_encode_packobj(type, content))
487 def breakpoint(self):
488 """Clear byte and object counts and return the last processed id."""
490 self.outbytes = self.count = 0
493 def write(self, type, content):
494 """Write an object in this pack file."""
495 return self._write(calc_hash(type, content), type, content)
497 def exists(self, id):
498 """Return non-empty if an object is found in the object cache."""
499 if not self.objcache:
500 self._make_objcache()
501 return self.objcache.exists(id)
503 def maybe_write(self, type, content):
504 """Write an object to the pack file if not present and return its id."""
505 bin = calc_hash(type, content)
506 if not self.exists(bin):
507 self._write(bin, type, content)
508 self.objcache.add(bin)
511 def new_blob(self, blob):
512 """Create a blob object in the pack with the supplied content."""
513 return self.maybe_write('blob', blob)
515 def new_tree(self, shalist):
516 """Create a tree object in the pack."""
517 shalist = sorted(shalist, key = _shalist_sort_key)
519 for (mode,name,bin) in shalist:
522 assert(mode[0] != '0')
524 assert(len(bin) == 20)
525 l.append('%s %s\0%s' % (mode,name,bin))
526 return self.maybe_write('tree', ''.join(l))
528 def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
530 if tree: l.append('tree %s' % tree.encode('hex'))
531 if parent: l.append('parent %s' % parent.encode('hex'))
532 if author: l.append('author %s %s' % (author, _git_date(adate)))
533 if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
536 return self.maybe_write('commit', '\n'.join(l))
538 def new_commit(self, parent, tree, msg):
539 """Create a commit object in the pack."""
541 userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
542 commit = self._new_commit(tree, parent,
543 userline, now, userline, now,
548 """Remove the pack file from disk."""
553 os.unlink(self.filename + '.pack')
557 if not f: return None
561 # update object count
563 cp = struct.pack('!i', self.count)
567 # calculate the pack sha1sum
574 f.write(sum.digest())
578 p = subprocess.Popen(['git', 'index-pack', '-v',
580 self.filename + '.pack'],
581 preexec_fn = _gitenv,
582 stdout = subprocess.PIPE)
583 out = p.stdout.read().strip()
584 _git_wait('git index-pack', p)
586 raise GitError('git index-pack produced no output')
587 nameprefix = repo('objects/pack/%s' % out)
588 if os.path.exists(self.filename + '.map'):
589 os.unlink(self.filename + '.map')
590 os.rename(self.filename + '.pack', nameprefix + '.pack')
591 os.rename(self.filename + '.idx', nameprefix + '.idx')
595 """Close the pack file and move it to its definitive path."""
600 return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
604 os.environ['GIT_DIR'] = os.path.abspath(repo())
607 def list_refs(refname = None):
608 """Generate a list of tuples in the form (refname,hash).
609 If a ref name is specified, list only this particular ref.
611 argv = ['git', 'show-ref', '--']
614 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
615 out = p.stdout.read().strip()
616 rv = p.wait() # not fatal
620 for d in out.split('\n'):
621 (sha, name) = d.split(' ', 1)
622 yield (name, sha.decode('hex'))
625 def read_ref(refname):
626 """Get the commit id of the most recent commit made on a given ref."""
627 l = list(list_refs(refname))
635 def rev_list(ref, count=None):
636 """Generate a list of reachable commits in reverse chronological order.
638 This generator walks through commits, from child to parent, that are
639 reachable via the specified ref and yields a series of tuples of the form
642 If count is a non-zero integer, limit the number of commits to "count"
645 assert(not ref.startswith('-'))
648 opts += ['-n', str(atoi(count))]
649 argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
650 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
654 if s.startswith('commit '):
655 commit = s[7:].decode('hex')
659 rv = p.wait() # not fatal
661 raise GitError, 'git rev-list returned error %d' % rv
664 def rev_get_date(ref):
665 """Get the date of the latest commit on the specified ref."""
666 for (date, commit) in rev_list(ref, count=1):
668 raise GitError, 'no such commit %r' % ref
671 def update_ref(refname, newval, oldval):
672 """Change the commit pointed to by a branch."""
675 assert(refname.startswith('refs/heads/'))
676 p = subprocess.Popen(['git', 'update-ref', refname,
677 newval.encode('hex'), oldval.encode('hex')],
678 preexec_fn = _gitenv)
679 _git_wait('git update-ref', p)
682 def guess_repo(path=None):
683 """Set the path value in the global variable "repodir".
684 This makes bup look for an existing bup repository, but not fail if a
685 repository doesn't exist. Usually, if you are interacting with a bup
686 repository, you would not be calling this function but using
693 repodir = os.environ.get('BUP_DIR')
695 repodir = os.path.expanduser('~/.bup')
698 def init_repo(path=None):
699 """Create the Git bare repository for bup in a given path."""
702 if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
703 raise GitError('"%d" exists but is not a directory\n' % d)
704 p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
705 preexec_fn = _gitenv)
706 _git_wait('git init', p)
707 # Force the index version configuration in order to ensure bup works
708 # regardless of the version of the installed Git binary.
709 p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
710 stdout=sys.stderr, preexec_fn = _gitenv)
711 _git_wait('git config', p)
714 def check_repo_or_die(path=None):
715 """Make sure a bup repository exists, and abort if not.
716 If the path to a particular repository was not specified, this function
717 initializes the default repository automatically.
720 if not os.path.isdir(repo('objects/pack/.')):
721 if repodir == home_repodir:
724 log('error: %r is not a bup/git repository\n' % repo())
729 """Generate a list of (mode, name, hash) tuples of objects from 'buf'."""
731 while ofs < len(buf):
732 z = buf[ofs:].find('\0')
734 spl = buf[ofs:ofs+z].split(' ', 1)
735 assert(len(spl) == 2)
736 sha = buf[ofs+z+1:ofs+z+1+20]
738 yield (spl[0], spl[1], sha)
743 """Get Git's version and ensure a usable version is installed.
745 The returned version is formatted as an ordered tuple with each position
746 representing a digit in the version tag. For example, the following tuple
747 would represent version 1.6.6.9:
753 p = subprocess.Popen(['git', '--version'],
754 stdout=subprocess.PIPE)
755 gvs = p.stdout.read()
756 _git_wait('git --version', p)
757 m = re.match(r'git version (\S+.\S+)', gvs)
759 raise GitError('git --version weird output: %r' % gvs)
760 _ver = tuple(m.group(1).split('.'))
761 needed = ('1','5', '3', '1')
763 raise GitError('git version %s or higher is required; you have %s'
764 % ('.'.join(needed), '.'.join(_ver)))
768 def _git_wait(cmd, p):
771 raise GitError('%s returned %d' % (cmd, rv))
774 def _git_capture(argv):
775 p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
777 _git_wait(repr(argv), p)
781 class _AbortableIter:
782 def __init__(self, it, onabort = None):
784 self.onabort = onabort
792 return self.it.next()
793 except StopIteration, e:
801 """Abort iteration and call the abortion callback, if needed."""
813 """Link to 'git cat-file' that is used to retrieve blob data."""
816 wanted = ('1','5','6')
819 log('warning: git version < %s; bup will be slow.\n'
822 self.get = self._slow_get
824 self.p = self.inprogress = None
825 self.get = self._fast_get
829 self.p.stdout.close()
832 self.inprogress = None
836 self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
837 stdin=subprocess.PIPE,
838 stdout=subprocess.PIPE,
840 preexec_fn = _gitenv)
842 def _fast_get(self, id):
843 if not self.p or self.p.poll() != None:
846 assert(self.p.poll() == None)
848 log('_fast_get: opening %r while %r is open'
849 % (id, self.inprogress))
850 assert(not self.inprogress)
851 assert(id.find('\n') < 0)
852 assert(id.find('\r') < 0)
855 self.p.stdin.write('%s\n' % id)
856 hdr = self.p.stdout.readline()
857 if hdr.endswith(' missing\n'):
858 raise KeyError('blob %r is missing' % id)
860 if len(spl) != 3 or len(spl[0]) != 40:
861 raise GitError('expected blob, got %r' % spl)
862 (hex, type, size) = spl
864 it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
865 onabort = self._abort)
870 assert(self.p.stdout.readline() == '\n')
871 self.inprogress = None
876 def _slow_get(self, id):
877 assert(id.find('\n') < 0)
878 assert(id.find('\r') < 0)
880 type = _git_capture(['git', 'cat-file', '-t', id]).strip()
883 p = subprocess.Popen(['git', 'cat-file', type, id],
884 stdout=subprocess.PIPE,
885 preexec_fn = _gitenv)
886 for blob in chunkyreader(p.stdout):
888 _git_wait('git cat-file', p)
896 treefile = ''.join(it)
897 for (mode, name, sha) in treeparse(treefile):
898 for blob in self.join(sha.encode('hex')):
900 elif type == 'commit':
901 treeline = ''.join(it).split('\n')[0]
902 assert(treeline.startswith('tree '))
903 for blob in self.join(treeline[5:]):
906 raise GitError('invalid object type %r: expected blob/tree/commit'
910 """Generate a list of the content of all blobs that can be reached
911 from an object. The hash given in 'id' must point to a blob, a tree
912 or a commit. The content of all blobs that can be seen from trees or
913 commits will be added to the list.
916 for d in self._join(self.get(id)):
918 except StopIteration: