1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
5 import os, zlib, time, subprocess, struct, stat, re, tempfile
7 from bup.helpers import *
8 from bup import _helpers
14 home_repodir = os.path.expanduser('~/.bup')
17 _typemap = { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
18 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
24 class GitError(Exception):
29 """Get the path to the git repository or one of its subdirectories."""
32 raise GitError('You should call check_repo_or_die()')
34 # If there's a .git subdirectory, then the actual repo is in there.
35 gd = os.path.join(repodir, '.git')
36 if os.path.exists(gd):
39 return os.path.join(repodir, sub)
42 def auto_midx(objdir):
43 main_exe = os.environ.get('BUP_MAIN_EXE') or sys.argv[0]
44 args = [main_exe, 'midx', '--auto', '--dir', objdir]
45 rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
47 add_error('%r: returned %d' % (args, rv))
50 def mangle_name(name, mode, gitmode):
51 """Mangle a file name to present an abstract name for segmented files.
52 Mangled file names will have the ".bup" extension added to them. If a
53 file's name already ends with ".bup", a ".bupl" extension is added to
54 disambiguate normal files from semgmented ones.
56 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
58 elif name.endswith('.bup') or name[:-1].endswith('.bup'):
64 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
65 def demangle_name(name):
66 """Remove name mangling from a file name, if necessary.
68 The return value is a tuple (demangled_filename,mode), where mode is one of
71 * BUP_NORMAL : files that should be read as-is from the repository
72 * BUP_CHUNKED : files that were chunked and need to be assembled
74 For more information on the name mangling algorythm, see mangle_name()
76 if name.endswith('.bupl'):
77 return (name[:-5], BUP_NORMAL)
78 elif name.endswith('.bup'):
79 return (name[:-4], BUP_CHUNKED)
81 return (name, BUP_NORMAL)
84 def _encode_packobj(type, content):
87 szbits = (sz & 0x0f) | (_typemap[type]<<4)
96 z = zlib.compressobj(1)
98 yield z.compress(content)
102 def _encode_looseobj(type, content):
103 z = zlib.compressobj(1)
104 yield z.compress('%s %d\0' % (type, len(content)))
105 yield z.compress(content)
109 def _decode_looseobj(buf):
111 s = zlib.decompress(buf)
118 assert(type in _typemap)
119 assert(sz == len(content))
120 return (type, content)
123 def _decode_packobj(buf):
126 type = _typermap[(c & 0x70) >> 4]
133 sz |= (c & 0x7f) << shift
137 return (type, zlib.decompress(buf[i+1:]))
141 """Object representation of a Git pack index file."""
142 def __init__(self, filename, f):
144 self.idxnames = [self.name]
145 self.map = mmap_read(f)
146 assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
147 self.fanout = list(struct.unpack('!256I',
148 str(buffer(self.map, 8, 256*4))))
149 self.fanout.append(0) # entry "-1"
150 nsha = self.fanout[255]
151 self.ofstable = buffer(self.map,
152 8 + 256*4 + nsha*20 + nsha*4,
154 self.ofs64table = buffer(self.map,
155 8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
157 def _ofs_from_idx(self, idx):
158 ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
160 idx64 = ofs & 0x7fffffff
161 ofs = struct.unpack('!I',
162 str(buffer(self.ofs64table, idx64*8, 8)))[0]
165 def _idx_from_hash(self, hash):
166 global _total_searches, _total_steps
168 assert(len(hash) == 20)
170 start = self.fanout[b1-1] # range -1..254
171 end = self.fanout[b1] # range 0..255
172 buf = buffer(self.map, 8 + 256*4, end*20)
174 _total_steps += 1 # lookup table is a step
177 mid = start + (end-start)/2
178 v = str(buf[mid*20:(mid+1)*20])
187 def find_offset(self, hash):
188 """Get the offset of an object inside the index file."""
189 idx = self._idx_from_hash(hash)
191 return self._ofs_from_idx(idx)
194 def exists(self, hash):
195 """Return nonempty if the object exists in this index."""
196 return hash and (self._idx_from_hash(hash) != None) and True or None
199 for i in xrange(self.fanout[255]):
200 yield buffer(self.map, 8 + 256*4 + 20*i, 20)
203 return int(self.fanout[255])
206 extract_bits = _helpers.extract_bits
210 """Wrapper which contains data from multiple index files.
211 Multiple index (.midx) files constitute a wrapper around index (.idx) files
212 and make it possible for bup to expand Git's indexing capabilities to vast
215 def __init__(self, filename):
217 self.force_keep = False
218 assert(filename.endswith('.midx'))
219 self.map = mmap_read(open(filename))
220 if str(self.map[0:4]) != 'MIDX':
221 log('Warning: skipping: invalid MIDX header in %r\n' % filename)
222 self.force_keep = True
223 return self._init_failed()
224 ver = struct.unpack('!I', self.map[4:8])[0]
225 if ver < MIDX_VERSION:
226 log('Warning: ignoring old-style (v%d) midx %r\n'
228 self.force_keep = False # old stuff is boring
229 return self._init_failed()
230 if ver > MIDX_VERSION:
231 log('Warning: ignoring too-new (v%d) midx %r\n'
233 self.force_keep = True # new stuff is exciting
234 return self._init_failed()
236 self.bits = _helpers.firstword(self.map[8:12])
237 self.entries = 2**self.bits
238 self.fanout = buffer(self.map, 12, self.entries*4)
239 shaofs = 12 + self.entries*4
240 nsha = self._fanget(self.entries-1)
241 self.shalist = buffer(self.map, shaofs, nsha*20)
242 self.idxnames = str(self.map[shaofs + 20*nsha:]).split('\0')
244 def _init_failed(self):
247 self.fanout = buffer('\0\0\0\0')
248 self.shalist = buffer('\0'*20)
251 def _fanget(self, i):
253 s = self.fanout[start:start+4]
254 return _helpers.firstword(s)
257 return str(self.shalist[i*20:(i+1)*20])
259 def exists(self, hash):
260 """Return nonempty if the object exists in the index files."""
261 global _total_searches, _total_steps
264 el = extract_bits(want, self.bits)
266 start = self._fanget(el-1)
267 startv = el << (32-self.bits)
271 end = self._fanget(el)
272 endv = (el+1) << (32-self.bits)
273 _total_steps += 1 # lookup table is a step
274 hashv = _helpers.firstword(hash)
275 #print '(%08x) %08x %08x %08x' % (extract_bits(want, 32), startv, hashv, endv)
278 #print '! %08x %08x %08x %d - %d' % (startv, hashv, endv, start, end)
279 mid = start + (hashv-startv)*(end-start-1)/(endv-startv)
280 #print ' %08x %08x %08x %d %d %d' % (startv, hashv, endv, start, mid, end)
282 #print ' %08x' % self._num(v)
285 startv = _helpers.firstword(v)
288 endv = _helpers.firstword(v)
294 for i in xrange(self._fanget(self.entries-1)):
295 yield buffer(self.shalist, i*20, 20)
298 return int(self._fanget(self.entries-1))
303 def __init__(self, dir):
305 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
315 assert(_mpi_count == 0)
318 return iter(idxmerge(self.packs))
321 return sum(len(pack) for pack in self.packs)
323 def exists(self, hash):
324 """Return nonempty if the object exists in the index files."""
325 global _total_searches
327 if hash in self.also:
329 for i in range(len(self.packs)):
331 _total_searches -= 1 # will be incremented by sub-pack
333 # reorder so most recently used packs are searched first
334 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
338 def refresh(self, skip_midx = False):
339 """Refresh the index list.
340 This method verifies if .midx files were superseded (e.g. all of its
341 contents are in another, bigger .midx file) and removes the superseded
344 If skip_midx is True, all work on .midx files will be skipped and .midx
345 files will be removed from the list.
347 The module-global variable 'ignore_midx' can force this function to
348 always act as if skip_midx was True.
350 skip_midx = skip_midx or ignore_midx
351 d = dict((p.name, p) for p in self.packs
352 if not skip_midx or not isinstance(p, PackMidx))
353 if os.path.exists(self.dir):
356 for ix in self.packs:
357 if isinstance(ix, PackMidx):
358 for name in ix.idxnames:
359 d[os.path.join(self.dir, name)] = ix
360 for f in os.listdir(self.dir):
361 full = os.path.join(self.dir, f)
362 if f.endswith('.midx') and not d.get(full):
364 (mxd, mxf) = os.path.split(mx.name)
366 for n in mx.idxnames:
367 if not os.path.exists(os.path.join(mxd, n)):
368 log(('warning: index %s missing\n' +
369 ' used by %s\n') % (n, mxf))
373 midxl.sort(lambda x,y: -cmp(len(x),len(y)))
376 for sub in ix.idxnames:
377 found = d.get(os.path.join(self.dir, sub))
378 if not found or isinstance(found, PackIdx):
379 # doesn't exist, or exists but not in a midx
381 for name in ix.idxnames:
382 d[os.path.join(self.dir, name)] = ix
385 if not any and not ix.force_keep:
386 debug1('midx: removing redundant: %s\n'
387 % os.path.basename(ix.name))
389 for f in os.listdir(self.dir):
390 full = os.path.join(self.dir, f)
391 if f.endswith('.idx') and not d.get(full):
394 self.packs = list(set(d.values()))
395 debug1('PackIdxList: using %d index%s.\n'
396 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
399 """Insert an additional object in the list."""
403 """Remove all additional objects from the list."""
407 def calc_hash(type, content):
408 """Calculate some content's hash in the Git fashion."""
409 header = '%s %d\0' % (type, len(content))
415 def _shalist_sort_key(ent):
416 (mode, name, id) = ent
417 if stat.S_ISDIR(int(mode, 8)):
423 def open_idx(filename):
424 if filename.endswith('.idx'):
425 f = open(filename, 'rb')
427 if header[0:4] == '\377tOc':
428 version = struct.unpack('!I', header[4:8])[0]
430 return PackIdxV2(filename, f)
432 raise GitError('%s: expected idx file version 2, got %d'
433 % (filename, version))
435 raise GitError('version 1 idx files not supported')
436 elif filename.endswith('.midx'):
437 return PackMidx(filename)
439 raise GitError('idx filenames must end with .idx or .midx')
442 def idxmerge(idxlist, final_progress=True):
443 """Generate a list of all the objects reachable in a PackIdxList."""
444 total = sum(len(i) for i in idxlist)
445 iters = (iter(i) for i in idxlist)
446 heap = [(next(it), it) for it in iters]
451 if (count % 10024) == 0:
452 progress('Reading indexes: %.2f%% (%d/%d)\r'
453 % (count*100.0/total, count, total))
461 heapq.heapreplace(heap, (e, it))
465 log('Reading indexes: %.2f%% (%d/%d), done.\n' % (100, total, total))
469 """Writes Git objects insid a pack file."""
470 def __init__(self, objcache_maker=None):
475 self.objcache_maker = objcache_maker
481 def _make_objcache(self):
482 if self.objcache == None:
483 if self.objcache_maker:
484 self.objcache = self.objcache_maker()
486 self.objcache = PackIdxList(repo('objects/pack'))
490 self._make_objcache()
491 (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
492 self.file = os.fdopen(fd, 'w+b')
493 assert(name.endswith('.pack'))
494 self.filename = name[:-5]
495 self.file.write('PACK\0\0\0\2\0\0\0\0')
497 def _raw_write(self, datalist):
500 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
501 # the file never has a *partial* blob. So let's make sure it's
502 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
503 # to our hashsplit algorithm.) f.write() does its own buffering,
504 # but that's okay because we'll flush it in _end().
505 oneblob = ''.join(datalist)
507 self.outbytes += len(oneblob)
510 def _write(self, bin, type, content):
513 self._raw_write(_encode_packobj(type, content))
516 def breakpoint(self):
517 """Clear byte and object counts and return the last processed id."""
519 self.outbytes = self.count = 0
522 def write(self, type, content):
523 """Write an object in this pack file."""
524 return self._write(calc_hash(type, content), type, content)
526 def exists(self, id):
527 """Return non-empty if an object is found in the object cache."""
528 if not self.objcache:
529 self._make_objcache()
530 return self.objcache.exists(id)
532 def maybe_write(self, type, content):
533 """Write an object to the pack file if not present and return its id."""
534 bin = calc_hash(type, content)
535 if not self.exists(bin):
536 self._write(bin, type, content)
537 self.objcache.add(bin)
540 def new_blob(self, blob):
541 """Create a blob object in the pack with the supplied content."""
542 return self.maybe_write('blob', blob)
544 def new_tree(self, shalist):
545 """Create a tree object in the pack."""
546 shalist = sorted(shalist, key = _shalist_sort_key)
548 for (mode,name,bin) in shalist:
551 assert(mode[0] != '0')
553 assert(len(bin) == 20)
554 l.append('%s %s\0%s' % (mode,name,bin))
555 return self.maybe_write('tree', ''.join(l))
557 def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
559 if tree: l.append('tree %s' % tree.encode('hex'))
560 if parent: l.append('parent %s' % parent.encode('hex'))
561 if author: l.append('author %s %s' % (author, _git_date(adate)))
562 if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
565 return self.maybe_write('commit', '\n'.join(l))
567 def new_commit(self, parent, tree, date, msg):
568 """Create a commit object in the pack."""
569 userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
570 commit = self._new_commit(tree, parent,
571 userline, date, userline, date,
576 """Remove the pack file from disk."""
581 os.unlink(self.filename + '.pack')
585 if not f: return None
589 # update object count
591 cp = struct.pack('!i', self.count)
595 # calculate the pack sha1sum
602 f.write(sum.digest())
606 p = subprocess.Popen(['git', 'index-pack', '-v',
608 self.filename + '.pack'],
609 preexec_fn = _gitenv,
610 stdout = subprocess.PIPE)
611 out = p.stdout.read().strip()
612 _git_wait('git index-pack', p)
614 raise GitError('git index-pack produced no output')
615 nameprefix = repo('objects/pack/%s' % out)
616 if os.path.exists(self.filename + '.map'):
617 os.unlink(self.filename + '.map')
618 os.rename(self.filename + '.pack', nameprefix + '.pack')
619 os.rename(self.filename + '.idx', nameprefix + '.idx')
621 auto_midx(repo('objects/pack'))
625 """Close the pack file and move it to its definitive path."""
630 return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
634 os.environ['GIT_DIR'] = os.path.abspath(repo())
637 def list_refs(refname = None):
638 """Generate a list of tuples in the form (refname,hash).
639 If a ref name is specified, list only this particular ref.
641 argv = ['git', 'show-ref', '--']
644 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
645 out = p.stdout.read().strip()
646 rv = p.wait() # not fatal
650 for d in out.split('\n'):
651 (sha, name) = d.split(' ', 1)
652 yield (name, sha.decode('hex'))
655 def read_ref(refname):
656 """Get the commit id of the most recent commit made on a given ref."""
657 l = list(list_refs(refname))
665 def rev_list(ref, count=None):
666 """Generate a list of reachable commits in reverse chronological order.
668 This generator walks through commits, from child to parent, that are
669 reachable via the specified ref and yields a series of tuples of the form
672 If count is a non-zero integer, limit the number of commits to "count"
675 assert(not ref.startswith('-'))
678 opts += ['-n', str(atoi(count))]
679 argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
680 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
684 if s.startswith('commit '):
685 commit = s[7:].decode('hex')
689 rv = p.wait() # not fatal
691 raise GitError, 'git rev-list returned error %d' % rv
694 def rev_get_date(ref):
695 """Get the date of the latest commit on the specified ref."""
696 for (date, commit) in rev_list(ref, count=1):
698 raise GitError, 'no such commit %r' % ref
701 def update_ref(refname, newval, oldval):
702 """Change the commit pointed to by a branch."""
705 assert(refname.startswith('refs/heads/'))
706 p = subprocess.Popen(['git', 'update-ref', refname,
707 newval.encode('hex'), oldval.encode('hex')],
708 preexec_fn = _gitenv)
709 _git_wait('git update-ref', p)
712 def guess_repo(path=None):
713 """Set the path value in the global variable "repodir".
714 This makes bup look for an existing bup repository, but not fail if a
715 repository doesn't exist. Usually, if you are interacting with a bup
716 repository, you would not be calling this function but using
723 repodir = os.environ.get('BUP_DIR')
725 repodir = os.path.expanduser('~/.bup')
728 def init_repo(path=None):
729 """Create the Git bare repository for bup in a given path."""
732 if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
733 raise GitError('"%d" exists but is not a directory\n' % d)
734 p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
735 preexec_fn = _gitenv)
736 _git_wait('git init', p)
737 # Force the index version configuration in order to ensure bup works
738 # regardless of the version of the installed Git binary.
739 p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
740 stdout=sys.stderr, preexec_fn = _gitenv)
741 _git_wait('git config', p)
744 def check_repo_or_die(path=None):
745 """Make sure a bup repository exists, and abort if not.
746 If the path to a particular repository was not specified, this function
747 initializes the default repository automatically.
750 if not os.path.isdir(repo('objects/pack/.')):
751 if repodir == home_repodir:
754 log('error: %r is not a bup/git repository\n' % repo())
759 """Generate a list of (mode, name, hash) tuples of objects from 'buf'."""
761 while ofs < len(buf):
762 z = buf[ofs:].find('\0')
764 spl = buf[ofs:ofs+z].split(' ', 1)
765 assert(len(spl) == 2)
766 sha = buf[ofs+z+1:ofs+z+1+20]
768 yield (spl[0], spl[1], sha)
773 """Get Git's version and ensure a usable version is installed.
775 The returned version is formatted as an ordered tuple with each position
776 representing a digit in the version tag. For example, the following tuple
777 would represent version 1.6.6.9:
783 p = subprocess.Popen(['git', '--version'],
784 stdout=subprocess.PIPE)
785 gvs = p.stdout.read()
786 _git_wait('git --version', p)
787 m = re.match(r'git version (\S+.\S+)', gvs)
789 raise GitError('git --version weird output: %r' % gvs)
790 _ver = tuple(m.group(1).split('.'))
791 needed = ('1','5', '3', '1')
793 raise GitError('git version %s or higher is required; you have %s'
794 % ('.'.join(needed), '.'.join(_ver)))
798 def _git_wait(cmd, p):
801 raise GitError('%s returned %d' % (cmd, rv))
804 def _git_capture(argv):
805 p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
807 _git_wait(repr(argv), p)
811 class _AbortableIter:
812 def __init__(self, it, onabort = None):
814 self.onabort = onabort
822 return self.it.next()
823 except StopIteration, e:
831 """Abort iteration and call the abortion callback, if needed."""
843 """Link to 'git cat-file' that is used to retrieve blob data."""
846 wanted = ('1','5','6')
849 log('warning: git version < %s; bup will be slow.\n'
852 self.get = self._slow_get
854 self.p = self.inprogress = None
855 self.get = self._fast_get
859 self.p.stdout.close()
862 self.inprogress = None
866 self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
867 stdin=subprocess.PIPE,
868 stdout=subprocess.PIPE,
870 preexec_fn = _gitenv)
872 def _fast_get(self, id):
873 if not self.p or self.p.poll() != None:
876 assert(self.p.poll() == None)
878 log('_fast_get: opening %r while %r is open'
879 % (id, self.inprogress))
880 assert(not self.inprogress)
881 assert(id.find('\n') < 0)
882 assert(id.find('\r') < 0)
883 assert(not id.startswith('-'))
885 self.p.stdin.write('%s\n' % id)
886 hdr = self.p.stdout.readline()
887 if hdr.endswith(' missing\n'):
888 self.inprogress = None
889 raise KeyError('blob %r is missing' % id)
891 if len(spl) != 3 or len(spl[0]) != 40:
892 raise GitError('expected blob, got %r' % spl)
893 (hex, type, size) = spl
895 it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
896 onabort = self._abort)
901 assert(self.p.stdout.readline() == '\n')
902 self.inprogress = None
907 def _slow_get(self, id):
908 assert(id.find('\n') < 0)
909 assert(id.find('\r') < 0)
911 type = _git_capture(['git', 'cat-file', '-t', id]).strip()
914 p = subprocess.Popen(['git', 'cat-file', type, id],
915 stdout=subprocess.PIPE,
916 preexec_fn = _gitenv)
917 for blob in chunkyreader(p.stdout):
919 _git_wait('git cat-file', p)
927 treefile = ''.join(it)
928 for (mode, name, sha) in treeparse(treefile):
929 for blob in self.join(sha.encode('hex')):
931 elif type == 'commit':
932 treeline = ''.join(it).split('\n')[0]
933 assert(treeline.startswith('tree '))
934 for blob in self.join(treeline[5:]):
937 raise GitError('invalid object type %r: expected blob/tree/commit'
941 """Generate a list of the content of all blobs that can be reached
942 from an object. The hash given in 'id' must point to a blob, a tree
943 or a commit. The content of all blobs that can be seen from trees or
944 commits will be added to the list.
947 for d in self._join(self.get(id)):
949 except StopIteration: