1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
5 import os, zlib, time, subprocess, struct, stat, re, tempfile
7 from bup.helpers import *
8 from bup import _helpers
14 home_repodir = os.path.expanduser('~/.bup')
17 _typemap = { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
18 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
24 class GitError(Exception):
29 """Get the path to the git repository or one of its subdirectories."""
32 raise GitError('You should call check_repo_or_die()')
34 # If there's a .git subdirectory, then the actual repo is in there.
35 gd = os.path.join(repodir, '.git')
36 if os.path.exists(gd):
39 return os.path.join(repodir, sub)
42 def auto_midx(objdir):
43 main_exe = os.environ.get('BUP_MAIN_EXE') or sys.argv[0]
44 args = [main_exe, 'midx', '--auto', '--dir', objdir]
45 rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
47 add_error('%r: returned %d' % (args, rv))
50 def mangle_name(name, mode, gitmode):
51 """Mangle a file name to present an abstract name for segmented files.
52 Mangled file names will have the ".bup" extension added to them. If a
53 file's name already ends with ".bup", a ".bupl" extension is added to
54 disambiguate normal files from semgmented ones.
56 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
58 elif name.endswith('.bup') or name[:-1].endswith('.bup'):
64 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
65 def demangle_name(name):
66 """Remove name mangling from a file name, if necessary.
68 The return value is a tuple (demangled_filename,mode), where mode is one of
71 * BUP_NORMAL : files that should be read as-is from the repository
72 * BUP_CHUNKED : files that were chunked and need to be assembled
74 For more information on the name mangling algorythm, see mangle_name()
76 if name.endswith('.bupl'):
77 return (name[:-5], BUP_NORMAL)
78 elif name.endswith('.bup'):
79 return (name[:-4], BUP_CHUNKED)
81 return (name, BUP_NORMAL)
84 def _encode_packobj(type, content):
87 szbits = (sz & 0x0f) | (_typemap[type]<<4)
96 z = zlib.compressobj(1)
98 yield z.compress(content)
102 def _encode_looseobj(type, content):
103 z = zlib.compressobj(1)
104 yield z.compress('%s %d\0' % (type, len(content)))
105 yield z.compress(content)
109 def _decode_looseobj(buf):
111 s = zlib.decompress(buf)
118 assert(type in _typemap)
119 assert(sz == len(content))
120 return (type, content)
123 def _decode_packobj(buf):
126 type = _typermap[(c & 0x70) >> 4]
133 sz |= (c & 0x7f) << shift
137 return (type, zlib.decompress(buf[i+1:]))
144 def find_offset(self, hash):
145 """Get the offset of an object inside the index file."""
146 idx = self._idx_from_hash(hash)
148 return self._ofs_from_idx(idx)
151 def exists(self, hash):
152 """Return nonempty if the object exists in this index."""
153 return hash and (self._idx_from_hash(hash) != None) and True or None
156 return int(self.fanout[255])
158 def _idx_from_hash(self, hash):
159 global _total_searches, _total_steps
161 assert(len(hash) == 20)
163 start = self.fanout[b1-1] # range -1..254
164 end = self.fanout[b1] # range 0..255
166 _total_steps += 1 # lookup table is a step
169 mid = start + (end-start)/2
170 v = self._idx_to_hash(mid)
180 class PackIdxV1(PackIdx):
181 """Object representation of a Git pack index (version 1) file."""
182 def __init__(self, filename, f):
184 self.idxnames = [self.name]
185 self.map = mmap_read(f)
186 self.fanout = list(struct.unpack('!256I',
187 str(buffer(self.map, 0, 256*4))))
188 self.fanout.append(0) # entry "-1"
189 nsha = self.fanout[255]
190 self.shatable = buffer(self.map, 256*4, nsha*24)
192 def _ofs_from_idx(self, idx):
193 return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
195 def _idx_to_hash(self, idx):
196 return str(self.shatable[idx*24+4 : idx*24+24])
199 for i in xrange(self.fanout[255]):
200 yield buffer(self.map, 256*4 + 24*i + 4, 20)
203 class PackIdxV2(PackIdx):
204 """Object representation of a Git pack index (version 2) file."""
205 def __init__(self, filename, f):
207 self.idxnames = [self.name]
208 self.map = mmap_read(f)
209 assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
210 self.fanout = list(struct.unpack('!256I',
211 str(buffer(self.map, 8, 256*4))))
212 self.fanout.append(0) # entry "-1"
213 nsha = self.fanout[255]
214 self.shatable = buffer(self.map, 8 + 256*4, nsha*20)
215 self.ofstable = buffer(self.map,
216 8 + 256*4 + nsha*20 + nsha*4,
218 self.ofs64table = buffer(self.map,
219 8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
221 def _ofs_from_idx(self, idx):
222 ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
224 idx64 = ofs & 0x7fffffff
225 ofs = struct.unpack('!I',
226 str(buffer(self.ofs64table, idx64*8, 8)))[0]
229 def _idx_to_hash(self, idx):
230 return str(self.shatable[idx*20:(idx+1)*20])
233 for i in xrange(self.fanout[255]):
234 yield buffer(self.map, 8 + 256*4 + 20*i, 20)
237 extract_bits = _helpers.extract_bits
241 """Wrapper which contains data from multiple index files.
242 Multiple index (.midx) files constitute a wrapper around index (.idx) files
243 and make it possible for bup to expand Git's indexing capabilities to vast
246 def __init__(self, filename):
248 self.force_keep = False
249 assert(filename.endswith('.midx'))
250 self.map = mmap_read(open(filename))
251 if str(self.map[0:4]) != 'MIDX':
252 log('Warning: skipping: invalid MIDX header in %r\n' % filename)
253 self.force_keep = True
254 return self._init_failed()
255 ver = struct.unpack('!I', self.map[4:8])[0]
256 if ver < MIDX_VERSION:
257 log('Warning: ignoring old-style (v%d) midx %r\n'
259 self.force_keep = False # old stuff is boring
260 return self._init_failed()
261 if ver > MIDX_VERSION:
262 log('Warning: ignoring too-new (v%d) midx %r\n'
264 self.force_keep = True # new stuff is exciting
265 return self._init_failed()
267 self.bits = _helpers.firstword(self.map[8:12])
268 self.entries = 2**self.bits
269 self.fanout = buffer(self.map, 12, self.entries*4)
270 shaofs = 12 + self.entries*4
271 nsha = self._fanget(self.entries-1)
272 self.shalist = buffer(self.map, shaofs, nsha*20)
273 self.idxnames = str(self.map[shaofs + 20*nsha:]).split('\0')
275 def _init_failed(self):
278 self.fanout = buffer('\0\0\0\0')
279 self.shalist = buffer('\0'*20)
282 def _fanget(self, i):
284 s = self.fanout[start:start+4]
285 return _helpers.firstword(s)
288 return str(self.shalist[i*20:(i+1)*20])
290 def exists(self, hash):
291 """Return nonempty if the object exists in the index files."""
292 global _total_searches, _total_steps
295 el = extract_bits(want, self.bits)
297 start = self._fanget(el-1)
298 startv = el << (32-self.bits)
302 end = self._fanget(el)
303 endv = (el+1) << (32-self.bits)
304 _total_steps += 1 # lookup table is a step
305 hashv = _helpers.firstword(hash)
306 #print '(%08x) %08x %08x %08x' % (extract_bits(want, 32), startv, hashv, endv)
309 #print '! %08x %08x %08x %d - %d' % (startv, hashv, endv, start, end)
310 mid = start + (hashv-startv)*(end-start-1)/(endv-startv)
311 #print ' %08x %08x %08x %d %d %d' % (startv, hashv, endv, start, mid, end)
313 #print ' %08x' % self._num(v)
316 startv = _helpers.firstword(v)
319 endv = _helpers.firstword(v)
325 for i in xrange(self._fanget(self.entries-1)):
326 yield buffer(self.shalist, i*20, 20)
329 return int(self._fanget(self.entries-1))
334 def __init__(self, dir):
336 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
346 assert(_mpi_count == 0)
349 return iter(idxmerge(self.packs))
352 return sum(len(pack) for pack in self.packs)
354 def exists(self, hash):
355 """Return nonempty if the object exists in the index files."""
356 global _total_searches
358 if hash in self.also:
360 for i in range(len(self.packs)):
362 _total_searches -= 1 # will be incremented by sub-pack
364 # reorder so most recently used packs are searched first
365 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
369 def refresh(self, skip_midx = False):
370 """Refresh the index list.
371 This method verifies if .midx files were superseded (e.g. all of its
372 contents are in another, bigger .midx file) and removes the superseded
375 If skip_midx is True, all work on .midx files will be skipped and .midx
376 files will be removed from the list.
378 The module-global variable 'ignore_midx' can force this function to
379 always act as if skip_midx was True.
381 skip_midx = skip_midx or ignore_midx
382 d = dict((p.name, p) for p in self.packs
383 if not skip_midx or not isinstance(p, PackMidx))
384 if os.path.exists(self.dir):
387 for ix in self.packs:
388 if isinstance(ix, PackMidx):
389 for name in ix.idxnames:
390 d[os.path.join(self.dir, name)] = ix
391 for f in os.listdir(self.dir):
392 full = os.path.join(self.dir, f)
393 if f.endswith('.midx') and not d.get(full):
395 (mxd, mxf) = os.path.split(mx.name)
397 for n in mx.idxnames:
398 if not os.path.exists(os.path.join(mxd, n)):
399 log(('warning: index %s missing\n' +
400 ' used by %s\n') % (n, mxf))
407 midxl.sort(lambda x,y: -cmp(len(x),len(y)))
410 for sub in ix.idxnames:
411 found = d.get(os.path.join(self.dir, sub))
412 if not found or isinstance(found, PackIdx):
413 # doesn't exist, or exists but not in a midx
415 for name in ix.idxnames:
416 d[os.path.join(self.dir, name)] = ix
419 if not any and not ix.force_keep:
420 debug1('midx: removing redundant: %s\n'
421 % os.path.basename(ix.name))
423 for f in os.listdir(self.dir):
424 full = os.path.join(self.dir, f)
425 if f.endswith('.idx') and not d.get(full):
428 self.packs = list(set(d.values()))
429 debug1('PackIdxList: using %d index%s.\n'
430 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
433 """Insert an additional object in the list."""
437 """Remove all additional objects from the list."""
441 def calc_hash(type, content):
442 """Calculate some content's hash in the Git fashion."""
443 header = '%s %d\0' % (type, len(content))
449 def _shalist_sort_key(ent):
450 (mode, name, id) = ent
451 if stat.S_ISDIR(int(mode, 8)):
457 def open_idx(filename):
458 if filename.endswith('.idx'):
459 f = open(filename, 'rb')
461 if header[0:4] == '\377tOc':
462 version = struct.unpack('!I', header[4:8])[0]
464 return PackIdxV2(filename, f)
466 raise GitError('%s: expected idx file version 2, got %d'
467 % (filename, version))
469 return PackIdxV1(filename, f)
470 elif filename.endswith('.midx'):
471 return PackMidx(filename)
473 raise GitError('idx filenames must end with .idx or .midx')
476 def idxmerge(idxlist, final_progress=True):
477 """Generate a list of all the objects reachable in a PackIdxList."""
478 total = sum(len(i) for i in idxlist)
479 iters = (iter(i) for i in idxlist)
480 heap = [(next(it), it) for it in iters]
485 if (count % 10024) == 0:
486 progress('Reading indexes: %.2f%% (%d/%d)\r'
487 % (count*100.0/total, count, total))
495 heapq.heapreplace(heap, (e, it))
499 log('Reading indexes: %.2f%% (%d/%d), done.\n' % (100, total, total))
503 """Writes Git objects insid a pack file."""
504 def __init__(self, objcache_maker=None):
509 self.objcache_maker = objcache_maker
515 def _make_objcache(self):
516 if self.objcache == None:
517 if self.objcache_maker:
518 self.objcache = self.objcache_maker()
520 self.objcache = PackIdxList(repo('objects/pack'))
524 self._make_objcache()
525 (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
526 self.file = os.fdopen(fd, 'w+b')
527 assert(name.endswith('.pack'))
528 self.filename = name[:-5]
529 self.file.write('PACK\0\0\0\2\0\0\0\0')
531 def _raw_write(self, datalist):
534 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
535 # the file never has a *partial* blob. So let's make sure it's
536 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
537 # to our hashsplit algorithm.) f.write() does its own buffering,
538 # but that's okay because we'll flush it in _end().
539 oneblob = ''.join(datalist)
541 self.outbytes += len(oneblob)
544 def _write(self, bin, type, content):
547 self._raw_write(_encode_packobj(type, content))
550 def breakpoint(self):
551 """Clear byte and object counts and return the last processed id."""
553 self.outbytes = self.count = 0
556 def write(self, type, content):
557 """Write an object in this pack file."""
558 return self._write(calc_hash(type, content), type, content)
560 def exists(self, id):
561 """Return non-empty if an object is found in the object cache."""
562 if not self.objcache:
563 self._make_objcache()
564 return self.objcache.exists(id)
566 def maybe_write(self, type, content):
567 """Write an object to the pack file if not present and return its id."""
568 bin = calc_hash(type, content)
569 if not self.exists(bin):
570 self._write(bin, type, content)
571 self.objcache.add(bin)
574 def new_blob(self, blob):
575 """Create a blob object in the pack with the supplied content."""
576 return self.maybe_write('blob', blob)
578 def new_tree(self, shalist):
579 """Create a tree object in the pack."""
580 shalist = sorted(shalist, key = _shalist_sort_key)
582 for (mode,name,bin) in shalist:
585 assert(mode[0] != '0')
587 assert(len(bin) == 20)
588 l.append('%s %s\0%s' % (mode,name,bin))
589 return self.maybe_write('tree', ''.join(l))
591 def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
593 if tree: l.append('tree %s' % tree.encode('hex'))
594 if parent: l.append('parent %s' % parent.encode('hex'))
595 if author: l.append('author %s %s' % (author, _git_date(adate)))
596 if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
599 return self.maybe_write('commit', '\n'.join(l))
601 def new_commit(self, parent, tree, date, msg):
602 """Create a commit object in the pack."""
603 userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
604 commit = self._new_commit(tree, parent,
605 userline, date, userline, date,
610 """Remove the pack file from disk."""
615 os.unlink(self.filename + '.pack')
619 if not f: return None
623 # update object count
625 cp = struct.pack('!i', self.count)
629 # calculate the pack sha1sum
636 f.write(sum.digest())
640 p = subprocess.Popen(['git', 'index-pack', '-v',
642 self.filename + '.pack'],
643 preexec_fn = _gitenv,
644 stdout = subprocess.PIPE)
645 out = p.stdout.read().strip()
646 _git_wait('git index-pack', p)
648 raise GitError('git index-pack produced no output')
649 nameprefix = repo('objects/pack/%s' % out)
650 if os.path.exists(self.filename + '.map'):
651 os.unlink(self.filename + '.map')
652 os.rename(self.filename + '.pack', nameprefix + '.pack')
653 os.rename(self.filename + '.idx', nameprefix + '.idx')
655 auto_midx(repo('objects/pack'))
659 """Close the pack file and move it to its definitive path."""
664 return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
668 os.environ['GIT_DIR'] = os.path.abspath(repo())
671 def list_refs(refname = None):
672 """Generate a list of tuples in the form (refname,hash).
673 If a ref name is specified, list only this particular ref.
675 argv = ['git', 'show-ref', '--']
678 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
679 out = p.stdout.read().strip()
680 rv = p.wait() # not fatal
684 for d in out.split('\n'):
685 (sha, name) = d.split(' ', 1)
686 yield (name, sha.decode('hex'))
689 def read_ref(refname):
690 """Get the commit id of the most recent commit made on a given ref."""
691 l = list(list_refs(refname))
699 def rev_list(ref, count=None):
700 """Generate a list of reachable commits in reverse chronological order.
702 This generator walks through commits, from child to parent, that are
703 reachable via the specified ref and yields a series of tuples of the form
706 If count is a non-zero integer, limit the number of commits to "count"
709 assert(not ref.startswith('-'))
712 opts += ['-n', str(atoi(count))]
713 argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
714 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
718 if s.startswith('commit '):
719 commit = s[7:].decode('hex')
723 rv = p.wait() # not fatal
725 raise GitError, 'git rev-list returned error %d' % rv
728 def rev_get_date(ref):
729 """Get the date of the latest commit on the specified ref."""
730 for (date, commit) in rev_list(ref, count=1):
732 raise GitError, 'no such commit %r' % ref
735 def rev_parse(committish):
736 """Resolve the full hash for 'committish', if it exists.
738 Should be roughly equivalent to 'git rev-parse'.
740 Returns the hex value of the hash if it is found, None if 'committish' does
741 not correspond to anything.
743 head = read_ref(committish)
745 debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
748 pL = PackIdxList(repo('objects/pack'))
750 if len(committish) == 40:
752 hash = committish.decode('hex')
762 def update_ref(refname, newval, oldval):
763 """Change the commit pointed to by a branch."""
766 assert(refname.startswith('refs/heads/'))
767 p = subprocess.Popen(['git', 'update-ref', refname,
768 newval.encode('hex'), oldval.encode('hex')],
769 preexec_fn = _gitenv)
770 _git_wait('git update-ref', p)
773 def guess_repo(path=None):
774 """Set the path value in the global variable "repodir".
775 This makes bup look for an existing bup repository, but not fail if a
776 repository doesn't exist. Usually, if you are interacting with a bup
777 repository, you would not be calling this function but using
784 repodir = os.environ.get('BUP_DIR')
786 repodir = os.path.expanduser('~/.bup')
789 def init_repo(path=None):
790 """Create the Git bare repository for bup in a given path."""
793 if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
794 raise GitError('"%d" exists but is not a directory\n' % d)
795 p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
796 preexec_fn = _gitenv)
797 _git_wait('git init', p)
798 # Force the index version configuration in order to ensure bup works
799 # regardless of the version of the installed Git binary.
800 p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
801 stdout=sys.stderr, preexec_fn = _gitenv)
802 _git_wait('git config', p)
805 def check_repo_or_die(path=None):
806 """Make sure a bup repository exists, and abort if not.
807 If the path to a particular repository was not specified, this function
808 initializes the default repository automatically.
811 if not os.path.isdir(repo('objects/pack/.')):
812 if repodir == home_repodir:
815 log('error: %r is not a bup/git repository\n' % repo())
820 """Generate a list of (mode, name, hash) tuples of objects from 'buf'."""
822 while ofs < len(buf):
823 z = buf[ofs:].find('\0')
825 spl = buf[ofs:ofs+z].split(' ', 1)
826 assert(len(spl) == 2)
827 sha = buf[ofs+z+1:ofs+z+1+20]
829 yield (spl[0], spl[1], sha)
834 """Get Git's version and ensure a usable version is installed.
836 The returned version is formatted as an ordered tuple with each position
837 representing a digit in the version tag. For example, the following tuple
838 would represent version 1.6.6.9:
844 p = subprocess.Popen(['git', '--version'],
845 stdout=subprocess.PIPE)
846 gvs = p.stdout.read()
847 _git_wait('git --version', p)
848 m = re.match(r'git version (\S+.\S+)', gvs)
850 raise GitError('git --version weird output: %r' % gvs)
851 _ver = tuple(m.group(1).split('.'))
852 needed = ('1','5', '3', '1')
854 raise GitError('git version %s or higher is required; you have %s'
855 % ('.'.join(needed), '.'.join(_ver)))
859 def _git_wait(cmd, p):
862 raise GitError('%s returned %d' % (cmd, rv))
865 def _git_capture(argv):
866 p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
868 _git_wait(repr(argv), p)
872 class _AbortableIter:
873 def __init__(self, it, onabort = None):
875 self.onabort = onabort
883 return self.it.next()
884 except StopIteration, e:
892 """Abort iteration and call the abortion callback, if needed."""
904 """Link to 'git cat-file' that is used to retrieve blob data."""
907 wanted = ('1','5','6')
910 log('warning: git version < %s; bup will be slow.\n'
913 self.get = self._slow_get
915 self.p = self.inprogress = None
916 self.get = self._fast_get
920 self.p.stdout.close()
923 self.inprogress = None
927 self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
928 stdin=subprocess.PIPE,
929 stdout=subprocess.PIPE,
931 preexec_fn = _gitenv)
933 def _fast_get(self, id):
934 if not self.p or self.p.poll() != None:
937 assert(self.p.poll() == None)
939 log('_fast_get: opening %r while %r is open'
940 % (id, self.inprogress))
941 assert(not self.inprogress)
942 assert(id.find('\n') < 0)
943 assert(id.find('\r') < 0)
944 assert(not id.startswith('-'))
946 self.p.stdin.write('%s\n' % id)
947 hdr = self.p.stdout.readline()
948 if hdr.endswith(' missing\n'):
949 self.inprogress = None
950 raise KeyError('blob %r is missing' % id)
952 if len(spl) != 3 or len(spl[0]) != 40:
953 raise GitError('expected blob, got %r' % spl)
954 (hex, type, size) = spl
956 it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
957 onabort = self._abort)
962 assert(self.p.stdout.readline() == '\n')
963 self.inprogress = None
968 def _slow_get(self, id):
969 assert(id.find('\n') < 0)
970 assert(id.find('\r') < 0)
972 type = _git_capture(['git', 'cat-file', '-t', id]).strip()
975 p = subprocess.Popen(['git', 'cat-file', type, id],
976 stdout=subprocess.PIPE,
977 preexec_fn = _gitenv)
978 for blob in chunkyreader(p.stdout):
980 _git_wait('git cat-file', p)
988 treefile = ''.join(it)
989 for (mode, name, sha) in treeparse(treefile):
990 for blob in self.join(sha.encode('hex')):
992 elif type == 'commit':
993 treeline = ''.join(it).split('\n')[0]
994 assert(treeline.startswith('tree '))
995 for blob in self.join(treeline[5:]):
998 raise GitError('invalid object type %r: expected blob/tree/commit'
1002 """Generate a list of the content of all blobs that can be reached
1003 from an object. The hash given in 'id' must point to a blob, a tree
1004 or a commit. The content of all blobs that can be seen from trees or
1005 commits will be added to the list.
1008 for d in self._join(self.get(id)):
1010 except StopIteration: