1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
6 from __future__ import absolute_import
7 import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
8 from array import array
9 from collections import namedtuple
10 from itertools import islice
11 from numbers import Integral
13 from bup import _helpers, compat, hashsplit, path, midx, bloom, xstat
14 from bup.compat import range
15 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
17 hostname, localtime, log,
20 mmap_read, mmap_readwrite,
22 progress, qprogress, shstr, stat_if_exists,
25 from bup.pwdgrp import username, userfullname
28 repodir = None # The default repository, once initialized
30 _typemap = { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
31 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
37 class GitError(Exception):
41 def _gitenv(repo_dir=None):
44 return merge_dict(os.environ, {'GIT_DIR': os.path.abspath(repo_dir)})
46 def _git_wait(cmd, p):
49 raise GitError('%s returned %d' % (shstr(cmd), rv))
51 def _git_capture(argv):
52 p = subprocess.Popen(argv, stdout=subprocess.PIPE, env=_gitenv())
54 _git_wait(repr(argv), p)
57 def git_config_get(option, repo_dir=None):
58 cmd = ('git', 'config', '--get', option)
59 p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
60 env=_gitenv(repo_dir=repo_dir))
66 raise GitError('%s returned %d' % (cmd, rc))
70 def parse_tz_offset(s):
71 """UTC offset in seconds."""
72 tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
78 # FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
79 # Make sure that's authoritative.
80 _start_end_char = r'[^ .,:;<>"\'\0\n]'
81 _content_char = r'[^\0\n<>]'
82 _safe_str_rx = '(?:%s{1,2}|(?:%s%s*%s))' \
84 _start_end_char, _content_char, _start_end_char)
85 _tz_rx = r'[-+]\d\d[0-5]\d'
86 _parent_rx = r'(?:parent [abcdefABCDEF0123456789]{40}\n)'
87 # Assumes every following line starting with a space is part of the
88 # mergetag. Is there a formal commit blob spec?
89 _mergetag_rx = r'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)'
90 _commit_rx = re.compile(r'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
91 (?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
92 committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?)
94 (?P<message>(?:.|\n)*)''' % (_parent_rx,
95 _safe_str_rx, _safe_str_rx, _tz_rx,
96 _safe_str_rx, _safe_str_rx, _tz_rx,
98 _parent_hash_rx = re.compile(r'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
100 # Note that the author_sec and committer_sec values are (UTC) epoch
101 # seconds, and for now the mergetag is not included.
102 CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
103 'author_name', 'author_mail',
104 'author_sec', 'author_offset',
105 'committer_name', 'committer_mail',
106 'committer_sec', 'committer_offset',
109 def parse_commit(content):
110 commit_match = re.match(_commit_rx, content)
112 raise Exception('cannot parse commit %r' % content)
113 matches = commit_match.groupdict()
114 return CommitInfo(tree=matches['tree'],
115 parents=re.findall(_parent_hash_rx, matches['parents']),
116 author_name=matches['author_name'],
117 author_mail=matches['author_mail'],
118 author_sec=int(matches['asec']),
119 author_offset=parse_tz_offset(matches['atz']),
120 committer_name=matches['committer_name'],
121 committer_mail=matches['committer_mail'],
122 committer_sec=int(matches['csec']),
123 committer_offset=parse_tz_offset(matches['ctz']),
124 message=matches['message'])
127 def get_cat_data(cat_iterator, expected_type):
128 _, kind, _ = next(cat_iterator)
129 if kind != expected_type:
130 raise Exception('expected %r, saw %r' % (expected_type, kind))
131 return ''.join(cat_iterator)
133 def get_commit_items(id, cp):
134 return parse_commit(get_cat_data(cp.get(id), 'commit'))
136 def _local_git_date_str(epoch_sec):
137 return '%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
140 def _git_date_str(epoch_sec, tz_offset_sec):
141 offs = tz_offset_sec // 60
142 return '%d %s%02d%02d' \
144 '+' if offs >= 0 else '-',
149 def repo(sub = '', repo_dir=None):
150 """Get the path to the git repository or one of its subdirectories."""
151 repo_dir = repo_dir or repodir
153 raise GitError('You should call check_repo_or_die()')
155 # If there's a .git subdirectory, then the actual repo is in there.
156 gd = os.path.join(repo_dir, '.git')
157 if os.path.exists(gd):
160 return os.path.join(repo_dir, sub)
164 return re.sub(r'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)',
169 full = os.path.abspath(path)
170 fullrepo = os.path.abspath(repo(''))
171 if not fullrepo.endswith('/'):
173 if full.startswith(fullrepo):
174 path = full[len(fullrepo):]
175 if path.startswith('index-cache/'):
176 path = path[len('index-cache/'):]
177 return shorten_hash(path)
181 paths = [repo('objects/pack')]
182 paths += glob.glob(repo('index-cache/*/.'))
186 def auto_midx(objdir):
187 args = [path.exe(), 'midx', '--auto', '--dir', objdir]
189 rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
191 # make sure 'args' gets printed to help with debugging
192 add_error('%r: exception: %s' % (args, e))
195 add_error('%r: returned %d' % (args, rv))
197 args = [path.exe(), 'bloom', '--dir', objdir]
199 rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
201 # make sure 'args' gets printed to help with debugging
202 add_error('%r: exception: %s' % (args, e))
205 add_error('%r: returned %d' % (args, rv))
208 def mangle_name(name, mode, gitmode):
209 """Mangle a file name to present an abstract name for segmented files.
210 Mangled file names will have the ".bup" extension added to them. If a
211 file's name already ends with ".bup", a ".bupl" extension is added to
212 disambiguate normal files from segmented ones.
214 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
215 assert(stat.S_ISDIR(gitmode))
217 elif name.endswith('.bup') or name[:-1].endswith('.bup'):
218 return name + '.bupl'
223 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
224 def demangle_name(name, mode):
225 """Remove name mangling from a file name, if necessary.
227 The return value is a tuple (demangled_filename,mode), where mode is one of
230 * BUP_NORMAL : files that should be read as-is from the repository
231 * BUP_CHUNKED : files that were chunked and need to be reassembled
233 For more information on the name mangling algorithm, see mangle_name()
235 if name.endswith('.bupl'):
236 return (name[:-5], BUP_NORMAL)
237 elif name.endswith('.bup'):
238 return (name[:-4], BUP_CHUNKED)
239 elif name.endswith('.bupm'):
241 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
243 return (name, BUP_NORMAL)
246 def calc_hash(type, content):
247 """Calculate some content's hash in the Git fashion."""
248 header = '%s %d\0' % (type, len(content))
254 def shalist_item_sort_key(ent):
255 (mode, name, id) = ent
256 assert(mode+0 == mode)
257 if stat.S_ISDIR(mode):
263 def tree_encode(shalist):
264 """Generate a git tree object from (mode,name,hash) tuples."""
265 shalist = sorted(shalist, key = shalist_item_sort_key)
267 for (mode,name,bin) in shalist:
269 assert(mode+0 == mode)
271 assert(len(bin) == 20)
272 s = '%o %s\0%s' % (mode,name,bin)
273 assert(s[0] != '0') # 0-padded octal is not acceptable in a git tree
278 def tree_decode(buf):
279 """Generate a list of (mode,name,hash) from the git tree object in buf."""
281 while ofs < len(buf):
282 z = buf.find('\0', ofs)
284 spl = buf[ofs:z].split(' ', 1)
285 assert(len(spl) == 2)
287 sha = buf[z+1:z+1+20]
289 yield (int(mode, 8), name, sha)
292 def _encode_packobj(type, content, compression_level=1):
293 if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
294 raise ValueError('invalid compression level %s' % compression_level)
297 szbits = (sz & 0x0f) | (_typemap[type]<<4)
300 if sz: szbits |= 0x80
306 z = zlib.compressobj(compression_level)
308 yield z.compress(content)
312 def _encode_looseobj(type, content, compression_level=1):
313 z = zlib.compressobj(compression_level)
314 yield z.compress('%s %d\0' % (type, len(content)))
315 yield z.compress(content)
319 def _decode_looseobj(buf):
321 s = zlib.decompress(buf)
328 assert(type in _typemap)
329 assert(sz == len(content))
330 return (type, content)
333 def _decode_packobj(buf):
336 type = _typermap[(c & 0x70) >> 4]
343 sz |= (c & 0x7f) << shift
347 return (type, zlib.decompress(buf[i+1:]))
354 def find_offset(self, hash):
355 """Get the offset of an object inside the index file."""
356 idx = self._idx_from_hash(hash)
358 return self._ofs_from_idx(idx)
361 def exists(self, hash, want_source=False):
362 """Return nonempty if the object exists in this index."""
363 if hash and (self._idx_from_hash(hash) != None):
364 return want_source and os.path.basename(self.name) or True
367 def _idx_from_hash(self, hash):
368 global _total_searches, _total_steps
370 assert(len(hash) == 20)
372 start = self.fanout[b1-1] # range -1..254
373 end = self.fanout[b1] # range 0..255
375 _total_steps += 1 # lookup table is a step
378 mid = start + (end-start)/2
379 v = self._idx_to_hash(mid)
389 class PackIdxV1(PackIdx):
390 """Object representation of a Git pack index (version 1) file."""
391 def __init__(self, filename, f):
393 self.idxnames = [self.name]
394 self.map = mmap_read(f)
395 # Min size for 'L' is 4, which is sufficient for struct's '!I'
396 self.fanout = array('L', struct.unpack('!256I', self.map))
397 self.fanout.append(0) # entry "-1"
398 self.nsha = self.fanout[255]
399 self.sha_ofs = 256 * 4
400 # Avoid slicing shatable for individual hashes (very high overhead)
401 self.shatable = buffer(self.map, self.sha_ofs, self.nsha * 24)
404 return int(self.nsha) # int() from long for python 2
406 def _ofs_from_idx(self, idx):
407 if idx >= self.nsha or idx < 0:
408 raise IndexError('invalid pack index index %d' % idx)
409 ofs = self.sha_ofs + idx * 24
410 return struct.unpack_from('!I', self.map, offset=ofs)[0]
412 def _idx_to_hash(self, idx):
413 if idx >= self.nsha or idx < 0:
414 raise IndexError('invalid pack index index %d' % idx)
415 ofs = self.sha_ofs + idx * 24 + 4
416 return self.map[ofs : ofs + 20]
419 start = self.sha_ofs + 4
420 for ofs in range(start, start + 24 * self.nsha, 24):
421 yield self.map[ofs : ofs + 20]
424 class PackIdxV2(PackIdx):
425 """Object representation of a Git pack index (version 2) file."""
426 def __init__(self, filename, f):
428 self.idxnames = [self.name]
429 self.map = mmap_read(f)
430 assert self.map[0:8] == b'\377tOc\0\0\0\2'
431 # Min size for 'L' is 4, which is sufficient for struct's '!I'
432 self.fanout = array('L', struct.unpack_from('!256I', self.map, offset=8))
433 self.fanout.append(0)
434 self.nsha = self.fanout[255]
435 self.sha_ofs = 8 + 256*4
436 self.ofstable_ofs = self.sha_ofs + self.nsha * 20 + self.nsha * 4
437 self.ofs64table_ofs = self.ofstable_ofs + self.nsha * 4
438 # Avoid slicing this for individual hashes (very high overhead)
439 self.shatable = buffer(self.map, self.sha_ofs, self.nsha*20)
442 return int(self.nsha) # int() from long for python 2
444 def _ofs_from_idx(self, idx):
445 if idx >= self.nsha or idx < 0:
446 raise IndexError('invalid pack index index %d' % idx)
447 ofs_ofs = self.ofstable_ofs + idx * 4
448 ofs = struct.unpack_from('!I', self.map, offset=ofs_ofs)[0]
450 idx64 = ofs & 0x7fffffff
451 ofs64_ofs = self.ofs64table_ofs + idx64 * 8
452 ofs = struct.unpack_from('!Q', self.map, offset=ofs64_ofs)[0]
455 def _idx_to_hash(self, idx):
456 if idx >= self.nsha or idx < 0:
457 raise IndexError('invalid pack index index %d' % idx)
458 ofs = self.sha_ofs + idx * 20
459 return self.map[ofs : ofs + 20]
463 for ofs in range(start, start + 20 * self.nsha, 20):
464 yield self.map[ofs : ofs + 20]
469 def __init__(self, dir, ignore_midx=False):
471 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
476 self.do_bloom = False
478 self.ignore_midx = ignore_midx
484 assert(_mpi_count == 0)
487 return iter(idxmerge(self.packs))
490 return sum(len(pack) for pack in self.packs)
492 def exists(self, hash, want_source=False):
493 """Return nonempty if the object exists in the index files."""
494 global _total_searches
496 if hash in self.also:
498 if self.do_bloom and self.bloom:
499 if self.bloom.exists(hash):
500 self.do_bloom = False
502 _total_searches -= 1 # was counted by bloom
504 for i in xrange(len(self.packs)):
506 _total_searches -= 1 # will be incremented by sub-pack
507 ix = p.exists(hash, want_source=want_source)
509 # reorder so most recently used packs are searched first
510 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
515 def refresh(self, skip_midx = False):
516 """Refresh the index list.
517 This method verifies if .midx files were superseded (e.g. all of its
518 contents are in another, bigger .midx file) and removes the superseded
521 If skip_midx is True, all work on .midx files will be skipped and .midx
522 files will be removed from the list.
524 The instance variable 'ignore_midx' can force this function to
525 always act as if skip_midx was True.
527 self.bloom = None # Always reopen the bloom as it may have been relaced
528 self.do_bloom = False
529 skip_midx = skip_midx or self.ignore_midx
530 d = dict((p.name, p) for p in self.packs
531 if not skip_midx or not isinstance(p, midx.PackMidx))
532 if os.path.exists(self.dir):
535 for ix in self.packs:
536 if isinstance(ix, midx.PackMidx):
537 for name in ix.idxnames:
538 d[os.path.join(self.dir, name)] = ix
539 for full in glob.glob(os.path.join(self.dir,'*.midx')):
541 mx = midx.PackMidx(full)
542 (mxd, mxf) = os.path.split(mx.name)
544 for n in mx.idxnames:
545 if not os.path.exists(os.path.join(mxd, n)):
546 log(('warning: index %s missing\n' +
547 ' used by %s\n') % (n, mxf))
555 midxl.sort(key=lambda ix:
556 (-len(ix), -xstat.stat(ix.name).st_mtime))
559 for sub in ix.idxnames:
560 found = d.get(os.path.join(self.dir, sub))
561 if not found or isinstance(found, PackIdx):
562 # doesn't exist, or exists but not in a midx
567 for name in ix.idxnames:
568 d[os.path.join(self.dir, name)] = ix
569 elif not ix.force_keep:
570 debug1('midx: removing redundant: %s\n'
571 % os.path.basename(ix.name))
574 for full in glob.glob(os.path.join(self.dir,'*.idx')):
578 except GitError as e:
582 bfull = os.path.join(self.dir, 'bup.bloom')
583 if self.bloom is None and os.path.exists(bfull):
584 self.bloom = bloom.ShaBloom(bfull)
585 self.packs = list(set(d.values()))
586 self.packs.sort(reverse=True, key=lambda x: len(x))
587 if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
591 debug1('PackIdxList: using %d index%s.\n'
592 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
595 """Insert an additional object in the list."""
599 def open_idx(filename):
600 if filename.endswith('.idx'):
601 f = open(filename, 'rb')
603 if header[0:4] == '\377tOc':
604 version = struct.unpack('!I', header[4:8])[0]
606 return PackIdxV2(filename, f)
608 raise GitError('%s: expected idx file version 2, got %d'
609 % (filename, version))
610 elif len(header) == 8 and header[0:4] < '\377tOc':
611 return PackIdxV1(filename, f)
613 raise GitError('%s: unrecognized idx file header' % filename)
614 elif filename.endswith('.midx'):
615 return midx.PackMidx(filename)
617 raise GitError('idx filenames must end with .idx or .midx')
620 def idxmerge(idxlist, final_progress=True):
621 """Generate a list of all the objects reachable in a PackIdxList."""
622 def pfunc(count, total):
623 qprogress('Reading indexes: %.2f%% (%d/%d)\r'
624 % (count*100.0/total, count, total))
625 def pfinal(count, total):
627 progress('Reading indexes: %.2f%% (%d/%d), done.\n'
628 % (100, total, total))
629 return merge_iter(idxlist, 10024, pfunc, pfinal)
632 def _make_objcache():
633 return PackIdxList(repo('objects/pack'))
635 # bup-gc assumes that it can disable all PackWriter activities
636 # (bloom/midx/cache) via the constructor and close() arguments.
639 """Writes Git objects inside a pack file."""
640 def __init__(self, objcache_maker=_make_objcache, compression_level=1,
641 run_midx=True, on_pack_finish=None,
642 max_pack_size=None, max_pack_objects=None, repo_dir=None):
643 self.repo_dir = repo_dir or repo()
650 self.objcache_maker = objcache_maker
652 self.compression_level = compression_level
653 self.run_midx=run_midx
654 self.on_pack_finish = on_pack_finish
655 if not max_pack_size:
656 max_pack_size = git_config_get('pack.packSizeLimit',
657 repo_dir=self.repo_dir)
658 if max_pack_size is not None:
659 max_pack_size = parse_num(max_pack_size)
660 if not max_pack_size:
661 # larger packs slow down pruning
662 max_pack_size = 1000 * 1000 * 1000
663 self.max_pack_size = max_pack_size
664 # cache memory usage is about 83 bytes per object
665 self.max_pack_objects = max_pack_objects if max_pack_objects \
666 else max(1, self.max_pack_size // 5000)
674 def __exit__(self, type, value, traceback):
679 objdir = dir = os.path.join(self.repo_dir, 'objects')
680 fd, name = tempfile.mkstemp(suffix='.pack', dir=objdir)
682 self.file = os.fdopen(fd, 'w+b')
687 self.parentfd = os.open(objdir, os.O_RDONLY)
693 assert(name.endswith('.pack'))
694 self.filename = name[:-5]
695 self.file.write('PACK\0\0\0\2\0\0\0\0')
696 self.idx = list(list() for i in xrange(256))
698 def _raw_write(self, datalist, sha):
701 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
702 # the file never has a *partial* blob. So let's make sure it's
703 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
704 # to our hashsplit algorithm.) f.write() does its own buffering,
705 # but that's okay because we'll flush it in _end().
706 oneblob = ''.join(datalist)
710 raise GitError, e, sys.exc_info()[2]
712 crc = zlib.crc32(oneblob) & 0xffffffff
713 self._update_idx(sha, crc, nw)
718 def _update_idx(self, sha, crc, size):
721 self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
723 def _write(self, sha, type, content):
727 sha = calc_hash(type, content)
728 size, crc = self._raw_write(_encode_packobj(type, content,
729 self.compression_level),
731 if self.outbytes >= self.max_pack_size \
732 or self.count >= self.max_pack_objects:
736 def breakpoint(self):
737 """Clear byte and object counts and return the last processed id."""
738 id = self._end(self.run_midx)
739 self.outbytes = self.count = 0
742 def _require_objcache(self):
743 if self.objcache is None and self.objcache_maker:
744 self.objcache = self.objcache_maker()
745 if self.objcache is None:
747 "PackWriter not opened or can't check exists w/o objcache")
749 def exists(self, id, want_source=False):
750 """Return non-empty if an object is found in the object cache."""
751 self._require_objcache()
752 return self.objcache.exists(id, want_source=want_source)
754 def just_write(self, sha, type, content):
755 """Write an object to the pack file without checking for duplication."""
756 self._write(sha, type, content)
757 # If nothing else, gc doesn't have/want an objcache
758 if self.objcache is not None:
759 self.objcache.add(sha)
761 def maybe_write(self, type, content):
762 """Write an object to the pack file if not present and return its id."""
763 sha = calc_hash(type, content)
764 if not self.exists(sha):
765 self._require_objcache()
766 self.just_write(sha, type, content)
769 def new_blob(self, blob):
770 """Create a blob object in the pack with the supplied content."""
771 return self.maybe_write('blob', blob)
773 def new_tree(self, shalist):
774 """Create a tree object in the pack."""
775 content = tree_encode(shalist)
776 return self.maybe_write('tree', content)
778 def new_commit(self, tree, parent,
779 author, adate_sec, adate_tz,
780 committer, cdate_sec, cdate_tz,
782 """Create a commit object in the pack. The date_sec values must be
783 epoch-seconds, and if a tz is None, the local timezone is assumed."""
785 adate_str = _git_date_str(adate_sec, adate_tz)
787 adate_str = _local_git_date_str(adate_sec)
789 cdate_str = _git_date_str(cdate_sec, cdate_tz)
791 cdate_str = _local_git_date_str(cdate_sec)
793 if tree: l.append('tree %s' % tree.encode('hex'))
794 if parent: l.append('parent %s' % parent.encode('hex'))
795 if author: l.append('author %s %s' % (author, adate_str))
796 if committer: l.append('committer %s %s' % (committer, cdate_str))
799 return self.maybe_write('commit', '\n'.join(l))
802 """Remove the pack file from disk."""
811 os.unlink(self.filename + '.pack')
818 def _end(self, run_midx=True):
820 if not f: return None
827 # update object count
829 cp = struct.pack('!i', self.count)
833 # calculate the pack sha1sum
836 for b in chunkyreader(f):
838 packbin = sum.digest()
840 fdatasync(f.fileno())
844 obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
845 nameprefix = os.path.join(self.repo_dir,
846 'objects/pack/pack-' + obj_list_sha)
847 if os.path.exists(self.filename + '.map'):
848 os.unlink(self.filename + '.map')
849 os.rename(self.filename + '.pack', nameprefix + '.pack')
850 os.rename(self.filename + '.idx', nameprefix + '.idx')
852 os.fsync(self.parentfd)
854 os.close(self.parentfd)
857 auto_midx(os.path.join(self.repo_dir, 'objects/pack'))
859 if self.on_pack_finish:
860 self.on_pack_finish(nameprefix)
864 def close(self, run_midx=True):
865 """Close the pack file and move it to its definitive path."""
866 return self._end(run_midx=run_midx)
868 def _write_pack_idx_v2(self, filename, idx, packbin):
871 for entry in section:
872 if entry[2] >= 2**31:
875 # Length: header + fan-out + shas-and-crcs + overflow-offsets
876 index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
878 idx_f = open(filename, 'w+b')
880 idx_f.truncate(index_len)
881 fdatasync(idx_f.fileno())
882 idx_map = mmap_readwrite(idx_f, close=False)
884 count = _helpers.write_idx(filename, idx_map, idx, self.count)
885 assert(count == self.count)
892 idx_f = open(filename, 'a+b')
897 b = idx_f.read(8 + 4*256)
900 obj_list_sum = Sha1()
901 for b in chunkyreader(idx_f, 20*self.count):
903 obj_list_sum.update(b)
904 namebase = obj_list_sum.hexdigest()
906 for b in chunkyreader(idx_f):
908 idx_f.write(idx_sum.digest())
909 fdatasync(idx_f.fileno())
915 def list_refs(patterns=None, repo_dir=None,
916 limit_to_heads=False, limit_to_tags=False):
917 """Yield (refname, hash) tuples for all repository refs unless
918 patterns are specified. In that case, only include tuples for
919 refs matching those patterns (cf. git-show-ref(1)). The limits
920 restrict the result items to refs/heads or refs/tags. If both
921 limits are specified, items from both sources will be included.
924 argv = ['git', 'show-ref']
926 argv.append('--heads')
928 argv.append('--tags')
931 argv.extend(patterns)
932 p = subprocess.Popen(argv, env=_gitenv(repo_dir), stdout=subprocess.PIPE)
933 out = p.stdout.read().strip()
934 rv = p.wait() # not fatal
938 for d in out.split('\n'):
939 (sha, name) = d.split(' ', 1)
940 yield (name, sha.decode('hex'))
943 def read_ref(refname, repo_dir = None):
944 """Get the commit id of the most recent commit made on a given ref."""
945 refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
946 l = tuple(islice(refs, 2))
954 def rev_list_invocation(ref_or_refs, count=None, format=None):
955 if isinstance(ref_or_refs, compat.str_type):
956 refs = (ref_or_refs,)
959 argv = ['git', 'rev-list']
960 if isinstance(count, Integral):
961 argv.extend(['-n', str(count)])
963 raise ValueError('unexpected count argument %r' % count)
966 argv.append('--pretty=format:' + format)
968 assert not ref.startswith('-')
974 def rev_list(ref_or_refs, count=None, parse=None, format=None, repo_dir=None):
975 """Yield information about commits as per "git rev-list". If a format
976 is not provided, yield one hex hash at a time. If a format is
977 provided, pass it to rev-list and call parse(git_stdout) for each
978 commit with the stream positioned just after the rev-list "commit
979 HASH" header line. When a format is provided yield (oidx,
980 parse(git_stdout)) for each commit.
983 assert bool(parse) == bool(format)
984 p = subprocess.Popen(rev_list_invocation(ref_or_refs, count=count,
986 env=_gitenv(repo_dir),
987 stdout = subprocess.PIPE)
989 for line in p.stdout:
992 line = p.stdout.readline()
995 if not s.startswith('commit '):
996 raise Exception('unexpected line ' + s)
999 yield s, parse(p.stdout)
1000 line = p.stdout.readline()
1002 rv = p.wait() # not fatal
1004 raise GitError, 'git rev-list returned error %d' % rv
1007 def get_commit_dates(refs, repo_dir=None):
1008 """Get the dates for the specified commit refs. For now, every unique
1009 string in refs must resolve to a different commit or this
1010 function will fail."""
1013 commit = get_commit_items(ref, cp(repo_dir))
1014 result.append(commit.author_sec)
1018 def rev_parse(committish, repo_dir=None):
1019 """Resolve the full hash for 'committish', if it exists.
1021 Should be roughly equivalent to 'git rev-parse'.
1023 Returns the hex value of the hash if it is found, None if 'committish' does
1024 not correspond to anything.
1026 head = read_ref(committish, repo_dir=repo_dir)
1028 debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
1031 pL = PackIdxList(repo('objects/pack', repo_dir=repo_dir))
1033 if len(committish) == 40:
1035 hash = committish.decode('hex')
1045 def update_ref(refname, newval, oldval, repo_dir=None):
1046 """Update a repository reference."""
1049 assert(refname.startswith('refs/heads/') \
1050 or refname.startswith('refs/tags/'))
1051 p = subprocess.Popen(['git', 'update-ref', refname,
1052 newval.encode('hex'), oldval.encode('hex')],
1053 env=_gitenv(repo_dir))
1054 _git_wait('git update-ref', p)
1057 def delete_ref(refname, oldvalue=None):
1058 """Delete a repository reference (see git update-ref(1))."""
1059 assert(refname.startswith('refs/'))
1060 oldvalue = [] if not oldvalue else [oldvalue]
1061 p = subprocess.Popen(['git', 'update-ref', '-d', refname] + oldvalue,
1063 _git_wait('git update-ref', p)
1066 def guess_repo(path=None):
1067 """Set the path value in the global variable "repodir".
1068 This makes bup look for an existing bup repository, but not fail if a
1069 repository doesn't exist. Usually, if you are interacting with a bup
1070 repository, you would not be calling this function but using
1071 check_repo_or_die().
1077 repodir = os.environ.get('BUP_DIR')
1079 repodir = os.path.expanduser('~/.bup')
1082 def init_repo(path=None):
1083 """Create the Git bare repository for bup in a given path."""
1085 d = repo() # appends a / to the path
1086 parent = os.path.dirname(os.path.dirname(d))
1087 if parent and not os.path.exists(parent):
1088 raise GitError('parent directory "%s" does not exist\n' % parent)
1089 if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
1090 raise GitError('"%s" exists but is not a directory\n' % d)
1091 p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
1093 _git_wait('git init', p)
1094 # Force the index version configuration in order to ensure bup works
1095 # regardless of the version of the installed Git binary.
1096 p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
1097 stdout=sys.stderr, env=_gitenv())
1098 _git_wait('git config', p)
1100 p = subprocess.Popen(['git', 'config', 'core.logAllRefUpdates', 'true'],
1101 stdout=sys.stderr, env=_gitenv())
1102 _git_wait('git config', p)
1105 def check_repo_or_die(path=None):
1106 """Check to see if a bup repository probably exists, and abort if not."""
1109 pst = stat_if_exists(top + '/objects/pack')
1110 if pst and stat.S_ISDIR(pst.st_mode):
1113 top_st = stat_if_exists(top)
1115 log('error: repository %r does not exist (see "bup help init")\n'
1118 log('error: %r is not a repository\n' % top)
1124 """Get Git's version and ensure a usable version is installed.
1126 The returned version is formatted as an ordered tuple with each position
1127 representing a digit in the version tag. For example, the following tuple
1128 would represent version 1.6.6.9:
1130 ('1', '6', '6', '9')
1134 p = subprocess.Popen(['git', '--version'],
1135 stdout=subprocess.PIPE)
1136 gvs = p.stdout.read()
1137 _git_wait('git --version', p)
1138 m = re.match(r'git version (\S+.\S+)', gvs)
1140 raise GitError('git --version weird output: %r' % gvs)
1141 _ver = tuple(m.group(1).split('.'))
1142 needed = ('1','5', '3', '1')
1144 raise GitError('git version %s or higher is required; you have %s'
1145 % ('.'.join(needed), '.'.join(_ver)))
1149 class _AbortableIter:
1150 def __init__(self, it, onabort = None):
1152 self.onabort = onabort
1160 return next(self.it)
1161 except StopIteration as e:
1169 """Abort iteration and call the abortion callback, if needed."""
1180 """Link to 'git cat-file' that is used to retrieve blob data."""
1181 def __init__(self, repo_dir = None):
1182 self.repo_dir = repo_dir
1183 wanted = ('1','5','6')
1185 log('error: git version must be at least 1.5.6\n')
1187 self.p = self.inprogress = None
1191 self.p.stdout.close()
1192 self.p.stdin.close()
1194 self.inprogress = None
1198 self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1199 stdin=subprocess.PIPE,
1200 stdout=subprocess.PIPE,
1203 env=_gitenv(self.repo_dir))
1206 """Yield (oidx, type, size), followed by the data referred to by ref.
1207 If ref does not exist, only yield (None, None, None).
1210 if not self.p or self.p.poll() != None:
1213 poll_result = self.p.poll()
1214 assert(poll_result == None)
1216 log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1217 assert(not self.inprogress)
1218 assert(ref.find('\n') < 0)
1219 assert(ref.find('\r') < 0)
1220 assert(not ref.startswith('-'))
1221 self.inprogress = ref
1222 self.p.stdin.write('%s\n' % ref)
1223 self.p.stdin.flush()
1224 hdr = self.p.stdout.readline()
1225 if hdr.endswith(' missing\n'):
1226 self.inprogress = None
1227 yield None, None, None
1229 info = hdr.split(' ')
1230 if len(info) != 3 or len(info[0]) != 40:
1231 raise GitError('expected object (id, type, size), got %r' % info)
1232 oidx, typ, size = info
1234 it = _AbortableIter(chunkyreader(self.p.stdout, size),
1235 onabort=self._abort)
1237 yield oidx, typ, size
1240 readline_result = self.p.stdout.readline()
1241 assert(readline_result == '\n')
1242 self.inprogress = None
1243 except Exception as e:
1247 def _join(self, it):
1248 _, typ, _ = next(it)
1253 treefile = ''.join(it)
1254 for (mode, name, sha) in tree_decode(treefile):
1255 for blob in self.join(sha.encode('hex')):
1257 elif typ == 'commit':
1258 treeline = ''.join(it).split('\n')[0]
1259 assert(treeline.startswith('tree '))
1260 for blob in self.join(treeline[5:]):
1263 raise GitError('invalid object type %r: expected blob/tree/commit'
1267 """Generate a list of the content of all blobs that can be reached
1268 from an object. The hash given in 'id' must point to a blob, a tree
1269 or a commit. The content of all blobs that can be seen from trees or
1270 commits will be added to the list.
1273 for d in self._join(self.get(id)):
1275 except StopIteration:
1281 def cp(repo_dir=None):
1282 """Create a CatPipe object or reuse the already existing one."""
1285 repo_dir = repodir or repo()
1286 repo_dir = os.path.abspath(repo_dir)
1287 cp = _cp.get(repo_dir)
1289 cp = CatPipe(repo_dir)
1294 def tags(repo_dir = None):
1295 """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1297 for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1298 assert(n.startswith('refs/tags/'))
1302 tags[c].append(name) # more than one tag can point at 'c'
1306 class MissingObject(KeyError):
1307 def __init__(self, oid):
1309 KeyError.__init__(self, 'object %r is missing' % oid.encode('hex'))
1312 WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1313 'path', 'chunk_path', 'data'])
1314 # The path is the mangled path, and if an item represents a fragment
1315 # of a chunked file, the chunk_path will be the chunked subtree path
1316 # for the chunk, i.e. ['', '2d3115e', ...]. The top-level path for a
1317 # chunked file will have a chunk_path of ['']. So some chunk subtree
1318 # of the file '/foo/bar/baz' might look like this:
1320 # item.path = ['foo', 'bar', 'baz.bup']
1321 # item.chunk_path = ['', '2d3115e', '016b097']
1322 # item.type = 'tree'
1326 def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1327 """Yield everything reachable from oidx via get_ref (which must behave
1328 like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1329 returns true. Throw MissingObject if a hash encountered is
1330 missing from the repository, and don't read or return blob content
1331 in the data field unless include_data is set.
1334 # Maintain the pending stack on the heap to avoid stack overflow
1335 pending = [(oidx, [], [], None)]
1337 oidx, parent_path, chunk_path, mode = pending.pop()
1338 oid = oidx.decode('hex')
1339 if stop_at and stop_at(oidx):
1342 if (not include_data) and mode and stat.S_ISREG(mode):
1343 # If the object is a "regular file", then it's a leaf in
1344 # the graph, so we can skip reading the data if the caller
1345 # hasn't requested it.
1346 yield WalkItem(oid=oid, type='blob',
1347 chunk_path=chunk_path, path=parent_path,
1352 item_it = get_ref(oidx)
1353 get_oidx, typ, _ = next(item_it)
1355 raise MissingObject(oidx.decode('hex'))
1356 if typ not in ('blob', 'commit', 'tree'):
1357 raise Exception('unexpected repository object type %r' % typ)
1359 # FIXME: set the mode based on the type when the mode is None
1360 if typ == 'blob' and not include_data:
1361 # Dump data until we can ask cat_pipe not to fetch it
1362 for ignored in item_it:
1366 data = ''.join(item_it)
1368 yield WalkItem(oid=oid, type=typ,
1369 chunk_path=chunk_path, path=parent_path,
1371 data=(data if include_data else None))
1374 commit_items = parse_commit(data)
1375 for pid in commit_items.parents:
1376 pending.append((pid, parent_path, chunk_path, mode))
1377 pending.append((commit_items.tree, parent_path, chunk_path,
1378 hashsplit.GIT_MODE_TREE))
1380 for mode, name, ent_id in tree_decode(data):
1381 demangled, bup_type = demangle_name(name, mode)
1383 sub_path = parent_path
1384 sub_chunk_path = chunk_path + [name]
1386 sub_path = parent_path + [name]
1387 if bup_type == BUP_CHUNKED:
1388 sub_chunk_path = ['']
1390 sub_chunk_path = chunk_path
1391 pending.append((ent_id.encode('hex'), sub_path, sub_chunk_path,