X-Git-Url: https://arthur.barton.de/gitweb/?a=blobdiff_plain;f=lib%2Fbup%2Fvfs.py;h=99ca85d39426a0de4e35617625872d9b804f9491;hb=aeb8ce7d7974a4e6f472ae546d3baab90e1e8aa1;hp=454355337c4b06bde89f231657be66647a7fea14;hpb=f3e3307ebee6b88b118a447b0f999232eb948696;p=bup.git diff --git a/lib/bup/vfs.py b/lib/bup/vfs.py index 4543553..99ca85d 100644 --- a/lib/bup/vfs.py +++ b/lib/bup/vfs.py @@ -1,110 +1,145 @@ -"""Virtual File System representing bup's repository contents. +"""Virtual File System interface to bup repository content. + +This module provides a path-based interface to the content of a bup +repository. + +The VFS is structured like this: + + /SAVE-NAME/latest/... + /SAVE-NAME/SAVE-DATE/... + /.tag/TAG-NAME/... + +Each path is represented by an item that has least an item.meta which +may be either a Metadata object, or an integer mode. Functions like +item_mode() and item_size() will return the mode and size in either +case. Any item.meta Metadata instances must not be modified directly. +Make a copy to modify via item.meta.copy() if needed, or call +copy_item(). + +The want_meta argument is advisory for calls that accept it, and it +may not be honored. Callers must be able to handle an item.meta value +that is either an instance of Metadata or an integer mode, perhaps +via item_mode() or augment_item_meta(). + +Setting want_meta=False is rarely desirable since it can limit the VFS +to only the metadata that git itself can represent, and so for +example, fifos and sockets will appear to be regular files +(e.g. S_ISREG(item_mode(item)) will be true). But the option is still +provided because it may be more efficient when just the path names or +the more limited metadata is sufficient. + +Any given metadata object's size may be None, in which case the size +can be computed via item_size() or augment_item_meta(..., +include_size=True). + +When traversing a directory using functions like contents(), the meta +value for any directories other than '.' will be a default directory +mode, not a Metadata object. This is because the actual metadata for +a directory is stored inside the directory (see +fill_in_metadata_if_dir() or ensure_item_has_metadata()). + +Commit items represent commits (e.g. /.tag/some-commit or +/foo/latest), and for most purposes, they appear as the underlying +tree. S_ISDIR(item_mode(item)) will return true for both tree Items +and Commits and the commit's oid is the tree hash; the commit hash is +item.coid. -The vfs.py library makes it possible to expose contents from bup's repository -and abstracts internal name mangling and storage from the exposition layer. """ -import os, re, stat, time -from bup import git -from helpers import * -from bup.hashsplit import GIT_MODE_TREE, GIT_MODE_FILE -EMPTY_SHA='\0'*20 - -_cp = None -def cp(): - """Create a git.CatPipe object or reuse the already existing one.""" - global _cp - if not _cp: - _cp = git.CatPipe() - return _cp - -class NodeError(Exception): - """VFS base exception.""" - pass - -class NoSuchFile(NodeError): - """Request of a file that does not exist.""" - pass - -class NotDir(NodeError): - """Attempt to do a directory action on a file that is not one.""" - pass - -class NotFile(NodeError): - """Access to a node that does not represent a file.""" - pass - -class TooManySymlinks(NodeError): - """Symlink dereferencing level is too deep.""" - pass - - -def _treeget(hash): - it = cp().get(hash.encode('hex')) - type = it.next() - assert(type == 'tree') - return git.tree_decode(''.join(it)) - - -def _tree_decode(hash): - tree = [(int(name,16),stat.S_ISDIR(mode),sha) - for (mode,name,sha) - in _treeget(hash)] - assert(tree == list(sorted(tree))) - return tree - - -def _chunk_len(hash): - return sum(len(b) for b in cp().join(hash.encode('hex'))) - - -def _last_chunk_info(hash): - tree = _tree_decode(hash) - assert(tree) - (ofs,isdir,sha) = tree[-1] - if isdir: - (subofs, sublen) = _last_chunk_info(sha) - return (ofs+subofs, sublen) - else: - return (ofs, _chunk_len(sha)) - - -def _total_size(hash): - (lastofs, lastsize) = _last_chunk_info(hash) - return lastofs + lastsize - - -def _chunkiter(hash, startofs): +from __future__ import absolute_import, print_function +from collections import namedtuple +from errno import EINVAL, ELOOP, ENOENT, ENOTDIR +from itertools import chain, dropwhile, groupby, tee +from random import randrange +from stat import S_IFDIR, S_IFLNK, S_IFREG, S_ISDIR, S_ISLNK, S_ISREG +from time import localtime, strftime +import exceptions, re, sys + +from bup import client, git, metadata +from bup.compat import range +from bup.git import BUP_CHUNKED, cp, get_commit_items, parse_commit, tree_decode +from bup.helpers import debug2, last +from bup.metadata import Metadata +from bup.repo import LocalRepo, RemoteRepo + + +class IOError(exceptions.IOError): + def __init__(self, errno, message, terminus=None): + exceptions.IOError.__init__(self, errno, message) + self.terminus = terminus + +default_file_mode = S_IFREG | 0o644 +default_dir_mode = S_IFDIR | 0o755 +default_symlink_mode = S_IFLNK | 0o755 + +def _default_mode_for_gitmode(gitmode): + if S_ISREG(gitmode): + return default_file_mode + if S_ISDIR(gitmode): + return default_dir_mode + if S_ISLNK(gitmode): + return default_symlink_mode + raise Exception('unexpected git mode ' + oct(gitmode)) + +def _normal_or_chunked_file_size(repo, oid): + """Return the size of the normal or chunked file indicated by oid.""" + # FIXME: --batch-format CatPipe? + it = repo.cat(oid.encode('hex')) + _, obj_t, size = next(it) + ofs = 0 + while obj_t == 'tree': + mode, name, last_oid = last(tree_decode(''.join(it))) + ofs += int(name, 16) + it = repo.cat(last_oid.encode('hex')) + _, obj_t, size = next(it) + return ofs + sum(len(b) for b in it) + +def _skip_chunks_before_offset(tree, offset): + prev_ent = next(tree, None) + if not prev_ent: + return tree + ent = None + for ent in tree: + ent_ofs = int(ent[1], 16) + if ent_ofs > offset: + return chain([prev_ent, ent], tree) + if ent_ofs == offset: + return chain([ent], tree) + prev_ent = ent + return [prev_ent] + +def _tree_chunks(repo, tree, startofs): + "Tree should be a sequence of (name, mode, hash) as per tree_decode()." assert(startofs >= 0) - tree = _tree_decode(hash) - - # skip elements before startofs - for i in xrange(len(tree)): - if i+1 >= len(tree) or tree[i+1][0] > startofs: - break - first = i - - # iterate through what's left - for i in xrange(first, len(tree)): - (ofs,isdir,sha) = tree[i] - skipmore = startofs-ofs + # name is the chunk's hex offset in the original file + for mode, name, oid in _skip_chunks_before_offset(tree, startofs): + ofs = int(name, 16) + skipmore = startofs - ofs if skipmore < 0: skipmore = 0 - if isdir: - for b in _chunkiter(sha, skipmore): + it = repo.cat(oid.encode('hex')) + _, obj_t, size = next(it) + data = ''.join(it) + if S_ISDIR(mode): + assert obj_t == 'tree' + for b in _tree_chunks(repo, tree_decode(data), skipmore): yield b else: - yield ''.join(cp().join(sha.encode('hex')))[skipmore:] - + assert obj_t == 'blob' + yield data[skipmore:] class _ChunkReader: - def __init__(self, hash, isdir, startofs): + def __init__(self, repo, oid, startofs): + it = repo.cat(oid.encode('hex')) + _, obj_t, size = next(it) + isdir = obj_t == 'tree' + data = ''.join(it) if isdir: - self.it = _chunkiter(hash, startofs) + self.it = _tree_chunks(repo, tree_decode(data), startofs) self.blob = None else: self.it = None - self.blob = ''.join(cp().join(hash.encode('hex')))[startofs:] + self.blob = data[startofs:] self.ofs = startofs def next(self, size): @@ -125,31 +160,36 @@ class _ChunkReader: self.ofs += len(out) return out - class _FileReader(object): - def __init__(self, hash, size, isdir): - self.hash = hash + def __init__(self, repo, oid, known_size=None): + assert len(oid) == 20 + self.oid = oid self.ofs = 0 - self.size = size - self.isdir = isdir self.reader = None - + self._repo = repo + self._size = known_size + + def _compute_size(self): + if not self._size: + self._size = _normal_or_chunked_file_size(self._repo, self.oid) + return self._size + def seek(self, ofs): - if ofs > self.size: - self.ofs = self.size - elif ofs < 0: - self.ofs = 0 - else: - self.ofs = ofs + if ofs < 0 or ofs > self._compute_size(): + raise IOError(EINVAL, 'Invalid seek offset: %d' % ofs) + self.ofs = ofs def tell(self): return self.ofs - def read(self, count = -1): + def read(self, count=-1): + size = self._compute_size() + if self.ofs >= size: + return '' if count < 0: - count = self.size - self.ofs + count = size - self.ofs if not self.reader or self.reader.ofs != self.ofs: - self.reader = _ChunkReader(self.hash, self.isdir, self.ofs) + self.reader = _ChunkReader(self._repo, self.oid, self.ofs) try: buf = self.reader.next(count) except: @@ -161,374 +201,818 @@ class _FileReader(object): def close(self): pass + def __enter__(self): + return self + def __exit__(self, type, value, traceback): + self.close() + return False -class Node: - """Base class for file representation.""" - def __init__(self, parent, name, mode, hash): - self.parent = parent - self.name = name - self.mode = mode - self.hash = hash - self.ctime = self.mtime = self.atime = 0 - self._subs = None - - def __cmp__(a, b): - return cmp(a and a.name or None, b and b.name or None) - - def __iter__(self): - return iter(self.subs()) - - def fullname(self, stop_at=None): - """Get this file's full path.""" - assert(self != stop_at) # would be the empty string; too weird - if self.parent and self.parent != stop_at: - return os.path.join(self.parent.fullname(stop_at=stop_at), - self.name) - else: - return self.name - - def _mksubs(self): - self._subs = {} - - def subs(self): - """Get a list of nodes that are contained in this node.""" - if self._subs == None: - self._mksubs() - return sorted(self._subs.values()) - - def sub(self, name): - """Get node named 'name' that is contained in this node.""" - if self._subs == None: - self._mksubs() - ret = self._subs.get(name) - if not ret: - raise NoSuchFile("no file %r in %r" % (name, self.name)) - return ret - - def top(self): - """Return the very top node of the tree.""" - if self.parent: - return self.parent.top() +_multiple_slashes_rx = re.compile(r'//+') + +def _decompose_path(path): + """Return a boolean indicating whether the path is absolute, and a + reversed list of path elements, omitting any occurrences of "." + and ignoring any leading or trailing slash. If the path is + effectively '/' or '.', return an empty list. + + """ + path = re.sub(_multiple_slashes_rx, '/', path) + if path == '/': + return True, True, [] + is_absolute = must_be_dir = False + if path.startswith('/'): + is_absolute = True + path = path[1:] + for suffix in ('/', '/.'): + if path.endswith(suffix): + must_be_dir = True + path = path[:-len(suffix)] + parts = [x for x in path.split('/') if x != '.'] + parts.reverse() + if not parts: + must_be_dir = True # e.g. path was effectively '.' or '/', etc. + return is_absolute, must_be_dir, parts + + +Item = namedtuple('Item', ('meta', 'oid')) +Chunky = namedtuple('Chunky', ('meta', 'oid')) +Root = namedtuple('Root', ('meta')) +Tags = namedtuple('Tags', ('meta')) +RevList = namedtuple('RevList', ('meta', 'oid')) +Commit = namedtuple('Commit', ('meta', 'oid', 'coid')) + +item_types = frozenset((Item, Chunky, Root, Tags, RevList, Commit)) +real_tree_types = frozenset((Item, Commit)) + +_root = Root(meta=default_dir_mode) +_tags = Tags(meta=default_dir_mode) + + +### vfs cache + +### A general purpose shared cache with (currently) cheap random +### eviction. At the moment there is no weighting so a single commit +### item is just as likely to be evicted as an entire "rev-list". See +### is_valid_cache_key for a description of the expected content. + +_cache = {} +_cache_keys = [] +_cache_max_items = 30000 + +def clear_cache(): + global _cache, _cache_keys + _cache = {} + _cache_keys = [] + +def is_valid_cache_key(x): + """Return logically true if x looks like it could be a valid cache key + (with respect to structure). Current valid cache entries: + res:... -> resolution + commit_oid -> commit + commit_oid + ':r' -> rev-list + i.e. rev-list -> {'.', commit, '2012...', next_commit, ...} + """ + # Suspect we may eventually add "(container_oid, name) -> ...", and others. + x_t = type(x) + if x_t is bytes: + if len(x) == 20: + return True + if len(x) == 22 and x.endswith(b':r'): + return True + if x.startswith('res:'): + return True + +def cache_get(key): + global _cache + assert is_valid_cache_key(key) + return _cache.get(key) + +def cache_notice(key, value): + global _cache, _cache_keys, _cache_max_items + assert is_valid_cache_key(key) + if key in _cache: + return + if len(_cache) < _cache_max_items: + _cache_keys.append(key) + _cache[key] = value + return + victim_i = randrange(0, len(_cache_keys)) + victim = _cache_keys[victim_i] + del _cache[victim] + _cache_keys[victim_i] = key + _cache[key] = value + +def cache_get_commit_item(oid, need_meta=True): + """Return the requested tree item if it can be found in the cache. + When need_meta is true don't return a cached item that only has a + mode.""" + # tree might be stored independently, or as '.' with its entries. + item = cache_get(oid) + if item: + if not need_meta: + return item + if isinstance(item.meta, Metadata): + return item + entries = cache_get(oid + b':r') + if entries: + return entries['.'] + +def cache_get_revlist_item(oid, need_meta=True): + commit = cache_get_commit_item(oid, need_meta=need_meta) + if commit: + return RevList(oid=oid, meta=commit.meta) + +def copy_item(item): + """Return a completely independent copy of item, such that + modifications will not affect the original. + + """ + meta = getattr(item, 'meta', None) + if isinstance(meta, Metadata): + return(item._replace(meta=meta.copy())) + return item + +def item_mode(item): + """Return the integer mode (stat st_mode) for item.""" + m = item.meta + if isinstance(m, Metadata): + return m.mode + return m + +def _read_dir_meta(bupm): + # This is because save writes unmodified Metadata() entries for + # fake parents -- test-save-strip-graft.sh demonstrates. + m = Metadata.read(bupm) + if not m: + return default_dir_mode + assert m.mode is not None + if m.size is None: + m.size = 0 + return m + +def tree_data_and_bupm(repo, oid): + """Return (tree_bytes, bupm_oid) where bupm_oid will be None if the + tree has no metadata (i.e. older bup save, or non-bup tree). + + """ + assert len(oid) == 20 + it = repo.cat(oid.encode('hex')) + _, item_t, size = next(it) + data = ''.join(it) + if item_t == 'commit': + commit = parse_commit(data) + it = repo.cat(commit.tree) + _, item_t, size = next(it) + data = ''.join(it) + assert item_t == 'tree' + elif item_t != 'tree': + raise Exception('%r is not a tree or commit' % oid.encode('hex')) + for _, mangled_name, sub_oid in tree_decode(data): + if mangled_name == '.bupm': + return data, sub_oid + if mangled_name > '.bupm': + break + return data, None + +def _find_treeish_oid_metadata(repo, oid): + """Return the metadata for the tree or commit oid, or None if the tree + has no metadata (i.e. older bup save, or non-bup tree). + + """ + tree_data, bupm_oid = tree_data_and_bupm(repo, oid) + if bupm_oid: + with _FileReader(repo, bupm_oid) as meta_stream: + return _read_dir_meta(meta_stream) + return None + +def _readlink(repo, oid): + return ''.join(repo.join(oid.encode('hex'))) + +def readlink(repo, item): + """Return the link target of item, which must be a symlink. Reads the + target from the repository if necessary.""" + assert repo + assert S_ISLNK(item_mode(item)) + if isinstance(item.meta, Metadata): + target = item.meta.symlink_target + if target: + return target + return _readlink(repo, item.oid) + +def _compute_item_size(repo, item): + mode = item_mode(item) + if S_ISREG(mode): + size = _normal_or_chunked_file_size(repo, item.oid) + return size + if S_ISLNK(mode): + return len(_readlink(repo, item.oid)) + return 0 + +def item_size(repo, item): + """Return the size of item, computing it if necessary.""" + m = item.meta + if isinstance(m, Metadata) and m.size is not None: + return m.size + return _compute_item_size(repo, item) + +def tree_data_reader(repo, oid): + """Return an open reader for all of the data contained within oid. If + oid refers to a tree, recursively concatenate all of its contents.""" + return _FileReader(repo, oid) + +def fopen(repo, item): + """Return an open reader for the given file item.""" + assert S_ISREG(item_mode(item)) + return tree_data_reader(repo, item.oid) + +def _commit_item_from_data(oid, data): + info = parse_commit(data) + return Commit(meta=default_dir_mode, + oid=info.tree.decode('hex'), + coid=oid) + +def _commit_item_from_oid(repo, oid, require_meta): + commit = cache_get_commit_item(oid, need_meta=require_meta) + if commit and ((not require_meta) or isinstance(commit.meta, Metadata)): + return commit + it = repo.cat(oid.encode('hex')) + _, typ, size = next(it) + assert typ == 'commit' + commit = _commit_item_from_data(oid, ''.join(it)) + if require_meta: + meta = _find_treeish_oid_metadata(repo, commit.oid) + if meta: + commit = commit._replace(meta=meta) + cache_notice(oid, commit) + return commit + +def _revlist_item_from_oid(repo, oid, require_meta): + commit = _commit_item_from_oid(repo, oid, require_meta) + return RevList(oid=oid, meta=commit.meta) + +def root_items(repo, names=None, want_meta=True): + """Yield (name, item) for the items in '/' in the VFS. Return + everything if names is logically false, otherwise return only + items with a name in the collection. + + """ + # FIXME: what about non-leaf refs like 'refs/heads/foo/bar/baz? + + global _root, _tags + if not names: + yield '.', _root + yield '.tag', _tags + # FIXME: maybe eventually support repo.clone() or something + # and pass in two repos, so we can drop the tuple() and stream + # in parallel (i.e. meta vs refs). + for name, oid in tuple(repo.refs([], limit_to_heads=True)): + assert(name.startswith('refs/heads/')) + yield name[11:], _revlist_item_from_oid(repo, oid, want_meta) + return + + if '.' in names: + yield '.', _root + if '.tag' in names: + yield '.tag', _tags + for ref in names: + if ref in ('.', '.tag'): + continue + it = repo.cat('refs/heads/' + ref) + oidx, typ, size = next(it) + if not oidx: + for _ in it: pass + continue + assert typ == 'commit' + commit = parse_commit(''.join(it)) + yield ref, _revlist_item_from_oid(repo, oidx.decode('hex'), want_meta) + +def ordered_tree_entries(tree_data, bupm=None): + """Yields (name, mangled_name, kind, gitmode, oid) for each item in + tree, sorted by name. + + """ + # Sadly, the .bupm entries currently aren't in git tree order, + # i.e. they don't account for the fact that git sorts trees + # (including our chunked trees) as if their names ended with "/", + # so "fo" sorts after "fo." iff fo is a directory. This makes + # streaming impossible when we need the metadata. + def result_from_tree_entry(tree_entry): + gitmode, mangled_name, oid = tree_entry + name, kind = git.demangle_name(mangled_name, gitmode) + return name, mangled_name, kind, gitmode, oid + + tree_ents = (result_from_tree_entry(x) for x in tree_decode(tree_data)) + if bupm: + tree_ents = sorted(tree_ents, key=lambda x: x[0]) + for ent in tree_ents: + yield ent + +def tree_items(oid, tree_data, names=frozenset(), bupm=None): + + def tree_item(ent_oid, kind, gitmode): + if kind == BUP_CHUNKED: + meta = Metadata.read(bupm) if bupm else default_file_mode + return Chunky(oid=ent_oid, meta=meta) + + if S_ISDIR(gitmode): + # No metadata here (accessable via '.' inside ent_oid). + return Item(meta=default_dir_mode, oid=ent_oid) + + return Item(oid=ent_oid, + meta=(Metadata.read(bupm) if bupm \ + else _default_mode_for_gitmode(gitmode))) + + assert len(oid) == 20 + if not names: + dot_meta = _read_dir_meta(bupm) if bupm else default_dir_mode + yield '.', Item(oid=oid, meta=dot_meta) + tree_entries = ordered_tree_entries(tree_data, bupm) + for name, mangled_name, kind, gitmode, ent_oid in tree_entries: + if mangled_name == '.bupm': + continue + assert name != '.' + yield name, tree_item(ent_oid, kind, gitmode) + return + + # Assumes the tree is properly formed, i.e. there are no + # duplicates, and entries will be in git tree order. + if type(names) not in (frozenset, set): + names = frozenset(names) + remaining = len(names) + + # Account for the bupm sort order issue (cf. ordered_tree_entries above) + last_name = max(names) if bupm else max(names) + '/' + + if '.' in names: + dot_meta = _read_dir_meta(bupm) if bupm else default_dir_mode + yield '.', Item(oid=oid, meta=dot_meta) + if remaining == 1: + return + remaining -= 1 + + tree_entries = ordered_tree_entries(tree_data, bupm) + for name, mangled_name, kind, gitmode, ent_oid in tree_entries: + if mangled_name == '.bupm': + continue + assert name != '.' + if name not in names: + if name > last_name: + break # given bupm sort order, we're finished + if (kind == BUP_CHUNKED or not S_ISDIR(gitmode)) and bupm: + Metadata.read(bupm) + continue + yield name, tree_item(ent_oid, kind, gitmode) + if remaining == 1: + break + remaining -= 1 + +def tree_items_with_meta(repo, oid, tree_data, names): + # For now, the .bupm order doesn't quite match git's, and we don't + # load the tree data incrementally anyway, so we just work in RAM + # via tree_data. + assert len(oid) == 20 + bupm = None + for _, mangled_name, sub_oid in tree_decode(tree_data): + if mangled_name == '.bupm': + bupm = _FileReader(repo, sub_oid) + break + if mangled_name > '.bupm': + break + for item in tree_items(oid, tree_data, names, bupm): + yield item + +_save_name_rx = re.compile(r'^\d\d\d\d-\d\d-\d\d-\d{6}(-\d+)?$') + +def _reverse_suffix_duplicates(strs): + """Yields the elements of strs, with any runs of duplicate values + suffixed with -N suffixes, where the zero padded integer N + decreases to 0 by 1 (e.g. 10, 09, ..., 00). + + """ + for name, duplicates in groupby(strs): + ndup = len(tuple(duplicates)) + if ndup == 1: + yield name else: - return self + ndig = len(str(ndup - 1)) + fmt = '%s-' + '%0' + str(ndig) + 'd' + for i in range(ndup - 1, -1, -1): + yield fmt % (name, i) + +def parse_rev(f): + items = f.readline().split(None) + assert len(items) == 2 + tree, auth_sec = items + return tree.decode('hex'), int(auth_sec) + +def _name_for_rev(rev): + commit_oidx, (tree_oid, utc) = rev + return strftime('%Y-%m-%d-%H%M%S', localtime(utc)) + +def _item_for_rev(rev): + commit_oidx, (tree_oid, utc) = rev + coid = commit_oidx.decode('hex') + item = cache_get_commit_item(coid, need_meta=False) + if item: + return item + item = Commit(meta=default_dir_mode, oid=tree_oid, coid=coid) + cache_notice(item.coid, item) + return item + +def cache_commit(repo, oid): + """Build, cache, and return a "name -> commit_item" dict of the entire + commit rev-list. - def fs_top(self): - """Return the top node of the particular backup set. + """ + # For now, always cache with full metadata + entries = {} + entries['.'] = _revlist_item_from_oid(repo, oid, True) + revs = repo.rev_list((oid.encode('hex'),), format='%T %at', + parse=parse_rev) + rev_items, rev_names = tee(revs) + revs = None # Don't disturb the tees + rev_names = _reverse_suffix_duplicates(_name_for_rev(x) for x in rev_names) + rev_items = (_item_for_rev(x) for x in rev_items) + latest = None + for item in rev_items: + latest = latest or item + name = next(rev_names) + entries[name] = item + entries['latest'] = latest + cache_notice(latest.coid + b':r', entries) + return entries + +def revlist_items(repo, oid, names): + assert len(oid) == 20 + + # Special case '.' instead of caching the whole history since it's + # the only way to get the metadata for the commit. + if names and all(x == '.' for x in names): + yield '.', _revlist_item_from_oid(repo, oid, True) + return + + # For now, don't worry about the possibility of the contents being + # "too big" for the cache. + entries = cache_get(oid + b':r') + if not entries: + entries = cache_commit(repo, oid) + + if not names: + for name in sorted(entries.keys()): + yield name, entries[name] + return + + names = frozenset(name for name in names + if _save_name_rx.match(name) or name in ('.', 'latest')) + + if '.' in names: + yield '.', entries['.'] + for name in (n for n in names if n != '.'): + commit = entries.get(name) + if commit: + yield name, commit + +def tags_items(repo, names): + global _tags + + def tag_item(oid): + assert len(oid) == 20 + oidx = oid.encode('hex') + it = repo.cat(oidx) + _, typ, size = next(it) + if typ == 'commit': + return cache_get_commit_item(oid, need_meta=False) \ + or _commit_item_from_data(oid, ''.join(it)) + for _ in it: pass + if typ == 'blob': + return Item(meta=default_file_mode, oid=oid) + elif typ == 'tree': + return Item(meta=default_dir_mode, oid=oid) + raise Exception('unexpected tag type ' + typ + ' for tag ' + name) + + if not names: + yield '.', _tags + # We have to pull these all into ram because tag_item calls cat() + for name, oid in tuple(repo.refs(names, limit_to_tags=True)): + assert(name.startswith('refs/tags/')) + name = name[10:] + yield name, tag_item(oid) + return + + # Assumes no duplicate refs + if type(names) not in (frozenset, set): + names = frozenset(names) + remaining = len(names) + last_name = max(names) + if '.' in names: + yield '.', _tags + if remaining == 1: + return + remaining -= 1 + + for name, oid in repo.refs(names, limit_to_tags=True): + assert(name.startswith('refs/tags/')) + name = name[10:] + if name > last_name: + return + if name not in names: + continue + yield name, tag_item(oid) + if remaining == 1: + return + remaining -= 1 + +def contents(repo, item, names=None, want_meta=True): + """Yields information about the items contained in item. Yields + (name, item) for each name in names, if the name exists, in an + unspecified order. If there are no names, then yields (name, + item) for all items, including, a first item named '.' + representing the container itself. + + The meta value for any directories other than '.' will be a + default directory mode, not a Metadata object. This is because + the actual metadata for a directory is stored inside the directory + (see fill_in_metadata_if_dir() or ensure_item_has_metadata()). + + Note that want_meta is advisory. For any given item, item.meta + might be a Metadata instance or a mode, and if the former, + meta.size might be None. Missing sizes can be computed via via + item_size() or augment_item_meta(..., include_size=True). + + Do not modify any item.meta Metadata instances directly. If + needed, make a copy via item.meta.copy() and modify that instead. - If this node isn't inside a backup set, return the root level. - """ - if self.parent and not isinstance(self.parent, CommitList): - return self.parent.fs_top() + """ + # Q: are we comfortable promising '.' first when no names? + global _root, _tags + assert repo + assert S_ISDIR(item_mode(item)) + item_t = type(item) + if item_t in real_tree_types: + it = repo.cat(item.oid.encode('hex')) + _, obj_t, size = next(it) + data = ''.join(it) + if obj_t != 'tree': + for _ in it: pass + # Note: it shouldn't be possible to see an Item with type + # 'commit' since a 'commit' should always produce a Commit. + raise Exception('unexpected git ' + obj_t) + if want_meta: + item_gen = tree_items_with_meta(repo, item.oid, data, names) else: - return self - - def _lresolve(self, parts): - #debug2('_lresolve %r in %r\n' % (parts, self.name)) - if not parts: - return self - (first, rest) = (parts[0], parts[1:]) - if first == '.': - return self._lresolve(rest) - elif first == '..': - if not self.parent: - raise NoSuchFile("no parent dir for %r" % self.name) - return self.parent._lresolve(rest) - elif rest: - return self.sub(first)._lresolve(rest) + item_gen = tree_items(item.oid, data, names) + elif item_t == RevList: + item_gen = revlist_items(repo, item.oid, names) + elif item_t == Root: + item_gen = root_items(repo, names, want_meta) + elif item_t == Tags: + item_gen = tags_items(repo, names) + else: + raise Exception('unexpected VFS item ' + str(item)) + for x in item_gen: + yield x + +def _resolve_path(repo, path, parent=None, want_meta=True, deref=False): + cache_key = b'res:%d%d%d:%s\0%s' \ + % (bool(want_meta), bool(deref), repo.id(), + ('/'.join(x[0] for x in parent) if parent else ''), + '/'.join(path)) + resolution = cache_get(cache_key) + if resolution: + return resolution + + def notice_resolution(r): + cache_notice(cache_key, r) + return r + + def raise_dir_required_but_not_dir(path, parent, past): + raise IOError(ENOTDIR, + "path %r%s resolves to non-directory %r" + % (path, + ' (relative to %r)' % parent if parent else '', + past), + terminus=past) + global _root + assert repo + assert len(path) + if parent: + for x in parent: + assert len(x) == 2 + assert type(x[0]) in (bytes, str) + assert type(x[1]) in item_types + assert parent[0][1] == _root + if not S_ISDIR(item_mode(parent[-1][1])): + raise IOError(ENOTDIR, + 'path resolution parent %r is not a directory' + % (parent,)) + is_absolute, must_be_dir, future = _decompose_path(path) + if must_be_dir: + deref = True + if not future: # path was effectively '.' or '/' + if is_absolute: + return notice_resolution((('', _root),)) + if parent: + return notice_resolution(tuple(parent)) + return notice_resolution((('', _root),)) + if is_absolute: + past = [('', _root)] + else: + past = list(parent) if parent else [('', _root)] + hops = 0 + while True: + if not future: + if must_be_dir and not S_ISDIR(item_mode(past[-1][1])): + raise_dir_required_but_not_dir(path, parent, past) + return notice_resolution(tuple(past)) + segment = future.pop() + if segment == '..': + assert len(past) > 0 + if len(past) > 1: # .. from / is / + assert S_ISDIR(item_mode(past[-1][1])) + past.pop() else: - return self.sub(first) - - def lresolve(self, path, stay_inside_fs=False): - """Walk into a given sub-path of this node. - - If the last element is a symlink, leave it as a symlink, don't resolve - it. (like lstat()) - """ - start = self - if not path: - return start - if path.startswith('/'): - if stay_inside_fs: - start = self.fs_top() - else: - start = self.top() - path = path[1:] - parts = re.split(r'/+', path or '.') - if not parts[-1]: - parts[-1] = '.' - #debug2('parts: %r %r\n' % (path, parts)) - return start._lresolve(parts) - - def resolve(self, path = ''): - """Like lresolve(), and dereference it if it was a symlink.""" - return self.lresolve(path).lresolve('.') - - def try_resolve(self, path = ''): - """Like resolve(), but don't worry if a symlink uses an invalid path. - - Returns an error if any intermediate nodes were invalid. - """ - n = self.lresolve(path) - try: - n = n.lresolve('.') - except NoSuchFile: - pass - return n - - def nlinks(self): - """Get the number of hard links to the current node.""" - if self._subs == None: - self._mksubs() - return 1 - - def size(self): - """Get the size of the current node.""" - return 0 - - def open(self): - """Open the current node. It is an error to open a non-file node.""" - raise NotFile('%s is not a regular file' % self.name) - - -class File(Node): - """A normal file from bup's repository.""" - def __init__(self, parent, name, mode, hash, bupmode): - Node.__init__(self, parent, name, mode, hash) - self.bupmode = bupmode - self._cached_size = None - self._filereader = None - - def open(self): - """Open the file.""" - # You'd think FUSE might call this only once each time a file is - # opened, but no; it's really more of a refcount, and it's called - # once per read(). Thus, it's important to cache the filereader - # object here so we're not constantly re-seeking. - if not self._filereader: - self._filereader = _FileReader(self.hash, self.size(), - self.bupmode == git.BUP_CHUNKED) - self._filereader.seek(0) - return self._filereader - - def size(self): - """Get this file's size.""" - if self._cached_size == None: - debug1('<<< 100: - raise TooManySymlinks('too many levels of symlinks: %r' - % self.fullname()) - _symrefs += 1 - try: - try: - return self.parent.lresolve(self.readlink(), - stay_inside_fs=True) - except NoSuchFile: - raise NoSuchFile("%s: broken symlink to %r" - % (self.fullname(), self.readlink())) - finally: - _symrefs -= 1 - - def _lresolve(self, parts): - return self.dereference()._lresolve(parts) - - -class FakeSymlink(Symlink): - """A symlink that is not stored in the bup repository.""" - def __init__(self, parent, name, toname): - Symlink.__init__(self, parent, name, EMPTY_SHA, git.BUP_NORMAL) - self.toname = toname - - def readlink(self): - """Get the path that this link points at.""" - return self.toname - - -class Dir(Node): - """A directory stored inside of bup's repository.""" - def _mksubs(self): - self._subs = {} - it = cp().get(self.hash.encode('hex')) - type = it.next() - if type == 'commit': - del it - it = cp().get(self.hash.encode('hex') + ':') - type = it.next() - assert(type == 'tree') - for (mode,mangled_name,sha) in git.tree_decode(''.join(it)): - name = mangled_name - (name,bupmode) = git.demangle_name(mangled_name) - if bupmode == git.BUP_CHUNKED: - mode = GIT_MODE_FILE - if stat.S_ISDIR(mode): - self._subs[name] = Dir(self, name, mode, sha) - elif stat.S_ISLNK(mode): - self._subs[name] = Symlink(self, name, sha, bupmode) - else: - self._subs[name] = File(self, name, mode, sha, bupmode) - - -class CommitDir(Node): - """A directory that contains all commits that are reachable by a ref. - - Contains a set of subdirectories named after the commits' first byte in - hexadecimal. Each of those directories contain all commits with hashes that - start the same as the directory name. The name used for those - subdirectories is the hash of the commit without the first byte. This - separation helps us avoid having too much directories on the same level as - the number of commits grows big. - """ - def __init__(self, parent, name): - Node.__init__(self, parent, name, GIT_MODE_TREE, EMPTY_SHA) - - def _mksubs(self): - self._subs = {} - refs = git.list_refs() - for ref in refs: - #debug2('ref name: %s\n' % ref[0]) - revs = git.rev_list(ref[1].encode('hex')) - for (date, commit) in revs: - #debug2('commit: %s date: %s\n' % (commit.encode('hex'), date)) - commithex = commit.encode('hex') - containername = commithex[:2] - dirname = commithex[2:] - n1 = self._subs.get(containername) - if not n1: - n1 = CommitList(self, containername) - self._subs[containername] = n1 - - if n1.commits.get(dirname): - # Stop work for this ref, the rest should already be present - break - - n1.commits[dirname] = (commit, date) - - -class CommitList(Node): - """A list of commits with hashes that start with the current node's name.""" - def __init__(self, parent, name): - Node.__init__(self, parent, name, GIT_MODE_TREE, EMPTY_SHA) - self.commits = {} - - def _mksubs(self): - self._subs = {} - for (name, (hash, date)) in self.commits.items(): - n1 = Dir(self, name, GIT_MODE_TREE, hash) - n1.ctime = n1.mtime = date - self._subs[name] = n1 - - -class TagDir(Node): - """A directory that contains all tags in the repository.""" - def __init__(self, parent, name): - Node.__init__(self, parent, name, GIT_MODE_TREE, EMPTY_SHA) - - def _mksubs(self): - self._subs = {} - for (name, sha) in git.list_refs(): - if name.startswith('refs/tags/'): - name = name[10:] - date = git.rev_get_date(sha.encode('hex')) - commithex = sha.encode('hex') - target = '../.commit/%s/%s' % (commithex[:2], commithex[2:]) - tag1 = FakeSymlink(self, name, target) - tag1.ctime = tag1.mtime = date - self._subs[name] = tag1 - - -class BranchList(Node): - """A list of links to commits reachable by a branch in bup's repository. - - Represents each commit as a symlink that points to the commit directory in - /.commit/??/ . The symlink is named after the commit date. + parent_name, parent_item = past[-1] + wanted = (segment,) if not want_meta else ('.', segment) + items = tuple(contents(repo, parent_item, names=wanted, + want_meta=want_meta)) + if not want_meta: + item = items[0][1] if items else None + else: # First item will be '.' and have the metadata + item = items[1][1] if len(items) == 2 else None + dot, dot_item = items[0] + assert dot == '.' + past[-1] = parent_name, parent_item + if not item: + past.append((segment, None),) + return notice_resolution(tuple(past)) + mode = item_mode(item) + if not S_ISLNK(mode): + if not S_ISDIR(mode): + past.append((segment, item),) + if future: + raise IOError(ENOTDIR, + 'path %r%s ends internally in non-directory here: %r' + % (path, + ' (relative to %r)' % parent if parent else '', + past), + terminus=past) + if must_be_dir: + raise_dir_required_but_not_dir(path, parent, past) + return notice_resolution(tuple(past)) + # It's treeish + if want_meta and type(item) in real_tree_types: + dir_meta = _find_treeish_oid_metadata(repo, item.oid) + if dir_meta: + item = item._replace(meta=dir_meta) + past.append((segment, item)) + else: # symlink + if not future and not deref: + past.append((segment, item),) + continue + if hops > 100: + raise IOError(ELOOP, + 'too many symlinks encountered while resolving %r%s' + % (path, ' relative to %r' % parent if parent else ''), + terminus=tuple(past + [(segment, item)])) + target = readlink(repo, item) + is_absolute, _, target_future = _decompose_path(target) + if is_absolute: + if not target_future: # path was effectively '/' + return notice_resolution((('', _root),)) + past = [('', _root)] + future = target_future + else: + future.extend(target_future) + hops += 1 + +def lresolve(repo, path, parent=None, want_meta=True): + """Perform exactly the same function as resolve(), except if the final + path element is a symbolic link, don't follow it, just return it + in the result. + """ - def __init__(self, parent, name, hash): - Node.__init__(self, parent, name, GIT_MODE_TREE, hash) - - def _mksubs(self): - self._subs = {} - - tags = git.tags() - - revs = list(git.rev_list(self.hash.encode('hex'))) - for (date, commit) in revs: - l = time.localtime(date) - ls = time.strftime('%Y-%m-%d-%H%M%S', l) - commithex = commit.encode('hex') - target = '../.commit/%s/%s' % (commithex[:2], commithex[2:]) - n1 = FakeSymlink(self, ls, target) - n1.ctime = n1.mtime = date - self._subs[ls] = n1 - - for tag in tags.get(commit, []): - t1 = FakeSymlink(self, tag, target) - t1.ctime = t1.mtime = date - self._subs[tag] = t1 - - latest = max(revs) - if latest: - (date, commit) = latest - commithex = commit.encode('hex') - target = '../.commit/%s/%s' % (commithex[:2], commithex[2:]) - n1 = FakeSymlink(self, 'latest', target) - n1.ctime = n1.mtime = date - self._subs['latest'] = n1 - - -class RefList(Node): - """A list of branches in bup's repository. - - The sub-nodes of the ref list are a series of CommitList for each commit - hash pointed to by a branch. - - Also, a special sub-node named '.commit' contains all commit directories - that are reachable via a ref (e.g. a branch). See CommitDir for details. + return _resolve_path(repo, path, parent=parent, want_meta=want_meta, + deref=False) + +def resolve(repo, path, parent=None, want_meta=True): + """Follow the path in the virtual filesystem and return a tuple + representing the location, if any, denoted by the path. Each + element in the result tuple will be (name, info), where info will + be a VFS item that can be passed to functions like item_mode(). + + If a path segment that does not exist is encountered during + resolution, the result will represent the location of the missing + item, and that item in the result will be None. + + Any attempt to traverse a non-directory will raise a VFS ENOTDIR + IOError exception. + + Any symlinks along the path, including at the end, will be + resolved. A VFS IOError with the errno attribute set to ELOOP + will be raised if too many symlinks are traversed while following + the path. That exception is effectively like a normal + ELOOP IOError exception, but will include a terminus element + describing the location of the failure, which will be a tuple of + (name, info) elements. + + The parent, if specified, must be a sequence of (name, item) + tuples, and will provide the starting point for the resolution of + the path. If no parent is specified, resolution will start at + '/'. + + The result may include elements of parent directly, so they must + not be modified later. If this is a concern, pass in "name, + copy_item(item) for name, item in parent" instead. + + When want_meta is true, detailed metadata will be included in each + result item if it's avaiable, otherwise item.meta will be an + integer mode. The metadata size may or may not be provided, but + can be computed by item_size() or augment_item_meta(..., + include_size=True). Setting want_meta=False is rarely desirable + since it can limit the VFS to just the metadata git itself can + represent, and so, as an example, fifos and sockets will appear to + be regular files (e.g. S_ISREG(item_mode(item)) will be true) . + But the option is provided because it may be more efficient when + only the path names or the more limited metadata is sufficient. + + Do not modify any item.meta Metadata instances directly. If + needed, make a copy via item.meta.copy() and modify that instead. + """ - def __init__(self, parent): - Node.__init__(self, parent, '/', GIT_MODE_TREE, EMPTY_SHA) + result = _resolve_path(repo, path, parent=parent, want_meta=want_meta, + deref=True) + _, leaf_item = result[-1] + if leaf_item: + assert not S_ISLNK(item_mode(leaf_item)) + return result + +def try_resolve(repo, path, parent=None, want_meta=True): + """If path does not refer to a symlink, does not exist, or refers to a + valid symlink, behave exactly like resolve(). If path refers to + an invalid symlink, behave like lresolve. - def _mksubs(self): - self._subs = {} + """ + res = lresolve(repo, path, parent=parent, want_meta=want_meta) + leaf_name, leaf_item = res[-1] + if not leaf_item: + return res + if not S_ISLNK(item_mode(leaf_item)): + return res + deref = resolve(repo, leaf_name, parent=res[:-1], want_meta=want_meta) + deref_name, deref_item = deref[-1] + if deref_item: + return deref + return res + +def augment_item_meta(repo, item, include_size=False): + """Ensure item has a Metadata instance for item.meta. If item.meta is + currently a mode, replace it with a compatible "fake" Metadata + instance. If include_size is true, ensure item.meta.size is + correct, computing it if needed. If item.meta is a Metadata + instance, this call may modify it in place or replace it. - commit_dir = CommitDir(self, '.commit') - self._subs['.commit'] = commit_dir + """ + # If we actually had parallelism, we'd need locking... + assert repo + m = item.meta + if isinstance(m, Metadata): + if include_size and m.size is None: + m.size = _compute_item_size(repo, item) + return item._replace(meta=m) + return item + # m is mode + meta = Metadata() + meta.mode = m + meta.uid = meta.gid = meta.atime = meta.mtime = meta.ctime = 0 + if S_ISLNK(m): + target = _readlink(repo, item.oid) + meta.symlink_target = target + meta.size = len(target) + elif include_size: + meta.size = _compute_item_size(repo, item) + return item._replace(meta=meta) + +def fill_in_metadata_if_dir(repo, item): + """If item is a directory and item.meta is not a Metadata instance, + attempt to find the metadata for the directory. If found, return + a new item augmented to include that metadata. Otherwise, return + item. May be useful for the output of contents(). - tag_dir = TagDir(self, '.tag') - self._subs['.tag'] = tag_dir + """ + if S_ISDIR(item_mode(item)) and not isinstance(item.meta, Metadata): + items = tuple(contents(repo, item, ('.',), want_meta=True)) + assert len(items) == 1 + assert items[0][0] == '.' + item = items[0][1] + return item + +def ensure_item_has_metadata(repo, item, include_size=False): + """If item is a directory, attempt to find and add its metadata. If + the item still doesn't have a Metadata instance for item.meta, + give it one via augment_item_meta(). May be useful for the output + of contents(). - for (name,sha) in git.list_refs(): - if name.startswith('refs/heads/'): - name = name[11:] - date = git.rev_get_date(sha.encode('hex')) - n1 = BranchList(self, name, sha) - n1.ctime = n1.mtime = date - self._subs[name] = n1 + """ + return augment_item_meta(repo, + fill_in_metadata_if_dir(repo, item), + include_size=include_size)