1 """Virtual File System interface to bup repository content.
3 This module provides a path-based interface to the content of a bup
6 The VFS is structured like this:
9 /SAVE-NAME/SAVE-DATE/...
12 Each path is represented by an item that has least an item.meta which
13 may be either a Metadata object, or an integer mode. Functions like
14 item_mode() and item_size() will return the mode and size in either
15 case. Any item.meta Metadata instances must not be modified directly.
16 Make a copy to modify via item.meta.copy() if needed.
18 The want_meta argument is advisory for calls that accept it, and it
19 may not be honored. Callers must be able to handle an item.meta value
20 that is either an instance of Metadata or an integer mode, perhaps
21 via item_mode() or augment_item_meta().
23 Setting want_meta=False is rarely desirable since it can limit the VFS
24 to only the metadata that git itself can represent, and so for
25 example, fifos and sockets will appear to be regular files
26 (e.g. S_ISREG(item_mode(item)) will be true). But the option is still
27 provided because it may be more efficient when just the path names or
28 the more limited metadata is sufficient.
30 Any given metadata object's size may be None, in which case the size
31 can be computed via item_size() or augment_item_meta(...,
34 When traversing a directory using functions like contents(), the meta
35 value for any directories other than '.' will be a default directory
36 mode, not a Metadata object. This is because the actual metadata for
37 a directory is stored inside the directory.
39 Commit items represent commits (e.g. /.tag/some-commit or
40 /foo/latest), and for most purposes, they appear as the underlying
41 tree. S_ISDIR(item_mode(item)) will return true for both tree Items
42 and Commits and the commit's oid is the tree hash. The commit hash
43 will be item.coid, and nominal_oid(item) will return coid for commits,
44 oid for everything else.
48 from __future__ import print_function
49 from collections import namedtuple
50 from errno import ELOOP, ENOENT, ENOTDIR
51 from itertools import chain, dropwhile, groupby, izip, tee
52 from stat import S_IFDIR, S_IFLNK, S_IFREG, S_ISDIR, S_ISLNK, S_ISREG
53 from time import localtime, strftime
54 import exceptions, re, sys
56 from bup import client, git, metadata
57 from bup.git import BUP_CHUNKED, cp, get_commit_items, parse_commit, tree_decode
58 from bup.helpers import debug2, last
59 from bup.metadata import Metadata
60 from bup.repo import LocalRepo, RemoteRepo
63 class IOError(exceptions.IOError):
64 def __init__(self, errno, message):
65 exceptions.IOError.__init__(self, errno, message)
68 def __init__(self, message, terminus=None):
69 IOError.__init__(self, ELOOP, message)
70 self.terminus = terminus
72 default_file_mode = S_IFREG | 0o644
73 default_dir_mode = S_IFDIR | 0o755
74 default_symlink_mode = S_IFLNK | 0o755
76 def _default_mode_for_gitmode(gitmode):
78 return default_file_mode
80 return default_dir_mode
82 return default_symlink_mode
83 raise Exception('unexpected git mode ' + oct(gitmode))
85 def _normal_or_chunked_file_size(repo, oid):
86 """Return the size of the normal or chunked file indicated by oid."""
87 # FIXME: --batch-format CatPipe?
88 it = repo.cat(oid.encode('hex'))
89 _, obj_t, size = next(it)
91 while obj_t == 'tree':
92 mode, name, last_oid = last(tree_decode(''.join(it)))
94 it = repo.cat(last_oid.encode('hex'))
95 _, obj_t, size = next(it)
96 return ofs + sum(len(b) for b in it)
98 def _tree_chunks(repo, tree, startofs):
99 "Tree should be a sequence of (name, mode, hash) as per tree_decode()."
100 assert(startofs >= 0)
101 # name is the chunk's hex offset in the original file
102 tree = dropwhile(lambda (_1, name, _2): int(name, 16) < startofs, tree)
103 for mode, name, oid in tree:
105 skipmore = startofs - ofs
108 it = repo.cat(oid.encode('hex'))
109 _, obj_t, size = next(it)
112 assert obj_t == 'tree'
113 for b in _tree_chunks(repo, tree_decode(data), skipmore):
116 assert obj_t == 'blob'
117 yield data[skipmore:]
120 def __init__(self, repo, oid, startofs):
121 it = repo.cat(oid.encode('hex'))
122 _, obj_t, size = next(it)
123 isdir = obj_t == 'tree'
126 self.it = _tree_chunks(repo, tree_decode(data), startofs)
130 self.blob = data[startofs:]
133 def next(self, size):
135 while len(out) < size:
136 if self.it and not self.blob:
138 self.blob = self.it.next()
139 except StopIteration:
142 want = size - len(out)
143 out += self.blob[:want]
144 self.blob = self.blob[want:]
147 debug2('next(%d) returned %d\n' % (size, len(out)))
151 class _FileReader(object):
152 def __init__(self, repo, oid, known_size=None):
157 self._size = known_size
159 def _compute_size(self):
161 self._size = _normal_or_chunked_file_size(self._repo, self.oid)
166 raise IOError(errno.EINVAL, 'Invalid argument')
167 if ofs > self._compute_size():
168 raise IOError(errno.EINVAL, 'Invalid argument')
174 def read(self, count=-1):
176 count = self._compute_size() - self.ofs
177 if not self.reader or self.reader.ofs != self.ofs:
178 self.reader = _ChunkReader(self._repo, self.oid, self.ofs)
180 buf = self.reader.next(count)
183 raise # our offsets will be all screwed up otherwise
192 def __exit__(self, type, value, traceback):
196 _multiple_slashes_rx = re.compile(r'//+')
198 def _decompose_path(path):
199 """Return a reversed list of path elements, omitting any occurrences
200 of "." and ignoring any leading or trailing slash."""
201 path = re.sub(_multiple_slashes_rx, '/', path)
202 if path.startswith('/'):
204 if path.endswith('/'):
206 result = [x for x in path.split('/') if x != '.']
211 Item = namedtuple('Item', ('meta', 'oid'))
212 Chunky = namedtuple('Chunky', ('meta', 'oid'))
213 Root = namedtuple('Root', ('meta'))
214 Tags = namedtuple('Tags', ('meta'))
215 RevList = namedtuple('RevList', ('meta', 'oid'))
216 Commit = namedtuple('Commit', ('meta', 'oid', 'coid'))
218 item_types = frozenset((Item, Chunky, Root, Tags, RevList, Commit))
219 real_tree_types = frozenset((Item, Commit))
221 _root = Root(meta=default_dir_mode)
222 _tags = Tags(meta=default_dir_mode)
225 def nominal_oid(item):
226 """If the item is a Commit, return its commit oid, otherwise return
227 the item's oid, if it has one.
230 if isinstance(item, Commit):
232 return getattr(item, 'oid', None)
235 """Return a completely independent copy of item, such that
236 modifications will not affect the original.
239 meta = getattr(item, 'meta', None)
242 return(item._replace(meta=meta.copy()))
245 """Return the integer mode (stat st_mode) for item."""
247 if isinstance(m, Metadata):
251 def _read_dir_meta(bupm):
252 # This is because save writes unmodified Metadata() entries for
253 # fake parents -- test-save-strip-graft.sh demonstrates.
254 m = Metadata.read(bupm)
256 return default_dir_mode
257 assert m.mode is not None
262 def tree_data_and_bupm(repo, oid):
263 """Return (tree_bytes, bupm_oid) where bupm_oid will be None if the
264 tree has no metadata (i.e. older bup save, or non-bup tree).
267 assert len(oid) == 20
268 it = repo.cat(oid.encode('hex'))
269 _, item_t, size = next(it)
271 if item_t == 'commit':
272 commit = parse_commit(data)
273 it = repo.cat(commit.tree)
274 _, item_t, size = next(it)
276 assert item_t == 'tree'
277 elif item_t != 'tree':
278 raise Exception('%r is not a tree or commit' % oid.encode('hex'))
279 for _, mangled_name, sub_oid in tree_decode(data):
280 if mangled_name == '.bupm':
282 if mangled_name > '.bupm':
286 def _find_dir_item_metadata(repo, item):
287 """Return the metadata for the tree or commit item, or None if the
288 tree has no metadata (i.e. older bup save, or non-bup tree).
291 tree_data, bupm_oid = tree_data_and_bupm(repo, item.oid)
293 with _FileReader(repo, bupm_oid) as meta_stream:
294 return _read_dir_meta(meta_stream)
297 def _readlink(repo, oid):
298 return ''.join(repo.join(oid.encode('hex')))
300 def readlink(repo, item):
301 """Return the link target of item, which must be a symlink. Reads the
302 target from the repository if necessary."""
304 assert S_ISLNK(item_mode(item))
305 if isinstance(item.meta, Metadata):
306 target = item.meta.symlink_target
309 return _readlink(repo, item.oid)
311 def _compute_item_size(repo, item):
312 mode = item_mode(item)
314 size = _normal_or_chunked_file_size(repo, item.oid)
317 return len(_readlink(repo, item.oid))
320 def item_size(repo, item):
321 """Return the size of item, computing it if necessary."""
323 if isinstance(m, Metadata) and m.size is not None:
325 return _compute_item_size(repo, item)
327 def fopen(repo, item):
328 """Return an open reader for the given file item."""
330 assert S_ISREG(item_mode(item))
331 return _FileReader(repo, item.oid)
333 def augment_item_meta(repo, item, include_size=False):
334 """Ensure item has a Metadata instance for item.meta. If item.meta is
335 currently a mode, replace it with a compatible "fake" Metadata
336 instance. If include_size is true, ensure item.meta.size is
337 correct, computing it if needed. If item.meta is a Metadata
338 instance, this call may modify it in place or replace it.
341 # If we actually had parallelism, we'd need locking...
344 if isinstance(m, Metadata):
345 if include_size and m.size is None:
346 m.size = _compute_item_size(repo, item)
347 return item._replace(meta=m)
352 meta.uid = meta.gid = meta.atime = meta.mtime = meta.ctime = 0
354 target = _readlink(repo, item.oid)
355 meta.symlink_target = target
356 meta.size = len(target)
358 meta.size = _compute_item_size(repo, item)
359 return item._replace(meta=meta)
361 def _commit_meta_from_auth_sec(author_sec):
363 m.mode = default_dir_mode
364 m.uid = m.gid = m.size = 0
365 m.atime = m.mtime = m.ctime = author_sec * 10**9
368 def _commit_meta_from_oidx(repo, oidx):
370 _, typ, size = next(it)
371 assert typ == 'commit'
372 author_sec = parse_commit(''.join(it)).author_sec
373 return _commit_meta_from_auth_sec(author_sec)
375 def parse_rev_auth_secs(f):
376 tree, author_secs = f.readline().split(None, 2)
377 return tree, int(author_secs)
379 def root_items(repo, names=None):
380 """Yield (name, item) for the items in '/' in the VFS. Return
381 everything if names is logically false, otherwise return only
382 items with a name in the collection.
385 # FIXME: what about non-leaf refs like 'refs/heads/foo/bar/baz?
391 # FIXME: maybe eventually support repo.clone() or something
392 # and pass in two repos, so we can drop the tuple() and stream
393 # in parallel (i.e. meta vs refs).
394 for name, oid in tuple(repo.refs([], limit_to_heads=True)):
395 assert(name.startswith('refs/heads/'))
397 m = _commit_meta_from_oidx(repo, oid.encode('hex'))
398 yield name, RevList(meta=m, oid=oid)
406 if ref in ('.', '.tag'):
409 oidx, typ, size = next(it)
413 assert typ == 'commit'
414 commit = parse_commit(''.join(it))
415 yield ref, RevList(meta=_commit_meta_from_auth_sec(commit.author_sec),
416 oid=oidx.decode('hex'))
418 def ordered_tree_entries(tree_data, bupm=None):
419 """Yields (name, mangled_name, kind, gitmode, oid) for each item in
420 tree, sorted by name.
423 # Sadly, the .bupm entries currently aren't in git tree order,
424 # i.e. they don't account for the fact that git sorts trees
425 # (including our chunked trees) as if their names ended with "/",
426 # so "fo" sorts after "fo." iff fo is a directory. This makes
427 # streaming impossible when we need the metadata.
428 def result_from_tree_entry(tree_entry):
429 gitmode, mangled_name, oid = tree_entry
430 name, kind = git.demangle_name(mangled_name, gitmode)
431 return name, mangled_name, kind, gitmode, oid
433 tree_ents = (result_from_tree_entry(x) for x in tree_decode(tree_data))
435 tree_ents = sorted(tree_ents, key=lambda x: x[0])
436 for ent in tree_ents:
439 def tree_items(oid, tree_data, names=frozenset(), bupm=None):
441 def tree_item(ent_oid, kind, gitmode):
442 if kind == BUP_CHUNKED:
443 meta = Metadata.read(bupm) if bupm else default_file_mode
444 return Chunky(oid=ent_oid, meta=meta)
447 # No metadata here (accessable via '.' inside ent_oid).
448 return Item(meta=default_dir_mode, oid=ent_oid)
450 return Item(oid=ent_oid,
451 meta=(Metadata.read(bupm) if bupm \
452 else _default_mode_for_gitmode(gitmode)))
454 assert len(oid) == 20
456 dot_meta = _read_dir_meta(bupm) if bupm else default_dir_mode
457 yield '.', Item(oid=oid, meta=dot_meta)
458 tree_entries = ordered_tree_entries(tree_data, bupm)
459 for name, mangled_name, kind, gitmode, ent_oid in tree_entries:
460 if mangled_name == '.bupm':
463 yield name, tree_item(ent_oid, kind, gitmode)
466 # Assumes the tree is properly formed, i.e. there are no
467 # duplicates, and entries will be in git tree order.
468 if type(names) not in (frozenset, set):
469 names = frozenset(names)
470 remaining = len(names)
472 # Account for the bupm sort order issue (cf. ordered_tree_entries above)
473 last_name = max(names) if bupm else max(names) + '/'
476 dot_meta = _read_dir_meta(bupm) if bupm else default_dir_mode
477 yield '.', Item(oid=oid, meta=dot_meta)
482 tree_entries = ordered_tree_entries(tree_data, bupm)
483 for name, mangled_name, kind, gitmode, ent_oid in tree_entries:
484 if mangled_name == '.bupm':
487 if name not in names:
489 break # given bupm sort order, we're finished
490 if (kind == BUP_CHUNKED or not S_ISDIR(gitmode)) and bupm:
493 yield name, tree_item(ent_oid, kind, gitmode)
498 def tree_items_with_meta(repo, oid, tree_data, names):
499 # For now, the .bupm order doesn't quite match git's, and we don't
500 # load the tree data incrementally anyway, so we just work in RAM
502 assert len(oid) == 20
504 for _, mangled_name, sub_oid in tree_decode(tree_data):
505 if mangled_name == '.bupm':
506 bupm = _FileReader(repo, sub_oid)
508 if mangled_name > '.bupm':
510 for item in tree_items(oid, tree_data, names, bupm):
513 _save_name_rx = re.compile(r'^\d\d\d\d-\d\d-\d\d-\d{6}(-\d+)?$')
515 def _reverse_suffix_duplicates(strs):
516 """Yields the elements of strs, with any runs of duplicate values
517 suffixed with -N suffixes, where the zero padded integer N
518 decreases to 0 by 1 (e.g. 10, 09, ..., 00).
521 for name, duplicates in groupby(strs):
522 ndup = len(tuple(duplicates))
526 ndig = len(str(ndup - 1))
527 fmt = '%s-' + '%0' + str(ndig) + 'd'
528 for i in xrange(ndup - 1, -1, -1):
529 yield fmt % (name, i)
531 def _name_for_rev(rev):
532 commit, (tree_oidx, utc) = rev
533 assert len(commit) == 40
534 return strftime('%Y-%m-%d-%H%M%S', localtime(utc))
536 def _item_for_rev(rev):
537 commit, (tree_oidx, utc) = rev
538 assert len(commit) == 40
539 assert len(tree_oidx) == 40
540 return Commit(meta=default_dir_mode,
541 oid=tree_oidx.decode('hex'),
542 coid=commit.decode('hex'))
544 def revlist_items(repo, oid, names):
545 assert len(oid) == 20
546 oidx = oid.encode('hex')
547 names = frozenset(name for name in (names or tuple()) \
548 if _save_name_rx.match(name) or name in ('.', 'latest'))
550 # Do this before we open the rev_list iterator so we're not nesting
551 if (not names) or ('.' in names):
552 yield '.', RevList(oid=oid, meta=_commit_meta_from_oidx(repo, oidx))
554 revs = repo.rev_list((oidx,), format='%T %at', parse=parse_rev_auth_secs)
555 rev_items, rev_names = tee(revs)
556 revs = None # Don't disturb the tees
557 rev_names = _reverse_suffix_duplicates(_name_for_rev(x) for x in rev_names)
558 rev_items = (_item_for_rev(x) for x in rev_items)
562 for item in rev_items:
563 first_commit = first_commit or item
564 yield next(rev_names), item
565 yield 'latest', first_commit
568 # Revs are in reverse chronological order by default
569 last_name = min(names)
570 for item in rev_items:
571 first_commit = first_commit or item
572 name = next(rev_names) # Might have -N dup suffix
575 if not name in names:
579 # FIXME: need real short circuit...
580 for _ in rev_items: pass
581 for _ in rev_names: pass
583 if 'latest' in names:
584 yield 'latest', first_commit
586 def tags_items(repo, names):
590 assert len(oid) == 20
591 oidx = oid.encode('hex')
593 _, typ, size = next(it)
595 tree_oid = parse_commit(''.join(it)).tree.decode('hex')
596 assert len(tree_oid) == 20
597 # FIXME: more efficient/bulk?
598 return RevList(meta=_commit_meta_from_oidx(repo, oidx), oid=oid)
601 return Item(meta=default_file_mode, oid=oid)
603 return Item(meta=default_dir_mode, oid=oid)
604 raise Exception('unexpected tag type ' + typ + ' for tag ' + name)
608 # We have to pull these all into ram because tag_item calls cat()
609 for name, oid in tuple(repo.refs(names, limit_to_tags=True)):
610 assert(name.startswith('refs/tags/'))
612 yield name, tag_item(oid)
615 # Assumes no duplicate refs
616 if type(names) not in (frozenset, set):
617 names = frozenset(names)
618 remaining = len(names)
619 last_name = max(names)
626 for name, oid in repo.refs(names, limit_to_tags=True):
627 assert(name.startswith('refs/tags/'))
631 if name not in names:
633 yield name, tag_item(oid)
638 def contents(repo, item, names=None, want_meta=True):
639 """Yields information about the items contained in item. Yields
640 (name, item) for each name in names, if the name exists, in an
641 unspecified order. If there are no names, then yields (name,
642 item) for all items, including, a first item named '.'
643 representing the container itself.
645 Note that want_meta is advisory. For any given item, item.meta
646 might be a Metadata instance or a mode, and if the former,
647 meta.size might be None. Missing sizes can be computed via via
648 item_size() or augment_item_meta(..., include_size=True).
650 Do not modify any item.meta Metadata instances directly. If
651 needed, make a copy via item.meta.copy() and modify that instead.
654 # Q: are we comfortable promising '.' first when no names?
657 assert S_ISDIR(item_mode(item))
660 if item_t in real_tree_types:
661 it = repo.cat(item.oid.encode('hex'))
662 _, obj_type, size = next(it)
664 if obj_type == 'tree':
666 item_gen = tree_items_with_meta(repo, item.oid, data, names)
668 item_gen = tree_items(item.oid, data, names)
669 elif obj_type == 'commit':
671 item_gen = tree_items_with_meta(repo, item.oid, tree_data, names)
673 item_gen = tree_items(item.oid, tree_data, names)
676 raise Exception('unexpected git ' + obj_type)
677 elif item_t == RevList:
678 item_gen = revlist_items(repo, item.oid, names)
680 item_gen = root_items(repo, names)
682 item_gen = tags_items(repo, names)
684 raise Exception('unexpected VFS item ' + str(item))
688 def _resolve_path(repo, path, parent=None, want_meta=True, deref=False):
695 assert type(x[0]) in (bytes, str)
696 assert type(x[1]) in item_types
697 assert parent[0][1] == _root
698 future = _decompose_path(path)
699 if path.startswith('/'):
700 if future == ['']: # path was effectively '/'
701 return (('', _root),)
708 if not future: # e.g. if path was effectively '.'
712 segment = future.pop()
714 if len(past) > 1: # .. from / is /
717 parent_name, parent_item = past[-1]
718 wanted = (segment,) if not want_meta else ('.', segment)
719 items = tuple(contents(repo, parent_item, names=wanted,
720 want_meta=want_meta))
722 item = items[0][1] if items else None
723 else: # First item will be '.' and have the metadata
724 item = items[1][1] if len(items) == 2 else None
725 dot, dot_item = items[0]
727 past[-1] = parent_name, parent_item
729 past.append((segment, None),)
731 mode = item_mode(item)
732 if not S_ISLNK(mode):
733 if not S_ISDIR(mode):
735 past.append((segment, item),)
738 if want_meta and type(item) in real_tree_types:
739 dir_meta = _find_dir_item_metadata(repo, item)
741 item = item._replace(meta=dir_meta)
743 past.append((segment, item),)
745 past.append((segment, item))
747 if not future and not deref:
748 past.append((segment, item),)
750 target = readlink(repo, item)
751 target_future = _decompose_path(target)
752 if target.startswith('/'):
753 future = target_future
755 if target_future == ['']: # path was effectively '/'
758 future.extend(target_future)
761 raise Loop('too many symlinks encountered while resolving %r%s'
763 'relative to %r' % parent if parent else ''))
765 def lresolve(repo, path, parent=None, want_meta=True):
766 """Perform exactly the same function as resolve(), except if the
767 final path element is a symbolic link, don't follow it, just
768 return it in the result."""
769 return _resolve_path(repo, path, parent=parent, want_meta=want_meta,
772 def resolve(repo, path, parent=None, want_meta=True):
773 """Follow the path in the virtual filesystem and return a tuple
774 representing the location, if any, denoted by the path. Each
775 element in the result tuple will be (name, info), where info will
776 be a VFS item that can be passed to functions like item_mode().
778 If a path segment that does not exist is encountered during
779 resolution, the result will represent the location of the missing
780 item, and that item in the result will be None.
782 Any symlinks along the path, including at the end, will be
783 resolved. A Loop exception will be raised if too many symlinks
784 are traversed whiile following the path. raised if too many
785 symlinks are traversed while following the path. That exception
786 is effectively like a normal ELOOP IOError exception, but will
787 include a terminus element describing the location of the failure,
788 which will be a tuple of (name, info) elements.
790 Currently, a path ending in '/' will still resolve if it exists,
791 even if not a directory. The parent, if specified, must be a
792 sequence of (name, item) tuples, and will provide the starting
793 point for the resolution of the path. The result may include
794 elements of parent directly, so they must not be modified later.
795 If this is a concern, pass in "name, copy_item(item) for
796 name, item in parent" instead.
798 When want_meta is true, detailed metadata will be included in each
799 result item if it's avaiable, otherwise item.meta will be an
800 integer mode. The metadata size may or may not be provided, but
801 can be computed by item_size() or augment_item_meta(...,
802 include_size=True). Setting want_meta=False is rarely desirable
803 since it can limit the VFS to just the metadata git itself can
804 represent, and so, as an example, fifos and sockets will appear to
805 be regular files (e.g. S_ISREG(item_mode(item)) will be true) .
806 But the option is provided because it may be more efficient when
807 only the path names or the more limited metadata is sufficient.
809 Do not modify any item.meta Metadata instances directly. If
810 needed, make a copy via item.meta.copy() and modify that instead.
813 result = _resolve_path(repo, path, parent=parent, want_meta=want_meta,
815 _, leaf_item = result[-1]
817 assert not S_ISLNK(item_mode(leaf_item))