1 """Virtual File System interface to bup repository content.
3 This module provides a path-based interface to the content of a bup
6 The VFS is structured like this:
9 /SAVE-NAME/SAVE-DATE/...
12 Each path is represented by an item that has least an item.meta which
13 may be either a Metadata object, or an integer mode. Functions like
14 item_mode() and item_size() will return the mode and size in either
15 case. Any item.meta Metadata instances must not be modified directly.
16 Make a copy to modify via item.meta.copy() if needed.
18 The want_meta argument is advisory for calls that accept it, and it
19 may not be honored. Callers must be able to handle an item.meta value
20 that is either an instance of Metadata or an integer mode, perhaps
21 via item_mode() or augment_item_meta().
23 Setting want_meta=False is rarely desirable since it can limit the VFS
24 to only the metadata that git itself can represent, and so for
25 example, fifos and sockets will appear to be regular files
26 (e.g. S_ISREG(item_mode(item)) will be true). But the option is still
27 provided because it may be more efficient when just the path names or
28 the more limited metadata is sufficient.
30 Any given metadata object's size may be None, in which case the size
31 can be computed via item_size() or augment_item_meta(...,
34 When traversing a directory using functions like contents(), the meta
35 value for any directories other than '.' will be a default directory
36 mode, not a Metadata object. This is because the actual metadata for
37 a directory is stored inside the directory (see
38 fill_in_metadata_if_dir() or ensure_item_has_metadata()).
40 Commit items represent commits (e.g. /.tag/some-commit or
41 /foo/latest), and for most purposes, they appear as the underlying
42 tree. S_ISDIR(item_mode(item)) will return true for both tree Items
43 and Commits and the commit's oid is the tree hash. The commit hash
44 will be item.coid, and nominal_oid(item) will return coid for commits,
45 oid for everything else.
49 from __future__ import print_function
50 from collections import namedtuple
51 from errno import ELOOP, ENOENT, ENOTDIR
52 from itertools import chain, dropwhile, groupby, izip, tee
53 from stat import S_IFDIR, S_IFLNK, S_IFREG, S_ISDIR, S_ISLNK, S_ISREG
54 from time import localtime, strftime
55 import exceptions, re, sys
57 from bup import client, git, metadata
58 from bup.git import BUP_CHUNKED, cp, get_commit_items, parse_commit, tree_decode
59 from bup.helpers import debug2, last
60 from bup.metadata import Metadata
61 from bup.repo import LocalRepo, RemoteRepo
64 class IOError(exceptions.IOError):
65 def __init__(self, errno, message, terminus=None):
66 exceptions.IOError.__init__(self, errno, message)
67 self.terminus = terminus
69 default_file_mode = S_IFREG | 0o644
70 default_dir_mode = S_IFDIR | 0o755
71 default_symlink_mode = S_IFLNK | 0o755
73 def _default_mode_for_gitmode(gitmode):
75 return default_file_mode
77 return default_dir_mode
79 return default_symlink_mode
80 raise Exception('unexpected git mode ' + oct(gitmode))
82 def _normal_or_chunked_file_size(repo, oid):
83 """Return the size of the normal or chunked file indicated by oid."""
84 # FIXME: --batch-format CatPipe?
85 it = repo.cat(oid.encode('hex'))
86 _, obj_t, size = next(it)
88 while obj_t == 'tree':
89 mode, name, last_oid = last(tree_decode(''.join(it)))
91 it = repo.cat(last_oid.encode('hex'))
92 _, obj_t, size = next(it)
93 return ofs + sum(len(b) for b in it)
95 def _tree_chunks(repo, tree, startofs):
96 "Tree should be a sequence of (name, mode, hash) as per tree_decode()."
98 # name is the chunk's hex offset in the original file
99 tree = dropwhile(lambda (_1, name, _2): int(name, 16) < startofs, tree)
100 for mode, name, oid in tree:
102 skipmore = startofs - ofs
105 it = repo.cat(oid.encode('hex'))
106 _, obj_t, size = next(it)
109 assert obj_t == 'tree'
110 for b in _tree_chunks(repo, tree_decode(data), skipmore):
113 assert obj_t == 'blob'
114 yield data[skipmore:]
117 def __init__(self, repo, oid, startofs):
118 it = repo.cat(oid.encode('hex'))
119 _, obj_t, size = next(it)
120 isdir = obj_t == 'tree'
123 self.it = _tree_chunks(repo, tree_decode(data), startofs)
127 self.blob = data[startofs:]
130 def next(self, size):
132 while len(out) < size:
133 if self.it and not self.blob:
135 self.blob = self.it.next()
136 except StopIteration:
139 want = size - len(out)
140 out += self.blob[:want]
141 self.blob = self.blob[want:]
144 debug2('next(%d) returned %d\n' % (size, len(out)))
148 class _FileReader(object):
149 def __init__(self, repo, oid, known_size=None):
154 self._size = known_size
156 def _compute_size(self):
158 self._size = _normal_or_chunked_file_size(self._repo, self.oid)
163 raise IOError(errno.EINVAL, 'Invalid argument')
164 if ofs > self._compute_size():
165 raise IOError(errno.EINVAL, 'Invalid argument')
171 def read(self, count=-1):
173 count = self._compute_size() - self.ofs
174 if not self.reader or self.reader.ofs != self.ofs:
175 self.reader = _ChunkReader(self._repo, self.oid, self.ofs)
177 buf = self.reader.next(count)
180 raise # our offsets will be all screwed up otherwise
189 def __exit__(self, type, value, traceback):
193 _multiple_slashes_rx = re.compile(r'//+')
195 def _decompose_path(path):
196 """Return a reversed list of path elements, omitting any occurrences
197 of "." and ignoring any leading or trailing slash."""
198 path = re.sub(_multiple_slashes_rx, '/', path)
199 if path.startswith('/'):
201 if path.endswith('/'):
203 result = [x for x in path.split('/') if x != '.']
208 Item = namedtuple('Item', ('meta', 'oid'))
209 Chunky = namedtuple('Chunky', ('meta', 'oid'))
210 Root = namedtuple('Root', ('meta'))
211 Tags = namedtuple('Tags', ('meta'))
212 RevList = namedtuple('RevList', ('meta', 'oid'))
213 Commit = namedtuple('Commit', ('meta', 'oid', 'coid'))
215 item_types = frozenset((Item, Chunky, Root, Tags, RevList, Commit))
216 real_tree_types = frozenset((Item, Commit))
218 _root = Root(meta=default_dir_mode)
219 _tags = Tags(meta=default_dir_mode)
222 def nominal_oid(item):
223 """If the item is a Commit, return its commit oid, otherwise return
224 the item's oid, if it has one.
227 if isinstance(item, Commit):
229 return getattr(item, 'oid', None)
232 """Return a completely independent copy of item, such that
233 modifications will not affect the original.
236 meta = getattr(item, 'meta', None)
239 return(item._replace(meta=meta.copy()))
242 """Return the integer mode (stat st_mode) for item."""
244 if isinstance(m, Metadata):
248 def _read_dir_meta(bupm):
249 # This is because save writes unmodified Metadata() entries for
250 # fake parents -- test-save-strip-graft.sh demonstrates.
251 m = Metadata.read(bupm)
253 return default_dir_mode
254 assert m.mode is not None
259 def tree_data_and_bupm(repo, oid):
260 """Return (tree_bytes, bupm_oid) where bupm_oid will be None if the
261 tree has no metadata (i.e. older bup save, or non-bup tree).
264 assert len(oid) == 20
265 it = repo.cat(oid.encode('hex'))
266 _, item_t, size = next(it)
268 if item_t == 'commit':
269 commit = parse_commit(data)
270 it = repo.cat(commit.tree)
271 _, item_t, size = next(it)
273 assert item_t == 'tree'
274 elif item_t != 'tree':
275 raise Exception('%r is not a tree or commit' % oid.encode('hex'))
276 for _, mangled_name, sub_oid in tree_decode(data):
277 if mangled_name == '.bupm':
279 if mangled_name > '.bupm':
283 def _find_treeish_oid_metadata(repo, oid):
284 """Return the metadata for the tree or commit oid, or None if the tree
285 has no metadata (i.e. older bup save, or non-bup tree).
288 tree_data, bupm_oid = tree_data_and_bupm(repo, oid)
290 with _FileReader(repo, bupm_oid) as meta_stream:
291 return _read_dir_meta(meta_stream)
294 def _readlink(repo, oid):
295 return ''.join(repo.join(oid.encode('hex')))
297 def readlink(repo, item):
298 """Return the link target of item, which must be a symlink. Reads the
299 target from the repository if necessary."""
301 assert S_ISLNK(item_mode(item))
302 if isinstance(item.meta, Metadata):
303 target = item.meta.symlink_target
306 return _readlink(repo, item.oid)
308 def _compute_item_size(repo, item):
309 mode = item_mode(item)
311 size = _normal_or_chunked_file_size(repo, item.oid)
314 return len(_readlink(repo, item.oid))
317 def item_size(repo, item):
318 """Return the size of item, computing it if necessary."""
320 if isinstance(m, Metadata) and m.size is not None:
322 return _compute_item_size(repo, item)
324 def fopen(repo, item):
325 """Return an open reader for the given file item."""
327 assert S_ISREG(item_mode(item))
328 return _FileReader(repo, item.oid)
330 def _commit_item_from_data(oid, data):
331 info = parse_commit(data)
332 return Commit(meta=default_dir_mode,
333 oid=info.tree.decode('hex'),
336 def _commit_item_from_oid(repo, oid, require_meta):
337 it = repo.cat(oid.encode('hex'))
338 _, typ, size = next(it)
339 assert typ == 'commit'
340 commit = _commit_item_from_data(oid, ''.join(it))
342 meta = _find_treeish_oid_metadata(repo, commit.tree)
344 commit = commit._replace(meta=meta)
347 def _revlist_item_from_oid(repo, oid, require_meta):
349 meta = _find_treeish_oid_metadata(repo, oid) or default_dir_mode
351 meta = default_dir_mode
352 return RevList(oid=oid, meta=meta)
354 def parse_rev_auth_secs(f):
355 tree, author_secs = f.readline().split(None, 2)
356 return tree, int(author_secs)
358 def root_items(repo, names=None):
359 """Yield (name, item) for the items in '/' in the VFS. Return
360 everything if names is logically false, otherwise return only
361 items with a name in the collection.
364 # FIXME: what about non-leaf refs like 'refs/heads/foo/bar/baz?
370 # FIXME: maybe eventually support repo.clone() or something
371 # and pass in two repos, so we can drop the tuple() and stream
372 # in parallel (i.e. meta vs refs).
373 for name, oid in tuple(repo.refs([], limit_to_heads=True)):
374 assert(name.startswith('refs/heads/'))
375 yield name[11:], _revlist_item_from_oid(repo, oid, False)
383 if ref in ('.', '.tag'):
386 oidx, typ, size = next(it)
390 assert typ == 'commit'
391 commit = parse_commit(''.join(it))
392 yield ref, _revlist_item_from_oid(repo, oidx.decode('hex'), False)
394 def ordered_tree_entries(tree_data, bupm=None):
395 """Yields (name, mangled_name, kind, gitmode, oid) for each item in
396 tree, sorted by name.
399 # Sadly, the .bupm entries currently aren't in git tree order,
400 # i.e. they don't account for the fact that git sorts trees
401 # (including our chunked trees) as if their names ended with "/",
402 # so "fo" sorts after "fo." iff fo is a directory. This makes
403 # streaming impossible when we need the metadata.
404 def result_from_tree_entry(tree_entry):
405 gitmode, mangled_name, oid = tree_entry
406 name, kind = git.demangle_name(mangled_name, gitmode)
407 return name, mangled_name, kind, gitmode, oid
409 tree_ents = (result_from_tree_entry(x) for x in tree_decode(tree_data))
411 tree_ents = sorted(tree_ents, key=lambda x: x[0])
412 for ent in tree_ents:
415 def tree_items(oid, tree_data, names=frozenset(), bupm=None):
417 def tree_item(ent_oid, kind, gitmode):
418 if kind == BUP_CHUNKED:
419 meta = Metadata.read(bupm) if bupm else default_file_mode
420 return Chunky(oid=ent_oid, meta=meta)
423 # No metadata here (accessable via '.' inside ent_oid).
424 return Item(meta=default_dir_mode, oid=ent_oid)
426 return Item(oid=ent_oid,
427 meta=(Metadata.read(bupm) if bupm \
428 else _default_mode_for_gitmode(gitmode)))
430 assert len(oid) == 20
432 dot_meta = _read_dir_meta(bupm) if bupm else default_dir_mode
433 yield '.', Item(oid=oid, meta=dot_meta)
434 tree_entries = ordered_tree_entries(tree_data, bupm)
435 for name, mangled_name, kind, gitmode, ent_oid in tree_entries:
436 if mangled_name == '.bupm':
439 yield name, tree_item(ent_oid, kind, gitmode)
442 # Assumes the tree is properly formed, i.e. there are no
443 # duplicates, and entries will be in git tree order.
444 if type(names) not in (frozenset, set):
445 names = frozenset(names)
446 remaining = len(names)
448 # Account for the bupm sort order issue (cf. ordered_tree_entries above)
449 last_name = max(names) if bupm else max(names) + '/'
452 dot_meta = _read_dir_meta(bupm) if bupm else default_dir_mode
453 yield '.', Item(oid=oid, meta=dot_meta)
458 tree_entries = ordered_tree_entries(tree_data, bupm)
459 for name, mangled_name, kind, gitmode, ent_oid in tree_entries:
460 if mangled_name == '.bupm':
463 if name not in names:
465 break # given bupm sort order, we're finished
466 if (kind == BUP_CHUNKED or not S_ISDIR(gitmode)) and bupm:
469 yield name, tree_item(ent_oid, kind, gitmode)
474 def tree_items_with_meta(repo, oid, tree_data, names):
475 # For now, the .bupm order doesn't quite match git's, and we don't
476 # load the tree data incrementally anyway, so we just work in RAM
478 assert len(oid) == 20
480 for _, mangled_name, sub_oid in tree_decode(tree_data):
481 if mangled_name == '.bupm':
482 bupm = _FileReader(repo, sub_oid)
484 if mangled_name > '.bupm':
486 for item in tree_items(oid, tree_data, names, bupm):
489 _save_name_rx = re.compile(r'^\d\d\d\d-\d\d-\d\d-\d{6}(-\d+)?$')
491 def _reverse_suffix_duplicates(strs):
492 """Yields the elements of strs, with any runs of duplicate values
493 suffixed with -N suffixes, where the zero padded integer N
494 decreases to 0 by 1 (e.g. 10, 09, ..., 00).
497 for name, duplicates in groupby(strs):
498 ndup = len(tuple(duplicates))
502 ndig = len(str(ndup - 1))
503 fmt = '%s-' + '%0' + str(ndig) + 'd'
504 for i in xrange(ndup - 1, -1, -1):
505 yield fmt % (name, i)
507 def _name_for_rev(rev):
508 commit, (tree_oidx, utc) = rev
509 assert len(commit) == 40
510 return strftime('%Y-%m-%d-%H%M%S', localtime(utc))
512 def _item_for_rev(rev):
513 commit, (tree_oidx, utc) = rev
514 assert len(commit) == 40
515 assert len(tree_oidx) == 40
516 return Commit(meta=default_dir_mode,
517 oid=tree_oidx.decode('hex'),
518 coid=commit.decode('hex'))
520 def revlist_items(repo, oid, names):
521 assert len(oid) == 20
522 oidx = oid.encode('hex')
523 names = frozenset(name for name in (names or tuple()) \
524 if _save_name_rx.match(name) or name in ('.', 'latest'))
525 # Do this before we open the rev_list iterator so we're not nesting
526 if (not names) or ('.' in names):
527 yield '.', _revlist_item_from_oid(repo, oid, True)
529 revs = repo.rev_list((oidx,), format='%T %at', parse=parse_rev_auth_secs)
530 rev_items, rev_names = tee(revs)
531 revs = None # Don't disturb the tees
532 rev_names = _reverse_suffix_duplicates(_name_for_rev(x) for x in rev_names)
533 rev_items = (_item_for_rev(x) for x in rev_items)
537 for item in rev_items:
538 first_commit = first_commit or item
539 yield next(rev_names), item
540 yield 'latest', first_commit
543 # Revs are in reverse chronological order by default
544 last_name = min(names)
545 for item in rev_items:
546 first_commit = first_commit or item
547 name = next(rev_names) # Might have -N dup suffix
550 if not name in names:
554 # FIXME: need real short circuit...
555 for _ in rev_items: pass
556 for _ in rev_names: pass
558 if 'latest' in names:
559 yield 'latest', first_commit
561 def tags_items(repo, names):
565 assert len(oid) == 20
566 oidx = oid.encode('hex')
568 _, typ, size = next(it)
570 return _commit_item_from_data(oid, ''.join(it))
573 return Item(meta=default_file_mode, oid=oid)
575 return Item(meta=default_dir_mode, oid=oid)
576 raise Exception('unexpected tag type ' + typ + ' for tag ' + name)
580 # We have to pull these all into ram because tag_item calls cat()
581 for name, oid in tuple(repo.refs(names, limit_to_tags=True)):
582 assert(name.startswith('refs/tags/'))
584 yield name, tag_item(oid)
587 # Assumes no duplicate refs
588 if type(names) not in (frozenset, set):
589 names = frozenset(names)
590 remaining = len(names)
591 last_name = max(names)
598 for name, oid in repo.refs(names, limit_to_tags=True):
599 assert(name.startswith('refs/tags/'))
603 if name not in names:
605 yield name, tag_item(oid)
610 def contents(repo, item, names=None, want_meta=True):
611 """Yields information about the items contained in item. Yields
612 (name, item) for each name in names, if the name exists, in an
613 unspecified order. If there are no names, then yields (name,
614 item) for all items, including, a first item named '.'
615 representing the container itself.
617 The meta value for any directories other than '.' will be a
618 default directory mode, not a Metadata object. This is because
619 the actual metadata for a directory is stored inside the directory
620 (see fill_in_metadata_if_dir() or ensure_item_has_metadata()).
622 Note that want_meta is advisory. For any given item, item.meta
623 might be a Metadata instance or a mode, and if the former,
624 meta.size might be None. Missing sizes can be computed via via
625 item_size() or augment_item_meta(..., include_size=True).
627 Do not modify any item.meta Metadata instances directly. If
628 needed, make a copy via item.meta.copy() and modify that instead.
631 # Q: are we comfortable promising '.' first when no names?
634 assert S_ISDIR(item_mode(item))
637 if item_t in real_tree_types:
638 it = repo.cat(item.oid.encode('hex'))
639 _, obj_type, size = next(it)
641 if obj_type == 'tree':
643 item_gen = tree_items_with_meta(repo, item.oid, data, names)
645 item_gen = tree_items(item.oid, data, names)
646 elif obj_type == 'commit':
648 item_gen = tree_items_with_meta(repo, item.oid, tree_data, names)
650 item_gen = tree_items(item.oid, tree_data, names)
653 raise Exception('unexpected git ' + obj_type)
654 elif item_t == RevList:
655 item_gen = revlist_items(repo, item.oid, names)
657 item_gen = root_items(repo, names)
659 item_gen = tags_items(repo, names)
661 raise Exception('unexpected VFS item ' + str(item))
665 def _resolve_path(repo, path, parent=None, want_meta=True, deref=False):
672 assert type(x[0]) in (bytes, str)
673 assert type(x[1]) in item_types
674 assert parent[0][1] == _root
675 future = _decompose_path(path)
676 if path.startswith('/'):
677 if future == ['']: # path was effectively '/'
678 return (('', _root),)
685 if not future: # e.g. if path was effectively '.'
689 segment = future.pop()
691 if len(past) > 1: # .. from / is /
694 parent_name, parent_item = past[-1]
695 wanted = (segment,) if not want_meta else ('.', segment)
696 items = tuple(contents(repo, parent_item, names=wanted,
697 want_meta=want_meta))
699 item = items[0][1] if items else None
700 else: # First item will be '.' and have the metadata
701 item = items[1][1] if len(items) == 2 else None
702 dot, dot_item = items[0]
704 past[-1] = parent_name, parent_item
706 past.append((segment, None),)
708 mode = item_mode(item)
709 if not S_ISLNK(mode):
710 if not S_ISDIR(mode):
712 past.append((segment, item),)
715 if want_meta and type(item) in real_tree_types:
716 dir_meta = _find_treeish_oid_metadata(repo, item.oid)
718 item = item._replace(meta=dir_meta)
720 past.append((segment, item),)
722 past.append((segment, item))
724 if not future and not deref:
725 past.append((segment, item),)
727 target = readlink(repo, item)
728 target_future = _decompose_path(target)
729 if target.startswith('/'):
730 future = target_future
732 if target_future == ['']: # path was effectively '/'
735 future.extend(target_future)
739 'too many symlinks encountered while resolving %r%s'
740 % (path, 'relative to %r' % parent if parent else ''),
741 terminus=tuple(past + [(segment, item)]))
743 def lresolve(repo, path, parent=None, want_meta=True):
744 """Perform exactly the same function as resolve(), except if the
745 final path element is a symbolic link, don't follow it, just
746 return it in the result."""
747 return _resolve_path(repo, path, parent=parent, want_meta=want_meta,
750 def resolve(repo, path, parent=None, want_meta=True):
751 """Follow the path in the virtual filesystem and return a tuple
752 representing the location, if any, denoted by the path. Each
753 element in the result tuple will be (name, info), where info will
754 be a VFS item that can be passed to functions like item_mode().
756 If a path segment that does not exist is encountered during
757 resolution, the result will represent the location of the missing
758 item, and that item in the result will be None.
760 Any symlinks along the path, including at the end, will be
761 resolved. A VFS IOError with the errno attribute set to ELOOP
762 will be raised if too many symlinks are traversed while following
763 the path. That exception is effectively like a normal
764 ELOOP IOError exception, but will include a terminus element
765 describing the location of the failure, which will be a tuple of
766 (name, info) elements.
768 Currently, a path ending in '/' will still resolve if it exists,
769 even if not a directory. The parent, if specified, must be a
770 sequence of (name, item) tuples, and will provide the starting
771 point for the resolution of the path. The result may include
772 elements of parent directly, so they must not be modified later.
773 If this is a concern, pass in "name, copy_item(item) for
774 name, item in parent" instead.
776 When want_meta is true, detailed metadata will be included in each
777 result item if it's avaiable, otherwise item.meta will be an
778 integer mode. The metadata size may or may not be provided, but
779 can be computed by item_size() or augment_item_meta(...,
780 include_size=True). Setting want_meta=False is rarely desirable
781 since it can limit the VFS to just the metadata git itself can
782 represent, and so, as an example, fifos and sockets will appear to
783 be regular files (e.g. S_ISREG(item_mode(item)) will be true) .
784 But the option is provided because it may be more efficient when
785 only the path names or the more limited metadata is sufficient.
787 Do not modify any item.meta Metadata instances directly. If
788 needed, make a copy via item.meta.copy() and modify that instead.
791 result = _resolve_path(repo, path, parent=parent, want_meta=want_meta,
793 _, leaf_item = result[-1]
795 assert not S_ISLNK(item_mode(leaf_item))
798 def augment_item_meta(repo, item, include_size=False):
799 """Ensure item has a Metadata instance for item.meta. If item.meta is
800 currently a mode, replace it with a compatible "fake" Metadata
801 instance. If include_size is true, ensure item.meta.size is
802 correct, computing it if needed. If item.meta is a Metadata
803 instance, this call may modify it in place or replace it.
806 # If we actually had parallelism, we'd need locking...
809 if isinstance(m, Metadata):
810 if include_size and m.size is None:
811 m.size = _compute_item_size(repo, item)
812 return item._replace(meta=m)
817 meta.uid = meta.gid = meta.atime = meta.mtime = meta.ctime = 0
819 target = _readlink(repo, item.oid)
820 meta.symlink_target = target
821 meta.size = len(target)
823 meta.size = _compute_item_size(repo, item)
824 return item._replace(meta=meta)
826 def fill_in_metadata_if_dir(repo, item):
827 """If item is a directory and item.meta is not a Metadata instance,
828 attempt to find the metadata for the directory. If found, return
829 a new item augmented to include that metadata. Otherwise, return
830 item. May be useful for the output of contents().
833 if S_ISDIR(item_mode(item)) and not isinstance(item.meta, Metadata):
834 items = tuple(contents(repo, item, ('.',), want_meta=True))
835 assert len(items) == 1
836 assert items[0][0] == '.'
840 def ensure_item_has_metadata(repo, item, include_size=False):
841 """If item is a directory, attempt to find and add its metadata. If
842 the item still doesn't have a Metadata instance for item.meta,
843 give it one via augment_item_meta(). May be useful for the output
847 return augment_item_meta(repo,
848 fill_in_metadata_if_dir(repo, item),
849 include_size=include_size)