1 """Virtual File System interface to bup repository content.
3 This module provides a path-based interface to the content of a bup
6 The VFS is structured like this:
9 /SAVE-NAME/SAVE-DATE/...
12 Each path is represented by an item that has least an item.meta which
13 may be either a Metadata object, or an integer mode. Functions like
14 item_mode() and item_size() will return the mode and size in either
15 case. Any item.meta Metadata instances must not be modified directly.
16 Make a copy to modify via item.meta.copy() if needed.
18 The want_meta argument is advisory for calls that accept it, and it
19 may not be honored. Callers must be able to handle an item.meta value
20 that is either an instance of Metadata or an integer mode, perhaps
21 via item_mode() or augment_item_meta().
23 Setting want_meta=False is rarely desirable since it can limit the VFS
24 to only the metadata that git itself can represent, and so for
25 example, fifos and sockets will appear to be regular files
26 (e.g. S_ISREG(item_mode(item)) will be true). But the option is still
27 provided because it may be more efficient when just the path names or
28 the more limited metadata is sufficient.
30 Any given metadata object's size may be None, in which case the size
31 can be computed via item_size() or augment_item_meta(...,
34 When traversing a directory using functions like contents(), the meta
35 value for any directories other than '.' will be a default directory
36 mode, not a Metadata object. This is because the actual metadata for
37 a directory is stored inside the directory.
39 At the moment tagged commits (e.g. /.tag/some-commit) are represented
40 as an item that is indistinguishable from a normal directory, so you
41 cannot assume that the oid of an item satisfying
42 S_ISDIR(item_mode(item)) refers to a tree.
46 from __future__ import print_function
47 from collections import namedtuple
48 from errno import ELOOP, ENOENT, ENOTDIR
49 from itertools import chain, dropwhile, izip
50 from stat import S_IFDIR, S_IFLNK, S_IFREG, S_ISDIR, S_ISLNK, S_ISREG
51 from time import localtime, strftime
52 import exceptions, re, sys
54 from bup import client, git, metadata
55 from bup.git import BUP_CHUNKED, cp, get_commit_items, parse_commit, tree_decode
56 from bup.helpers import debug2, last
57 from bup.metadata import Metadata
58 from bup.repo import LocalRepo, RemoteRepo
61 class IOError(exceptions.IOError):
62 def __init__(self, errno, message):
63 exceptions.IOError.__init__(self, errno, message)
66 def __init__(self, message, terminus=None):
67 IOError.__init__(self, ELOOP, message)
68 self.terminus = terminus
70 default_file_mode = S_IFREG | 0o644
71 default_dir_mode = S_IFDIR | 0o755
72 default_symlink_mode = S_IFLNK | 0o755
74 def _default_mode_for_gitmode(gitmode):
76 return default_file_mode
78 return default_dir_mode
80 return default_symlink_mode
81 raise Exception('unexpected git mode ' + oct(gitmode))
83 def _normal_or_chunked_file_size(repo, oid):
84 """Return the size of the normal or chunked file indicated by oid."""
85 # FIXME: --batch-format CatPipe?
86 it = repo.cat(oid.encode('hex'))
87 _, obj_t, size = next(it)
89 while obj_t == 'tree':
90 mode, name, last_oid = last(tree_decode(''.join(it)))
92 it = repo.cat(last_oid.encode('hex'))
93 _, obj_t, size = next(it)
94 return ofs + sum(len(b) for b in it)
96 def _tree_chunks(repo, tree, startofs):
97 "Tree should be a sequence of (name, mode, hash) as per tree_decode()."
99 # name is the chunk's hex offset in the original file
100 tree = dropwhile(lambda (_1, name, _2): int(name, 16) < startofs, tree)
101 for mode, name, oid in tree:
103 skipmore = startofs - ofs
106 it = repo.cat(oid.encode('hex'))
107 _, obj_t, size = next(it)
110 assert obj_t == 'tree'
111 for b in _tree_chunks(repo, tree_decode(data), skipmore):
114 assert obj_t == 'blob'
115 yield data[skipmore:]
118 def __init__(self, repo, oid, startofs):
119 it = repo.cat(oid.encode('hex'))
120 _, obj_t, size = next(it)
121 isdir = obj_t == 'tree'
124 self.it = _tree_chunks(repo, tree_decode(data), startofs)
128 self.blob = data[startofs:]
131 def next(self, size):
133 while len(out) < size:
134 if self.it and not self.blob:
136 self.blob = self.it.next()
137 except StopIteration:
140 want = size - len(out)
141 out += self.blob[:want]
142 self.blob = self.blob[want:]
145 debug2('next(%d) returned %d\n' % (size, len(out)))
149 class _FileReader(object):
150 def __init__(self, repo, oid, known_size=None):
155 self._size = known_size
157 def _compute_size(self):
159 self._size = _normal_or_chunked_file_size(self._repo, self.oid)
164 raise IOError(errno.EINVAL, 'Invalid argument')
165 if ofs > self._compute_size():
166 raise IOError(errno.EINVAL, 'Invalid argument')
172 def read(self, count=-1):
174 count = self._compute_size() - self.ofs
175 if not self.reader or self.reader.ofs != self.ofs:
176 self.reader = _ChunkReader(self._repo, self.oid, self.ofs)
178 buf = self.reader.next(count)
181 raise # our offsets will be all screwed up otherwise
190 def __exit__(self, type, value, traceback):
194 _multiple_slashes_rx = re.compile(r'//+')
196 def _decompose_path(path):
197 """Return a reversed list of path elements, omitting any occurrences
198 of "." and ignoring any leading or trailing slash."""
199 path = re.sub(_multiple_slashes_rx, '/', path)
200 if path.startswith('/'):
202 if path.endswith('/'):
204 result = [x for x in path.split('/') if x != '.']
209 Item = namedtuple('Item', ('meta', 'oid'))
210 Chunky = namedtuple('Chunky', ('meta', 'oid'))
211 Root = namedtuple('Root', ('meta'))
212 Tags = namedtuple('Tags', ('meta'))
213 RevList = namedtuple('RevList', ('meta', 'oid'))
215 _root = Root(meta=default_dir_mode)
216 _tags = Tags(meta=default_dir_mode)
219 """Return a completely independent copy of item, such that
220 modifications will not affect the original.
223 meta = getattr(item, 'meta', None)
226 return(item._replace(meta=meta.copy()))
229 """Return the integer mode (stat st_mode) for item."""
231 if isinstance(m, Metadata):
235 def _read_dir_meta(bupm):
236 # This is because save writes unmodified Metadata() entries for
237 # fake parents -- test-save-strip-graft.sh demonstrates.
238 m = Metadata.read(bupm)
240 return default_dir_mode
241 assert m.mode is not None
246 def _tree_data_and_bupm(repo, oid):
247 """Return (tree_bytes, bupm_oid) where bupm_oid will be None if the
248 tree has no metadata (i.e. older bup save, or non-bup tree).
251 assert len(oid) == 20
252 it = repo.cat(oid.encode('hex'))
253 _, item_t, size = next(it)
255 if item_t == 'commit':
256 commit = parse_commit(data)
257 it = repo.cat(commit.tree)
258 _, item_t, size = next(it)
260 assert item_t == 'tree'
261 elif item_t != 'tree':
262 raise Exception('%r is not a tree or commit' % oid.encode('hex'))
263 for _, mangled_name, sub_oid in tree_decode(data):
264 if mangled_name == '.bupm':
266 if mangled_name > '.bupm':
270 def _find_dir_item_metadata(repo, item):
271 """Return the metadata for the tree or commit item, or None if the
272 tree has no metadata (i.e. older bup save, or non-bup tree).
275 tree_data, bupm_oid = _tree_data_and_bupm(repo, item.oid)
277 with _FileReader(repo, bupm_oid) as meta_stream:
278 return _read_dir_meta(meta_stream)
281 def _readlink(repo, oid):
282 return ''.join(repo.join(oid.encode('hex')))
284 def readlink(repo, item):
285 """Return the link target of item, which must be a symlink. Reads the
286 target from the repository if necessary."""
288 assert S_ISLNK(item_mode(item))
289 if isinstance(item.meta, Metadata):
290 target = item.meta.symlink_target
293 return _readlink(repo, item.oid)
295 def _compute_item_size(repo, item):
296 mode = item_mode(item)
298 size = _normal_or_chunked_file_size(repo, item.oid)
301 return len(_readlink(repo, item.oid))
304 def item_size(repo, item):
305 """Return the size of item, computing it if necessary."""
307 if isinstance(m, Metadata) and m.size is not None:
309 return _compute_item_size(repo, item)
311 def fopen(repo, item):
312 """Return an open reader for the given file item."""
314 assert S_ISREG(item_mode(item))
315 return _FileReader(repo, item.oid)
317 def augment_item_meta(repo, item, include_size=False):
318 """Ensure item has a Metadata instance for item.meta. If item.meta is
319 currently a mode, replace it with a compatible "fake" Metadata
320 instance. If include_size is true, ensure item.meta.size is
321 correct, computing it if needed. If item.meta is a Metadata
322 instance, this call may modify it in place or replace it.
325 # If we actually had parallelism, we'd need locking...
328 if isinstance(m, Metadata):
329 if include_size and m.size is None:
330 m.size = _compute_item_size(repo, item)
331 return item._replace(meta=m)
336 meta.uid = meta.gid = meta.atime = meta.mtime = meta.ctime = 0
338 target = _readlink(repo, item.oid)
339 meta.symlink_target = target
340 meta.size = len(target)
342 meta.size = _compute_item_size(repo, item)
343 return item._replace(meta=meta)
345 def _commit_meta_from_auth_sec(author_sec):
347 m.mode = default_dir_mode
348 m.uid = m.gid = m.size = 0
349 m.atime = m.mtime = m.ctime = author_sec * 10**9
352 def _commit_meta_from_oidx(repo, oidx):
354 _, typ, size = next(it)
355 assert typ == 'commit'
356 author_sec = parse_commit(''.join(it)).author_sec
357 return _commit_meta_from_auth_sec(author_sec)
359 def parse_rev_auth_secs(f):
360 tree, author_secs = f.readline().split(None, 2)
361 return tree, int(author_secs)
363 def root_items(repo, names=None):
364 """Yield (name, item) for the items in '/' in the VFS. Return
365 everything if names is logically false, otherwise return only
366 items with a name in the collection.
369 # FIXME: what about non-leaf refs like 'refs/heads/foo/bar/baz?
375 # FIXME: maybe eventually support repo.clone() or something
376 # and pass in two repos, so we can drop the tuple() and stream
377 # in parallel (i.e. meta vs refs).
378 for name, oid in tuple(repo.refs([], limit_to_heads=True)):
379 assert(name.startswith('refs/heads/'))
381 m = _commit_meta_from_oidx(repo, oid.encode('hex'))
382 yield name, RevList(meta=m, oid=oid)
390 if ref in ('.', '.tag'):
393 oidx, typ, size = next(it)
397 assert typ == 'commit'
398 commit = parse_commit(''.join(it))
399 yield ref, RevList(meta=_commit_meta_from_auth_sec(commit.author_sec),
400 oid=oidx.decode('hex'))
402 def ordered_tree_entries(tree_data, bupm=None):
403 """Yields (name, mangled_name, kind, gitmode, oid) for each item in
404 tree, sorted by name.
407 # Sadly, the .bupm entries currently aren't in git tree order,
408 # i.e. they don't account for the fact that git sorts trees
409 # (including our chunked trees) as if their names ended with "/",
410 # so "fo" sorts after "fo." iff fo is a directory. This makes
411 # streaming impossible when we need the metadata.
412 def result_from_tree_entry(tree_entry):
413 gitmode, mangled_name, oid = tree_entry
414 name, kind = git.demangle_name(mangled_name, gitmode)
415 return name, mangled_name, kind, gitmode, oid
417 tree_ents = (result_from_tree_entry(x) for x in tree_decode(tree_data))
419 tree_ents = sorted(tree_ents, key=lambda x: x[0])
420 for ent in tree_ents:
423 def tree_items(oid, tree_data, names=frozenset(), bupm=None):
425 def tree_item(ent_oid, kind, gitmode):
426 if kind == BUP_CHUNKED:
427 meta = Metadata.read(bupm) if bupm else default_file_mode
428 return Chunky(oid=ent_oid, meta=meta)
431 # No metadata here (accessable via '.' inside ent_oid).
432 return Item(meta=default_dir_mode, oid=ent_oid)
434 return Item(oid=ent_oid,
435 meta=(Metadata.read(bupm) if bupm \
436 else _default_mode_for_gitmode(gitmode)))
438 assert len(oid) == 20
440 dot_meta = _read_dir_meta(bupm) if bupm else default_dir_mode
441 yield '.', Item(oid=oid, meta=dot_meta)
442 tree_entries = ordered_tree_entries(tree_data, bupm)
443 for name, mangled_name, kind, gitmode, ent_oid in tree_entries:
444 if mangled_name == '.bupm':
447 yield name, tree_item(ent_oid, kind, gitmode)
450 # Assumes the tree is properly formed, i.e. there are no
451 # duplicates, and entries will be in git tree order.
452 if type(names) not in (frozenset, set):
453 names = frozenset(names)
454 remaining = len(names)
456 # Account for the bupm sort order issue (cf. ordered_tree_entries above)
457 last_name = max(names) if bupm else max(names) + '/'
460 dot_meta = _read_dir_meta(bupm) if bupm else default_dir_mode
461 yield '.', Item(oid=oid, meta=dot_meta)
466 tree_entries = ordered_tree_entries(tree_data, bupm)
467 for name, mangled_name, kind, gitmode, ent_oid in tree_entries:
468 if mangled_name == '.bupm':
471 if name not in names:
473 break # given bupm sort order, we're finished
474 if (kind == BUP_CHUNKED or not S_ISDIR(gitmode)) and bupm:
477 yield name, tree_item(ent_oid, kind, gitmode)
482 def tree_items_with_meta(repo, oid, tree_data, names):
483 # For now, the .bupm order doesn't quite match git's, and we don't
484 # load the tree data incrementally anyway, so we just work in RAM
486 assert len(oid) == 20
488 for _, mangled_name, sub_oid in tree_decode(tree_data):
489 if mangled_name == '.bupm':
490 bupm = _FileReader(repo, sub_oid)
492 if mangled_name > '.bupm':
494 for item in tree_items(oid, tree_data, names, bupm):
497 _save_name_rx = re.compile(r'^\d\d\d\d-\d\d-\d\d-\d{6}$')
499 def revlist_items(repo, oid, names):
500 assert len(oid) == 20
501 oidx = oid.encode('hex')
503 # There might well be duplicate names in this dir (time resolution is secs)
504 names = frozenset(name for name in (names or tuple()) \
505 if _save_name_rx.match(name) or name in ('.', 'latest'))
507 # Do this before we open the rev_list iterator so we're not nesting
508 if (not names) or ('.' in names):
509 yield '.', RevList(oid=oid, meta=_commit_meta_from_oidx(repo, oidx))
511 revs = repo.rev_list((oidx,), format='%T %at', parse=parse_rev_auth_secs)
512 first_rev = next(revs, None)
513 revs = chain((first_rev,), revs)
516 for commit, (tree_oidx, utc) in revs:
517 assert len(tree_oidx) == 40
518 name = strftime('%Y-%m-%d-%H%M%S', localtime(utc))
519 yield name, Item(meta=default_dir_mode, oid=tree_oidx.decode('hex'))
521 commit, (tree_oidx, utc) = first_rev
522 yield 'latest', Item(meta=default_dir_mode,
523 oid=tree_oidx.decode('hex'))
526 # Revs are in reverse chronological order by default
527 last_name = min(names)
528 for commit, (tree_oidx, utc) in revs:
529 assert len(tree_oidx) == 40
530 name = strftime('%Y-%m-%d-%H%M%S', localtime(utc))
533 if not name in names:
535 yield name, Item(meta=default_dir_mode, oid=tree_oidx.decode('hex'))
537 # FIXME: need real short circuit...
541 if first_rev and 'latest' in names:
542 commit, (tree_oidx, utc) = first_rev
543 yield 'latest', Item(meta=default_dir_mode, oid=tree_oidx.decode('hex'))
545 def tags_items(repo, names):
549 assert len(oid) == 20
550 oidx = oid.encode('hex')
552 _, typ, size = next(it)
554 tree_oid = parse_commit(''.join(it)).tree.decode('hex')
555 assert len(tree_oid) == 20
556 # FIXME: more efficient/bulk?
557 return RevList(meta=_commit_meta_from_oidx(repo, oidx), oid=oid)
560 return Item(meta=default_file_mode, oid=oid)
562 return Item(meta=default_dir_mode, oid=oid)
563 raise Exception('unexpected tag type ' + typ + ' for tag ' + name)
567 # We have to pull these all into ram because tag_item calls cat()
568 for name, oid in tuple(repo.refs(names, limit_to_tags=True)):
569 assert(name.startswith('refs/tags/'))
571 yield name, tag_item(oid)
574 # Assumes no duplicate refs
575 if type(names) not in (frozenset, set):
576 names = frozenset(names)
577 remaining = len(names)
578 last_name = max(names)
585 for name, oid in repo.refs(names, limit_to_tags=True):
586 assert(name.startswith('refs/tags/'))
590 if name not in names:
592 yield name, tag_item(oid)
597 def contents(repo, item, names=None, want_meta=True):
598 """Yields information about the items contained in item. Yields
599 (name, item) for each name in names, if the name exists, in an
600 unspecified order. If there are no names, then yields (name,
601 item) for all items, including, a first item named '.'
602 representing the container itself.
604 Any given name might produce more than one result. For example,
605 saves to a branch that happen within the same second currently end
606 up with the same VFS timestmap, i.e. /foo/2017-09-10-150833/.
608 Note that want_meta is advisory. For any given item, item.meta
609 might be a Metadata instance or a mode, and if the former,
610 meta.size might be None. Missing sizes can be computed via via
611 item_size() or augment_item_meta(..., include_size=True).
613 Do not modify any item.meta Metadata instances directly. If
614 needed, make a copy via item.meta.copy() and modify that instead.
617 # Q: are we comfortable promising '.' first when no names?
619 assert S_ISDIR(item_mode(item))
622 it = repo.cat(item.oid.encode('hex'))
623 _, obj_type, size = next(it)
625 if obj_type == 'tree':
627 item_gen = tree_items_with_meta(repo, item.oid, data, names)
629 item_gen = tree_items(item.oid, data, names)
630 elif obj_type == 'commit':
631 tree_oidx = parse_commit(data).tree
632 it = repo.cat(tree_oidx)
633 _, obj_type, size = next(it)
634 assert obj_type == 'tree'
635 tree_data = ''.join(it)
637 item_gen = tree_items_with_meta(repo, tree_oidx.decode('hex'),
640 item_gen = tree_items(tree_oidx.decode('hex'), tree_data, names)
643 raise Exception('unexpected git ' + obj_type)
644 elif item_t == RevList:
645 item_gen = revlist_items(repo, item.oid, names)
647 item_gen = root_items(repo, names)
649 item_gen = tags_items(repo, names)
651 raise Exception('unexpected VFS item ' + str(item))
655 def _resolve_path(repo, path, parent=None, want_meta=True, deref=False):
659 future = _decompose_path(path)
661 if path.startswith('/'):
664 if future == ['']: # path was effectively '/'
666 if not past and not parent:
673 segment = future.pop()
675 if len(past) > 1: # .. from / is /
678 parent_name, parent_item = past[-1]
679 wanted = (segment,) if not want_meta else ('.', segment)
680 items = tuple(contents(repo, parent_item, names=wanted,
681 want_meta=want_meta))
683 item = items[0][1] if items else None
684 else: # First item will be '.' and have the metadata
685 item = items[1][1] if len(items) == 2 else None
686 dot, dot_item = items[0]
688 past[-1] = parent_name, parent_item
690 return tuple(past + [(segment, None)])
691 mode = item_mode(item)
692 if not S_ISLNK(mode):
693 if not S_ISDIR(mode):
695 return tuple(past + [(segment, item)])
697 if want_meta and type(item) == Item:
698 dir_meta = _find_dir_item_metadata(repo, item)
700 item = item._replace(meta=dir_meta)
702 return tuple(past + [(segment, item)])
703 past.append((segment, item))
705 if not future and not deref:
706 return tuple(past + [(segment, item)])
707 target = readlink(repo, item)
708 target_future = _decompose_path(target)
709 if target.startswith('/'):
710 future = target_future
712 if target_future == ['']: # path was effectively '/'
715 future = future + target_future
718 raise Loop('too many symlinks encountered while resolving %r%s'
720 'relative to %r' % parent if parent else ''))
722 def lresolve(repo, path, parent=None, want_meta=True):
723 """Perform exactly the same function as resolve(), except if the
724 final path element is a symbolic link, don't follow it, just
725 return it in the result."""
726 return _resolve_path(repo, path, parent=parent, want_meta=want_meta,
730 def resolve(repo, path, parent=None, want_meta=True):
731 """Follow the path in the virtual filesystem and return a tuple
732 representing the location, if any, denoted by the path. Each
733 element in the result tuple will be (name, info), where info will
734 be a VFS item that can be passed to functions like item_mode().
736 If a path segment that does not exist is encountered during
737 resolution, the result will represent the location of the missing
738 item, and that item in the result will be None.
740 Any symlinks along the path, including at the end, will be
741 resolved. A Loop exception will be raised if too many symlinks
742 are traversed whiile following the path. raised if too many
743 symlinks are traversed while following the path. That exception
744 is effectively like a normal ELOOP IOError exception, but will
745 include a terminus element describing the location of the failure,
746 which will be a tuple of (name, info) elements.
748 Currently, a path ending in '/' will still resolve if it exists,
749 even if not a directory. The parent, if specified, must be a
750 (name, item) tuple, and will provide the starting point for the
751 resolution of the path. Currently, the path must be relative when
752 a parent is provided. The result may include parent directly, so
753 it must not be modified later. If this is a concern, pass in
754 copy_item(parent) instead.
756 When want_meta is true, detailed metadata will be included in each
757 result item if it's avaiable, otherwise item.meta will be an
758 integer mode. The metadata size may or may not be provided, but
759 can be computed by item_size() or augment_item_meta(...,
760 include_size=True). Setting want_meta=False is rarely desirable
761 since it can limit the VFS to just the metadata git itself can
762 represent, and so, as an example, fifos and sockets will appear to
763 be regular files (e.g. S_ISREG(item_mode(item)) will be true) .
764 But the option is provided because it may be more efficient when
765 only the path names or the more limited metadata is sufficient.
767 Do not modify any item.meta Metadata instances directly. If
768 needed, make a copy via item.meta.copy() and modify that instead.
771 return _resolve_path(repo, path, parent=parent, want_meta=want_meta,