1 """Virtual File System interface to bup repository content.
3 This module provides a path-based interface to the content of a bup
6 The VFS is structured like this:
9 /SAVE-NAME/SAVE-DATE/...
12 Each path is represented by an item that has least an item.meta which
13 may be either a Metadata object, or an integer mode. Functions like
14 item_mode() and item_size() will return the mode and size in either
15 case. Any item.meta Metadata instances must not be modified directly.
16 Make a copy to modify via item.meta.copy() if needed.
18 The want_meta argument is advisory for calls that accept it, and it
19 may not be honored. Callers must be able to handle an item.meta value
20 that is either an instance of Metadata or an integer mode, perhaps
21 via item_mode() or augment_item_meta().
23 Setting want_meta=False is rarely desirable since it can limit the VFS
24 to only the metadata that git itself can represent, and so for
25 example, fifos and sockets will appear to be regular files
26 (e.g. S_ISREG(item_mode(item)) will be true). But the option is still
27 provided because it may be more efficient when just the path names or
28 the more limited metadata is sufficient.
30 Any given metadata object's size may be None, in which case the size
31 can be computed via item_size() or augment_item_meta(...,
34 When traversing a directory using functions like contents(), the meta
35 value for any directories other than '.' will be a default directory
36 mode, not a Metadata object. This is because the actual metadata for
37 a directory is stored inside the directory.
39 At the moment tagged commits (e.g. /.tag/some-commit) are represented
40 as an item that is indistinguishable from a normal directory, so you
41 cannot assume that the oid of an item satisfying
42 S_ISDIR(item_mode(item)) refers to a tree.
46 from __future__ import print_function
47 from collections import namedtuple
48 from errno import ELOOP, ENOENT, ENOTDIR
49 from itertools import chain, dropwhile, groupby, izip, tee
50 from stat import S_IFDIR, S_IFLNK, S_IFREG, S_ISDIR, S_ISLNK, S_ISREG
51 from time import localtime, strftime
52 import exceptions, re, sys
54 from bup import client, git, metadata
55 from bup.git import BUP_CHUNKED, cp, get_commit_items, parse_commit, tree_decode
56 from bup.helpers import debug2, last
57 from bup.metadata import Metadata
58 from bup.repo import LocalRepo, RemoteRepo
61 class IOError(exceptions.IOError):
62 def __init__(self, errno, message):
63 exceptions.IOError.__init__(self, errno, message)
66 def __init__(self, message, terminus=None):
67 IOError.__init__(self, ELOOP, message)
68 self.terminus = terminus
70 default_file_mode = S_IFREG | 0o644
71 default_dir_mode = S_IFDIR | 0o755
72 default_symlink_mode = S_IFLNK | 0o755
74 def _default_mode_for_gitmode(gitmode):
76 return default_file_mode
78 return default_dir_mode
80 return default_symlink_mode
81 raise Exception('unexpected git mode ' + oct(gitmode))
83 def _normal_or_chunked_file_size(repo, oid):
84 """Return the size of the normal or chunked file indicated by oid."""
85 # FIXME: --batch-format CatPipe?
86 it = repo.cat(oid.encode('hex'))
87 _, obj_t, size = next(it)
89 while obj_t == 'tree':
90 mode, name, last_oid = last(tree_decode(''.join(it)))
92 it = repo.cat(last_oid.encode('hex'))
93 _, obj_t, size = next(it)
94 return ofs + sum(len(b) for b in it)
96 def _tree_chunks(repo, tree, startofs):
97 "Tree should be a sequence of (name, mode, hash) as per tree_decode()."
99 # name is the chunk's hex offset in the original file
100 tree = dropwhile(lambda (_1, name, _2): int(name, 16) < startofs, tree)
101 for mode, name, oid in tree:
103 skipmore = startofs - ofs
106 it = repo.cat(oid.encode('hex'))
107 _, obj_t, size = next(it)
110 assert obj_t == 'tree'
111 for b in _tree_chunks(repo, tree_decode(data), skipmore):
114 assert obj_t == 'blob'
115 yield data[skipmore:]
118 def __init__(self, repo, oid, startofs):
119 it = repo.cat(oid.encode('hex'))
120 _, obj_t, size = next(it)
121 isdir = obj_t == 'tree'
124 self.it = _tree_chunks(repo, tree_decode(data), startofs)
128 self.blob = data[startofs:]
131 def next(self, size):
133 while len(out) < size:
134 if self.it and not self.blob:
136 self.blob = self.it.next()
137 except StopIteration:
140 want = size - len(out)
141 out += self.blob[:want]
142 self.blob = self.blob[want:]
145 debug2('next(%d) returned %d\n' % (size, len(out)))
149 class _FileReader(object):
150 def __init__(self, repo, oid, known_size=None):
155 self._size = known_size
157 def _compute_size(self):
159 self._size = _normal_or_chunked_file_size(self._repo, self.oid)
164 raise IOError(errno.EINVAL, 'Invalid argument')
165 if ofs > self._compute_size():
166 raise IOError(errno.EINVAL, 'Invalid argument')
172 def read(self, count=-1):
174 count = self._compute_size() - self.ofs
175 if not self.reader or self.reader.ofs != self.ofs:
176 self.reader = _ChunkReader(self._repo, self.oid, self.ofs)
178 buf = self.reader.next(count)
181 raise # our offsets will be all screwed up otherwise
190 def __exit__(self, type, value, traceback):
194 _multiple_slashes_rx = re.compile(r'//+')
196 def _decompose_path(path):
197 """Return a reversed list of path elements, omitting any occurrences
198 of "." and ignoring any leading or trailing slash."""
199 path = re.sub(_multiple_slashes_rx, '/', path)
200 if path.startswith('/'):
202 if path.endswith('/'):
204 result = [x for x in path.split('/') if x != '.']
209 Item = namedtuple('Item', ('meta', 'oid'))
210 Chunky = namedtuple('Chunky', ('meta', 'oid'))
211 Root = namedtuple('Root', ('meta'))
212 Tags = namedtuple('Tags', ('meta'))
213 RevList = namedtuple('RevList', ('meta', 'oid'))
215 _root = Root(meta=default_dir_mode)
216 _tags = Tags(meta=default_dir_mode)
219 """Return a completely independent copy of item, such that
220 modifications will not affect the original.
223 meta = getattr(item, 'meta', None)
226 return(item._replace(meta=meta.copy()))
229 """Return the integer mode (stat st_mode) for item."""
231 if isinstance(m, Metadata):
235 def _read_dir_meta(bupm):
236 # This is because save writes unmodified Metadata() entries for
237 # fake parents -- test-save-strip-graft.sh demonstrates.
238 m = Metadata.read(bupm)
240 return default_dir_mode
241 assert m.mode is not None
246 def _tree_data_and_bupm(repo, oid):
247 """Return (tree_bytes, bupm_oid) where bupm_oid will be None if the
248 tree has no metadata (i.e. older bup save, or non-bup tree).
251 assert len(oid) == 20
252 it = repo.cat(oid.encode('hex'))
253 _, item_t, size = next(it)
255 if item_t == 'commit':
256 commit = parse_commit(data)
257 it = repo.cat(commit.tree)
258 _, item_t, size = next(it)
260 assert item_t == 'tree'
261 elif item_t != 'tree':
262 raise Exception('%r is not a tree or commit' % oid.encode('hex'))
263 for _, mangled_name, sub_oid in tree_decode(data):
264 if mangled_name == '.bupm':
266 if mangled_name > '.bupm':
270 def _find_dir_item_metadata(repo, item):
271 """Return the metadata for the tree or commit item, or None if the
272 tree has no metadata (i.e. older bup save, or non-bup tree).
275 tree_data, bupm_oid = _tree_data_and_bupm(repo, item.oid)
277 with _FileReader(repo, bupm_oid) as meta_stream:
278 return _read_dir_meta(meta_stream)
281 def _readlink(repo, oid):
282 return ''.join(repo.join(oid.encode('hex')))
284 def readlink(repo, item):
285 """Return the link target of item, which must be a symlink. Reads the
286 target from the repository if necessary."""
288 assert S_ISLNK(item_mode(item))
289 if isinstance(item.meta, Metadata):
290 target = item.meta.symlink_target
293 return _readlink(repo, item.oid)
295 def _compute_item_size(repo, item):
296 mode = item_mode(item)
298 size = _normal_or_chunked_file_size(repo, item.oid)
301 return len(_readlink(repo, item.oid))
304 def item_size(repo, item):
305 """Return the size of item, computing it if necessary."""
307 if isinstance(m, Metadata) and m.size is not None:
309 return _compute_item_size(repo, item)
311 def fopen(repo, item):
312 """Return an open reader for the given file item."""
314 assert S_ISREG(item_mode(item))
315 return _FileReader(repo, item.oid)
317 def augment_item_meta(repo, item, include_size=False):
318 """Ensure item has a Metadata instance for item.meta. If item.meta is
319 currently a mode, replace it with a compatible "fake" Metadata
320 instance. If include_size is true, ensure item.meta.size is
321 correct, computing it if needed. If item.meta is a Metadata
322 instance, this call may modify it in place or replace it.
325 # If we actually had parallelism, we'd need locking...
328 if isinstance(m, Metadata):
329 if include_size and m.size is None:
330 m.size = _compute_item_size(repo, item)
331 return item._replace(meta=m)
336 meta.uid = meta.gid = meta.atime = meta.mtime = meta.ctime = 0
338 target = _readlink(repo, item.oid)
339 meta.symlink_target = target
340 meta.size = len(target)
342 meta.size = _compute_item_size(repo, item)
343 return item._replace(meta=meta)
345 def _commit_meta_from_auth_sec(author_sec):
347 m.mode = default_dir_mode
348 m.uid = m.gid = m.size = 0
349 m.atime = m.mtime = m.ctime = author_sec * 10**9
352 def _commit_meta_from_oidx(repo, oidx):
354 _, typ, size = next(it)
355 assert typ == 'commit'
356 author_sec = parse_commit(''.join(it)).author_sec
357 return _commit_meta_from_auth_sec(author_sec)
359 def parse_rev_auth_secs(f):
360 tree, author_secs = f.readline().split(None, 2)
361 return tree, int(author_secs)
363 def root_items(repo, names=None):
364 """Yield (name, item) for the items in '/' in the VFS. Return
365 everything if names is logically false, otherwise return only
366 items with a name in the collection.
369 # FIXME: what about non-leaf refs like 'refs/heads/foo/bar/baz?
375 # FIXME: maybe eventually support repo.clone() or something
376 # and pass in two repos, so we can drop the tuple() and stream
377 # in parallel (i.e. meta vs refs).
378 for name, oid in tuple(repo.refs([], limit_to_heads=True)):
379 assert(name.startswith('refs/heads/'))
381 m = _commit_meta_from_oidx(repo, oid.encode('hex'))
382 yield name, RevList(meta=m, oid=oid)
390 if ref in ('.', '.tag'):
393 oidx, typ, size = next(it)
397 assert typ == 'commit'
398 commit = parse_commit(''.join(it))
399 yield ref, RevList(meta=_commit_meta_from_auth_sec(commit.author_sec),
400 oid=oidx.decode('hex'))
402 def ordered_tree_entries(tree_data, bupm=None):
403 """Yields (name, mangled_name, kind, gitmode, oid) for each item in
404 tree, sorted by name.
407 # Sadly, the .bupm entries currently aren't in git tree order,
408 # i.e. they don't account for the fact that git sorts trees
409 # (including our chunked trees) as if their names ended with "/",
410 # so "fo" sorts after "fo." iff fo is a directory. This makes
411 # streaming impossible when we need the metadata.
412 def result_from_tree_entry(tree_entry):
413 gitmode, mangled_name, oid = tree_entry
414 name, kind = git.demangle_name(mangled_name, gitmode)
415 return name, mangled_name, kind, gitmode, oid
417 tree_ents = (result_from_tree_entry(x) for x in tree_decode(tree_data))
419 tree_ents = sorted(tree_ents, key=lambda x: x[0])
420 for ent in tree_ents:
423 def tree_items(oid, tree_data, names=frozenset(), bupm=None):
425 def tree_item(ent_oid, kind, gitmode):
426 if kind == BUP_CHUNKED:
427 meta = Metadata.read(bupm) if bupm else default_file_mode
428 return Chunky(oid=ent_oid, meta=meta)
431 # No metadata here (accessable via '.' inside ent_oid).
432 return Item(meta=default_dir_mode, oid=ent_oid)
434 return Item(oid=ent_oid,
435 meta=(Metadata.read(bupm) if bupm \
436 else _default_mode_for_gitmode(gitmode)))
438 assert len(oid) == 20
440 dot_meta = _read_dir_meta(bupm) if bupm else default_dir_mode
441 yield '.', Item(oid=oid, meta=dot_meta)
442 tree_entries = ordered_tree_entries(tree_data, bupm)
443 for name, mangled_name, kind, gitmode, ent_oid in tree_entries:
444 if mangled_name == '.bupm':
447 yield name, tree_item(ent_oid, kind, gitmode)
450 # Assumes the tree is properly formed, i.e. there are no
451 # duplicates, and entries will be in git tree order.
452 if type(names) not in (frozenset, set):
453 names = frozenset(names)
454 remaining = len(names)
456 # Account for the bupm sort order issue (cf. ordered_tree_entries above)
457 last_name = max(names) if bupm else max(names) + '/'
460 dot_meta = _read_dir_meta(bupm) if bupm else default_dir_mode
461 yield '.', Item(oid=oid, meta=dot_meta)
466 tree_entries = ordered_tree_entries(tree_data, bupm)
467 for name, mangled_name, kind, gitmode, ent_oid in tree_entries:
468 if mangled_name == '.bupm':
471 if name not in names:
473 break # given bupm sort order, we're finished
474 if (kind == BUP_CHUNKED or not S_ISDIR(gitmode)) and bupm:
477 yield name, tree_item(ent_oid, kind, gitmode)
482 def tree_items_with_meta(repo, oid, tree_data, names):
483 # For now, the .bupm order doesn't quite match git's, and we don't
484 # load the tree data incrementally anyway, so we just work in RAM
486 assert len(oid) == 20
488 for _, mangled_name, sub_oid in tree_decode(tree_data):
489 if mangled_name == '.bupm':
490 bupm = _FileReader(repo, sub_oid)
492 if mangled_name > '.bupm':
494 for item in tree_items(oid, tree_data, names, bupm):
497 _save_name_rx = re.compile(r'^\d\d\d\d-\d\d-\d\d-\d{6}(-\d+)?$')
499 def _reverse_suffix_duplicates(strs):
500 """Yields the elements of strs, with any runs of duplicate values
501 suffixed with -N suffixes, where the zero padded integer N
502 decreases to 0 by 1 (e.g. 10, 09, ..., 00).
505 for name, duplicates in groupby(strs):
506 ndup = len(tuple(duplicates))
510 ndig = len(str(ndup - 1))
511 fmt = '%s-' + '%0' + str(ndig) + 'd'
512 for i in xrange(ndup - 1, -1, -1):
513 yield fmt % (name, i)
515 def _name_for_rev(rev):
516 commit, (tree_oidx, utc) = rev
517 assert len(commit) == 40
518 return strftime('%Y-%m-%d-%H%M%S', localtime(utc))
520 def _item_for_rev(rev):
521 commit, (tree_oidx, utc) = rev
522 assert len(tree_oidx) == 40
523 return Item(meta=default_dir_mode, oid=tree_oidx.decode('hex'))
525 def revlist_items(repo, oid, names):
526 assert len(oid) == 20
527 oidx = oid.encode('hex')
528 names = frozenset(name for name in (names or tuple()) \
529 if _save_name_rx.match(name) or name in ('.', 'latest'))
531 # Do this before we open the rev_list iterator so we're not nesting
532 if (not names) or ('.' in names):
533 yield '.', RevList(oid=oid, meta=_commit_meta_from_oidx(repo, oidx))
535 revs = repo.rev_list((oidx,), format='%T %at', parse=parse_rev_auth_secs)
536 first_rev = next(revs, None)
537 revs = chain((first_rev,), revs)
538 rev_items, rev_names = tee(revs)
539 revs = None # Don't disturb the tees
540 rev_names = _reverse_suffix_duplicates(_name_for_rev(x) for x in rev_names)
541 rev_items = (_item_for_rev(x) for x in rev_items)
544 for item in rev_items:
545 yield next(rev_names), item
546 yield 'latest', _item_for_rev(first_rev)
549 # Revs are in reverse chronological order by default
550 last_name = min(names)
551 for item in rev_items:
552 name = next(rev_names) # Might have -N dup suffix
555 if not name in names:
559 # FIXME: need real short circuit...
560 for _ in rev_items: pass
561 for _ in rev_names: pass
563 if 'latest' in names:
564 yield 'latest', _item_for_rev(first_rev)
566 def tags_items(repo, names):
570 assert len(oid) == 20
571 oidx = oid.encode('hex')
573 _, typ, size = next(it)
575 tree_oid = parse_commit(''.join(it)).tree.decode('hex')
576 assert len(tree_oid) == 20
577 # FIXME: more efficient/bulk?
578 return RevList(meta=_commit_meta_from_oidx(repo, oidx), oid=oid)
581 return Item(meta=default_file_mode, oid=oid)
583 return Item(meta=default_dir_mode, oid=oid)
584 raise Exception('unexpected tag type ' + typ + ' for tag ' + name)
588 # We have to pull these all into ram because tag_item calls cat()
589 for name, oid in tuple(repo.refs(names, limit_to_tags=True)):
590 assert(name.startswith('refs/tags/'))
592 yield name, tag_item(oid)
595 # Assumes no duplicate refs
596 if type(names) not in (frozenset, set):
597 names = frozenset(names)
598 remaining = len(names)
599 last_name = max(names)
606 for name, oid in repo.refs(names, limit_to_tags=True):
607 assert(name.startswith('refs/tags/'))
611 if name not in names:
613 yield name, tag_item(oid)
618 def contents(repo, item, names=None, want_meta=True):
619 """Yields information about the items contained in item. Yields
620 (name, item) for each name in names, if the name exists, in an
621 unspecified order. If there are no names, then yields (name,
622 item) for all items, including, a first item named '.'
623 representing the container itself.
625 Note that want_meta is advisory. For any given item, item.meta
626 might be a Metadata instance or a mode, and if the former,
627 meta.size might be None. Missing sizes can be computed via via
628 item_size() or augment_item_meta(..., include_size=True).
630 Do not modify any item.meta Metadata instances directly. If
631 needed, make a copy via item.meta.copy() and modify that instead.
634 # Q: are we comfortable promising '.' first when no names?
636 assert S_ISDIR(item_mode(item))
639 it = repo.cat(item.oid.encode('hex'))
640 _, obj_type, size = next(it)
642 if obj_type == 'tree':
644 item_gen = tree_items_with_meta(repo, item.oid, data, names)
646 item_gen = tree_items(item.oid, data, names)
647 elif obj_type == 'commit':
649 item_gen = tree_items_with_meta(repo, item.oid, tree_data, names)
651 item_gen = tree_items(item.oid, tree_data, names)
654 raise Exception('unexpected git ' + obj_type)
655 elif item_t == RevList:
656 item_gen = revlist_items(repo, item.oid, names)
658 item_gen = root_items(repo, names)
660 item_gen = tags_items(repo, names)
662 raise Exception('unexpected VFS item ' + str(item))
666 def _resolve_path(repo, path, parent=None, want_meta=True, deref=False):
670 future = _decompose_path(path)
672 if path.startswith('/'):
675 if future == ['']: # path was effectively '/'
677 if not past and not parent:
684 segment = future.pop()
686 if len(past) > 1: # .. from / is /
689 parent_name, parent_item = past[-1]
690 wanted = (segment,) if not want_meta else ('.', segment)
691 items = tuple(contents(repo, parent_item, names=wanted,
692 want_meta=want_meta))
694 item = items[0][1] if items else None
695 else: # First item will be '.' and have the metadata
696 item = items[1][1] if len(items) == 2 else None
697 dot, dot_item = items[0]
699 past[-1] = parent_name, parent_item
701 past.append((segment, None),)
703 mode = item_mode(item)
704 if not S_ISLNK(mode):
705 if not S_ISDIR(mode):
707 past.append((segment, item),)
710 if want_meta and type(item) == Item:
711 dir_meta = _find_dir_item_metadata(repo, item)
713 item = item._replace(meta=dir_meta)
715 past.append((segment, item),)
717 past.append((segment, item))
719 if not future and not deref:
720 past.append((segment, item),)
722 target = readlink(repo, item)
723 target_future = _decompose_path(target)
724 if target.startswith('/'):
725 future = target_future
727 if target_future == ['']: # path was effectively '/'
730 future.extend(target_future)
733 raise Loop('too many symlinks encountered while resolving %r%s'
735 'relative to %r' % parent if parent else ''))
737 def lresolve(repo, path, parent=None, want_meta=True):
738 """Perform exactly the same function as resolve(), except if the
739 final path element is a symbolic link, don't follow it, just
740 return it in the result."""
741 return _resolve_path(repo, path, parent=parent, want_meta=want_meta,
745 def resolve(repo, path, parent=None, want_meta=True):
746 """Follow the path in the virtual filesystem and return a tuple
747 representing the location, if any, denoted by the path. Each
748 element in the result tuple will be (name, info), where info will
749 be a VFS item that can be passed to functions like item_mode().
751 If a path segment that does not exist is encountered during
752 resolution, the result will represent the location of the missing
753 item, and that item in the result will be None.
755 Any symlinks along the path, including at the end, will be
756 resolved. A Loop exception will be raised if too many symlinks
757 are traversed whiile following the path. raised if too many
758 symlinks are traversed while following the path. That exception
759 is effectively like a normal ELOOP IOError exception, but will
760 include a terminus element describing the location of the failure,
761 which will be a tuple of (name, info) elements.
763 Currently, a path ending in '/' will still resolve if it exists,
764 even if not a directory. The parent, if specified, must be a
765 (name, item) tuple, and will provide the starting point for the
766 resolution of the path. Currently, the path must be relative when
767 a parent is provided. The result may include parent directly, so
768 it must not be modified later. If this is a concern, pass in
769 copy_item(parent) instead.
771 When want_meta is true, detailed metadata will be included in each
772 result item if it's avaiable, otherwise item.meta will be an
773 integer mode. The metadata size may or may not be provided, but
774 can be computed by item_size() or augment_item_meta(...,
775 include_size=True). Setting want_meta=False is rarely desirable
776 since it can limit the VFS to just the metadata git itself can
777 represent, and so, as an example, fifos and sockets will appear to
778 be regular files (e.g. S_ISREG(item_mode(item)) will be true) .
779 But the option is provided because it may be more efficient when
780 only the path names or the more limited metadata is sufficient.
782 Do not modify any item.meta Metadata instances directly. If
783 needed, make a copy via item.meta.copy() and modify that instead.
786 return _resolve_path(repo, path, parent=parent, want_meta=want_meta,