From: Avery Pennarun Date: Fri, 23 Apr 2010 23:18:10 +0000 (-0400) Subject: vfs: take advantage of bup chunking to make file seeking faster. X-Git-Tag: bup-0.14^0 X-Git-Url: https://arthur.barton.de/gitweb/?a=commitdiff_plain;h=0e3565afb1cc11683eae5107d80d7d7ef66ed34c;p=bup.git vfs: take advantage of bup chunking to make file seeking faster. If you have a huge file, you can now seek around inside it (eg. in 'bup fuse') without having to read its entire contents. Calculating the file size is also really fast now. This makes a bup fuse-mounted filesystem much more useful for real-time access. For example, I was able to connect to an sqlite3 database and have it work at a reasonable speed. (Obviously, since 'bup fuse' is written in python and doesn't currently support threading, the speed could still be improved, but at least it's no longer godawful.) --- diff --git a/cmd/fuse-cmd.py b/cmd/fuse-cmd.py index 5d6fd96..6488097 100755 --- a/cmd/fuse-cmd.py +++ b/cmd/fuse-cmd.py @@ -93,7 +93,9 @@ class BupFs(fuse.Fuse): def read(self, path, size, offset): log('--read(%r)\n' % path) n = cache_get(self.top, path) - return n.readbytes(offset, size) + o = n.open() + o.seek(offset) + return o.read(size) if not hasattr(fuse, '__version__'): diff --git a/lib/bup/vfs.py b/lib/bup/vfs.py index 91566da..c8f688a 100644 --- a/lib/bup/vfs.py +++ b/lib/bup/vfs.py @@ -23,13 +23,103 @@ class TooManySymlinks(NodeError): pass -class FileReader: - def __init__(self, node): - self.n = node +def _treeget(hash): + it = cp().get(hash.encode('hex')) + type = it.next() + assert(type == 'tree') + return git._treeparse(''.join(it)) + + +def _tree_decode(hash): + tree = [(int(name,16),stat.S_ISDIR(int(mode,8)),sha) + for (mode,name,sha) + in _treeget(hash)] + assert(tree == list(sorted(tree))) + return tree + + +def _chunk_len(hash): + return sum(len(b) for b in cp().join(hash.encode('hex'))) + + +def _last_chunk_info(hash): + tree = _tree_decode(hash) + assert(tree) + (ofs,isdir,sha) = tree[-1] + if isdir: + (subofs, sublen) = _last_chunk_info(sha) + return (ofs+subofs, sublen) + else: + return (ofs, _chunk_len(sha)) + + +def _total_size(hash): + (lastofs, lastsize) = _last_chunk_info(hash) + return lastofs + lastsize + + +def _chunkiter(hash, startofs): + assert(startofs >= 0) + tree = _tree_decode(hash) + + # skip elements before startofs + for i in xrange(len(tree)): + if i+1 >= len(tree) or tree[i+1][0] > startofs: + break + first = i + + # iterate through what's left + for i in xrange(first, len(tree)): + (ofs,isdir,sha) = tree[i] + skipmore = startofs-ofs + if skipmore < 0: + skipmore = 0 + if isdir: + for b in _chunkiter(sha, skipmore): + yield b + else: + yield ''.join(cp().join(sha.encode('hex')))[skipmore:] + + +class _ChunkReader: + def __init__(self, hash, isdir, startofs): + if isdir: + self.it = _chunkiter(hash, startofs) + self.blob = None + else: + self.it = None + self.blob = cp().get(hash.encode('hex')) + + def next(self, size): + out = '' + while len(out) < size: + if self.it and not self.blob: + try: + self.blob = self.it.next() + except StopIteration: + self.it = None + if self.blob: + want = size - len(out) + out += self.blob[:want] + self.blob = self.blob[want:] + if not self.it: + break + log('next(%d) returned %d\n' % (size, len(out))) + return out + + +class _FileReader: + def __init__(self, hash, size, isdir): + self.hash = hash self.ofs = 0 - self.size = self.n.size() + self.size = size + self.isdir = isdir + self.reader = None def seek(self, ofs): + if self.ofs == ofs: + return + self.reader = None if ofs > self.size: self.ofs = self.size elif ofs < 0: @@ -43,7 +133,13 @@ class FileReader: def read(self, count = -1): if count < 0: count = self.size - self.ofs - buf = self.n.readbytes(self.ofs, count) + if not self.reader: + self.reader = _ChunkReader(self.hash, self.isdir, self.ofs) + try: + buf = self.reader.next(count) + except: + self.reader = None + raise # our offsets will be all screwed up otherwise self.ofs += len(buf) return buf @@ -131,48 +227,45 @@ class Node: def open(self): raise NotFile('%s is not a regular file' % self.name) - - def readbytes(self, ofs, count): - raise NotFile('%s is not a regular file' % self.name) - - def read(self, num = -1): - if num < 0: - num = self.size() - return self.readbytes(0, num) - - + + class File(Node): - def __init__(self, parent, name, mode, hash): + def __init__(self, parent, name, mode, hash, bupmode): Node.__init__(self, parent, name, mode, hash) + self.bupmode = bupmode self._cached_size = None + self._filereader = None - def _content(self): - return cp().join(self.hash.encode('hex')) - def open(self): - return FileReader(self) + # You'd think FUSE might call this only once each time a file is + # opened, but no; it's really more of a refcount, and it's called + # once per read(). Thus, it's important to cache the filereader + # object here so we're not constantly re-seeking. + if not self._filereader: + self._filereader = _FileReader(self.hash, self.size(), + self.bupmode == git.BUP_CHUNKED) + return self._filereader def size(self): - # FIXME inefficient. If a file is chunked, we could just check - # the offset + size of the very last chunk. if self._cached_size == None: - self._cached_size = sum(len(blob) for blob in self._content()) + log('<<<