]> arthur.barton.de Git - bup.git/commitdiff
vfs: take advantage of bup chunking to make file seeking faster. bup-0.14
authorAvery Pennarun <apenwarr@gmail.com>
Fri, 23 Apr 2010 23:18:10 +0000 (19:18 -0400)
committerAvery Pennarun <apenwarr@gmail.com>
Fri, 23 Apr 2010 23:27:51 +0000 (19:27 -0400)
If you have a huge file, you can now seek around inside it (eg. in 'bup
fuse') without having to read its entire contents.  Calculating the file
size is also really fast now.

This makes a bup fuse-mounted filesystem much more useful for real-time
access.  For example, I was able to connect to an sqlite3 database and have
it work at a reasonable speed.  (Obviously, since 'bup fuse' is written in
python and doesn't currently support threading, the speed could still be
improved, but at least it's no longer godawful.)

cmd/fuse-cmd.py
lib/bup/vfs.py

index 5d6fd96684f9500808549380523760ec9c2f682a..64880975d861eb71be80ce34e15a6ea8b853a718 100755 (executable)
@@ -93,7 +93,9 @@ class BupFs(fuse.Fuse):
     def read(self, path, size, offset):
         log('--read(%r)\n' % path)
         n = cache_get(self.top, path)
-        return n.readbytes(offset, size)
+        o = n.open()
+        o.seek(offset)
+        return o.read(size)
 
 
 if not hasattr(fuse, '__version__'):
index 91566da1dcc29a5fad934a055259694094b13c8e..c8f688a9c33f096311dabbcbda1bee2a595328ec 100644 (file)
@@ -23,13 +23,103 @@ class TooManySymlinks(NodeError):
     pass
 
 
-class FileReader:
-    def __init__(self, node):
-        self.n = node
+def _treeget(hash):
+    it = cp().get(hash.encode('hex'))
+    type = it.next()
+    assert(type == 'tree')
+    return git._treeparse(''.join(it))
+
+
+def _tree_decode(hash):
+    tree = [(int(name,16),stat.S_ISDIR(int(mode,8)),sha)
+            for (mode,name,sha)
+            in _treeget(hash)]
+    assert(tree == list(sorted(tree)))
+    return tree
+
+
+def _chunk_len(hash):
+    return sum(len(b) for b in cp().join(hash.encode('hex')))
+
+
+def _last_chunk_info(hash):
+    tree = _tree_decode(hash)
+    assert(tree)
+    (ofs,isdir,sha) = tree[-1]
+    if isdir:
+        (subofs, sublen) = _last_chunk_info(sha)
+        return (ofs+subofs, sublen)
+    else:
+        return (ofs, _chunk_len(sha))
+
+
+def _total_size(hash):
+    (lastofs, lastsize) = _last_chunk_info(hash)
+    return lastofs + lastsize
+
+
+def _chunkiter(hash, startofs):
+    assert(startofs >= 0)
+    tree = _tree_decode(hash)
+
+    # skip elements before startofs
+    for i in xrange(len(tree)):
+        if i+1 >= len(tree) or tree[i+1][0] > startofs:
+            break
+    first = i
+
+    # iterate through what's left
+    for i in xrange(first, len(tree)):
+        (ofs,isdir,sha) = tree[i]
+        skipmore = startofs-ofs
+        if skipmore < 0:
+            skipmore = 0
+        if isdir:
+            for b in _chunkiter(sha, skipmore):
+                yield b
+        else:
+            yield ''.join(cp().join(sha.encode('hex')))[skipmore:]
+
+
+class _ChunkReader:
+    def __init__(self, hash, isdir, startofs):
+        if isdir:
+            self.it = _chunkiter(hash, startofs)
+            self.blob = None
+        else:
+            self.it = None
+            self.blob = cp().get(hash.encode('hex'))
+
+    def next(self, size):
+        out = ''
+        while len(out) < size:
+            if self.it and not self.blob:
+                try:
+                    self.blob = self.it.next()
+                except StopIteration:
+                    self.it = None
+            if self.blob:
+                want = size - len(out)
+                out += self.blob[:want]
+                self.blob = self.blob[want:]
+            if not self.it:
+                break
+        log('next(%d) returned %d\n' % (size, len(out)))
+        return out
+
+
+class _FileReader:
+    def __init__(self, hash, size, isdir):
+        self.hash = hash
         self.ofs = 0
-        self.size = self.n.size()
+        self.size = size
+        self.isdir = isdir
+        self.reader = None
 
     def seek(self, ofs):
+        if self.ofs == ofs:
+            return
+        self.reader = None
         if ofs > self.size:
             self.ofs = self.size
         elif ofs < 0:
@@ -43,7 +133,13 @@ class FileReader:
     def read(self, count = -1):
         if count < 0:
             count = self.size - self.ofs
-        buf = self.n.readbytes(self.ofs, count)
+        if not self.reader:
+            self.reader = _ChunkReader(self.hash, self.isdir, self.ofs)
+        try:
+            buf = self.reader.next(count)
+        except:
+            self.reader = None
+            raise  # our offsets will be all screwed up otherwise
         self.ofs += len(buf)
         return buf
 
@@ -131,48 +227,45 @@ class Node:
 
     def open(self):
         raise NotFile('%s is not a regular file' % self.name)
-    
-    def readbytes(self, ofs, count):
-        raise NotFile('%s is not a regular file' % self.name)
-    
-    def read(self, num = -1):
-        if num < 0:
-            num = self.size()
-        return self.readbytes(0, num)
-    
-    
+
+
 class File(Node):
-    def __init__(self, parent, name, mode, hash):
+    def __init__(self, parent, name, mode, hash, bupmode):
         Node.__init__(self, parent, name, mode, hash)
+        self.bupmode = bupmode
         self._cached_size = None
+        self._filereader = None
         
-    def _content(self):
-        return cp().join(self.hash.encode('hex'))
-
     def open(self):
-        return FileReader(self)
+        # You'd think FUSE might call this only once each time a file is
+        # opened, but no; it's really more of a refcount, and it's called
+        # once per read().  Thus, it's important to cache the filereader
+        # object here so we're not constantly re-seeking.
+        if not self._filereader:
+            self._filereader = _FileReader(self.hash, self.size(),
+                                           self.bupmode == git.BUP_CHUNKED)
+        return self._filereader
     
     def size(self):
-        # FIXME inefficient.  If a file is chunked, we could just check
-        # the offset + size of the very last chunk.
         if self._cached_size == None:
-            self._cached_size = sum(len(blob) for blob in self._content())
+            log('<<<<File.size() is calculating\n')
+            if self.bupmode == git.BUP_CHUNKED:
+                self._cached_size = _total_size(self.hash)
+            else:
+                self._cached_size = _chunk_len(self.hash)
         return self._cached_size
-    
-    def readbytes(self, ofs, count):
-        # FIXME inefficient.  If a file is chunked, we could choose to
-        # read only the required chunks rather than joining the whole thing.
-        buf = ''.join(self._content())
-        return buf[ofs:ofs+count]
-    
+
 
 _symrefs = 0
 class Symlink(File):
-    def __init__(self, parent, name, hash):
-        File.__init__(self, parent, name, 0120000, hash)
+    def __init__(self, parent, name, hash, bupmode):
+        File.__init__(self, parent, name, 0120000, hash, bupmode)
+
+    def size(self):
+        return len(self.readlink())
 
     def readlink(self):
-        return self.read(1024)
+        return ''.join(cp().join(self.hash.encode('hex')))
 
     def dereference(self):
         global _symrefs
@@ -191,10 +284,10 @@ class Symlink(File):
 
 class FakeSymlink(Symlink):
     def __init__(self, parent, name, toname):
-        Symlink.__init__(self, parent, name, EMPTY_SHA)
+        Symlink.__init__(self, parent, name, EMPTY_SHA, git.BUP_NORMAL)
         self.toname = toname
         
-    def _content(self):
+    def readlink(self):
         return self.toname
     
 
@@ -217,9 +310,9 @@ class Dir(Node):
             if stat.S_ISDIR(mode):
                 self._subs[name] = Dir(self, name, mode, sha)
             elif stat.S_ISLNK(mode):
-                self._subs[name] = Symlink(self, name, sha)
+                self._subs[name] = Symlink(self, name, sha, bupmode)
             else:
-                self._subs[name] = File(self, name, mode, sha)
+                self._subs[name] = File(self, name, mode, sha, bupmode)
                 
 
 class CommitList(Node):