From aeb8ce7d7974a4e6f472ae546d3baab90e1e8aa1 Mon Sep 17 00:00:00 2001
From: Rob Browning <rlb@defaultvalue.org>
Date: Mon, 3 Dec 2018 12:58:03 -0600
Subject: [PATCH] Don't return invalid data for offset reads (observed via
 fuse)

Fix a serious bug in the vfs that could cause it to return invalid
data for chunked file read()s that didn't start at the beginning of
the file.  The issue was first observed via fuse, which makes sense
given that it streams a file in chunks that (currently) each come from
independent, increasing seek-offset FileReaders.

The previous dropwhile() invocation in the _tree_chunks generator,
used to skip past chunks that were completely before the offset, was
simple but wrong, and would skip too far.  Replace it with
_skip_chunks_before_offset().

Add randomized tests of both simple streaming reads, and seek offset
reads, which catch the problem, cover additional cases, and should
prevent regressions.

Thanks to voldial for reporting the problem and providing an easy way
to reproduce it.

Signed-off-by: Rob Browning <rlb@defaultvalue.org>
Tested-by: Rob Browning <rlb@defaultvalue.org>
---
 lib/bup/t/tvfs.py | 98 +++++++++++++++++++++++++++++++++++++++++++++++
 lib/bup/vfs.py    | 17 +++++++-
 2 files changed, 113 insertions(+), 2 deletions(-)

diff --git a/lib/bup/t/tvfs.py b/lib/bup/t/tvfs.py
index d8670a7..3000992 100644
--- a/lib/bup/t/tvfs.py
+++ b/lib/bup/t/tvfs.py
@@ -4,12 +4,14 @@ from collections import namedtuple
 from errno import ELOOP, ENOTDIR
 from io import BytesIO
 from os import environ, symlink
+from random import Random, randint
 from stat import S_IFDIR, S_IFREG, S_ISDIR, S_ISREG
 from sys import stderr
 from time import localtime, strftime
 
 from wvtest import *
 
+from bup._helpers import write_random
 from bup import git, metadata, vfs
 from bup.git import BUP_CHUNKED
 from bup.helpers import exc, exo, shstr
@@ -500,6 +502,102 @@ def test_resolve():
             wvpasseq(4, len(res))
             wvpasseq(expected, res)
 
+def write_sized_random_content(parent_dir, size, seed):
+    verbose = 0
+    with open('%s/%d' % (parent_dir, size), 'wb') as f:
+        write_random(f.fileno(), size, seed, verbose)
+
+def validate_vfs_streaming_read(repo, item, expected_path, read_sizes):
+    for read_size in read_sizes:
+        with open(expected_path, 'rb') as expected:
+            with vfs.fopen(repo, item) as actual:
+                ex_buf = expected.read(read_size)
+                act_buf = actual.read(read_size)
+                while ex_buf and act_buf:
+                    wvpassge(read_size, len(ex_buf))
+                    wvpassge(read_size, len(act_buf))
+                    wvpasseq(len(ex_buf), len(act_buf))
+                    wvpass(ex_buf == act_buf)
+                    ex_buf = expected.read(read_size)
+                    act_buf = actual.read(read_size)
+                wvpasseq('', ex_buf)
+                wvpasseq('', act_buf)
+
+def validate_vfs_seeking_read(repo, item, expected_path, read_sizes):
+    def read_act(act_pos):
+        with vfs.fopen(repo, item) as actual:
+            actual.seek(act_pos)
+            wvpasseq(act_pos, actual.tell())
+            act_buf = actual.read(read_size)
+            act_pos += len(act_buf)
+            wvpasseq(act_pos, actual.tell())
+            return act_pos, act_buf
+
+    for read_size in read_sizes:
+        with open(expected_path, 'rb') as expected:
+                ex_buf = expected.read(read_size)
+                act_buf = None
+                act_pos = 0
+                while ex_buf:
+                    act_pos, act_buf = read_act(act_pos)
+                    wvpassge(read_size, len(ex_buf))
+                    wvpassge(read_size, len(act_buf))
+                    wvpasseq(len(ex_buf), len(act_buf))
+                    wvpass(ex_buf == act_buf)
+                    if not act_buf:
+                        break
+                    ex_buf = expected.read(read_size)
+                else:  # hit expected eof first
+                    act_pos, act_buf = read_act(act_pos)
+                wvpasseq('', ex_buf)
+                wvpasseq('', act_buf)
+
+@wvtest
+def test_read_and_seek():
+    # Write a set of randomly sized files containing random data whose
+    # names are their sizes, and then verify that what we get back
+    # from the vfs when seeking and reading with various block sizes
+    # matches the original content.
+    with no_lingering_errors():
+        with test_tempdir('bup-tvfs-read-') as tmpdir:
+            resolve = vfs.resolve
+            bup_dir = tmpdir + '/bup'
+            environ['GIT_DIR'] = bup_dir
+            environ['BUP_DIR'] = bup_dir
+            git.repodir = bup_dir
+            repo = LocalRepo()
+            data_path = tmpdir + '/src'
+            os.mkdir(data_path)
+            seed = randint(-(1 << 31), (1 << 31) - 1)
+            rand = Random()
+            rand.seed(seed)
+            print('test_read seed:', seed, file=sys.stderr)
+            max_size = 2 * 1024 * 1024
+            sizes = set((rand.randint(1, max_size) for _ in xrange(5)))
+            sizes.add(1)
+            sizes.add(max_size)
+            for size in sizes:
+                write_sized_random_content(data_path, size, seed)
+            ex((bup_path, 'init'))
+            ex((bup_path, 'index', '-v', data_path))
+            ex((bup_path, 'save', '-d', '100000', '-tvvn', 'test', '--strip',
+                data_path))
+            read_sizes = set((rand.randint(1, max_size) for _ in xrange(10)))
+            sizes.add(1)
+            sizes.add(max_size)
+            print('test_read src sizes:', sizes, file=sys.stderr)
+            print('test_read read sizes:', read_sizes, file=sys.stderr)
+            for size in sizes:
+                res = resolve(repo, '/test/latest/' + str(size))
+                _, item = res[-1]
+                wvpasseq(size, vfs.item_size(repo, res[-1][1]))
+                validate_vfs_streaming_read(repo, item,
+                                            '%s/%d' % (data_path, size),
+                                            read_sizes)
+                validate_vfs_seeking_read(repo, item,
+                                          '%s/%d' % (data_path, size),
+                                          read_sizes)
+
 @wvtest
 def test_resolve_loop():
     with no_lingering_errors():
diff --git a/lib/bup/vfs.py b/lib/bup/vfs.py
index b84e4c8..99ca85d 100644
--- a/lib/bup/vfs.py
+++ b/lib/bup/vfs.py
@@ -94,12 +94,25 @@ def _normal_or_chunked_file_size(repo, oid):
         _, obj_t, size = next(it)
     return ofs + sum(len(b) for b in it)
 
+def _skip_chunks_before_offset(tree, offset):
+    prev_ent = next(tree, None)
+    if not prev_ent:
+        return tree
+    ent = None
+    for ent in tree:
+        ent_ofs = int(ent[1], 16)
+        if ent_ofs > offset:
+            return chain([prev_ent, ent], tree)
+        if ent_ofs == offset:
+            return chain([ent], tree)
+        prev_ent = ent
+    return [prev_ent]
+
 def _tree_chunks(repo, tree, startofs):
     "Tree should be a sequence of (name, mode, hash) as per tree_decode()."
     assert(startofs >= 0)
     # name is the chunk's hex offset in the original file
-    tree = dropwhile(lambda (_1, name, _2): int(name, 16) < startofs, tree)
-    for mode, name, oid in tree:
+    for mode, name, oid in _skip_chunks_before_offset(tree, startofs):
         ofs = int(name, 16)
         skipmore = startofs - ofs
         if skipmore < 0:
-- 
2.39.2