From aeb8ce7d7974a4e6f472ae546d3baab90e1e8aa1 Mon Sep 17 00:00:00 2001 From: Rob Browning Date: Mon, 3 Dec 2018 12:58:03 -0600 Subject: [PATCH] Don't return invalid data for offset reads (observed via fuse) Fix a serious bug in the vfs that could cause it to return invalid data for chunked file read()s that didn't start at the beginning of the file. The issue was first observed via fuse, which makes sense given that it streams a file in chunks that (currently) each come from independent, increasing seek-offset FileReaders. The previous dropwhile() invocation in the _tree_chunks generator, used to skip past chunks that were completely before the offset, was simple but wrong, and would skip too far. Replace it with _skip_chunks_before_offset(). Add randomized tests of both simple streaming reads, and seek offset reads, which catch the problem, cover additional cases, and should prevent regressions. Thanks to voldial for reporting the problem and providing an easy way to reproduce it. Signed-off-by: Rob Browning Tested-by: Rob Browning --- lib/bup/t/tvfs.py | 98 +++++++++++++++++++++++++++++++++++++++++++++++ lib/bup/vfs.py | 17 +++++++- 2 files changed, 113 insertions(+), 2 deletions(-) diff --git a/lib/bup/t/tvfs.py b/lib/bup/t/tvfs.py index d8670a7..3000992 100644 --- a/lib/bup/t/tvfs.py +++ b/lib/bup/t/tvfs.py @@ -4,12 +4,14 @@ from collections import namedtuple from errno import ELOOP, ENOTDIR from io import BytesIO from os import environ, symlink +from random import Random, randint from stat import S_IFDIR, S_IFREG, S_ISDIR, S_ISREG from sys import stderr from time import localtime, strftime from wvtest import * +from bup._helpers import write_random from bup import git, metadata, vfs from bup.git import BUP_CHUNKED from bup.helpers import exc, exo, shstr @@ -500,6 +502,102 @@ def test_resolve(): wvpasseq(4, len(res)) wvpasseq(expected, res) +def write_sized_random_content(parent_dir, size, seed): + verbose = 0 + with open('%s/%d' % (parent_dir, size), 'wb') as f: + write_random(f.fileno(), size, seed, verbose) + +def validate_vfs_streaming_read(repo, item, expected_path, read_sizes): + for read_size in read_sizes: + with open(expected_path, 'rb') as expected: + with vfs.fopen(repo, item) as actual: + ex_buf = expected.read(read_size) + act_buf = actual.read(read_size) + while ex_buf and act_buf: + wvpassge(read_size, len(ex_buf)) + wvpassge(read_size, len(act_buf)) + wvpasseq(len(ex_buf), len(act_buf)) + wvpass(ex_buf == act_buf) + ex_buf = expected.read(read_size) + act_buf = actual.read(read_size) + wvpasseq('', ex_buf) + wvpasseq('', act_buf) + +def validate_vfs_seeking_read(repo, item, expected_path, read_sizes): + def read_act(act_pos): + with vfs.fopen(repo, item) as actual: + actual.seek(act_pos) + wvpasseq(act_pos, actual.tell()) + act_buf = actual.read(read_size) + act_pos += len(act_buf) + wvpasseq(act_pos, actual.tell()) + return act_pos, act_buf + + for read_size in read_sizes: + with open(expected_path, 'rb') as expected: + ex_buf = expected.read(read_size) + act_buf = None + act_pos = 0 + while ex_buf: + act_pos, act_buf = read_act(act_pos) + wvpassge(read_size, len(ex_buf)) + wvpassge(read_size, len(act_buf)) + wvpasseq(len(ex_buf), len(act_buf)) + wvpass(ex_buf == act_buf) + if not act_buf: + break + ex_buf = expected.read(read_size) + else: # hit expected eof first + act_pos, act_buf = read_act(act_pos) + wvpasseq('', ex_buf) + wvpasseq('', act_buf) + +@wvtest +def test_read_and_seek(): + # Write a set of randomly sized files containing random data whose + # names are their sizes, and then verify that what we get back + # from the vfs when seeking and reading with various block sizes + # matches the original content. + with no_lingering_errors(): + with test_tempdir('bup-tvfs-read-') as tmpdir: + resolve = vfs.resolve + bup_dir = tmpdir + '/bup' + environ['GIT_DIR'] = bup_dir + environ['BUP_DIR'] = bup_dir + git.repodir = bup_dir + repo = LocalRepo() + data_path = tmpdir + '/src' + os.mkdir(data_path) + seed = randint(-(1 << 31), (1 << 31) - 1) + rand = Random() + rand.seed(seed) + print('test_read seed:', seed, file=sys.stderr) + max_size = 2 * 1024 * 1024 + sizes = set((rand.randint(1, max_size) for _ in xrange(5))) + sizes.add(1) + sizes.add(max_size) + for size in sizes: + write_sized_random_content(data_path, size, seed) + ex((bup_path, 'init')) + ex((bup_path, 'index', '-v', data_path)) + ex((bup_path, 'save', '-d', '100000', '-tvvn', 'test', '--strip', + data_path)) + read_sizes = set((rand.randint(1, max_size) for _ in xrange(10))) + sizes.add(1) + sizes.add(max_size) + print('test_read src sizes:', sizes, file=sys.stderr) + print('test_read read sizes:', read_sizes, file=sys.stderr) + for size in sizes: + res = resolve(repo, '/test/latest/' + str(size)) + _, item = res[-1] + wvpasseq(size, vfs.item_size(repo, res[-1][1])) + validate_vfs_streaming_read(repo, item, + '%s/%d' % (data_path, size), + read_sizes) + validate_vfs_seeking_read(repo, item, + '%s/%d' % (data_path, size), + read_sizes) + @wvtest def test_resolve_loop(): with no_lingering_errors(): diff --git a/lib/bup/vfs.py b/lib/bup/vfs.py index b84e4c8..99ca85d 100644 --- a/lib/bup/vfs.py +++ b/lib/bup/vfs.py @@ -94,12 +94,25 @@ def _normal_or_chunked_file_size(repo, oid): _, obj_t, size = next(it) return ofs + sum(len(b) for b in it) +def _skip_chunks_before_offset(tree, offset): + prev_ent = next(tree, None) + if not prev_ent: + return tree + ent = None + for ent in tree: + ent_ofs = int(ent[1], 16) + if ent_ofs > offset: + return chain([prev_ent, ent], tree) + if ent_ofs == offset: + return chain([ent], tree) + prev_ent = ent + return [prev_ent] + def _tree_chunks(repo, tree, startofs): "Tree should be a sequence of (name, mode, hash) as per tree_decode()." assert(startofs >= 0) # name is the chunk's hex offset in the original file - tree = dropwhile(lambda (_1, name, _2): int(name, 16) < startofs, tree) - for mode, name, oid in tree: + for mode, name, oid in _skip_chunks_before_offset(tree, startofs): ofs = int(name, 16) skipmore = startofs - ofs if skipmore < 0: -- 2.39.2