From 6c14d90cecf074255ce522032b107abf961e5e3d Mon Sep 17 00:00:00 2001 From: Rob Browning Date: Wed, 15 Jan 2014 18:39:09 -0600 Subject: [PATCH] Add support for "restore --sparse" Thanks to Patrick Rouleau for spotting a serious potential short-write problem in an earlier revision of this work, and suggesting other improvements. Signed-off-by: Rob Browning Tested-by: Rob Browning --- Documentation/bup-restore.md | 5 ++ Makefile | 2 + cmd/restore-cmd.py | 34 ++++++--- lib/bup/_helpers.c | 139 +++++++++++++++++++++++++++++++++-- t/test-sparse-files.sh | 103 ++++++++++++++++++++++++++ 5 files changed, 265 insertions(+), 18 deletions(-) create mode 100755 t/test-sparse-files.sh diff --git a/Documentation/bup-restore.md b/Documentation/bup-restore.md index 9f33133..dacfc7d 100644 --- a/Documentation/bup-restore.md +++ b/Documentation/bup-restore.md @@ -121,6 +121,11 @@ See the EXAMPLES section for a demonstration. : read --exclude-rx patterns from *filename*, one pattern per-line (may be repeated). Ignore completely empty lines. +\--sparse +: write output data sparsely when reasonable. Currently, reasonable + just means "at least whenever there are 512 or more consecutive + zeroes". + \--map-user *old*=*new* : for every path, restore the *old* (saved) user name as *new*. Specifying "" for *new* will clear the user. For example diff --git a/Makefile b/Makefile index 97f9338..e522355 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,6 @@ OS:=$(shell uname | sed 's/[-_].*//') CFLAGS := -Wall -O2 -Werror -Wno-unknown-pragmas $(PYINCLUDE) $(CFLAGS) +CFLAGS := -D_FILE_OFFSET_BITS=64 $(CFLAGS) SOEXT:=.so ifeq ($(OS),CYGWIN) @@ -101,6 +102,7 @@ runtests-cmdline: all TMPDIR="$(test_tmp)" t/test-restore-map-owner.sh TMPDIR="$(test_tmp)" t/test-restore-single-file.sh TMPDIR="$(test_tmp)" t/test-rm-between-index-and-save.sh + TMPDIR="$(test_tmp)" t/test-sparse-files.sh TMPDIR="$(test_tmp)" t/test-command-without-init-fails.sh TMPDIR="$(test_tmp)" t/test-redundant-saves.sh TMPDIR="$(test_tmp)" t/test-save-creates-no-unrefs.sh diff --git a/cmd/restore-cmd.py b/cmd/restore-cmd.py index 535d83c..d527489 100755 --- a/cmd/restore-cmd.py +++ b/cmd/restore-cmd.py @@ -2,6 +2,7 @@ import copy, errno, sys, stat, re from bup import options, git, metadata, vfs from bup.helpers import * +from bup._helpers import write_sparsely optspec = """ bup restore [-C outdir] @@ -10,6 +11,7 @@ C,outdir= change to given outdir before extracting files numeric-ids restore numeric IDs (user, group, etc.) rather than names exclude-rx= skip paths matching the unanchored regex (may be repeated) exclude-rx-from= skip --exclude-rx patterns in file (may be repeated) +sparse create sparse files v,verbose increase log output (can be used more than once) map-user= given OLD=NEW, restore OLD user as NEW user map-group= given OLD=NEW, restore OLD group as NEW group @@ -164,6 +166,18 @@ def write_file_content(fullname, n): outf.close() +def write_file_content_sparsely(fullname, n): + outfd = os.open(fullname, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 0600) + try: + trailing_zeros = 0; + for b in chunkyreader(n.open()): + trailing_zeros = write_sparsely(outfd, b, 512, trailing_zeros) + pos = os.lseek(outfd, trailing_zeros, os.SEEK_END) + os.ftruncate(outfd, pos) + finally: + os.close(outfd) + + def find_dir_item_metadata_by_name(dir, name): """Find metadata in dir (a node) for an item with the given name, or for the directory itself if the name is ''.""" @@ -188,7 +202,7 @@ def find_dir_item_metadata_by_name(dir, name): meta_stream.close() -def do_root(n, owner_map, restore_root_meta = True): +def do_root(n, sparse, owner_map, restore_root_meta = True): # Very similar to do_node(), except that this function doesn't # create a path for n's destination directory (and so ignores # n.fullname). It assumes the destination is '.', and restores @@ -211,21 +225,21 @@ def do_root(n, owner_map, restore_root_meta = True): # Don't get metadata if this is a dir -- handled in sub do_node(). if meta_stream and not stat.S_ISDIR(sub.mode): m = metadata.Metadata.read(meta_stream) - do_node(n, sub, owner_map, meta = m) + do_node(n, sub, sparse, owner_map, meta = m) if root_meta and restore_root_meta: apply_metadata(root_meta, '.', opt.numeric_ids, owner_map) finally: if meta_stream: meta_stream.close() - -def do_node(top, n, owner_map, meta = None): +def do_node(top, n, sparse, owner_map, meta = None): # Create n.fullname(), relative to the current directory, and # restore all of its metadata, when available. The meta argument # will be None for dirs, or when there is no .bupm (i.e. no # metadata). global total_restored, opt meta_stream = None + write_content = sparse and write_file_content_sparsely or write_file_content try: fullname = n.fullname(stop_at=top) # Match behavior of index --exclude-rx with respect to paths. @@ -251,9 +265,9 @@ def do_node(top, n, owner_map, meta = None): create_path(n, fullname, meta) if meta: if stat.S_ISREG(meta.mode): - write_file_content(fullname, n) + write_content(fullname, n) elif stat.S_ISREG(n.mode): - write_file_content(fullname, n) + write_content(fullname, n) total_restored += 1 plog('Restoring: %d\r' % total_restored) @@ -262,7 +276,7 @@ def do_node(top, n, owner_map, meta = None): # Don't get metadata if this is a dir -- handled in sub do_node(). if meta_stream and not stat.S_ISDIR(sub.mode): m = metadata.Metadata.read(meta_stream) - do_node(top, sub, owner_map, meta = m) + do_node(top, sub, sparse, owner_map, meta = m) if meta and not created_hardlink: apply_metadata(meta, fullname, opt.numeric_ids, owner_map) finally: @@ -312,7 +326,7 @@ for d in extra: if not isdir: add_error('%r: not a directory' % d) else: - do_root(n, owner_map, restore_root_meta = (name == '.')) + do_root(n, opt.sparse, owner_map, restore_root_meta = (name == '.')) else: # Source is /foo/what/ever -- extract ./ever to cwd. if isinstance(n, vfs.FakeSymlink): @@ -323,10 +337,10 @@ for d in extra: target = n.dereference() mkdirp(n.name) os.chdir(n.name) - do_root(target, owner_map) + do_root(target, opt.sparse, owner_map) else: # Not a directory or fake symlink. meta = find_dir_item_metadata_by_name(n.parent, n.name) - do_node(n.parent, n, owner_map, meta = meta) + do_node(n.parent, n, opt.sparse, owner_map, meta = meta) if not opt.quiet: progress('Restoring: %d, done.\n' % total_restored) diff --git a/lib/bup/_helpers.c b/lib/bup/_helpers.c index 9082dca..2a5dc46 100644 --- a/lib/bup/_helpers.c +++ b/lib/bup/_helpers.c @@ -70,6 +70,13 @@ static uint64_t htonll(uint64_t value) #endif +#define INTEGRAL_ASSIGNMENT_FITS(dest, src) \ + ({ \ + *(dest) = (src); \ + *(dest) == (src) && (*(dest) < 1) == ((src) < 1); \ + }) + + // At the moment any code that calls INTGER_TO_PY() will have to // disable -Wtautological-compare for clang. See below. @@ -222,6 +229,128 @@ static void unpythonize_argv(void) #endif // not __WIN32__ or __CYGWIN__ +static unsigned long long count_leading_zeros(const unsigned char * const buf, + unsigned long long len) +{ + const unsigned char *cur = buf; + while(len-- && *cur == 0) + cur++; + return cur - buf; +} + + +static int write_all(int fd, const void *buf, const size_t count) +{ + size_t written = 0; + while (written < count) + { + const ssize_t rc = write(fd, buf + written, count - written); + if (rc == -1) + return -1; + written += rc; + } + return 0; +} + + +static int uadd(unsigned long long *dest, + const unsigned long long x, + const unsigned long long y) +{ + const unsigned long long result = x + y; + if (result < x || result < y) + return 0; + *dest = result; + return 1; +} + + +static PyObject *bup_write_sparsely(PyObject *self, PyObject *args) +{ + int fd; + unsigned char *buf = NULL; + Py_ssize_t sbuf_len; + PyObject *py_min_sparse_len, *py_prev_sparse_len; + if (!PyArg_ParseTuple(args, "it#OO", + &fd, &buf, &sbuf_len, + &py_min_sparse_len, &py_prev_sparse_len)) + return NULL; + unsigned long long min_sparse_len, prev_sparse_len, buf_len; + if (!bup_ullong_from_py(&min_sparse_len, py_min_sparse_len, "min_sparse_len")) + return NULL; + if (!bup_ullong_from_py(&prev_sparse_len, py_prev_sparse_len, "prev_sparse_len")) + return NULL; + if (sbuf_len < 0) + return PyErr_Format(PyExc_ValueError, "negative bufer length"); + if (!INTEGRAL_ASSIGNMENT_FITS(&buf_len, sbuf_len)) + return PyErr_Format(PyExc_OverflowError, "buffer length too large"); + + // For now, there are some cases where we just give up if the + // values are too large, but we could try to break up the relevant + // operations into chunks. + + // Deal with preceding zeros. Just make them sparse, along with + // any leading zeros in buf, even if the region's not >= min, + // since the alternative is a potentially extra small write. + if (prev_sparse_len) + { + const unsigned long long zeros = count_leading_zeros(buf, buf_len); + unsigned long long new_sparse_len = 0; + if (!uadd(&new_sparse_len, prev_sparse_len, zeros)) + return PyErr_Format (PyExc_OverflowError, "sparse region too large"); + if (zeros == buf_len) + return PyLong_FromUnsignedLongLong(new_sparse_len); + + off_t new_off; + if (!INTEGRAL_ASSIGNMENT_FITS(&new_off, new_sparse_len)) + return PyErr_Format(PyExc_OverflowError, + "sparse region too large for seek"); + const off_t off = lseek(fd, new_off, SEEK_CUR); + if (off == -1) + return PyErr_SetFromErrno(PyExc_IOError); + buf += zeros; + buf_len -= zeros; + } + + int rc; + unsigned long long unexamined = buf_len; + unsigned char *block_start = buf, *cur = buf; + while(unexamined) + { + const unsigned long long zeros = count_leading_zeros(cur, unexamined); + assert(zeros <= unexamined); + unexamined -= zeros; + if (unexamined == 0) // Runs off the end. + { + rc = write_all(fd, block_start, cur - block_start); + if (rc) + return PyErr_SetFromErrno(PyExc_IOError); + return PyLong_FromUnsignedLongLong(zeros); + } + cur += zeros; + if (zeros >= min_sparse_len) + { + off_t new_off; + if (!INTEGRAL_ASSIGNMENT_FITS(&new_off, zeros)) + return PyErr_Format(PyExc_ValueError, + "zero count overflows off_t"); + off_t off = lseek(fd, new_off, SEEK_CUR); + if (off == -1) + return PyErr_SetFromErrno(PyExc_IOError); + block_start = cur; + } + while (unexamined && *cur != 0) + { + cur++; unexamined--; + } + } + rc = write_all(fd, block_start, cur - block_start); + if (rc) + return PyErr_SetFromErrno(PyExc_IOError); + return PyInt_FromLong(0); +} + + static PyObject *selftest(PyObject *self, PyObject *args) { if (!PyArg_ParseTuple(args, "")) @@ -916,14 +1045,6 @@ static PyObject *bup_set_linux_file_attr(PyObject *self, PyObject *args) #endif #endif - -#define INTEGRAL_ASSIGNMENT_FITS(dest, src) \ - ({ \ - *(dest) = (src); \ - *(dest) == (src) && (*(dest) < 1) == ((src) < 1); \ - }) - - #define ASSIGN_PYLONG_TO_INTEGRAL(dest, pylong, overflow) \ ({ \ int result = 0; \ @@ -1170,6 +1291,8 @@ static PyObject *bup_fstat(PyObject *self, PyObject *args) static PyMethodDef helper_methods[] = { + { "write_sparsely", bup_write_sparsely, METH_VARARGS, + "Write buf excepting zeros at the end. Return trailing zero count." }, { "selftest", selftest, METH_VARARGS, "Check that the rolling checksum rolls correctly (for unit tests)." }, { "blobbits", blobbits, METH_VARARGS, diff --git a/t/test-sparse-files.sh b/t/test-sparse-files.sh new file mode 100755 index 0000000..884c6b9 --- /dev/null +++ b/t/test-sparse-files.sh @@ -0,0 +1,103 @@ +#!/usr/bin/env bash +. ./wvtest-bup.sh + +set -o pipefail + +readonly mb=1048576 +readonly top="$(WVPASS pwd)" || exit $? +readonly tmpdir="$(WVPASS wvmktempdir)" || exit $? + +export BUP_DIR="$tmpdir/bup" +export GIT_DIR="$tmpdir/bup" + +bup() { "$top/bup" "$@"; } + +WVPASS cd "$tmpdir" + +WVPASS dd if=/dev/zero of=test-sparse seek=$((1024 * 256)) bs=1 count=1 +restore_size=$(WVPASS du -k -s test-sparse | WVPASS cut -f1) || exit $? +if ! [ "$restore_size" -lt 256 ]; then + WVSTART "no sparse support detected -- skipping tests" + exit 0 +fi + +WVPASS bup init +WVPASS mkdir src + +WVPASS dd if=/dev/zero of=src/foo seek=$mb bs=1 count=1 +WVPASS bup index src +WVPASS bup save -n src src + +WVSTART "sparse file restore (all sparse)" +WVPASS bup restore -C restore "src/latest/$(pwd)/" +restore_size=$(WVPASS du -k -s restore | WVPASS cut -f1) || exit $? +WVPASS [ "$restore_size" -gt 1000 ] +WVPASS "$top/t/compare-trees" -c src/ restore/src/ + +WVSTART "sparse file restore --no-sparse (all sparse)" +WVPASS rm -r restore +WVPASS bup restore --no-sparse -C restore "src/latest/$(pwd)/" +restore_size=$(WVPASS du -k -s restore | WVPASS cut -f1) || exit $? +WVPASS [ "$restore_size" -gt 1000 ] +WVPASS "$top/t/compare-trees" -c src/ restore/src/ + +WVSTART "sparse file restore --sparse (all sparse)" +WVPASS rm -r restore +WVPASS bup restore --sparse -C restore "src/latest/$(pwd)/" +restore_size=$(WVPASS du -k -s restore | WVPASS cut -f1) || exit $? +WVPASS [ "$restore_size" -lt 100 ] +WVPASS "$top/t/compare-trees" -c src/ restore/src/ + +WVSTART "sparse file restore --sparse (sparse end)" +WVPASS echo "start" > src/foo +WVPASS dd if=/dev/zero of=src/foo seek=$mb bs=1 count=1 conv=notrunc +WVPASS bup index src +WVPASS bup save -n src src +WVPASS rm -r restore +WVPASS bup restore --sparse -C restore "src/latest/$(pwd)/" +restore_size=$(WVPASS du -k -s restore | WVPASS cut -f1) || exit $? +WVPASS [ "$restore_size" -lt 100 ] +WVPASS "$top/t/compare-trees" -c src/ restore/src/ + +WVSTART "sparse file restore --sparse (sparse middle)" +WVPASS echo "end" >> src/foo +WVPASS bup index src +WVPASS bup save -n src src +WVPASS rm -r restore +WVPASS bup restore --sparse -C restore "src/latest/$(pwd)/" +restore_size=$(WVPASS du -k -s restore | WVPASS cut -f1) || exit $? +WVPASS [ "$restore_size" -lt 100 ] +WVPASS "$top/t/compare-trees" -c src/ restore/src/ + +WVSTART "sparse file restore --sparse (sparse start)" +WVPASS dd if=/dev/zero of=src/foo seek=$mb bs=1 count=1 +WVPASS echo "end" >> src/foo +WVPASS bup index src +WVPASS bup save -n src src +WVPASS rm -r restore +WVPASS bup restore --sparse -C restore "src/latest/$(pwd)/" +restore_size=$(WVPASS du -k -s restore | WVPASS cut -f1) || exit $? +WVPASS [ "$restore_size" -lt 100 ] +WVPASS "$top/t/compare-trees" -c src/ restore/src/ + +WVSTART "sparse file restore --sparse (sparse start and end)" +WVPASS dd if=/dev/zero of=src/foo seek=$mb bs=1 count=1 +WVPASS echo "middle" >> src/foo +WVPASS dd if=/dev/zero of=src/foo seek=$((2 * mb)) bs=1 count=1 conv=notrunc +WVPASS bup index src +WVPASS bup save -n src src +WVPASS rm -r restore +WVPASS bup restore --sparse -C restore "src/latest/$(pwd)/" +restore_size=$(WVPASS du -k -s restore | WVPASS cut -f1) || exit $? +WVPASS [ "$restore_size" -lt 100 ] +WVPASS "$top/t/compare-trees" -c src/ restore/src/ + +WVSTART "sparse file restore --sparse (random)" +WVPASS bup random 512k > src/foo +WVPASS bup index src +WVPASS bup save -n src src +WVPASS rm -r restore +WVPASS bup restore --sparse -C restore "src/latest/$(pwd)/" +WVPASS "$top/t/compare-trees" -c src/ restore/src/ + +WVPASS rm -rf "$tmpdir" -- 2.39.2