]> arthur.barton.de Git - bup.git/commitdiff
Add "bup rm", but require --unsafe invocation
authorRob Browning <rlb@defaultvalue.org>
Tue, 29 Dec 2015 20:53:54 +0000 (14:53 -0600)
committerRob Browning <rlb@defaultvalue.org>
Sun, 14 Feb 2016 00:10:09 +0000 (18:10 -0600)
Allow the removal of branches, and the removal of saves from specified
branches.  This command only removes the references, so until "bup gc"
is available, all of the related data will still be in the repository,
though possibly difficult to reach, unless otherwise tagged.

This command is potentially dangerous, so until we've had broader
testing, require all invocations to specify --unsafe, and make it clear
in the documentation that this command isn't considered stable.

Thanks to Nix for reporting an earlier mistake in the manpage.

Signed-off-by: Rob Browning <rlb@defaultvalue.org>
Tested-by: Rob Browning <rlb@defaultvalue.org>
Documentation/bup-rm.md [new file with mode: 0644]
Makefile
cmd/rm-cmd.py [new file with mode: 0755]
lib/bup/git.py
t/test-rm.sh [new file with mode: 0755]

diff --git a/Documentation/bup-rm.md b/Documentation/bup-rm.md
new file mode 100644 (file)
index 0000000..a0382e1
--- /dev/null
@@ -0,0 +1,50 @@
+% bup-rm(1) Bup %BUP_VERSION%
+% Rob Browning <rlb@defaultvalue.org>
+% %BUP_DATE%
+
+# NAME
+
+bup-rm - remove references to archive content (CAUTION: EXPERIMENTAL)
+
+# SYNOPSIS
+
+bup rm [-#|--verbose] <*branch*|*save*...>
+
+# DESCRIPTION
+
+`bup rm` removes the indicated *branch*es (backup sets) and *save*s.
+By itself, this command does not delete any actual data (nor recover
+any storage space), but it may make it very difficult or impossible to
+refer to the deleted items, unless there are other references to them
+(e.g. tags).
+
+A subsequent garbage collection, either by the forthcoming `bup gc`
+command, or by a normal `git gc`, may permanently delete data that is
+no longer reachable from the remaining branches or tags, and reclaim
+the related storage space.
+
+NOTE: This is one of the few bup commands that modifies your archive
+in intentionally destructive ways.
+
+# OPTIONS
+
+-v, \--verbose
+:   increase verbosity (can be used more than once).
+
+-*#*, \--compress=*#*
+:   set the compression level to # (a value from 0-9, where
+    9 is the highest and 0 is no compression).  The default
+    is 6.  Note that `bup rm` may only write new commits.
+
+# EXAMPLES
+
+    # Delete the backup set (branch) foo and a save in bar.
+    $ bup rm /foo /bar/2014-10-21-214720
+
+# SEE ALSO
+
+`bup-save`(1), `bup-fsck`(1), and `bup-tag`(1)
+
+# BUP
+
+Part of the `bup`(1) suite.
index 410613f793983211facb0938fe97f7f4b7496e36..30e9aeb28c73be1301405df267d430e23d3534cb 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -145,6 +145,7 @@ runtests-python: all t/tmp
            | tee -a t/tmp/test-log/$$$$.log
 
 cmdline_tests := \
+  t/test-rm.sh \
   t/test-main.sh \
   t/test-list-idx.sh \
   t/test-index.sh \
diff --git a/cmd/rm-cmd.py b/cmd/rm-cmd.py
new file mode 100755 (executable)
index 0000000..63c7445
--- /dev/null
@@ -0,0 +1,163 @@
+#!/usr/bin/env python
+
+import sys
+
+from bup import client, git, options, vfs
+from bup.git import get_commit_items
+from bup.helpers import add_error, handle_ctrl_c, log, saved_errors
+
+optspec = """
+bup rm <branch|save...>
+--
+#,compress=  set compression level to # (0-9, 9 is highest) [6]
+v,verbose    increase verbosity (can be specified multiple times)
+unsafe       use the command even though it may be DANGEROUS
+"""
+
+def append_commit(hash, parent, cp, writer):
+    ci = get_commit_items(hash, cp)
+    tree = ci.tree.decode('hex')
+    author = '%s <%s>' % (ci.author_name, ci.author_mail)
+    committer = '%s <%s>' % (ci.committer_name, ci.committer_mail)
+    c = writer.new_commit(tree, parent,
+                          author, ci.author_sec, ci.author_offset,
+                          committer, ci.committer_sec, ci.committer_offset,
+                          ci.message)
+    return c, tree
+
+
+def filter_branch(tip_commit_hex, exclude, writer):
+    # May return None if everything is excluded.
+    commits = [c for _, c in git.rev_list(tip_commit_hex)]
+    commits.reverse()
+    last_c, tree = None, None
+    # Rather than assert that we always find an exclusion here, we'll
+    # just let the StopIteration signal the error.
+    first_exclusion = next(i for i, c in enumerate(commits) if exclude(c))
+    if first_exclusion != 0:
+        last_c = commits[first_exclusion - 1]
+        tree = get_commit_items(last_c.encode('hex'),
+                                git.cp()).tree.decode('hex')
+        commits = commits[first_exclusion:]
+    for c in commits:
+        if exclude(c):
+            continue
+        last_c, tree = append_commit(c.encode('hex'), last_c, git.cp(), writer)
+    return last_c
+
+
+def rm_saves(saves, writer):
+    assert(saves)
+    branch_node = saves[0].parent
+    for save in saves: # Be certain they're all on the same branch
+        assert(save.parent == branch_node)
+    rm_commits = frozenset([x.dereference().hash for x in saves])
+    orig_tip = branch_node.hash
+    new_tip = filter_branch(orig_tip.encode('hex'),
+                            lambda x: x in rm_commits,
+                            writer)
+    assert(orig_tip)
+    assert(new_tip != orig_tip)
+    return orig_tip, new_tip
+
+
+def dead_items(vfs_top, paths):
+    """Return an optimized set of removals, reporting errors via
+    add_error, and if there are any errors, return None, None."""
+    dead_branches = {}
+    dead_saves = {}
+    # Scan for bad requests, and opportunities to optimize
+    for path in paths:
+        try:
+            n = vfs_top.lresolve(path)
+        except vfs.NodeError as e:
+            add_error('unable to resolve %s: %s' % (path, e))
+        else:
+            if isinstance(n, vfs.BranchList): # rm /foo
+                branchname = n.name
+                dead_branches[branchname] = n
+                dead_saves.pop(branchname, None) # rm /foo obviates rm /foo/bar
+            elif isinstance(n, vfs.FakeSymlink) and isinstance(n.parent,
+                                                               vfs.BranchList):
+                if n.name == 'latest':
+                    add_error("error: cannot delete 'latest' symlink")
+                else:
+                    branchname = n.parent.name
+                    if branchname not in dead_branches:
+                        dead_saves.setdefault(branchname, []).append(n)
+            else:
+                add_error("don't know how to remove %r yet" % n.fullname())
+    if saved_errors:
+        return None, None
+    return dead_branches, dead_saves
+
+
+handle_ctrl_c()
+
+o = options.Options(optspec)
+opt, flags, extra = o.parse(sys.argv[1:])
+
+if not opt.unsafe:
+    o.fatal('refusing to run dangerous, experimental command without --unsafe')
+
+if len(extra) < 1:
+    o.fatal('no paths specified')
+
+paths = extra
+
+git.check_repo_or_die()
+top = vfs.RefList(None)
+
+dead_branches, dead_saves = dead_items(top, paths)
+if saved_errors:
+    log('not proceeding with any removals\n')
+    sys.exit(1)
+
+updated_refs = {}  # ref_name -> (original_ref, tip_commit(bin))
+writer = None
+
+if dead_saves:
+    writer = git.PackWriter(compression_level=opt.compress)
+
+for branch, saves in dead_saves.iteritems():
+    assert(saves)
+    updated_refs['refs/heads/' + branch] = rm_saves(saves, writer)
+
+for branch, node in dead_branches.iteritems():
+    ref = 'refs/heads/' + branch
+    assert(not ref in updated_refs)
+    updated_refs[ref] = (node.hash, None)
+
+if writer:
+    # Must close before we can update the ref(s) below.
+    writer.close()
+
+# Only update the refs here, at the very end, so that if something
+# goes wrong above, the old refs will be undisturbed.  Make an attempt
+# to update each ref.
+for ref_name, info in updated_refs.iteritems():
+    orig_ref, new_ref = info
+    try:
+        if not new_ref:
+            git.delete_ref(ref_name, orig_ref.encode('hex'))
+        else:
+            git.update_ref(ref_name, new_ref, orig_ref)
+            if opt.verbose:
+                new_hex = new_ref.encode('hex')
+                if orig_ref:
+                    orig_hex = orig_ref.encode('hex')
+                    log('updated %r (%s -> %s)\n'
+                        % (ref_name, orig_hex, new_hex))
+                else:
+                    log('updated %r (%s)\n' % (ref_name, new_hex))
+    except (git.GitError, client.ClientError) as ex:
+        if new_ref:
+            add_error('while trying to update %r (%s -> %s): %s'
+                      % (ref_name, orig_ref, new_ref, ex))
+        else:
+            add_error('while trying to delete %r (%s): %s'
+                      % (ref_name, orig_ref, ex))
+
+if saved_errors:
+    log('warning: %d errors encountered\n' % len(saved_errors))
+    sys.exit(1)
index 73071fa49671aefe8cab6f3c9856980d65a7df87..315d8f3fa295aff53193a9315be5bb1aeecbeb94 100644 (file)
@@ -10,7 +10,8 @@ from itertools import islice
 from bup import _helpers, path, midx, bloom, xstat
 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
                          fdatasync,
-                         hostname, log, merge_iter, mmap_read, mmap_readwrite,
+                         hostname, localtime, log, merge_iter,
+                         mmap_read, mmap_readwrite,
                          progress, qprogress, unlink, username, userfullname,
                          utc_offset_str)
 
@@ -951,10 +952,11 @@ def update_ref(refname, newval, oldval, repo_dir=None):
     _git_wait('git update-ref', p)
 
 
-def delete_ref(refname):
-    """Delete a repository reference."""
+def delete_ref(refname, oldvalue=None):
+    """Delete a repository reference (see git update-ref(1))."""
     assert(refname.startswith('refs/'))
-    p = subprocess.Popen(['git', 'update-ref', '-d', refname],
+    oldvalue = [] if not oldvalue else [oldvalue]
+    p = subprocess.Popen(['git', 'update-ref', '-d', refname] + oldvalue,
                          preexec_fn = _gitenv())
     _git_wait('git update-ref', p)
 
diff --git a/t/test-rm.sh b/t/test-rm.sh
new file mode 100755 (executable)
index 0000000..80824fb
--- /dev/null
@@ -0,0 +1,234 @@
+#!/usr/bin/env bash
+. ./wvtest-bup.sh || exit $?
+. ./t/lib.sh || exit $?
+
+set -o pipefail
+
+# Perhaps this should check the rsync version instead, and not sure if
+# it's just darwin, or all of these.
+case "$(uname)" in
+    CYGWIN*|NetBSD)
+        rsx=''
+        ;;
+    Darwin)
+        rsx=.
+        ;;
+    *)
+        rsx=...
+        ;;
+esac
+
+if test "$(uname)" = Darwin; then
+    deleting=deleting
+else
+    deleting="deleting  "
+    plusx=++
+fi
+
+top="$(WVPASS pwd)" || exit $?
+tmpdir="$(WVPASS wvmktempdir)" || exit $?
+
+export BUP_DIR="$tmpdir/bup"
+export GIT_DIR="$tmpdir/bup"
+
+
+bup() { "$top/bup" "$@"; }
+compare-trees() { "$top/t/compare-trees" "$@"; }
+
+
+WVPASS bup init
+WVPASS cd "$tmpdir"
+
+
+WVSTART "rm /foo (lone branch)"
+WVPASS mkdir src src/foo
+WVPASS echo twisty-maze > src/1
+WVPASS bup index src
+WVPASS bup save -n src src
+WVPASS "$top"/t/sync-tree bup/ bup-baseline/
+# FIXME: test -n
+WVPASS bup tick # Make sure we always get the timestamp changes below
+WVPASS bup rm --unsafe /src
+WVPASSEQ "$(compare-trees bup/ bup-baseline/)" \
+"*$deleting logs/refs/heads/src
+*$deleting refs/heads/src
+.d..t...${rsx} logs/refs/heads/
+.d..t...${rsx} refs/heads/"
+
+
+WVSTART "rm /foo (one of many)"
+WVPASS rm -rf bup
+WVPASS mv bup-baseline bup
+WVPASS echo twisty-maze > src/2
+WVPASS bup index src
+WVPASS bup save -n src-2 src
+WVPASS echo twisty-maze > src/3
+WVPASS bup index src
+WVPASS bup save -n src-3 src
+WVPASS "$top"/t/sync-tree bup/ bup-baseline/
+WVPASS bup tick # Make sure we always get the timestamp changes below
+WVPASS bup rm --unsafe /src
+WVPASSEQ "$(compare-trees bup/ bup-baseline/)" \
+"*$deleting logs/refs/heads/src
+*$deleting refs/heads/src
+.d..t...${rsx} logs/refs/heads/
+.d..t...${rsx} refs/heads/"
+
+
+WVSTART "rm /foo /bar (multiple of many)"
+WVPASS rm -rf bup
+WVPASS mv bup-baseline bup
+WVPASS echo twisty-maze > src/4
+WVPASS bup index src
+WVPASS bup save -n src-4 src
+WVPASS echo twisty-maze > src/5
+WVPASS bup index src
+WVPASS bup save -n src-5 src
+WVPASS "$top"/t/sync-tree bup/ bup-baseline/
+WVPASS bup tick # Make sure we always get the timestamp changes below
+WVPASS bup rm --unsafe /src-2 /src-4
+WVPASSEQ "$(compare-trees bup/ bup-baseline/)" \
+"*$deleting logs/refs/heads/src-4
+*$deleting logs/refs/heads/src-2
+*$deleting refs/heads/src-4
+*$deleting refs/heads/src-2
+.d..t...${rsx} logs/refs/heads/
+.d..t...${rsx} refs/heads/"
+
+
+WVSTART "rm /foo /bar (all)"
+WVPASS rm -rf bup
+WVPASS mv bup-baseline bup
+WVPASS "$top"/t/sync-tree bup/ bup-baseline/
+WVPASS bup tick # Make sure we always get the timestamp changes below
+WVPASS bup rm --unsafe /src /src-2 /src-3 /src-4 /src-5
+WVPASSEQ "$(compare-trees bup/ bup-baseline/)" \
+"*$deleting logs/refs/heads/src-5
+*$deleting logs/refs/heads/src-4
+*$deleting logs/refs/heads/src-3
+*$deleting logs/refs/heads/src-2
+*$deleting logs/refs/heads/src
+*$deleting refs/heads/src-5
+*$deleting refs/heads/src-4
+*$deleting refs/heads/src-3
+*$deleting refs/heads/src-2
+*$deleting refs/heads/src
+.d..t...${rsx} logs/refs/heads/
+.d..t...${rsx} refs/heads/"
+
+
+WVSTART "rm /foo/bar (lone save - equivalent to rm /foo)"
+WVPASS rm -rf bup bup-baseline src
+WVPASS bup init
+WVPASS mkdir src
+WVPASS echo twisty-maze > src/1
+WVPASS bup index src
+WVPASS bup save -n src src
+save1="$(WVPASS bup ls src | head -n 1)" || exit $?
+WVPASS "$top"/t/sync-tree bup/ bup-baseline/
+WVPASS bup tick # Make sure we always get the timestamp changes below
+WVFAIL bup rm --unsafe /src/latest
+WVPASS bup rm --unsafe /src/"$save1"
+WVPASSEQ "$(compare-trees bup/ bup-baseline/)" \
+"*$deleting logs/refs/heads/src
+*$deleting refs/heads/src
+.d..t...${rsx} logs/refs/heads/
+.d..t...${rsx} refs/heads/"
+
+
+verify-changes-caused-by-rewriting-save()
+(
+    local before="$1"
+    local after="$2"
+    local tmpdir="$(WVPASS wvmktempdir)" || exit $?
+    (WVPASS cd "$before" && WVPASS find . | WVPASS sort) > "$tmpdir/before"
+    (WVPASS cd "$after" && WVPASS find . | WVPASS sort) > "$tmpdir/after"
+    new_paths="$(WVPASS comm -13 "$tmpdir/before" "$tmpdir/after")" || exit $?
+    new_idx="$(echo "$new_paths" | WVPASS grep -E '^\./objects/pack/pack-.*\.idx$' | cut -b 3-)"
+    new_pack="$(echo "$new_paths" | WVPASS grep -E '^\./objects/pack/pack-.*\.pack$' | cut -b 3-)"
+    WVPASSEQ "$(compare-trees "$after/" "$before/")" \
+">fcst...${rsx} logs/refs/heads/src
+.d..t...${rsx} objects/
+.d..t...${rsx} objects/pack/
+>fcst...${rsx} objects/pack/bup.bloom
+>f+++++++${plusx} $new_idx
+>f+++++++${plusx} $new_pack
+.d..t...${rsx} refs/heads/
+>fc.t...${rsx} refs/heads/src"
+    WVPASS rm -rf "$tmpdir"
+)
+
+commit-hash-n()
+{
+    local n="$1" repo="$2" branch="$3"
+    GIT_DIR="$repo" WVPASS git rev-list --reverse "$branch" \
+        | WVPASS awk "FNR == $n"
+}
+
+rm-safe-cinfo()
+{
+    local n="$1" repo="$2" branch="$3" hash
+    hash="$(commit-hash-n "$n" "$repo" "$branch")" || exit $?
+    local fmt='Tree: %T%n'
+    fmt="${fmt}Author: %an <%ae> %ai%n"
+    fmt="${fmt}Committer: %cn <%ce> %ci%n"
+    fmt="${fmt}%n%s%n%b"
+    GIT_DIR="$repo" WVPASS git log -n1 --pretty=format:"$fmt" "$hash"
+}
+
+
+WVSTART 'rm /foo/BAR (setup)'
+WVPASS rm -rf bup bup-baseline src
+WVPASS bup init
+WVPASS mkdir src
+WVPASS echo twisty-maze > src/1
+WVPASS bup index src
+WVPASS bup save -n src src
+WVPASS echo twisty-maze > src/2
+WVPASS bup index src
+WVPASS bup tick
+WVPASS bup save -n src src
+WVPASS echo twisty-maze > src/3
+WVPASS bup index src
+WVPASS bup tick
+WVPASS bup save -n src src
+WVPASS mv bup bup-baseline
+WVPASS bup tick # Make sure we always get the timestamp changes below
+
+
+WVSTART "rm /foo/BAR (first of many)"
+WVPASS "$top"/t/sync-tree bup-baseline/ bup/
+victim="$(WVPASS bup ls src | head -n 1)" || exit $?
+WVPASS bup rm --unsafe /src/"$victim"
+verify-changes-caused-by-rewriting-save bup-baseline bup
+WVPASSEQ 2 $(git rev-list src | wc -l)
+WVPASSEQ "$(rm-safe-cinfo 1 bup src)" "$(rm-safe-cinfo 2 bup-baseline src)"
+WVPASSEQ "$(rm-safe-cinfo 2 bup src)" "$(rm-safe-cinfo 3 bup-baseline src)"
+
+
+WVSTART "rm /foo/BAR (one of many)"
+WVPASS "$top"/t/sync-tree bup-baseline/ bup/
+victim="$(WVPASS bup ls src | tail -n +2 | head -n 1)" || exit $?
+WVPASS bup rm --unsafe /src/"$victim"
+verify-changes-caused-by-rewriting-save bup-baseline bup
+WVPASSEQ 2 $(git rev-list src | wc -l)
+WVPASSEQ "$(commit-hash-n 1 bup src)" "$(commit-hash-n 1 bup-baseline src)"
+WVPASSEQ "$(rm-safe-cinfo 2 bup src)" "$(rm-safe-cinfo 3 bup-baseline src)"
+
+
+WVSTART "rm /foo/BAR (last of many)"
+WVPASS "$top"/t/sync-tree bup-baseline/ bup/
+victim="$(WVPASS bup ls src | tail -n 2 | head -n 1)" || exit $?
+WVPASS bup rm --unsafe -vv /src/"$victim"
+WVPASSEQ "$(compare-trees bup/ bup-baseline/)" \
+">fcst...${rsx} logs/refs/heads/src
+.d..t...${rsx} refs/heads/
+>fc.t...${rsx} refs/heads/src"
+WVPASSEQ 2 $(git rev-list src | wc -l)
+WVPASSEQ "$(commit-hash-n 1 bup src)" "$(commit-hash-n 1 bup-baseline src)"
+WVPASSEQ "$(commit-hash-n 2 bup src)" "$(commit-hash-n 2 bup-baseline src)"
+
+
+# FIXME: test that committer changes when rewriting, when appropriate.
+
+WVPASS rm -rf "$tmpdir"