From b7e10498d1834aa330b44faf682b6c5bdc8d950a Mon Sep 17 00:00:00 2001 From: Rob Browning Date: Sun, 4 Jan 2015 18:47:42 -0600 Subject: [PATCH 1/1] git.py: add walk_object Add walk_object(), which can recursively traverse a git reference (tree, commit, blob, etc.). This will be used by "bup gc". Thanks to Aidan Hobson Sayers for suggesting improvements to an earlier version of the code. Signed-off-by: Rob Browning --- lib/bup/git.py | 91 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 90 insertions(+), 1 deletion(-) diff --git a/lib/bup/git.py b/lib/bup/git.py index 315d8f3..ba616d5 100644 --- a/lib/bup/git.py +++ b/lib/bup/git.py @@ -7,7 +7,7 @@ import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob from collections import namedtuple from itertools import islice -from bup import _helpers, path, midx, bloom, xstat +from bup import _helpers, hashsplit, path, midx, bloom, xstat from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2, fdatasync, hostname, localtime, log, merge_iter, @@ -1230,3 +1230,92 @@ def tags(repo_dir = None): tags[c] = [] tags[c].append(name) # more than one tag can point at 'c' return tags + + +WalkItem = namedtuple('WalkItem', ['id', 'type', 'mode', + 'path', 'chunk_path', 'data']) +# The path is the mangled path, and if an item represents a fragment +# of a chunked file, the chunk_path will be the chunked subtree path +# for the chunk, i.e. ['', '2d3115e', ...]. The top-level path for a +# chunked file will have a chunk_path of ['']. So some chunk subtree +# of the file '/foo/bar/baz' might look like this: +# +# item.path = ['foo', 'bar', 'baz.bup'] +# item.chunk_path = ['', '2d3115e', '016b097'] +# item.type = 'tree' +# ... + + +def _walk_object(cat_pipe, id, + parent_path, chunk_path, + mode=None, + stop_at=None, + include_data=None): + + if stop_at and stop_at(id): + return + + item_it = cat_pipe.get(id) # FIXME: use include_data + type = item_it.next() + + if type not in ('blob', 'commit', 'tree'): + raise Exception('unexpected repository object type %r' % type) + + # FIXME: set the mode based on the type when the mode is None + + if type == 'blob' and not include_data: + # Dump data until we can ask cat_pipe not to fetch it + for ignored in item_it: + pass + data = None + else: + data = ''.join(item_it) + + yield WalkItem(id=id, type=type, + chunk_path=chunk_path, path=parent_path, + mode=mode, + data=(data if include_data else None)) + + if type == 'commit': + commit_items = parse_commit(data) + tree_id = commit_items.tree + for x in _walk_object(cat_pipe, tree_id, parent_path, chunk_path, + mode=hashsplit.GIT_MODE_TREE, + stop_at=stop_at, + include_data=include_data): + yield x + parents = commit_items.parents + for pid in parents: + for x in _walk_object(cat_pipe, pid, parent_path, chunk_path, + mode=mode, # Same mode as this child + stop_at=stop_at, + include_data=include_data): + yield x + elif type == 'tree': + for mode, name, ent_id in tree_decode(data): + demangled, bup_type = demangle_name(name, mode) + if chunk_path: + sub_path = parent_path + sub_chunk_path = chunk_path + [name] + else: + sub_path = parent_path + [name] + if bup_type == BUP_CHUNKED: + sub_chunk_path = [''] + else: + sub_chunk_path = chunk_path + for x in _walk_object(cat_pipe, ent_id.encode('hex'), + sub_path, sub_chunk_path, + mode=mode, + stop_at=stop_at, + include_data=include_data): + yield x + + +def walk_object(cat_pipe, id, + stop_at=None, + include_data=None): + """Yield everything reachable from id via cat_pipe as a WalkItem, + stopping whenever stop_at(id) returns true.""" + return _walk_object(cat_pipe, id, [], [], + stop_at=stop_at, + include_data=include_data) -- 2.39.2