]> arthur.barton.de Git - bup.git/commitdiff
git.py: add walk_object
authorRob Browning <rlb@defaultvalue.org>
Mon, 5 Jan 2015 00:47:42 +0000 (18:47 -0600)
committerRob Browning <rlb@defaultvalue.org>
Sun, 14 Feb 2016 00:10:09 +0000 (18:10 -0600)
Add walk_object(), which can recursively traverse a git reference (tree,
commit, blob, etc.).  This will be used by "bup gc".

Thanks to Aidan Hobson Sayers for suggesting improvements to an earlier
version of the code.

Signed-off-by: Rob Browning <rlb@defaultvalue.org>
lib/bup/git.py

index 315d8f3fa295aff53193a9315be5bb1aeecbeb94..ba616d51f88f9892e9ee34adf9b0187c051ca53d 100644 (file)
@@ -7,7 +7,7 @@ import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
 from collections import namedtuple
 from itertools import islice
 
-from bup import _helpers, path, midx, bloom, xstat
+from bup import _helpers, hashsplit, path, midx, bloom, xstat
 from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
                          fdatasync,
                          hostname, localtime, log, merge_iter,
@@ -1230,3 +1230,92 @@ def tags(repo_dir = None):
             tags[c] = []
         tags[c].append(name)  # more than one tag can point at 'c'
     return tags
+
+
+WalkItem = namedtuple('WalkItem', ['id', 'type', 'mode',
+                                   'path', 'chunk_path', 'data'])
+# The path is the mangled path, and if an item represents a fragment
+# of a chunked file, the chunk_path will be the chunked subtree path
+# for the chunk, i.e. ['', '2d3115e', ...].  The top-level path for a
+# chunked file will have a chunk_path of [''].  So some chunk subtree
+# of the file '/foo/bar/baz' might look like this:
+#
+#   item.path = ['foo', 'bar', 'baz.bup']
+#   item.chunk_path = ['', '2d3115e', '016b097']
+#   item.type = 'tree'
+#   ...
+
+
+def _walk_object(cat_pipe, id,
+                 parent_path, chunk_path,
+                 mode=None,
+                 stop_at=None,
+                 include_data=None):
+
+    if stop_at and stop_at(id):
+        return
+
+    item_it = cat_pipe.get(id)  # FIXME: use include_data
+    type = item_it.next()
+
+    if type not in ('blob', 'commit', 'tree'):
+        raise Exception('unexpected repository object type %r' % type)
+
+    # FIXME: set the mode based on the type when the mode is None
+
+    if type == 'blob' and not include_data:
+        # Dump data until we can ask cat_pipe not to fetch it
+        for ignored in item_it:
+            pass
+        data = None
+    else:
+        data = ''.join(item_it)
+
+    yield  WalkItem(id=id, type=type,
+                    chunk_path=chunk_path, path=parent_path,
+                    mode=mode,
+                    data=(data if include_data else None))
+
+    if type == 'commit':
+        commit_items = parse_commit(data)
+        tree_id = commit_items.tree
+        for x in _walk_object(cat_pipe, tree_id, parent_path, chunk_path,
+                              mode=hashsplit.GIT_MODE_TREE,
+                              stop_at=stop_at,
+                              include_data=include_data):
+            yield x
+        parents = commit_items.parents
+        for pid in parents:
+            for x in _walk_object(cat_pipe, pid, parent_path, chunk_path,
+                                  mode=mode, # Same mode as this child
+                                  stop_at=stop_at,
+                                  include_data=include_data):
+                yield x
+    elif type == 'tree':
+        for mode, name, ent_id in tree_decode(data):
+            demangled, bup_type = demangle_name(name, mode)
+            if chunk_path:
+                sub_path = parent_path
+                sub_chunk_path = chunk_path + [name]
+            else:
+                sub_path = parent_path + [name]
+                if bup_type == BUP_CHUNKED:
+                    sub_chunk_path = ['']
+                else:
+                    sub_chunk_path = chunk_path
+            for x in _walk_object(cat_pipe, ent_id.encode('hex'),
+                                  sub_path, sub_chunk_path,
+                                  mode=mode,
+                                  stop_at=stop_at,
+                                  include_data=include_data):
+                yield x
+
+
+def walk_object(cat_pipe, id,
+                stop_at=None,
+                include_data=None):
+    """Yield everything reachable from id via cat_pipe as a WalkItem,
+    stopping whenever stop_at(id) returns true."""
+    return _walk_object(cat_pipe, id, [], [],
+                        stop_at=stop_at,
+                        include_data=include_data)