]> arthur.barton.de Git - bup.git/commitdiff
vfs2: add trivial random eviction commit cache
authorRob Browning <rlb@defaultvalue.org>
Mon, 13 Nov 2017 04:44:03 +0000 (22:44 -0600)
committerRob Browning <rlb@defaultvalue.org>
Sun, 24 Dec 2017 19:24:42 +0000 (13:24 -0600)
Porting rm and gc to vfs2 has put us at the point where we need at
least some caching so that we're not invoking git an endless number of
times when trying to "bup rm" 1000 saves (i.e. via test-prune-older if
nothing else).  Start with a very primitive, fixed-size commit cache
with random eviction.

It's not clear that the cache should always be enabled.  While it may
help commands like rm, prune-older, fuse, web, ftp, etc., it may often
just be extra baggage for for commands like restore, ls, cat-file, ...

Signed-off-by: Rob Browning <rlb@defaultvalue.org>
Tested-by: Rob Browning <rlb@defaultvalue.org>
lib/bup/t/tvfs.py
lib/bup/vfs2.py

index 7dbc6e8f555aa071a51ba29cae142463c8de2cb5..1a18b6fd650eb6fea9e40e0a9d6fcf35e49040a2 100644 (file)
@@ -22,6 +22,12 @@ bup_tmp = os.path.realpath('../../../t/tmp')
 bup_path = top_dir + '/bup'
 start_dir = os.getcwd()
 
+## The clear_cache() calls below are to make sure that the test starts
+## from a known state since at the moment the cache entry for a given
+## item (like a commit) can change.  For example, its meta value might
+## be promoted from a mode to a Metadata instance once the tree it
+## refers to is traversed.
+
 def ex(cmd, **kwargs):
     print(shstr(cmd), file=stderr)
     return exc(cmd, **kwargs)
@@ -234,15 +240,18 @@ def test_resolve():
                                  tip_oidx))[0].strip()
             tip_tree_oid = tip_tree_oidx.decode('hex')
             tip_tree = tree_dict(repo, tip_tree_oid)
-            test_revlist = vfs.RevList(meta=S_IFDIR | 0o755, oid=tip_oid)
             test_revlist_w_meta = vfs.RevList(meta=tip_tree['.'].meta,
                                               oid=tip_oid)
             expected_latest_item = vfs.Commit(meta=S_IFDIR | 0o755,
                                               oid=tip_tree_oid,
                                               coid=tip_oid)
+            expected_latest_item_w_meta = vfs.Commit(meta=tip_tree['.'].meta,
+                                                     oid=tip_tree_oid,
+                                                     coid=tip_oid)
             expected_test_tag_item = expected_latest_item
 
             wvstart('resolve: /')
+            vfs.clear_cache()
             res = resolve(repo, '/')
             wvpasseq(1, len(res))
             wvpasseq((('', vfs._root),), res)
@@ -250,7 +259,7 @@ def test_resolve():
             root_content = frozenset(vfs.contents(repo, root_item))
             wvpasseq(frozenset([('.', root_item),
                                 ('.tag', vfs._tags),
-                                ('test', test_revlist)]),
+                                ('test', test_revlist_w_meta)]),
                      root_content)
             for path in ('//', '/.', '/./', '/..', '/../',
                          '/test/latest/dir/../../..',
@@ -264,10 +273,12 @@ def test_resolve():
                          '/test//.//.//latest/dir/../../..'
                          '/test//./latest/dir/../../..'):
                 wvstart('resolve: ' + path)
+                vfs.clear_cache()
                 res = resolve(repo, path)
                 wvpasseq((('', vfs._root),), res)
 
             wvstart('resolve: /.tag')
+            vfs.clear_cache()
             res = resolve(repo, '/.tag')
             wvpasseq(2, len(res))
             wvpasseq((('', vfs._root), ('.tag', vfs._tags)),
@@ -279,24 +290,27 @@ def test_resolve():
                      tag_content)
 
             wvstart('resolve: /test')
+            vfs.clear_cache()
             res = resolve(repo, '/test')
             wvpasseq(2, len(res))
-            wvpasseq((('', vfs._root), ('test', test_revlist)), res)
+            wvpasseq((('', vfs._root), ('test', test_revlist_w_meta)), res)
             ignore, test_item = res[1]
             test_content = frozenset(vfs.contents(repo, test_item))
+            # latest has metadata here due to caching
             wvpasseq(frozenset([('.', test_revlist_w_meta),
-                                (save_time_str, expected_latest_item),
-                                ('latest', expected_latest_item)]),
+                                (save_time_str, expected_latest_item_w_meta),
+                                ('latest', expected_latest_item_w_meta)]),
                      test_content)
 
             wvstart('resolve: /test/latest')
+            vfs.clear_cache()
             res = resolve(repo, '/test/latest')
             wvpasseq(3, len(res))
             expected_latest_item_w_meta = vfs.Commit(meta=tip_tree['.'].meta,
                                                      oid=tip_tree_oid,
                                                      coid=tip_oid)
             expected = (('', vfs._root),
-                        ('test', test_revlist),
+                        ('test', test_revlist_w_meta),
                         ('latest', expected_latest_item_w_meta))
             wvpasseq(expected, res)
             ignore, latest_item = res[2]
@@ -312,59 +326,65 @@ def test_resolve():
             wvpasseq(expected, latest_content)
 
             wvstart('resolve: /test/latest/file')
+            vfs.clear_cache()
             res = resolve(repo, '/test/latest/file')
             wvpasseq(4, len(res))
             expected_file_item_w_meta = vfs.Item(meta=tip_tree['file'].meta,
                                                  oid=tip_tree['file'].oid)
             expected = (('', vfs._root),
-                        ('test', test_revlist),
+                        ('test', test_revlist_w_meta),
                         ('latest', expected_latest_item_w_meta),
                         ('file', expected_file_item_w_meta))
             wvpasseq(expected, res)
 
             wvstart('resolve: /test/latest/bad-symlink')
+            vfs.clear_cache()
             res = resolve(repo, '/test/latest/bad-symlink')
             wvpasseq(4, len(res))
             expected = (('', vfs._root),
-                        ('test', test_revlist),
+                        ('test', test_revlist_w_meta),
                         ('latest', expected_latest_item_w_meta),
                         ('not-there', None))
             wvpasseq(expected, res)
 
             wvstart('lresolve: /test/latest/bad-symlink')
+            vfs.clear_cache()
             res = lresolve(repo, '/test/latest/bad-symlink')
             wvpasseq(4, len(res))
             bad_symlink_value = tip_tree['bad-symlink']
             expected_bad_symlink_item_w_meta = vfs.Item(meta=bad_symlink_value.meta,
                                                         oid=bad_symlink_value.oid)
             expected = (('', vfs._root),
-                        ('test', test_revlist),
+                        ('test', test_revlist_w_meta),
                         ('latest', expected_latest_item_w_meta),
                         ('bad-symlink', expected_bad_symlink_item_w_meta))
             wvpasseq(expected, res)
 
             wvstart('resolve: /test/latest/file-symlink')
+            vfs.clear_cache()
             res = resolve(repo, '/test/latest/file-symlink')
             wvpasseq(4, len(res))
             expected = (('', vfs._root),
-                        ('test', test_revlist),
+                        ('test', test_revlist_w_meta),
                         ('latest', expected_latest_item_w_meta),
                         ('file', expected_file_item_w_meta))
             wvpasseq(expected, res)
 
             wvstart('lresolve: /test/latest/file-symlink')
+            vfs.clear_cache()
             res = lresolve(repo, '/test/latest/file-symlink')
             wvpasseq(4, len(res))
             file_symlink_value = tip_tree['file-symlink']
             expected_file_symlink_item_w_meta = vfs.Item(meta=file_symlink_value.meta,
                                                          oid=file_symlink_value.oid)
             expected = (('', vfs._root),
-                        ('test', test_revlist),
+                        ('test', test_revlist_w_meta),
                         ('latest', expected_latest_item_w_meta),
                         ('file-symlink', expected_file_symlink_item_w_meta))
             wvpasseq(expected, res)
 
             wvstart('resolve: /test/latest/missing')
+            vfs.clear_cache()
             res = resolve(repo, '/test/latest/missing')
             wvpasseq(4, len(res))
             name, item = res[-1]
@@ -379,6 +399,7 @@ def test_resolve():
                          '/test/latest/file/../..',
                          '/test/latest/file/foo'):
                 wvstart('resolve: ' + path)
+                vfs.clear_cache()
                 try:
                     resolve(repo, path)
                 except vfs.IOError as res_ex:
@@ -393,6 +414,7 @@ def test_resolve():
                          '/test/latest/file-symlink/../.',
                          '/test/latest/file-symlink/../..'):
                 wvstart('lresolve: ' + path)
+                vfs.clear_cache()
                 try:
                     lresolve(repo, path)
                 except vfs.IOError as res_ex:
@@ -401,6 +423,7 @@ def test_resolve():
                              [name for name, item in res_ex.terminus])
 
             wvstart('resolve: non-directory parent')
+            vfs.clear_cache()
             file_res = resolve(repo, '/test/latest/file')
             try:
                 resolve(repo, 'foo', parent=file_res)
@@ -409,13 +432,14 @@ def test_resolve():
                 wvpasseq(None, res_ex.terminus)
 
             wvstart('lresolve: /test/latest/dir-symlink')
+            vfs.clear_cache()
             res = lresolve(repo, '/test/latest/dir-symlink')
             wvpasseq(4, len(res))
             dir_symlink_value = tip_tree['dir-symlink']
             expected_dir_symlink_item_w_meta = vfs.Item(meta=dir_symlink_value.meta,
                                                          oid=dir_symlink_value.oid)
             expected = (('', vfs._root),
-                        ('test', test_revlist),
+                        ('test', test_revlist_w_meta),
                         ('latest', expected_latest_item_w_meta),
                         ('dir-symlink', expected_dir_symlink_item_w_meta))
             wvpasseq(expected, res)
@@ -424,7 +448,7 @@ def test_resolve():
             expected_dir_item = vfs.Item(oid=dir_value.oid,
                                          meta=tree_dict(repo, dir_value.oid)['.'].meta)
             expected = (('', vfs._root),
-                        ('test', test_revlist),
+                        ('test', test_revlist_w_meta),
                         ('latest', expected_latest_item_w_meta),
                         ('dir', expected_dir_item))
             for resname, resolver in (('resolve', resolve),
@@ -432,10 +456,12 @@ def test_resolve():
                 for path in ('/test/latest/dir-symlink/',
                              '/test/latest/dir-symlink/.'):
                     wvstart(resname + ': ' + path)
+                    vfs.clear_cache()
                     res = resolver(repo, path)
                     wvpasseq(4, len(res))
                     wvpasseq(expected, res)
             wvstart('resolve: /test/latest/dir-symlink')
+            vfs.clear_cache()
             res = resolve(repo, path)
             wvpasseq(4, len(res))
             wvpasseq(expected, res)
index 421b6cb32db7733b9983e79228b4e7646d4902d1..a147ff94add8a2fe81b55e530b509c072f2edd3b 100644 (file)
@@ -231,6 +231,77 @@ _root = Root(meta=default_dir_mode)
 _tags = Tags(meta=default_dir_mode)
 
 
+### vfs cache
+
+### A general purpose shared cache with (currently) cheap random
+### eviction.  There is currently no weighting so a single commit item
+### is just as likely to be evicted as an entire "rev-list".  See
+### is_valid_cache_key for a description of the expected content.
+
+_cache = {}
+_cache_keys = []
+_cache_max_items = 30000
+
+def clear_cache():
+    global _cache, _cache_keys
+    _cache = {}
+    _cache_keys = []
+
+def is_valid_cache_key(x):
+    """Return logically true if x looks like it could be a valid cache key
+    (with respect to structure).  Current valid cache entries:
+      commit_oid -> commit
+      commit_oid + ':r' -> rev-list
+         i.e. rev-list -> {'.', commit, '2012...', next_commit, ...}
+    """
+    # Suspect we may eventually add "(container_oid, name) -> ...", and others.
+    x_t = type(x)
+    if x_t is bytes:
+        if len(x) == 20:
+            return True
+        if len(x) == 22 and x.endswith(b':r'):
+            return True
+
+def cache_get(key):
+    global _cache
+    assert is_valid_cache_key(key)
+    return _cache.get(key)
+
+def cache_notice(key, value):
+    global _cache, _cache_keys, _cache_max_items
+    assert is_valid_cache_key(key)
+    if key in _cache:
+        return
+    _cache[key] = value
+    if len(_cache) < _cache_max_items:
+        return
+    victim_i = random.randrange(0, len(_cache_keys))
+    victim = _cache_keys[victim_i]
+    _cache_keys[victim_i] = key
+    _cache.pop(victim)
+
+
+def cache_get_commit_item(oid, need_meta=True):
+    """Return the requested tree item if it can be found in the cache.
+    When need_meta is true don't return a cached item that only has a
+    mode."""
+    # tree might be stored independently, or as '.' with its entries.
+    item = cache_get(oid)
+    if item:
+        if not need_meta:
+            return item
+        if isinstance(item.meta, Metadata):
+            return item
+    entries = cache_get(oid + b':r')
+    if entries:
+        return entries['.']
+
+def cache_get_revlist_item(oid, need_meta=True):
+    commit = cache_get_commit_item(oid, need_meta=need_meta)
+    if commit:
+        return RevList(oid=oid, meta=commit.meta)
+
+
 def copy_item(item):
     """Return a completely independent copy of item, such that
     modifications will not affect the original.
@@ -341,28 +412,25 @@ def _commit_item_from_data(oid, data):
                   coid=oid)
 
 def _commit_item_from_oid(repo, oid, require_meta):
+    commit = cache_get_commit_item(oid, need_meta=require_meta)
+    if commit and ((not require_meta) or isinstance(commit.meta, Metadata)):
+        return commit
     it = repo.cat(oid.encode('hex'))
     _, typ, size = next(it)
     assert typ == 'commit'
     commit = _commit_item_from_data(oid, ''.join(it))
     if require_meta:
-        meta = _find_treeish_oid_metadata(repo, commit.tree)
+        meta = _find_treeish_oid_metadata(repo, commit.oid)
         if meta:
             commit = commit._replace(meta=meta)
+    cache_notice(oid, commit)
     return commit
 
 def _revlist_item_from_oid(repo, oid, require_meta):
-    if require_meta:
-        meta = _find_treeish_oid_metadata(repo, oid) or default_dir_mode
-    else:
-        meta = default_dir_mode
-    return RevList(oid=oid, meta=meta)
-
-def parse_rev_auth_secs(f):
-    tree, author_secs = f.readline().split(None, 2)
-    return tree, int(author_secs)
+    commit = _commit_item_from_oid(repo, oid, require_meta)
+    return RevList(oid=oid, meta=commit.meta)
 
-def root_items(repo, names=None):
+def root_items(repo, names=None, want_meta=True):
     """Yield (name, item) for the items in '/' in the VFS.  Return
     everything if names is logically false, otherwise return only
     items with a name in the collection.
@@ -379,7 +447,7 @@ def root_items(repo, names=None):
         # in parallel (i.e. meta vs refs).
         for name, oid in tuple(repo.refs([], limit_to_heads=True)):
             assert(name.startswith('refs/heads/'))
-            yield name[11:], _revlist_item_from_oid(repo, oid, False)
+            yield name[11:], _revlist_item_from_oid(repo, oid, want_meta)
         return
 
     if '.' in names:
@@ -396,7 +464,7 @@ def root_items(repo, names=None):
             continue
         assert typ == 'commit'
         commit = parse_commit(''.join(it))
-        yield ref, _revlist_item_from_oid(repo, oidx.decode('hex'), False)
+        yield ref, _revlist_item_from_oid(repo, oidx.decode('hex'), want_meta)
 
 def ordered_tree_entries(tree_data, bupm=None):
     """Yields (name, mangled_name, kind, gitmode, oid) for each item in
@@ -511,59 +579,78 @@ def _reverse_suffix_duplicates(strs):
             for i in xrange(ndup - 1, -1, -1):
                 yield fmt % (name, i)
 
+def parse_rev(f):
+    items = f.readline().split(None)
+    assert len(items) == 2
+    tree, auth_sec = items
+    return tree.decode('hex'), int(auth_sec)
+
 def _name_for_rev(rev):
-    commit, (tree_oidx, utc) = rev
-    assert len(commit) == 40
+    commit_oidx, (tree_oid, utc) = rev
     return strftime('%Y-%m-%d-%H%M%S', localtime(utc))
 
 def _item_for_rev(rev):
-    commit, (tree_oidx, utc) = rev
-    assert len(commit) == 40
-    assert len(tree_oidx) == 40
-    return Commit(meta=default_dir_mode,
-                  oid=tree_oidx.decode('hex'),
-                  coid=commit.decode('hex'))
+    commit_oidx, (tree_oid, utc) = rev
+    coid = commit_oidx.decode('hex')
+    item = cache_get_commit_item(coid, need_meta=False)
+    if item:
+        return item
+    item = Commit(meta=default_dir_mode, oid=tree_oid, coid=coid)
+    cache_notice(item.coid, item)
+    return item
 
-def revlist_items(repo, oid, names):
-    assert len(oid) == 20
-    oidx = oid.encode('hex')
-    names = frozenset(name for name in (names or tuple()) \
-                      if _save_name_rx.match(name) or name in ('.', 'latest'))
-    # Do this before we open the rev_list iterator so we're not nesting
-    if (not names) or ('.' in names):
-        yield '.', _revlist_item_from_oid(repo, oid, True)
+def cache_commit(repo, oid):
+    """Build, cache, and return a "name -> commit_item" dict of the entire
+    commit rev-list.
 
-    revs = repo.rev_list((oidx,), format='%T %at', parse=parse_rev_auth_secs)
+    """
+    # For now, always cache with full metadata
+    entries = {}
+    entries['.'] = _revlist_item_from_oid(repo, oid, True)
+    revs = repo.rev_list((oid.encode('hex'),), format='%T %at',
+                         parse=parse_rev)
     rev_items, rev_names = tee(revs)
     revs = None  # Don't disturb the tees
     rev_names = _reverse_suffix_duplicates(_name_for_rev(x) for x in rev_names)
     rev_items = (_item_for_rev(x) for x in rev_items)
-    first_commit = None
+    latest = None
+    for item in rev_items:
+        latest = latest or item
+        name = next(rev_names)
+        entries[name] = item
+    entries['latest'] = latest
+    cache_notice(latest.coid + b':r', entries)
+    return entries
+
+def revlist_items(repo, oid, names):
+    assert len(oid) == 20
+
+    # Special case '.' instead of caching the whole history since it's
+    # the only way to get the metadata for the commit.
+    if names and all(x == '.' for x in names):
+        yield '.', _revlist_item_from_oid(repo, oid, True)
+        return
+
+    # For now, don't worry about the possibility of the contents being
+    # "too big" for the cache.
+    entries = cache_get(oid + b':r')
+    if not entries:
+        entries = cache_commit(repo, oid)
 
     if not names:
-        for item in rev_items:
-            first_commit = first_commit or item
-            yield next(rev_names), item
-        yield 'latest', first_commit
+        for name in sorted(entries.keys()):
+            yield name, entries[name]
         return
 
-    # Revs are in reverse chronological order by default
-    last_name = min(names)
-    for item in rev_items:
-        first_commit = first_commit or item
-        name = next(rev_names)  # Might have -N dup suffix
-        if name < last_name:
-            break
-        if not name in names:
-            continue
-        yield name, item
+    names = frozenset(name for name in names
+                      if _save_name_rx.match(name) or name in ('.', 'latest'))
 
-    # FIXME: need real short circuit...
-    for _ in rev_items: pass
-    for _ in rev_names: pass
-        
-    if 'latest' in names:
-        yield 'latest', first_commit
+    if '.' in names:
+        yield '.', entries['.']
+    for name in (n for n in names if n != '.'):
+        commit = entries.get(name)
+        if commit:
+            yield name, commit
 
 def tags_items(repo, names):
     global _tags
@@ -574,7 +661,8 @@ def tags_items(repo, names):
         it = repo.cat(oidx)
         _, typ, size = next(it)
         if typ == 'commit':
-            return _commit_item_from_data(oid, ''.join(it))
+            return cache_get_commit_item(oid, need_meta=False) \
+                or _commit_item_from_data(oid, ''.join(it))
         for _ in it: pass
         if typ == 'blob':
             return Item(meta=default_file_mode, oid=oid)
@@ -661,7 +749,7 @@ def contents(repo, item, names=None, want_meta=True):
     elif item_t == RevList:
         item_gen = revlist_items(repo, item.oid, names)
     elif item_t == Root:
-        item_gen = root_items(repo, names)
+        item_gen = root_items(repo, names, want_meta)
     elif item_t == Tags:
         item_gen = tags_items(repo, names)
     else: