From a4226ce05b0b6f7bda71bfb2d275fcf5f26a3320 Mon Sep 17 00:00:00 2001 From: Rob Browning Date: Sat, 10 Sep 2016 11:29:50 -0500 Subject: [PATCH] gc: move core code to bup.gc module Signed-off-by: Rob Browning Tested-by: Rob Browning --- cmd/gc-cmd.py | 251 ++------------------------------------------------ lib/bup/gc.py | 245 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 253 insertions(+), 243 deletions(-) create mode 100644 lib/bup/gc.py diff --git a/cmd/gc-cmd.py b/cmd/gc-cmd.py index 90874c3..e0782ca 100755 --- a/cmd/gc-cmd.py +++ b/cmd/gc-cmd.py @@ -4,46 +4,12 @@ bup_python="$(dirname "$0")/bup-python" || exit $? exec "$bup_python" "$0" ${1+"$@"} """ # end of bup preamble -import glob, os, stat, subprocess, sys, tempfile -from bup import bloom, git, midx, options, vfs -from bup.git import MissingObject, walk_object -from bup.helpers import handle_ctrl_c, log, progress, qprogress, saved_errors -from os.path import basename -# This command uses a Bloom filter to track the live objects during -# the mark phase. This means that the collection is probabilistic; it -# may retain some (known) percentage of garbage, but it can also work -# within a reasonable, fixed RAM budget for any particular percentage -# and repository size. -# -# The collection proceeds as follows: -# -# - Scan all live objects by walking all of the refs, and insert -# every hash encountered into a new Bloom "liveness" filter. -# Compute the size of the liveness filter based on the total -# number of objects in the repository. This is the "mark phase". -# -# - Clear the data that's dependent on the repository's object set, -# i.e. the reflog, the normal Bloom filter, and the midxes. -# -# - Traverse all of the pack files, consulting the liveness filter -# to decide which objects to keep. -# -# For each pack file, rewrite it iff it probably contains more -# than (currently) 10% garbage (computed by an initial traversal -# of the packfile in consultation with the liveness filter). To -# rewrite, traverse the packfile (again) and write each hash that -# tests positive against the liveness filter to a packwriter. -# -# During the traversal of all of the packfiles, delete redundant, -# old packfiles only after the packwriter has finished the pack -# that contains all of their live objects. -# -# The current code unconditionally tracks the set of tree hashes seen -# during the mark phase, and skips any that have already been visited. -# This should decrease the IO load at the cost of increased RAM use. +import sys -# FIXME: add a bloom filter tuning parameter? +from bup import git, options +from bup.gc import bup_gc +from bup.helpers import die_if_errors, handle_ctrl_c, log optspec = """ @@ -55,179 +21,6 @@ threshold= only rewrite a packfile if it's over this percent garbage [10] unsafe use the command even though it may be DANGEROUS """ - -class Nonlocal: - pass - - -def count_objects(dir): - # For now we'll just use open_idx(), but we could probably be much - # more efficient since all we need is a single integer (the last - # fanout entry) from each index. - global opt - object_count = 0 - indexes = glob.glob(os.path.join(dir, '*.idx')) - for i, idx_name in enumerate(indexes): - if opt.verbose: - log('found %d objects (%d/%d %s)\r' - % (object_count, i + 1, len(indexes), - os.path.basename(idx_name))) - idx = git.open_idx(idx_name) - object_count += len(idx) - return object_count - - -def report_live_item(n, total, ref_name, ref_id, item): - global opt - status = 'scanned %02.2f%%' % (n * 100.0 / total) - hex_id = ref_id.encode('hex') - dirslash = '/' if item.type == 'tree' else '' - chunk_path = item.chunk_path - - if chunk_path: - if opt.verbose < 4: - return - ps = '/'.join(item.path) - chunk_ps = '/'.join(chunk_path) - log('%s %s:%s/%s%s\n' % (status, hex_id, ps, chunk_ps, dirslash)) - return - - # Top commit, for example has none. - demangled = git.demangle_name(item.path[-1], item.mode)[0] if item.path \ - else None - - # Don't print mangled paths unless the verbosity is over 3. - if demangled: - ps = '/'.join(item.path[:-1] + [demangled]) - if opt.verbose == 1: - qprogress('%s %s:%s%s\r' % (status, hex_id, ps, dirslash)) - elif (opt.verbose > 1 and item.type == 'tree') \ - or (opt.verbose > 2 and item.type == 'blob'): - log('%s %s:%s%s\n' % (status, hex_id, ps, dirslash)) - elif opt.verbose > 3: - ps = '/'.join(item.path) - log('%s %s:%s%s\n' % (status, hex_id, ps, dirslash)) - - -def find_live_objects(existing_count, cat_pipe, opt): - prune_visited_trees = True # In case we want a command line option later - pack_dir = git.repo('objects/pack') - ffd, bloom_filename = tempfile.mkstemp('.bloom', 'tmp-gc-', pack_dir) - os.close(ffd) - # FIXME: allow selection of k? - # FIXME: support ephemeral bloom filters (i.e. *never* written to disk) - live_objs = bloom.create(bloom_filename, expected=existing_count, k=None) - stop_at, trees_visited = None, None - if prune_visited_trees: - trees_visited = set() - stop_at = lambda (x): x.decode('hex') in trees_visited - approx_live_count = 0 - for ref_name, ref_id in git.list_refs(): - for item in walk_object(cat_pipe, ref_id.encode('hex'), - stop_at=stop_at, - include_data=None): - # FIXME: batch ids - if opt.verbose: - report_live_item(approx_live_count, existing_count, - ref_name, ref_id, item) - bin_id = item.id.decode('hex') - if trees_visited is not None and item.type == 'tree': - trees_visited.add(bin_id) - if opt.verbose: - if not live_objs.exists(bin_id): - live_objs.add(bin_id) - approx_live_count += 1 - else: - live_objs.add(bin_id) - trees_visited = None - if opt.verbose: - log('expecting to retain about %.2f%% unnecessary objects\n' - % live_objs.pfalse_positive()) - return live_objs - - -def sweep(live_objects, existing_count, cat_pipe, opt): - # Traverse all the packs, saving the (probably) live data. - - ns = Nonlocal() - ns.stale_files = [] - def remove_stale_files(new_pack_prefix): - if opt.verbose and new_pack_prefix: - log('created ' + basename(new_pack_prefix) + '\n') - for p in ns.stale_files: - if opt.verbose: - log('removing ' + basename(p) + '\n') - os.unlink(p) - ns.stale_files = [] - - writer = git.PackWriter(objcache_maker=None, - compression_level=opt.compress, - run_midx=False, - on_pack_finish=remove_stale_files) - - # FIXME: sanity check .idx names vs .pack names? - collect_count = 0 - for idx_name in glob.glob(os.path.join(git.repo('objects/pack'), '*.idx')): - if opt.verbose: - qprogress('preserving live data (%d%% complete)\r' - % ((float(collect_count) / existing_count) * 100)) - idx = git.open_idx(idx_name) - - idx_live_count = 0 - for i in xrange(0, len(idx)): - sha = idx.shatable[i * 20 : (i + 1) * 20] - if live_objects.exists(sha): - idx_live_count += 1 - - collect_count += idx_live_count - if idx_live_count == 0: - if opt.verbose: - log('deleting %s\n' - % git.repo_rel(basename(idx_name))) - ns.stale_files.append(idx_name) - ns.stale_files.append(idx_name[:-3] + 'pack') - continue - - live_frac = idx_live_count / float(len(idx)) - if live_frac > ((100 - opt.threshold) / 100.0): - if opt.verbose: - log('keeping %s (%d%% live)\n' % (git.repo_rel(basename(idx_name)), - live_frac * 100)) - continue - - if opt.verbose: - log('rewriting %s (%.2f%% live)\n' % (basename(idx_name), - live_frac * 100)) - for i in xrange(0, len(idx)): - sha = idx.shatable[i * 20 : (i + 1) * 20] - if live_objects.exists(sha): - item_it = cat_pipe.get(sha.encode('hex')) - type = item_it.next() - writer.write(sha, type, ''.join(item_it)) - - ns.stale_files.append(idx_name) - ns.stale_files.append(idx_name[:-3] + 'pack') - - if opt.verbose: - progress('preserving live data (%d%% complete)\n' - % ((float(collect_count) / existing_count) * 100)) - - # Nothing should have recreated midx/bloom yet. - pack_dir = git.repo('objects/pack') - assert(not os.path.exists(os.path.join(pack_dir, 'bup.bloom'))) - assert(not glob.glob(os.path.join(pack_dir, '*.midx'))) - - # try/catch should call writer.abort()? - # This will finally run midx. - writer.close() # Can only change refs (if needed) after this. - remove_stale_files(None) # In case we didn't write to the writer. - - if opt.verbose: - log('discarded %d%% of objects\n' - % ((existing_count - count_objects(pack_dir)) - / float(existing_count) * 100)) - - # FIXME: server mode? # FIXME: make sure client handles server-side changes reasonably @@ -252,36 +45,8 @@ if opt.threshold: git.check_repo_or_die() -cat_pipe = vfs.cp() -existing_count = count_objects(git.repo('objects/pack')) -if opt.verbose: - log('found %d objects\n' % existing_count) -if not existing_count: - if opt.verbose: - log('nothing to collect\n') -else: - try: - live_objects = find_live_objects(existing_count, cat_pipe, opt) - except MissingObject as ex: - log('bup: missing object %r \n' % ex.id.encode('hex')) - sys.exit(1) - try: - # FIXME: just rename midxes and bloom, and restore them at the end if - # we didn't change any packs? - if opt.verbose: log('clearing midx files\n') - midx.clear_midxes() - if opt.verbose: log('clearing bloom filter\n') - bloom.clear_bloom(git.repo('objects/pack')) - if opt.verbose: log('clearing reflog\n') - expirelog_cmd = ['git', 'reflog', 'expire', '--all', '--expire=all'] - expirelog = subprocess.Popen(expirelog_cmd, preexec_fn = git._gitenv()) - git._git_wait(' '.join(expirelog_cmd), expirelog) - if opt.verbose: log('removing unreachable data\n') - sweep(live_objects, existing_count, cat_pipe, opt) - finally: - live_objects.close() - os.unlink(live_objects.name) +bup_gc(threshold=opt.threshold, + compression=opt.compress, + verbosity=opt.verbose) -if saved_errors: - log('WARNING: %d errors encountered during gc\n' % len(saved_errors)) - sys.exit(1) +die_if_errors() diff --git a/lib/bup/gc.py b/lib/bup/gc.py new file mode 100644 index 0000000..d85de74 --- /dev/null +++ b/lib/bup/gc.py @@ -0,0 +1,245 @@ +import glob, os, subprocess, sys, tempfile +from bup import bloom, git, midx +from bup.git import MissingObject, walk_object +from bup.helpers import log, progress, qprogress +from os.path import basename + +# This garbage collector uses a Bloom filter to track the live objects +# during the mark phase. This means that the collection is +# probabilistic; it may retain some (known) percentage of garbage, but +# it can also work within a reasonable, fixed RAM budget for any +# particular percentage and repository size. +# +# The collection proceeds as follows: +# +# - Scan all live objects by walking all of the refs, and insert +# every hash encountered into a new Bloom "liveness" filter. +# Compute the size of the liveness filter based on the total +# number of objects in the repository. This is the "mark phase". +# +# - Clear the data that's dependent on the repository's object set, +# i.e. the reflog, the normal Bloom filter, and the midxes. +# +# - Traverse all of the pack files, consulting the liveness filter +# to decide which objects to keep. +# +# For each pack file, rewrite it iff it probably contains more +# than (currently) 10% garbage (computed by an initial traversal +# of the packfile in consultation with the liveness filter). To +# rewrite, traverse the packfile (again) and write each hash that +# tests positive against the liveness filter to a packwriter. +# +# During the traversal of all of the packfiles, delete redundant, +# old packfiles only after the packwriter has finished the pack +# that contains all of their live objects. +# +# The current code unconditionally tracks the set of tree hashes seen +# during the mark phase, and skips any that have already been visited. +# This should decrease the IO load at the cost of increased RAM use. + +# FIXME: add a bloom filter tuning parameter? + + +class Nonlocal: + pass + + +def count_objects(dir, verbosity): + # For now we'll just use open_idx(), but we could probably be much + # more efficient since all we need is a single integer (the last + # fanout entry) from each index. + object_count = 0 + indexes = glob.glob(os.path.join(dir, '*.idx')) + for i, idx_name in enumerate(indexes): + if verbosity: + log('found %d objects (%d/%d %s)\r' + % (object_count, i + 1, len(indexes), basename(idx_name))) + idx = git.open_idx(idx_name) + object_count += len(idx) + return object_count + + +def report_live_item(n, total, ref_name, ref_id, item, verbosity): + status = 'scanned %02.2f%%' % (n * 100.0 / total) + hex_id = ref_id.encode('hex') + dirslash = '/' if item.type == 'tree' else '' + chunk_path = item.chunk_path + + if chunk_path: + if verbosity < 4: + return + ps = '/'.join(item.path) + chunk_ps = '/'.join(chunk_path) + log('%s %s:%s/%s%s\n' % (status, hex_id, ps, chunk_ps, dirslash)) + return + + # Top commit, for example has none. + demangled = git.demangle_name(item.path[-1], item.mode)[0] if item.path \ + else None + + # Don't print mangled paths unless the verbosity is over 3. + if demangled: + ps = '/'.join(item.path[:-1] + [demangled]) + if verbosity == 1: + qprogress('%s %s:%s%s\r' % (status, hex_id, ps, dirslash)) + elif (verbosity > 1 and item.type == 'tree') \ + or (verbosity > 2 and item.type == 'blob'): + log('%s %s:%s%s\n' % (status, hex_id, ps, dirslash)) + elif verbosity > 3: + ps = '/'.join(item.path) + log('%s %s:%s%s\n' % (status, hex_id, ps, dirslash)) + + +def find_live_objects(existing_count, cat_pipe, verbosity=0): + prune_visited_trees = True # In case we want a command line option later + pack_dir = git.repo('objects/pack') + ffd, bloom_filename = tempfile.mkstemp('.bloom', 'tmp-gc-', pack_dir) + os.close(ffd) + # FIXME: allow selection of k? + # FIXME: support ephemeral bloom filters (i.e. *never* written to disk) + live_objs = bloom.create(bloom_filename, expected=existing_count, k=None) + stop_at, trees_visited = None, None + if prune_visited_trees: + trees_visited = set() + stop_at = lambda (x): x.decode('hex') in trees_visited + approx_live_count = 0 + for ref_name, ref_id in git.list_refs(): + for item in walk_object(cat_pipe, ref_id.encode('hex'), + stop_at=stop_at, + include_data=None): + # FIXME: batch ids + if verbosity: + report_live_item(approx_live_count, existing_count, + ref_name, ref_id, item, verbosity) + bin_id = item.id.decode('hex') + if trees_visited is not None and item.type == 'tree': + trees_visited.add(bin_id) + if verbosity: + if not live_objs.exists(bin_id): + live_objs.add(bin_id) + approx_live_count += 1 + else: + live_objs.add(bin_id) + trees_visited = None + if verbosity: + log('expecting to retain about %.2f%% unnecessary objects\n' + % live_objs.pfalse_positive()) + return live_objs + + +def sweep(live_objects, existing_count, cat_pipe, threshold, compression, + verbosity): + # Traverse all the packs, saving the (probably) live data. + + ns = Nonlocal() + ns.stale_files = [] + def remove_stale_files(new_pack_prefix): + if verbosity and new_pack_prefix: + log('created ' + basename(new_pack_prefix) + '\n') + for p in ns.stale_files: + if verbosity: + log('removing ' + basename(p) + '\n') + os.unlink(p) + ns.stale_files = [] + + writer = git.PackWriter(objcache_maker=None, + compression_level=compression, + run_midx=False, + on_pack_finish=remove_stale_files) + + # FIXME: sanity check .idx names vs .pack names? + collect_count = 0 + for idx_name in glob.glob(os.path.join(git.repo('objects/pack'), '*.idx')): + if verbosity: + qprogress('preserving live data (%d%% complete)\r' + % ((float(collect_count) / existing_count) * 100)) + idx = git.open_idx(idx_name) + + idx_live_count = 0 + for i in xrange(0, len(idx)): + sha = idx.shatable[i * 20 : (i + 1) * 20] + if live_objects.exists(sha): + idx_live_count += 1 + + collect_count += idx_live_count + if idx_live_count == 0: + if verbosity: + log('deleting %s\n' + % git.repo_rel(basename(idx_name))) + ns.stale_files.append(idx_name) + ns.stale_files.append(idx_name[:-3] + 'pack') + continue + + live_frac = idx_live_count / float(len(idx)) + if live_frac > ((100 - threshold) / 100.0): + if verbosity: + log('keeping %s (%d%% live)\n' % (git.repo_rel(basename(idx_name)), + live_frac * 100)) + continue + + if verbosity: + log('rewriting %s (%.2f%% live)\n' % (basename(idx_name), + live_frac * 100)) + for i in xrange(0, len(idx)): + sha = idx.shatable[i * 20 : (i + 1) * 20] + if live_objects.exists(sha): + item_it = cat_pipe.get(sha.encode('hex')) + type = item_it.next() + writer.write(sha, type, ''.join(item_it)) + + ns.stale_files.append(idx_name) + ns.stale_files.append(idx_name[:-3] + 'pack') + + if verbosity: + progress('preserving live data (%d%% complete)\n' + % ((float(collect_count) / existing_count) * 100)) + + # Nothing should have recreated midx/bloom yet. + pack_dir = git.repo('objects/pack') + assert(not os.path.exists(os.path.join(pack_dir, 'bup.bloom'))) + assert(not glob.glob(os.path.join(pack_dir, '*.midx'))) + + # try/catch should call writer.abort()? + # This will finally run midx. + writer.close() # Can only change refs (if needed) after this. + remove_stale_files(None) # In case we didn't write to the writer. + + if verbosity: + log('discarded %d%% of objects\n' + % ((existing_count - count_objects(pack_dir, verbosity)) + / float(existing_count) * 100)) + + +def bup_gc(threshold=10, compression=1, verbosity=0): + cat_pipe = git.cp() + existing_count = count_objects(git.repo('objects/pack'), verbosity) + if verbosity: + log('found %d objects\n' % existing_count) + if not existing_count: + if verbosity: + log('nothing to collect\n') + else: + try: + live_objects = find_live_objects(existing_count, cat_pipe, + verbosity=verbosity) + except MissingObject as ex: + log('bup: missing object %r \n' % ex.id.encode('hex')) + sys.exit(1) + try: + # FIXME: just rename midxes and bloom, and restore them at the end if + # we didn't change any packs? + if verbosity: log('clearing midx files\n') + midx.clear_midxes() + if verbosity: log('clearing bloom filter\n') + bloom.clear_bloom(git.repo('objects/pack')) + if verbosity: log('clearing reflog\n') + expirelog_cmd = ['git', 'reflog', 'expire', '--all', '--expire=all'] + expirelog = subprocess.Popen(expirelog_cmd, preexec_fn = git._gitenv()) + git._git_wait(' '.join(expirelog_cmd), expirelog) + if verbosity: log('removing unreachable data\n') + sweep(live_objects, existing_count, cat_pipe, + threshold, compression, + verbosity) + finally: + live_objects.close() + os.unlink(live_objects.name) -- 2.39.2