lib/bup/gc.py

   1
   2 from __future__ import absolute_import
   3 from binascii import hexlify, unhexlify
   4 from os.path import basename
   5 import glob, os, subprocess, sys, tempfile
   6
   7 from bup import bloom, git, midx
   8 from bup.compat import hexstr, range
   9 from bup.git import MissingObject, walk_object
  10 from bup.helpers import Nonlocal, log, progress, qprogress
  11 from bup.io import path_msg
  12
  13 # This garbage collector uses a Bloom filter to track the live objects
  14 # during the mark phase.  This means that the collection is
  15 # probabilistic; it may retain some (known) percentage of garbage, but
  16 # it can also work within a reasonable, fixed RAM budget for any
  17 # particular percentage and repository size.
  18 #
  19 # The collection proceeds as follows:
  20 #
  21 #   - Scan all live objects by walking all of the refs, and insert
  22 #     every hash encountered into a new Bloom "liveness" filter.
  23 #     Compute the size of the liveness filter based on the total
  24 #     number of objects in the repository.  This is the "mark phase".
  25 #
  26 #   - Clear the data that's dependent on the repository's object set,
  27 #     i.e. the reflog, the normal Bloom filter, and the midxes.
  28 #
  29 #   - Traverse all of the pack files, consulting the liveness filter
  30 #     to decide which objects to keep.
  31 #
  32 #     For each pack file, rewrite it iff it probably contains more
  33 #     than (currently) 10% garbage (computed by an initial traversal
  34 #     of the packfile in consultation with the liveness filter).  To
  35 #     rewrite, traverse the packfile (again) and write each hash that
  36 #     tests positive against the liveness filter to a packwriter.
  37 #
  38 #     During the traversal of all of the packfiles, delete redundant,
  39 #     old packfiles only after the packwriter has finished the pack
  40 #     that contains all of their live objects.
  41 #
  42 # The current code unconditionally tracks the set of tree hashes seen
  43 # during the mark phase, and skips any that have already been visited.
  44 # This should decrease the IO load at the cost of increased RAM use.
  45
  46 # FIXME: add a bloom filter tuning parameter?
  47
  48
  49 def count_objects(dir, verbosity):
  50     # For now we'll just use open_idx(), but we could probably be much
  51     # more efficient since all we need is a single integer (the last
  52     # fanout entry) from each index.
  53     object_count = 0
  54     indexes = glob.glob(os.path.join(dir, b'*.idx'))
  55     for i, idx_name in enumerate(indexes):
  56         if verbosity:
  57             log('found %d objects (%d/%d %s)\r'
  58                 % (object_count, i + 1, len(indexes),
  59                    path_msg(basename(idx_name))))
  60         idx = git.open_idx(idx_name)
  61         object_count += len(idx)
  62     return object_count
  63
  64
  65 def report_live_item(n, total, ref_name, ref_id, item, verbosity):
  66     status = 'scanned %02.2f%%' % (n * 100.0 / total)
  67     hex_id = hexstr(ref_id)
  68     dirslash = b'/' if item.type == b'tree' else b''
  69     chunk_path = item.chunk_path
  70
  71     if chunk_path:
  72         if verbosity < 4:
  73             return
  74         ps = b'/'.join(item.path)
  75         chunk_ps = b'/'.join(chunk_path)
  76         log('%s %s:%s/%s%s\n' % (status, hex_id, path_msg(ps),
  77                                  path_msg(chunk_ps), path_msg(dirslash)))
  78         return
  79
  80     # Top commit, for example has none.
  81     demangled = git.demangle_name(item.path[-1], item.mode)[0] if item.path \
  82                 else None
  83
  84     # Don't print mangled paths unless the verbosity is over 3.
  85     if demangled:
  86         ps = b'/'.join(item.path[:-1] + [demangled])
  87         if verbosity == 1:
  88             qprogress('%s %s:%s%s\r' % (status, hex_id, path_msg(ps),
  89                                         path_msg(dirslash)))
  90         elif (verbosity > 1 and item.type == b'tree') \
  91              or (verbosity > 2 and item.type == b'blob'):
  92             log('%s %s:%s%s\n' % (status, hex_id, path_msg(ps),
  93                                   path_msg(dirslash)))
  94     elif verbosity > 3:
  95         ps = b'/'.join(item.path)
  96         log('%s %s:%s%s\n' % (status, hex_id, path_msg(ps), path_msg(dirslash)))
  97
  98
  99 def find_live_objects(existing_count, cat_pipe, verbosity=0):
 100     prune_visited_trees = True # In case we want a command line option later
 101     pack_dir = git.repo(b'objects/pack')
 102     ffd, bloom_filename = tempfile.mkstemp(b'.bloom', b'tmp-gc-', pack_dir)
 103     os.close(ffd)
 104     # FIXME: allow selection of k?
 105     # FIXME: support ephemeral bloom filters (i.e. *never* written to disk)
 106     live_objs = bloom.create(bloom_filename, expected=existing_count, k=None)
 107     # live_objs will hold on to the fd until close or exit
 108     os.unlink(bloom_filename)
 109     stop_at, trees_visited = None, None
 110     if prune_visited_trees:
 111         trees_visited = set()
 112         stop_at = lambda x: unhexlify(x) in trees_visited
 113     approx_live_count = 0
 114     for ref_name, ref_id in git.list_refs():
 115         for item in walk_object(cat_pipe.get, hexlify(ref_id), stop_at=stop_at,
 116                                 include_data=None):
 117             # FIXME: batch ids
 118             if verbosity:
 119                 report_live_item(approx_live_count, existing_count,
 120                                  ref_name, ref_id, item, verbosity)
 121             if trees_visited is not None and item.type == b'tree':
 122                 trees_visited.add(item.oid)
 123             if verbosity:
 124                 if not live_objs.exists(item.oid):
 125                     live_objs.add(item.oid)
 126                     approx_live_count += 1
 127             else:
 128                 live_objs.add(item.oid)
 129     trees_visited = None
 130     if verbosity:
 131         log('expecting to retain about %.2f%% unnecessary objects\n'
 132             % live_objs.pfalse_positive())
 133     return live_objs
 134
 135
 136 def sweep(live_objects, existing_count, cat_pipe, threshold, compression,
 137           verbosity):
 138     # Traverse all the packs, saving the (probably) live data.
 139
 140     ns = Nonlocal()
 141     ns.stale_files = []
 142     def remove_stale_files(new_pack_prefix):
 143         if verbosity and new_pack_prefix:
 144             log('created ' + path_msg(basename(new_pack_prefix)) + '\n')
 145         for p in ns.stale_files:
 146             if new_pack_prefix and p.startswith(new_pack_prefix):
 147                 continue  # Don't remove the new pack file
 148             if verbosity:
 149                 log('removing ' + path_msg(basename(p)) + '\n')
 150             os.unlink(p)
 151         if ns.stale_files:  # So git cat-pipe will close them
 152             cat_pipe.restart()
 153         ns.stale_files = []
 154
 155     writer = git.PackWriter(objcache_maker=None,
 156                             compression_level=compression,
 157                             run_midx=False,
 158                             on_pack_finish=remove_stale_files)
 159
 160     # FIXME: sanity check .idx names vs .pack names?
 161     collect_count = 0
 162     for idx_name in glob.glob(os.path.join(git.repo(b'objects/pack'), b'*.idx')):
 163         if verbosity:
 164             qprogress('preserving live data (%d%% complete)\r'
 165                       % ((float(collect_count) / existing_count) * 100))
 166         with git.open_idx(idx_name) as idx:
 167             idx_live_count = 0
 168             for sha in idx:
 169                 if live_objects.exists(sha):
 170                     idx_live_count += 1
 171
 172             collect_count += idx_live_count
 173             if idx_live_count == 0:
 174                 if verbosity:
 175                     log('deleting %s\n'
 176                         % path_msg(git.repo_rel(basename(idx_name))))
 177                 ns.stale_files.append(idx_name)
 178                 ns.stale_files.append(idx_name[:-3] + b'pack')
 179                 continue
 180
 181             live_frac = idx_live_count / float(len(idx))
 182             if live_frac > ((100 - threshold) / 100.0):
 183                 if verbosity:
 184                     log('keeping %s (%d%% live)\n' % (git.repo_rel(basename(idx_name)),
 185                                                     live_frac * 100))
 186                 continue
 187
 188             if verbosity:
 189                 log('rewriting %s (%.2f%% live)\n' % (basename(idx_name),
 190                                                     live_frac * 100))
 191             for sha in idx:
 192                 if live_objects.exists(sha):
 193                     item_it = cat_pipe.get(hexlify(sha))
 194                     _, typ, _ = next(item_it)
 195                     writer.just_write(sha, typ, b''.join(item_it))
 196
 197             ns.stale_files.append(idx_name)
 198             ns.stale_files.append(idx_name[:-3] + b'pack')
 199
 200     if verbosity:
 201         progress('preserving live data (%d%% complete)\n'
 202                  % ((float(collect_count) / existing_count) * 100))
 203
 204     # Nothing should have recreated midx/bloom yet.
 205     pack_dir = git.repo(b'objects/pack')
 206     assert(not os.path.exists(os.path.join(pack_dir, b'bup.bloom')))
 207     assert(not glob.glob(os.path.join(pack_dir, b'*.midx')))
 208
 209     # try/catch should call writer.abort()?
 210     # This will finally run midx.
 211     writer.close()  # Can only change refs (if needed) after this.
 212     remove_stale_files(None)  # In case we didn't write to the writer.
 213
 214     if verbosity:
 215         log('discarded %d%% of objects\n'
 216             % ((existing_count - count_objects(pack_dir, verbosity))
 217                / float(existing_count) * 100))
 218
 219
 220 def bup_gc(threshold=10, compression=1, verbosity=0):
 221     cat_pipe = git.cp()
 222     existing_count = count_objects(git.repo(b'objects/pack'), verbosity)
 223     if verbosity:
 224         log('found %d objects\n' % existing_count)
 225     if not existing_count:
 226         if verbosity:
 227             log('nothing to collect\n')
 228     else:
 229         try:
 230             live_objects = find_live_objects(existing_count, cat_pipe,
 231                                              verbosity=verbosity)
 232         except MissingObject as ex:
 233             log('bup: missing object %r \n' % hexstr(ex.oid))
 234             sys.exit(1)
 235         try:
 236             # FIXME: just rename midxes and bloom, and restore them at the end if
 237             # we didn't change any packs?
 238             packdir = git.repo(b'objects/pack')
 239             if verbosity: log('clearing midx files\n')
 240             midx.clear_midxes(packdir)
 241             if verbosity: log('clearing bloom filter\n')
 242             bloom.clear_bloom(packdir)
 243             if verbosity: log('clearing reflog\n')
 244             expirelog_cmd = [b'git', b'reflog', b'expire', b'--all', b'--expire=all']
 245             expirelog = subprocess.Popen(expirelog_cmd, env=git._gitenv())
 246             git._git_wait(b' '.join(expirelog_cmd), expirelog)
 247             if verbosity: log('removing unreachable data\n')
 248             sweep(live_objects, existing_count, cat_pipe,
 249                   threshold, compression,
 250                   verbosity)
 251         finally:
 252             live_objects.close()