lib/bup/gc.py

   1
   2 from __future__ import absolute_import
   3 import glob, os, subprocess, sys, tempfile
   4 from bup import bloom, git, midx
   5 from bup.compat import range
   6 from bup.git import MissingObject, walk_object
   7 from bup.helpers import Nonlocal, log, progress, qprogress
   8 from os.path import basename
   9
  10 # This garbage collector uses a Bloom filter to track the live objects
  11 # during the mark phase.  This means that the collection is
  12 # probabilistic; it may retain some (known) percentage of garbage, but
  13 # it can also work within a reasonable, fixed RAM budget for any
  14 # particular percentage and repository size.
  15 #
  16 # The collection proceeds as follows:
  17 #
  18 #   - Scan all live objects by walking all of the refs, and insert
  19 #     every hash encountered into a new Bloom "liveness" filter.
  20 #     Compute the size of the liveness filter based on the total
  21 #     number of objects in the repository.  This is the "mark phase".
  22 #
  23 #   - Clear the data that's dependent on the repository's object set,
  24 #     i.e. the reflog, the normal Bloom filter, and the midxes.
  25 #
  26 #   - Traverse all of the pack files, consulting the liveness filter
  27 #     to decide which objects to keep.
  28 #
  29 #     For each pack file, rewrite it iff it probably contains more
  30 #     than (currently) 10% garbage (computed by an initial traversal
  31 #     of the packfile in consultation with the liveness filter).  To
  32 #     rewrite, traverse the packfile (again) and write each hash that
  33 #     tests positive against the liveness filter to a packwriter.
  34 #
  35 #     During the traversal of all of the packfiles, delete redundant,
  36 #     old packfiles only after the packwriter has finished the pack
  37 #     that contains all of their live objects.
  38 #
  39 # The current code unconditionally tracks the set of tree hashes seen
  40 # during the mark phase, and skips any that have already been visited.
  41 # This should decrease the IO load at the cost of increased RAM use.
  42
  43 # FIXME: add a bloom filter tuning parameter?
  44
  45
  46 def count_objects(dir, verbosity):
  47     # For now we'll just use open_idx(), but we could probably be much
  48     # more efficient since all we need is a single integer (the last
  49     # fanout entry) from each index.
  50     object_count = 0
  51     indexes = glob.glob(os.path.join(dir, '*.idx'))
  52     for i, idx_name in enumerate(indexes):
  53         if verbosity:
  54             log('found %d objects (%d/%d %s)\r'
  55                 % (object_count, i + 1, len(indexes), basename(idx_name)))
  56         idx = git.open_idx(idx_name)
  57         object_count += len(idx)
  58     return object_count
  59
  60
  61 def report_live_item(n, total, ref_name, ref_id, item, verbosity):
  62     status = 'scanned %02.2f%%' % (n * 100.0 / total)
  63     hex_id = ref_id.encode('hex')
  64     dirslash = '/' if item.type == 'tree' else ''
  65     chunk_path = item.chunk_path
  66
  67     if chunk_path:
  68         if verbosity < 4:
  69             return
  70         ps = '/'.join(item.path)
  71         chunk_ps = '/'.join(chunk_path)
  72         log('%s %s:%s/%s%s\n' % (status, hex_id, ps, chunk_ps, dirslash))
  73         return
  74
  75     # Top commit, for example has none.
  76     demangled = git.demangle_name(item.path[-1], item.mode)[0] if item.path \
  77                 else None
  78
  79     # Don't print mangled paths unless the verbosity is over 3.
  80     if demangled:
  81         ps = '/'.join(item.path[:-1] + [demangled])
  82         if verbosity == 1:
  83             qprogress('%s %s:%s%s\r' % (status, hex_id, ps, dirslash))
  84         elif (verbosity > 1 and item.type == 'tree') \
  85              or (verbosity > 2 and item.type == 'blob'):
  86             log('%s %s:%s%s\n' % (status, hex_id, ps, dirslash))
  87     elif verbosity > 3:
  88         ps = '/'.join(item.path)
  89         log('%s %s:%s%s\n' % (status, hex_id, ps, dirslash))
  90
  91
  92 def find_live_objects(existing_count, cat_pipe, verbosity=0):
  93     prune_visited_trees = True # In case we want a command line option later
  94     pack_dir = git.repo('objects/pack')
  95     ffd, bloom_filename = tempfile.mkstemp('.bloom', 'tmp-gc-', pack_dir)
  96     os.close(ffd)
  97     # FIXME: allow selection of k?
  98     # FIXME: support ephemeral bloom filters (i.e. *never* written to disk)
  99     live_objs = bloom.create(bloom_filename, expected=existing_count, k=None)
 100     # live_objs will hold on to the fd until close or exit
 101     os.unlink(bloom_filename)
 102     stop_at, trees_visited = None, None
 103     if prune_visited_trees:
 104         trees_visited = set()
 105         stop_at = lambda x: x.decode('hex') in trees_visited
 106     approx_live_count = 0
 107     for ref_name, ref_id in git.list_refs():
 108         for item in walk_object(cat_pipe.get, ref_id.encode('hex'),
 109                                 stop_at=stop_at,
 110                                 include_data=None):
 111             # FIXME: batch ids
 112             if verbosity:
 113                 report_live_item(approx_live_count, existing_count,
 114                                  ref_name, ref_id, item, verbosity)
 115             if trees_visited is not None and item.type == 'tree':
 116                 trees_visited.add(item.oid)
 117             if verbosity:
 118                 if not live_objs.exists(item.oid):
 119                     live_objs.add(item.oid)
 120                     approx_live_count += 1
 121             else:
 122                 live_objs.add(item.oid)
 123     trees_visited = None
 124     if verbosity:
 125         log('expecting to retain about %.2f%% unnecessary objects\n'
 126             % live_objs.pfalse_positive())
 127     return live_objs
 128
 129
 130 def sweep(live_objects, existing_count, cat_pipe, threshold, compression,
 131           verbosity):
 132     # Traverse all the packs, saving the (probably) live data.
 133
 134     ns = Nonlocal()
 135     ns.stale_files = []
 136     def remove_stale_files(new_pack_prefix):
 137         if verbosity and new_pack_prefix:
 138             log('created ' + basename(new_pack_prefix) + '\n')
 139         for p in ns.stale_files:
 140             if new_pack_prefix and p.startswith(new_pack_prefix):
 141                 continue  # Don't remove the new pack file
 142             if verbosity:
 143                 log('removing ' + basename(p) + '\n')
 144             os.unlink(p)
 145         if ns.stale_files:  # So git cat-pipe will close them
 146             cat_pipe.restart()
 147         ns.stale_files = []
 148
 149     writer = git.PackWriter(objcache_maker=None,
 150                             compression_level=compression,
 151                             run_midx=False,
 152                             on_pack_finish=remove_stale_files)
 153
 154     # FIXME: sanity check .idx names vs .pack names?
 155     collect_count = 0
 156     for idx_name in glob.glob(os.path.join(git.repo('objects/pack'), '*.idx')):
 157         if verbosity:
 158             qprogress('preserving live data (%d%% complete)\r'
 159                       % ((float(collect_count) / existing_count) * 100))
 160         idx = git.open_idx(idx_name)
 161
 162         idx_live_count = 0
 163         for i in range(0, len(idx)):
 164             sha = idx.shatable[i * 20 : (i + 1) * 20]
 165             if live_objects.exists(sha):
 166                 idx_live_count += 1
 167
 168         collect_count += idx_live_count
 169         if idx_live_count == 0:
 170             if verbosity:
 171                 log('deleting %s\n'
 172                     % git.repo_rel(basename(idx_name)))
 173             ns.stale_files.append(idx_name)
 174             ns.stale_files.append(idx_name[:-3] + 'pack')
 175             continue
 176
 177         live_frac = idx_live_count / float(len(idx))
 178         if live_frac > ((100 - threshold) / 100.0):
 179             if verbosity:
 180                 log('keeping %s (%d%% live)\n' % (git.repo_rel(basename(idx_name)),
 181                                                   live_frac * 100))
 182             continue
 183
 184         if verbosity:
 185             log('rewriting %s (%.2f%% live)\n' % (basename(idx_name),
 186                                                   live_frac * 100))
 187         for i in range(0, len(idx)):
 188             sha = idx.shatable[i * 20 : (i + 1) * 20]
 189             if live_objects.exists(sha):
 190                 item_it = cat_pipe.get(sha.encode('hex'))
 191                 _, typ, _ = next(item_it)
 192                 writer.just_write(sha, typ, ''.join(item_it))
 193
 194         ns.stale_files.append(idx_name)
 195         ns.stale_files.append(idx_name[:-3] + 'pack')
 196
 197     if verbosity:
 198         progress('preserving live data (%d%% complete)\n'
 199                  % ((float(collect_count) / existing_count) * 100))
 200
 201     # Nothing should have recreated midx/bloom yet.
 202     pack_dir = git.repo('objects/pack')
 203     assert(not os.path.exists(os.path.join(pack_dir, 'bup.bloom')))
 204     assert(not glob.glob(os.path.join(pack_dir, '*.midx')))
 205
 206     # try/catch should call writer.abort()?
 207     # This will finally run midx.
 208     writer.close()  # Can only change refs (if needed) after this.
 209     remove_stale_files(None)  # In case we didn't write to the writer.
 210
 211     if verbosity:
 212         log('discarded %d%% of objects\n'
 213             % ((existing_count - count_objects(pack_dir, verbosity))
 214                / float(existing_count) * 100))
 215
 216
 217 def bup_gc(threshold=10, compression=1, verbosity=0):
 218     cat_pipe = git.cp()
 219     existing_count = count_objects(git.repo('objects/pack'), verbosity)
 220     if verbosity:
 221         log('found %d objects\n' % existing_count)
 222     if not existing_count:
 223         if verbosity:
 224             log('nothing to collect\n')
 225     else:
 226         try:
 227             live_objects = find_live_objects(existing_count, cat_pipe,
 228                                              verbosity=verbosity)
 229         except MissingObject as ex:
 230             log('bup: missing object %r \n' % ex.oid.encode('hex'))
 231             sys.exit(1)
 232         try:
 233             # FIXME: just rename midxes and bloom, and restore them at the end if
 234             # we didn't change any packs?
 235             packdir = git.repo('objects/pack')
 236             if verbosity: log('clearing midx files\n')
 237             midx.clear_midxes(packdir)
 238             if verbosity: log('clearing bloom filter\n')
 239             bloom.clear_bloom(packdir)
 240             if verbosity: log('clearing reflog\n')
 241             expirelog_cmd = ['git', 'reflog', 'expire', '--all', '--expire=all']
 242             expirelog = subprocess.Popen(expirelog_cmd, preexec_fn = git._gitenv())
 243             git._git_wait(' '.join(expirelog_cmd), expirelog)
 244             if verbosity: log('removing unreachable data\n')
 245             sweep(live_objects, existing_count, cat_pipe,
 246                   threshold, compression,
 247                   verbosity)
 248         finally:
 249             live_objects.close()