2 from __future__ import absolute_import
3 from binascii import hexlify, unhexlify
4 from os.path import basename
5 import glob, os, subprocess, sys, tempfile
7 from bup import bloom, git, midx
8 from bup.compat import hexstr, range
9 from bup.git import MissingObject, walk_object
10 from bup.helpers import Nonlocal, log, progress, qprogress
11 from bup.io import path_msg
13 # This garbage collector uses a Bloom filter to track the live objects
14 # during the mark phase. This means that the collection is
15 # probabilistic; it may retain some (known) percentage of garbage, but
16 # it can also work within a reasonable, fixed RAM budget for any
17 # particular percentage and repository size.
19 # The collection proceeds as follows:
21 # - Scan all live objects by walking all of the refs, and insert
22 # every hash encountered into a new Bloom "liveness" filter.
23 # Compute the size of the liveness filter based on the total
24 # number of objects in the repository. This is the "mark phase".
26 # - Clear the data that's dependent on the repository's object set,
27 # i.e. the reflog, the normal Bloom filter, and the midxes.
29 # - Traverse all of the pack files, consulting the liveness filter
30 # to decide which objects to keep.
32 # For each pack file, rewrite it iff it probably contains more
33 # than (currently) 10% garbage (computed by an initial traversal
34 # of the packfile in consultation with the liveness filter). To
35 # rewrite, traverse the packfile (again) and write each hash that
36 # tests positive against the liveness filter to a packwriter.
38 # During the traversal of all of the packfiles, delete redundant,
39 # old packfiles only after the packwriter has finished the pack
40 # that contains all of their live objects.
42 # The current code unconditionally tracks the set of tree hashes seen
43 # during the mark phase, and skips any that have already been visited.
44 # This should decrease the IO load at the cost of increased RAM use.
46 # FIXME: add a bloom filter tuning parameter?
49 def count_objects(dir, verbosity):
50 # For now we'll just use open_idx(), but we could probably be much
51 # more efficient since all we need is a single integer (the last
52 # fanout entry) from each index.
54 indexes = glob.glob(os.path.join(dir, b'*.idx'))
55 for i, idx_name in enumerate(indexes):
57 log('found %d objects (%d/%d %s)\r'
58 % (object_count, i + 1, len(indexes),
59 path_msg(basename(idx_name))))
60 idx = git.open_idx(idx_name)
61 object_count += len(idx)
65 def report_live_item(n, total, ref_name, ref_id, item, verbosity):
66 status = 'scanned %02.2f%%' % (n * 100.0 / total)
67 hex_id = hexstr(ref_id)
68 dirslash = b'/' if item.type == b'tree' else b''
69 chunk_path = item.chunk_path
74 ps = b'/'.join(item.path)
75 chunk_ps = b'/'.join(chunk_path)
76 log('%s %s:%s/%s%s\n' % (status, hex_id, path_msg(ps),
77 path_msg(chunk_ps), path_msg(dirslash)))
80 # Top commit, for example has none.
81 demangled = git.demangle_name(item.path[-1], item.mode)[0] if item.path \
84 # Don't print mangled paths unless the verbosity is over 3.
86 ps = b'/'.join(item.path[:-1] + [demangled])
88 qprogress('%s %s:%s%s\r' % (status, hex_id, path_msg(ps),
90 elif (verbosity > 1 and item.type == b'tree') \
91 or (verbosity > 2 and item.type == b'blob'):
92 log('%s %s:%s%s\n' % (status, hex_id, path_msg(ps),
95 ps = b'/'.join(item.path)
96 log('%s %s:%s%s\n' % (status, hex_id, path_msg(ps), path_msg(dirslash)))
99 def find_live_objects(existing_count, cat_pipe, verbosity=0):
100 prune_visited_trees = True # In case we want a command line option later
101 pack_dir = git.repo(b'objects/pack')
102 ffd, bloom_filename = tempfile.mkstemp(b'.bloom', b'tmp-gc-', pack_dir)
104 # FIXME: allow selection of k?
105 # FIXME: support ephemeral bloom filters (i.e. *never* written to disk)
106 live_objs = bloom.create(bloom_filename, expected=existing_count, k=None)
107 # live_objs will hold on to the fd until close or exit
108 os.unlink(bloom_filename)
109 stop_at, trees_visited = None, None
110 if prune_visited_trees:
111 trees_visited = set()
112 stop_at = lambda x: unhexlify(x) in trees_visited
113 approx_live_count = 0
114 for ref_name, ref_id in git.list_refs():
115 for item in walk_object(cat_pipe.get, hexlify(ref_id), stop_at=stop_at,
119 report_live_item(approx_live_count, existing_count,
120 ref_name, ref_id, item, verbosity)
121 if trees_visited is not None and item.type == b'tree':
122 trees_visited.add(item.oid)
124 if not live_objs.exists(item.oid):
125 live_objs.add(item.oid)
126 approx_live_count += 1
128 live_objs.add(item.oid)
131 log('expecting to retain about %.2f%% unnecessary objects\n'
132 % live_objs.pfalse_positive())
136 def sweep(live_objects, existing_count, cat_pipe, threshold, compression,
138 # Traverse all the packs, saving the (probably) live data.
142 def remove_stale_files(new_pack_prefix):
143 if verbosity and new_pack_prefix:
144 log('created ' + path_msg(basename(new_pack_prefix)) + '\n')
145 for p in ns.stale_files:
146 if new_pack_prefix and p.startswith(new_pack_prefix):
147 continue # Don't remove the new pack file
149 log('removing ' + path_msg(basename(p)) + '\n')
151 if ns.stale_files: # So git cat-pipe will close them
155 writer = git.PackWriter(objcache_maker=None,
156 compression_level=compression,
158 on_pack_finish=remove_stale_files)
160 # FIXME: sanity check .idx names vs .pack names?
162 for idx_name in glob.glob(os.path.join(git.repo(b'objects/pack'), b'*.idx')):
164 qprogress('preserving live data (%d%% complete)\r'
165 % ((float(collect_count) / existing_count) * 100))
166 with git.open_idx(idx_name) as idx:
169 if live_objects.exists(sha):
172 collect_count += idx_live_count
173 if idx_live_count == 0:
176 % path_msg(git.repo_rel(basename(idx_name))))
177 ns.stale_files.append(idx_name)
178 ns.stale_files.append(idx_name[:-3] + b'pack')
181 live_frac = idx_live_count / float(len(idx))
182 if live_frac > ((100 - threshold) / 100.0):
184 log('keeping %s (%d%% live)\n' % (git.repo_rel(basename(idx_name)),
189 log('rewriting %s (%.2f%% live)\n' % (basename(idx_name),
192 if live_objects.exists(sha):
193 item_it = cat_pipe.get(hexlify(sha))
194 _, typ, _ = next(item_it)
195 writer.just_write(sha, typ, b''.join(item_it))
197 ns.stale_files.append(idx_name)
198 ns.stale_files.append(idx_name[:-3] + b'pack')
201 progress('preserving live data (%d%% complete)\n'
202 % ((float(collect_count) / existing_count) * 100))
204 # Nothing should have recreated midx/bloom yet.
205 pack_dir = git.repo(b'objects/pack')
206 assert(not os.path.exists(os.path.join(pack_dir, b'bup.bloom')))
207 assert(not glob.glob(os.path.join(pack_dir, b'*.midx')))
209 # try/catch should call writer.abort()?
210 # This will finally run midx.
211 writer.close() # Can only change refs (if needed) after this.
212 remove_stale_files(None) # In case we didn't write to the writer.
215 log('discarded %d%% of objects\n'
216 % ((existing_count - count_objects(pack_dir, verbosity))
217 / float(existing_count) * 100))
220 def bup_gc(threshold=10, compression=1, verbosity=0):
222 existing_count = count_objects(git.repo(b'objects/pack'), verbosity)
224 log('found %d objects\n' % existing_count)
225 if not existing_count:
227 log('nothing to collect\n')
230 live_objects = find_live_objects(existing_count, cat_pipe,
232 except MissingObject as ex:
233 log('bup: missing object %r \n' % hexstr(ex.oid))
236 # FIXME: just rename midxes and bloom, and restore them at the end if
237 # we didn't change any packs?
238 packdir = git.repo(b'objects/pack')
239 if verbosity: log('clearing midx files\n')
240 midx.clear_midxes(packdir)
241 if verbosity: log('clearing bloom filter\n')
242 bloom.clear_bloom(packdir)
243 if verbosity: log('clearing reflog\n')
244 expirelog_cmd = [b'git', b'reflog', b'expire', b'--all', b'--expire=all']
245 expirelog = subprocess.Popen(expirelog_cmd, env=git._gitenv())
246 git._git_wait(b' '.join(expirelog_cmd), expirelog)
247 if verbosity: log('removing unreachable data\n')
248 sweep(live_objects, existing_count, cat_pipe,
249 threshold, compression,