2 from __future__ import absolute_import
3 import glob, os, subprocess, sys, tempfile
4 from bup import bloom, git, midx
5 from bup.compat import range
6 from bup.git import MissingObject, walk_object
7 from bup.helpers import Nonlocal, log, progress, qprogress
8 from os.path import basename
10 # This garbage collector uses a Bloom filter to track the live objects
11 # during the mark phase. This means that the collection is
12 # probabilistic; it may retain some (known) percentage of garbage, but
13 # it can also work within a reasonable, fixed RAM budget for any
14 # particular percentage and repository size.
16 # The collection proceeds as follows:
18 # - Scan all live objects by walking all of the refs, and insert
19 # every hash encountered into a new Bloom "liveness" filter.
20 # Compute the size of the liveness filter based on the total
21 # number of objects in the repository. This is the "mark phase".
23 # - Clear the data that's dependent on the repository's object set,
24 # i.e. the reflog, the normal Bloom filter, and the midxes.
26 # - Traverse all of the pack files, consulting the liveness filter
27 # to decide which objects to keep.
29 # For each pack file, rewrite it iff it probably contains more
30 # than (currently) 10% garbage (computed by an initial traversal
31 # of the packfile in consultation with the liveness filter). To
32 # rewrite, traverse the packfile (again) and write each hash that
33 # tests positive against the liveness filter to a packwriter.
35 # During the traversal of all of the packfiles, delete redundant,
36 # old packfiles only after the packwriter has finished the pack
37 # that contains all of their live objects.
39 # The current code unconditionally tracks the set of tree hashes seen
40 # during the mark phase, and skips any that have already been visited.
41 # This should decrease the IO load at the cost of increased RAM use.
43 # FIXME: add a bloom filter tuning parameter?
46 def count_objects(dir, verbosity):
47 # For now we'll just use open_idx(), but we could probably be much
48 # more efficient since all we need is a single integer (the last
49 # fanout entry) from each index.
51 indexes = glob.glob(os.path.join(dir, '*.idx'))
52 for i, idx_name in enumerate(indexes):
54 log('found %d objects (%d/%d %s)\r'
55 % (object_count, i + 1, len(indexes), basename(idx_name)))
56 idx = git.open_idx(idx_name)
57 object_count += len(idx)
61 def report_live_item(n, total, ref_name, ref_id, item, verbosity):
62 status = 'scanned %02.2f%%' % (n * 100.0 / total)
63 hex_id = ref_id.encode('hex')
64 dirslash = '/' if item.type == 'tree' else ''
65 chunk_path = item.chunk_path
70 ps = '/'.join(item.path)
71 chunk_ps = '/'.join(chunk_path)
72 log('%s %s:%s/%s%s\n' % (status, hex_id, ps, chunk_ps, dirslash))
75 # Top commit, for example has none.
76 demangled = git.demangle_name(item.path[-1], item.mode)[0] if item.path \
79 # Don't print mangled paths unless the verbosity is over 3.
81 ps = '/'.join(item.path[:-1] + [demangled])
83 qprogress('%s %s:%s%s\r' % (status, hex_id, ps, dirslash))
84 elif (verbosity > 1 and item.type == 'tree') \
85 or (verbosity > 2 and item.type == 'blob'):
86 log('%s %s:%s%s\n' % (status, hex_id, ps, dirslash))
88 ps = '/'.join(item.path)
89 log('%s %s:%s%s\n' % (status, hex_id, ps, dirslash))
92 def find_live_objects(existing_count, cat_pipe, verbosity=0):
93 prune_visited_trees = True # In case we want a command line option later
94 pack_dir = git.repo('objects/pack')
95 ffd, bloom_filename = tempfile.mkstemp('.bloom', 'tmp-gc-', pack_dir)
97 # FIXME: allow selection of k?
98 # FIXME: support ephemeral bloom filters (i.e. *never* written to disk)
99 live_objs = bloom.create(bloom_filename, expected=existing_count, k=None)
100 # live_objs will hold on to the fd until close or exit
101 os.unlink(bloom_filename)
102 stop_at, trees_visited = None, None
103 if prune_visited_trees:
104 trees_visited = set()
105 stop_at = lambda x: x.decode('hex') in trees_visited
106 approx_live_count = 0
107 for ref_name, ref_id in git.list_refs():
108 for item in walk_object(cat_pipe.get, ref_id.encode('hex'),
113 report_live_item(approx_live_count, existing_count,
114 ref_name, ref_id, item, verbosity)
115 if trees_visited is not None and item.type == 'tree':
116 trees_visited.add(item.oid)
118 if not live_objs.exists(item.oid):
119 live_objs.add(item.oid)
120 approx_live_count += 1
122 live_objs.add(item.oid)
125 log('expecting to retain about %.2f%% unnecessary objects\n'
126 % live_objs.pfalse_positive())
130 def sweep(live_objects, existing_count, cat_pipe, threshold, compression,
132 # Traverse all the packs, saving the (probably) live data.
136 def remove_stale_files(new_pack_prefix):
137 if verbosity and new_pack_prefix:
138 log('created ' + basename(new_pack_prefix) + '\n')
139 for p in ns.stale_files:
140 if new_pack_prefix and p.startswith(new_pack_prefix):
141 continue # Don't remove the new pack file
143 log('removing ' + basename(p) + '\n')
145 if ns.stale_files: # So git cat-pipe will close them
149 writer = git.PackWriter(objcache_maker=None,
150 compression_level=compression,
152 on_pack_finish=remove_stale_files)
154 # FIXME: sanity check .idx names vs .pack names?
156 for idx_name in glob.glob(os.path.join(git.repo('objects/pack'), '*.idx')):
158 qprogress('preserving live data (%d%% complete)\r'
159 % ((float(collect_count) / existing_count) * 100))
160 idx = git.open_idx(idx_name)
163 for i in range(0, len(idx)):
164 sha = idx.shatable[i * 20 : (i + 1) * 20]
165 if live_objects.exists(sha):
168 collect_count += idx_live_count
169 if idx_live_count == 0:
172 % git.repo_rel(basename(idx_name)))
173 ns.stale_files.append(idx_name)
174 ns.stale_files.append(idx_name[:-3] + 'pack')
177 live_frac = idx_live_count / float(len(idx))
178 if live_frac > ((100 - threshold) / 100.0):
180 log('keeping %s (%d%% live)\n' % (git.repo_rel(basename(idx_name)),
185 log('rewriting %s (%.2f%% live)\n' % (basename(idx_name),
187 for i in range(0, len(idx)):
188 sha = idx.shatable[i * 20 : (i + 1) * 20]
189 if live_objects.exists(sha):
190 item_it = cat_pipe.get(sha.encode('hex'))
191 _, typ, _ = next(item_it)
192 writer.just_write(sha, typ, ''.join(item_it))
194 ns.stale_files.append(idx_name)
195 ns.stale_files.append(idx_name[:-3] + 'pack')
198 progress('preserving live data (%d%% complete)\n'
199 % ((float(collect_count) / existing_count) * 100))
201 # Nothing should have recreated midx/bloom yet.
202 pack_dir = git.repo('objects/pack')
203 assert(not os.path.exists(os.path.join(pack_dir, 'bup.bloom')))
204 assert(not glob.glob(os.path.join(pack_dir, '*.midx')))
206 # try/catch should call writer.abort()?
207 # This will finally run midx.
208 writer.close() # Can only change refs (if needed) after this.
209 remove_stale_files(None) # In case we didn't write to the writer.
212 log('discarded %d%% of objects\n'
213 % ((existing_count - count_objects(pack_dir, verbosity))
214 / float(existing_count) * 100))
217 def bup_gc(threshold=10, compression=1, verbosity=0):
219 existing_count = count_objects(git.repo('objects/pack'), verbosity)
221 log('found %d objects\n' % existing_count)
222 if not existing_count:
224 log('nothing to collect\n')
227 live_objects = find_live_objects(existing_count, cat_pipe,
229 except MissingObject as ex:
230 log('bup: missing object %r \n' % ex.oid.encode('hex'))
233 # FIXME: just rename midxes and bloom, and restore them at the end if
234 # we didn't change any packs?
235 packdir = git.repo('objects/pack')
236 if verbosity: log('clearing midx files\n')
237 midx.clear_midxes(packdir)
238 if verbosity: log('clearing bloom filter\n')
239 bloom.clear_bloom(packdir)
240 if verbosity: log('clearing reflog\n')
241 expirelog_cmd = ['git', 'reflog', 'expire', '--all', '--expire=all']
242 expirelog = subprocess.Popen(expirelog_cmd, preexec_fn = git._gitenv())
243 git._git_wait(' '.join(expirelog_cmd), expirelog)
244 if verbosity: log('removing unreachable data\n')
245 sweep(live_objects, existing_count, cat_pipe,
246 threshold, compression,