2 import glob, os, stat, subprocess, sys, tempfile
3 from bup import bloom, git, midx, options, vfs
4 from bup.git import walk_object
5 from bup.helpers import handle_ctrl_c, log, progress, qprogress, saved_errors
6 from os.path import basename
8 # This command uses a Bloom filter to track the live objects during
9 # the mark phase. This means that the collection is probabilistic; it
10 # may retain some (known) percentage of garbage, but it can also work
11 # within a reasonable, fixed RAM budget for any particular percentage
12 # and repository size.
14 # The collection proceeds as follows:
16 # - Scan all live objects by walking all of the refs, and insert
17 # every hash encountered into a new Bloom "liveness" filter.
18 # Compute the size of the liveness filter based on the total
19 # number of objects in the repository. This is the "mark phase".
21 # - Clear the data that's dependent on the repository's object set,
22 # i.e. the reflog, the normal Bloom filter, and the midxes.
24 # - Traverse all of the pack files, consulting the liveness filter
25 # to decide which objects to keep.
27 # For each pack file, rewrite it iff it probably contains more
28 # than (currently) 10% garbage (computed by an initial traversal
29 # of the packfile in consultation with the liveness filter). To
30 # rewrite, traverse the packfile (again) and write each hash that
31 # tests positive against the liveness filter to a packwriter.
33 # During the traversal of all of the packfiles, delete redundant,
34 # old packfiles only after the packwriter has finished the pack
35 # that contains all of their live objects.
37 # The current code unconditionally tracks the set of tree hashes seen
38 # during the mark phase, and skips any that have already been visited.
39 # This should decrease the IO load at the cost of increased RAM use.
41 # FIXME: add a bloom filter tuning parameter?
47 v,verbose increase log output (can be used more than once)
48 threshold only rewrite a packfile if it's over this percent garbage [10]
49 #,compress= set compression level to # (0-9, 9 is highest) [1]
50 unsafe use the command even though it may be DANGEROUS
58 def count_objects(dir):
59 # For now we'll just use open_idx(), but we could probably be much
60 # more efficient since all we need is a single integer (the last
61 # fanout entry) from each index.
64 indexes = glob.glob(os.path.join(dir, '*.idx'))
65 for i, idx_name in enumerate(indexes):
67 log('found %d objects (%d/%d %s)\r'
68 % (object_count, i + 1, len(indexes),
69 os.path.basename(idx_name)))
70 idx = git.open_idx(idx_name)
71 object_count += len(idx)
75 def report_live_item(n, total, ref_name, ref_id, item):
77 status = 'scanned %02.2f%%' % (n * 100.0 / total)
78 hex_id = ref_id.encode('hex')
79 dirslash = '/' if item.type == 'tree' else ''
80 chunk_path = item.chunk_path
85 ps = '/'.join(item.path)
86 chunk_ps = '/'.join(chunk_path)
87 log('%s %s:%s/%s%s\n' % (status, hex_id, ps, chunk_ps, dirslash))
90 # Top commit, for example has none.
91 demangled = git.demangle_name(item.path[-1], item.mode)[0] if item.path \
94 # Don't print mangled paths unless the verbosity is over 3.
96 ps = '/'.join(item.path[:-1] + [demangled])
98 qprogress('%s %s:%s%s\r' % (status, hex_id, ps, dirslash))
99 elif (opt.verbose > 1 and item.type == 'tree') \
100 or (opt.verbose > 2 and item.type == 'blob'):
101 log('%s %s:%s%s\n' % (status, hex_id, ps, dirslash))
102 elif opt.verbose > 3:
103 ps = '/'.join(item.path)
104 log('%s %s:%s%s\n' % (status, hex_id, ps, dirslash))
107 def find_live_objects(existing_count, cat_pipe, opt):
108 prune_visited_trees = True # In case we want a command line option later
109 pack_dir = git.repo('objects/pack')
110 ffd, bloom_filename = tempfile.mkstemp('.bloom', 'tmp-gc-', pack_dir)
112 # FIXME: allow selection of k?
113 # FIXME: support ephemeral bloom filters (i.e. *never* written to disk)
114 live_objs = bloom.create(bloom_filename, expected=existing_count, k=None)
115 stop_at, trees_visited = None, None
116 if prune_visited_trees:
117 trees_visited = set()
118 stop_at = lambda (x): x.decode('hex') in trees_visited
119 approx_live_count = 0
120 for ref_name, ref_id in git.list_refs():
121 for item in walk_object(cat_pipe, ref_id.encode('hex'),
126 report_live_item(approx_live_count, existing_count,
127 ref_name, ref_id, item)
128 bin_id = item.id.decode('hex')
129 if trees_visited is not None and item.type == 'tree':
130 trees_visited.add(bin_id)
132 if not live_objs.exists(bin_id):
133 live_objs.add(bin_id)
134 approx_live_count += 1
136 live_objs.add(bin_id)
139 log('expecting to retain about %.2f%% unnecessary objects\n'
140 % live_objs.pfalse_positive())
144 def sweep(live_objects, existing_count, cat_pipe, opt):
145 # Traverse all the packs, saving the (probably) live data.
149 def remove_stale_files(new_pack_prefix):
150 if opt.verbose and new_pack_prefix:
151 log('created ' + basename(new_pack_prefix) + '\n')
152 for p in ns.stale_files:
154 log('removing ' + basename(p) + '\n')
158 writer = git.PackWriter(objcache_maker=None,
159 compression_level=opt.compress,
161 on_pack_finish=remove_stale_files)
163 # FIXME: sanity check .idx names vs .pack names?
165 for idx_name in glob.glob(os.path.join(git.repo('objects/pack'), '*.idx')):
167 qprogress('preserving live data (%d%% complete)\r'
168 % ((float(collect_count) / existing_count) * 100))
169 idx = git.open_idx(idx_name)
172 for i in xrange(0, len(idx)):
173 sha = idx.shatable[i * 20 : (i + 1) * 20]
174 if live_objects.exists(sha):
177 collect_count += idx_live_count
178 if idx_live_count == 0:
181 % git.repo_rel(basename(idx_name)))
182 ns.stale_files.append(idx_name)
183 ns.stale_files.append(idx_name[:-3] + 'pack')
186 live_frac = idx_live_count / float(len(idx))
187 if live_frac > ((100 - opt.threshold) / 100.0):
189 log('keeping %s (%d%% live)\n' % (git.repo_rel(basename(idx_name)),
194 log('rewriting %s (%.2f%% live)\n' % (basename(idx_name),
196 for i in xrange(0, len(idx)):
197 sha = idx.shatable[i * 20 : (i + 1) * 20]
198 if live_objects.exists(sha):
199 item_it = cat_pipe.get(sha.encode('hex'))
200 type = item_it.next()
201 writer.write(sha, type, ''.join(item_it))
203 ns.stale_files.append(idx_name)
204 ns.stale_files.append(idx_name[:-3] + 'pack')
207 progress('preserving live data (%d%% complete)\n'
208 % ((float(collect_count) / existing_count) * 100))
210 # Nothing should have recreated midx/bloom yet.
211 pack_dir = git.repo('objects/pack')
212 assert(not os.path.exists(os.path.join(pack_dir, 'bup.bloom')))
213 assert(not glob.glob(os.path.join(pack_dir, '*.midx')))
215 # try/catch should call writer.abort()?
216 # This will finally run midx.
217 writer.close() # Can only change refs (if needed) after this.
218 remove_stale_files(None) # In case we didn't write to the writer.
221 log('discarded %d%% of objects\n'
222 % ((existing_count - count_objects(pack_dir))
223 / float(existing_count) * 100))
226 # FIXME: server mode?
227 # FIXME: make sure client handles server-side changes reasonably
231 o = options.Options(optspec)
232 (opt, flags, extra) = o.parse(sys.argv[1:])
235 o.fatal('refusing to run dangerous, experimental command without --unsafe')
238 o.fatal('no positional parameters expected')
242 opt.threshold = int(opt.threshold)
244 o.fatal('threshold must be an integer percentage value')
245 if opt.threshold < 0 or opt.threshold > 100:
246 o.fatal('threshold must be an integer percentage value')
248 git.check_repo_or_die()
251 existing_count = count_objects(git.repo('objects/pack'))
253 log('found %d objects\n' % existing_count)
254 if not existing_count:
256 log('nothing to collect\n')
258 live_objects = find_live_objects(existing_count, cat_pipe, opt)
260 # FIXME: just rename midxes and bloom, and restore them at the end if
261 # we didn't change any packs?
262 if opt.verbose: log('clearing midx files\n')
264 if opt.verbose: log('clearing bloom filter\n')
265 bloom.clear_bloom(git.repo('objects/pack'))
266 if opt.verbose: log('clearing reflog\n')
267 expirelog_cmd = ['git', 'reflog', 'expire', '--all', '--expire=all']
268 expirelog = subprocess.Popen(expirelog_cmd, preexec_fn = git._gitenv())
269 git._git_wait(' '.join(expirelog_cmd), expirelog)
270 if opt.verbose: log('removing unreachable data\n')
271 sweep(live_objects, existing_count, cat_pipe, opt)
274 os.unlink(live_objects.name)
277 log('WARNING: %d errors encountered during gc\n' % len(saved_errors))