X-Git-Url: https://arthur.barton.de/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=cmd%2Fbloom-cmd.py;h=d74cce2704567597fc1fce934032dfd025041e55;hb=refs%2Fheads%2Funused-variable-do_bloom;hp=81ee392fb2d77a778e8cd8e4dc08dbf32f695728;hpb=9ce95ab1ee085c0ff4dd58381d1ce6aaa0ec5797;p=bup.git diff --git a/cmd/bloom-cmd.py b/cmd/bloom-cmd.py index 81ee392..d74cce2 100755 --- a/cmd/bloom-cmd.py +++ b/cmd/bloom-cmd.py @@ -1,42 +1,60 @@ #!/usr/bin/env python import sys, glob, tempfile -from bup import options, git +from bup import options, git, bloom from bup.helpers import * optspec = """ bup bloom [options...] -- -o,output= output bloom filename (default: auto-generated) -d,dir= input directory to look for idx files (default: auto-generated) -k,hashes= number of hash functions to use (4 or 5) (default: auto-generated) -c,check= an idx file to check against an existing bloom filter +ruin ruin the specified bloom file (clearing the bitfield) +f,force ignore existing bloom file and regenerate it from scratch +o,output= output bloom filename (default: auto) +d,dir= input directory to look for idx files (default: auto) +k,hashes= number of hash functions to use (4 or 5) (default: auto) +c,check= check the given .idx file against the bloom filter """ + +def ruin_bloom(bloomfilename): + rbloomfilename = git.repo_rel(bloomfilename) + if not os.path.exists(bloomfilename): + log("%s\n" % bloomfilename) + add_error("bloom: %s not found to ruin\n" % rbloomfilename) + return + b = bloom.ShaBloom(bloomfilename, readwrite=True, expected=1) + b.map[16:16+2**b.bits] = '\0' * 2**b.bits + + def check_bloom(path, bloomfilename, idx): + rbloomfilename = git.repo_rel(bloomfilename) + ridx = git.repo_rel(idx) if not os.path.exists(bloomfilename): - log("bloom: %s not found to check\n" % bloomfilename) + log("bloom: %s: does not exist.\n" % rbloomfilename) return - b = git.ShaBloom(bloomfilename) + b = bloom.ShaBloom(bloomfilename) if not b.valid(): - log("bloom: %s could not be opened to check\n" % bloomfilename) + add_error("bloom: %r is invalid.\n" % rbloomfilename) return base = os.path.basename(idx) if base not in b.idxnames: - log("bloom: filter does not contain %s, nothing to check\n" % idx) + log("bloom: %s does not contain the idx.\n" % rbloomfilename) return if base == idx: idx = os.path.join(path, idx) - log("bloom: checking %s" % idx) + log("bloom: bloom file: %s\n" % rbloomfilename) + log("bloom: checking %s\n" % ridx) for objsha in git.open_idx(idx): if not b.exists(objsha): - add_error("bloom: ERROR: %s missing from bloom" + add_error("bloom: ERROR: object %s missing" % str(objsha).encode('hex')) +_first = None def do_bloom(path, outfilename): + global _first b = None - if os.path.exists(outfilename): - b = git.ShaBloom(outfilename) + if os.path.exists(outfilename) and not opt.force: + b = bloom.ShaBloom(outfilename) if not b.valid(): debug1("bloom: Existing invalid bloom found, regenerating.\n") b = None @@ -45,7 +63,8 @@ def do_bloom(path, outfilename): rest = [] add_count = 0 rest_count = 0 - for name in glob.glob('%s/*.idx' % path): + for i,name in enumerate(glob.glob('%s/*.idx' % path)): + progress('bloom: counting: %d\r' % i) ix = git.open_idx(name) ixbase = os.path.basename(name) if b and (ixbase in b.idxnames): @@ -57,21 +76,22 @@ def do_bloom(path, outfilename): total = add_count + rest_count if not add: - log("bloom: Nothing to do\n") + debug1("bloom: nothing to do.\n") return if b: if len(b) != rest_count: - log("bloom: size %d != idx total %d, regenerating\n" - % (len(b), rest_count)) + debug1("bloom: size %d != idx total %d, regenerating\n" + % (len(b), rest_count)) b = None - elif (b.bits < git.MAX_BLOOM_BITS and - b.pfalse_positive(add_count) > git.MAX_PFALSE_POSITIVE): - log("bloom: %d more entries => %.2f false positive, regenerating\n" - % (add_count, b.pfalse_positive(add_count))) + elif (b.bits < bloom.MAX_BLOOM_BITS and + b.pfalse_positive(add_count) > bloom.MAX_PFALSE_POSITIVE): + debug1("bloom: regenerating: adding %d entries gives " + "%.2f%% false positives.\n" + % (add_count, b.pfalse_positive(add_count))) b = None else: - b = git.ShaBloom(outfilename, readwrite=True, expected=add_count) + b = bloom.ShaBloom(outfilename, readwrite=True, expected=add_count) if not b: # Need all idxs to build from scratch add += rest add_count += rest_count @@ -79,23 +99,29 @@ def do_bloom(path, outfilename): del rest_count msg = b is None and 'creating from' or 'adding' - log('bloom: %s %d file%s (%d object%s).\n' % (msg, len(add), - len(add)!=1 and 's' or '', - add_count, - add_count!=1 and 's' or '')) + if not _first: _first = path + dirprefix = (_first != path) and git.repo_rel(path)+': ' or '' + progress('bloom: %s%s %d file%s (%d object%s).\n' + % (dirprefix, msg, + len(add), len(add)!=1 and 's' or '', + add_count, add_count!=1 and 's' or '')) tfname = None if b is None: tfname = os.path.join(path, 'bup.tmp.bloom') - tf = open(tfname, 'w+') - b = git.ShaBloom.create(tfname, f=tf, expected=add_count, k=opt.k) - count = 0 + b = bloom.create(tfname, expected=add_count, k=opt.k) + + icount = 0 for name in add: ix = git.open_idx(name) - progress('Writing bloom: %d/%d\r' % (count, len(add))) + qprogress('bloom: writing %.2f%% (%d/%d objects)\r' + % (icount*100.0/add_count, icount, add_count)) b.add_idx(ix) - count += 1 - log('Writing bloom: %d/%d, done.\n' % (count, len(add))) + icount += len(ix) + + # Currently, there's an open file object for tfname inside b. + # Make sure it's closed before rename. + b.close() if tfname: os.rename(tfname, outfilename) @@ -110,21 +136,23 @@ if extra: o.fatal('no positional parameters expected') git.check_repo_or_die() -bloompath = opt.dir or git.repo('objects/pack') - -if not opt.output: - assert(bloompath) -outfilename = opt.output or os.path.join(bloompath, 'bup.bloom') -if opt.check: - check_bloom(bloompath, outfilename, opt.check) -else: - if opt.k and opt.k not in (4,5): - o.fatal('only k values of 4 and 5 are supported') +if not opt.check and opt.k and opt.k not in (4,5): + o.fatal('only k values of 4 and 5 are supported') - do_bloom(bloompath, outfilename) +paths = opt.dir and [opt.dir] or git.all_packdirs() +for path in paths: + debug1('bloom: scanning %s\n' % path) + outfilename = opt.output or os.path.join(path, 'bup.bloom') + if opt.check: + check_bloom(path, outfilename, opt.check) + elif opt.ruin: + ruin_bloom(outfilename) + else: + do_bloom(path, outfilename) if saved_errors: log('WARNING: %d errors encountered during bloom.\n' % len(saved_errors)) sys.exit(1) - +elif opt.check: + log('All tests passed.\n')