X-Git-Url: https://arthur.barton.de/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=cmd%2Fmidx-cmd.py;h=cadf7c3760b543342afbe9c219afa50bd8a7db1e;hb=bf67f94dd4f4096de4eee07a7dc377d6c889a016;hp=eb66759a1d823f656a155dff4194d5aba9f339fd;hpb=65cc02dc1f53900ca326831f27064c835a516bb4;p=bup.git diff --git a/cmd/midx-cmd.py b/cmd/midx-cmd.py index eb66759..cadf7c3 100755 --- a/cmd/midx-cmd.py +++ b/cmd/midx-cmd.py @@ -1,7 +1,21 @@ -#!/usr/bin/env python -import sys, math, struct, glob, resource -from bup import options, git -from bup.helpers import * +#!/bin/sh +"""": # -*-python-*- +bup_python="$(dirname "$0")/bup-python" || exit $? +exec "$bup_python" "$0" ${1+"$@"} +""" +# end of bup preamble + +from __future__ import absolute_import, print_function +from binascii import hexlify +import glob, math, os, resource, struct, sys, tempfile + +from bup import options, git, midx, _helpers, xstat +from bup.compat import argv_bytes, hexstr, range +from bup.helpers import (Sha1, add_error, atomically_replaced_file, debug1, fdatasync, + handle_ctrl_c, log, mmap_readwrite, qprogress, + saved_errors, unlink) +from bup.io import byte_stream, path_msg + PAGE_SIZE=4096 SHA_PER_PAGE=PAGE_SIZE/20. @@ -10,17 +24,22 @@ optspec = """ bup midx [options...] -- o,output= output midx filename (default: auto-generated) -a,auto automatically create .midx from any unindexed .idx files -f,force automatically create .midx from *all* .idx files +a,auto automatically use all existing .midx/.idx files as input +f,force merge produce exactly one .midx containing all objects +p,print print names of generated midx files +check validate contents of the given midx files (with -a, all midx files) max-files= maximum number of idx files to open at once [-1] -dir= directory containing idx/midx files +d,dir= directory containing idx/midx files """ +merge_into = _helpers.merge_into + + def _group(l, count): - for i in xrange(0, len(l), count): + for i in range(0, len(l), count): yield l[i:i+count] - - + + def max_files(): mf = min(resource.getrlimit(resource.RLIMIT_NOFILE)) if mf > 32: @@ -30,101 +49,144 @@ def max_files(): return mf -def merge(idxlist, bits, table): - count = 0 - for e in git.idxmerge(idxlist): - count += 1 - prefix = git.extract_bits(e, bits) - table[prefix] = count - yield e +def check_midx(name): + nicename = git.repo_rel(name) + log('Checking %s.\n' % path_msg(nicename)) + try: + ix = git.open_idx(name) + except git.GitError as e: + add_error('%s: %s' % (pathmsg(name), e)) + return + for count,subname in enumerate(ix.idxnames): + sub = git.open_idx(os.path.join(os.path.dirname(name), subname)) + for ecount,e in enumerate(sub): + if not (ecount % 1234): + qprogress(' %d/%d: %s %d/%d\r' + % (count, len(ix.idxnames), + git.shorten_hash(subname).decode('ascii'), + ecount, len(sub))) + if not sub.exists(e): + add_error("%s: %s: %s missing from idx" + % (path_msg(nicename), + git.shorten_hash(subname).decode('ascii'), + hexstr(e))) + if not ix.exists(e): + add_error("%s: %s: %s missing from midx" + % (path_msg(nicename), + git.shorten_hash(subname).decode('ascii'), + hexstr(e))) + prev = None + for ecount,e in enumerate(ix): + if not (ecount % 1234): + qprogress(' Ordering: %d/%d\r' % (ecount, len(ix))) + if e and prev and not e >= prev: + add_error('%s: ordering error: %s < %s' + % (nicename, hexstr(e), hexstr(prev))) + prev = e -def _do_midx(outdir, outfilename, infilenames): +_first = None +def _do_midx(outdir, outfilename, infilenames, prefixstr): + global _first if not outfilename: assert(outdir) - sum = Sha1('\0'.join(infilenames)).hexdigest() - outfilename = '%s/midx-%s.midx' % (outdir, sum) + sum = hexlify(Sha1(b'\0'.join(infilenames)).digest()) + outfilename = b'%s/midx-%s.midx' % (outdir, sum) inp = [] total = 0 - allfilenames = {} - for name in infilenames: - ix = git.open_idx(name) - for n in ix.idxnames: - allfilenames[n] = 1 - inp.append(ix) - total += len(ix) - - log('Merging %d indexes (%d objects).\n' % (len(infilenames), total)) - if (not opt.force and (total < 1024 and len(infilenames) < 3)) \ - or len(infilenames) < 2 \ - or (opt.force and not total): - log('midx: nothing to do.\n') - return + allfilenames = [] + midxs = [] + try: + for name in infilenames: + ix = git.open_idx(name) + midxs.append(ix) + inp.append(( + ix.map, + len(ix), + ix.sha_ofs, + isinstance(ix, midx.PackMidx) and ix.which_ofs or 0, + len(allfilenames), + )) + for n in ix.idxnames: + allfilenames.append(os.path.basename(n)) + total += len(ix) + inp.sort(reverse=True, key=lambda x: x[0][x[2] : x[2] + 20]) - pages = int(total/SHA_PER_PAGE) or 1 - bits = int(math.ceil(math.log(pages, 2))) - entries = 2**bits - log('Table size: %d (%d bits)\n' % (entries*4, bits)) - - table = [0]*entries + if not _first: _first = outdir + dirprefix = (_first != outdir) and git.repo_rel(outdir) + b': ' or b'' + debug1('midx: %s%screating from %d files (%d objects).\n' + % (dirprefix, prefixstr, len(infilenames), total)) + if (opt.auto and (total < 1024 and len(infilenames) < 3)) \ + or ((opt.auto or opt.force) and len(infilenames) < 2) \ + or (opt.force and not total): + debug1('midx: nothing to do.\n') + return - try: - os.unlink(outfilename) - except OSError: - pass - f = open(outfilename + '.tmp', 'w+') - f.write('MIDX\0\0\0\2') - f.write(struct.pack('!I', bits)) - assert(f.tell() == 12) - f.write('\0'*4*entries) - - for e in merge(inp, bits, table): - f.write(e) - - f.write('\0'.join(os.path.basename(p) for p in allfilenames.keys())) + pages = int(total/SHA_PER_PAGE) or 1 + bits = int(math.ceil(math.log(pages, 2))) + entries = 2**bits + debug1('midx: table size: %d (%d bits)\n' % (entries*4, bits)) - f.seek(12) - f.write(struct.pack('!%dI' % entries, *table)) - f.close() - os.rename(outfilename + '.tmp', outfilename) + unlink(outfilename) + with atomically_replaced_file(outfilename, 'wb') as f: + f.write(b'MIDX') + f.write(struct.pack('!II', midx.MIDX_VERSION, bits)) + assert(f.tell() == 12) - # this is just for testing + f.truncate(12 + 4*entries + 20*total + 4*total) + f.flush() + fdatasync(f.fileno()) + + fmap = mmap_readwrite(f, close=False) + count = merge_into(fmap, bits, total, inp) + del fmap # Assume this calls msync() now. + f.seek(0, os.SEEK_END) + f.write(b'\0'.join(allfilenames)) + finally: + for ix in midxs: + if isinstance(ix, midx.PackMidx): + ix.close() + midxs = None + inp = None + + + # This is just for testing (if you enable this, don't clear inp above) if 0: - p = git.PackMidx(outfilename) + p = midx.PackMidx(outfilename) assert(len(p.idxnames) == len(infilenames)) - print p.idxnames + log(repr(p.idxnames) + '\n') assert(len(p) == total) - pi = iter(p) - for i in merge(inp, total, bits, table): - assert(i == pi.next()) + for pe, e in p, git.idxmerge(inp, final_progress=False): + pin = next(pi) + assert(i == pin) assert(p.exists(i)) - return total,outfilename + return total, outfilename -def do_midx(outdir, outfilename, infilenames): - rv = _do_midx(outdir, outfilename, infilenames) - if rv: - print rv[1] +def do_midx(outdir, outfilename, infilenames, prefixstr, prout): + rv = _do_midx(outdir, outfilename, infilenames, prefixstr) + if rv and opt['print']: + prout.write(rv[1] + b'\n') -def do_midx_dir(path): +def do_midx_dir(path, outfilename, prout): already = {} sizes = {} if opt.force and not opt.auto: midxs = [] # don't use existing midx files else: - midxs = glob.glob('%s/*.midx' % path) + midxs = glob.glob(b'%s/*.midx' % path) contents = {} for mname in midxs: m = git.open_idx(mname) - contents[mname] = [('%s/%s' % (path,i)) for i in m.idxnames] + contents[mname] = [(b'%s/%s' % (path,i)) for i in m.idxnames] sizes[mname] = len(m) - # sort the biggest midxes first, so that we can eliminate smaller - # redundant ones that come later in the list - midxs.sort(lambda x,y: -cmp(sizes[x], sizes[y])) + # sort the biggest+newest midxes first, so that we can eliminate + # smaller (or older) redundant ones that come later in the list + midxs.sort(key=lambda ix: (-sizes[ix], -xstat.stat(ix).st_mtime)) for mname in midxs: any = 0 @@ -133,12 +195,12 @@ def do_midx_dir(path): already[iname] = 1 any = 1 if not any: - log('%r is redundant\n' % mname) + debug1('%r is redundant\n' % mname) unlink(mname) already[mname] = 1 midxs = [k for k in midxs if not already.get(k)] - idxs = [k for k in glob.glob('%s/*.idx' % path) if not already.get(k)] + idxs = [k for k in glob.glob(b'%s/*.idx' % path) if not already.get(k)] for iname in idxs: i = git.open_idx(iname) @@ -150,36 +212,47 @@ def do_midx_dir(path): DESIRED_HWM = opt.force and 1 or 5 DESIRED_LWM = opt.force and 1 or 2 existed = dict((name,1) for sz,name in all) - log('midx: %d indexes; want no more than %d.\n' % (len(all), DESIRED_HWM)) + debug1('midx: %d indexes; want no more than %d.\n' + % (len(all), DESIRED_HWM)) if len(all) <= DESIRED_HWM: - log('midx: nothing to do.\n') + debug1('midx: nothing to do.\n') while len(all) > DESIRED_HWM: all.sort() part1 = [name for sz,name in all[:len(all)-DESIRED_LWM+1]] part2 = all[len(all)-DESIRED_LWM+1:] - all = list(do_midx_group(path, part1)) + part2 + all = list(do_midx_group(path, outfilename, part1)) + part2 if len(all) > DESIRED_HWM: - log('\nStill too many indexes (%d > %d). Merging again.\n' - % (len(all), DESIRED_HWM)) + debug1('\nStill too many indexes (%d > %d). Merging again.\n' + % (len(all), DESIRED_HWM)) - for sz,name in all: - if not existed.get(name): - print name + if opt['print']: + for sz,name in all: + if not existed.get(name): + prout.write(name + b'\n') -def do_midx_group(outdir, infiles): - for sublist in _group(infiles, opt.max_files): - rv = _do_midx(path, None, sublist) +def do_midx_group(outdir, outfilename, infiles): + groups = list(_group(infiles, opt.max_files)) + gprefix = '' + for n,sublist in enumerate(groups): + if len(groups) != 1: + gprefix = 'Group %d: ' % (n+1) + rv = _do_midx(outdir, outfilename, sublist, gprefix) if rv: yield rv +handle_ctrl_c() -o = options.Options('bup midx', optspec) +o = options.Options(optspec) (opt, flags, extra) = o.parse(sys.argv[1:]) +opt.dir = argv_bytes(opt.dir) if opt.dir else None +opt.output = argv_bytes(opt.output) if opt.output else None if extra and (opt.auto or opt.force): o.fatal("you can't use -f/-a and also provide filenames") +if opt.check and (not extra and not opt.auto): + o.fatal("if using --check, you must provide filenames or -a") git.check_repo_or_die() @@ -187,17 +260,36 @@ if opt.max_files < 0: opt.max_files = max_files() assert(opt.max_files >= 5) -if extra: - do_midx(git.repo('objects/pack'), opt.output, extra) -elif opt.auto or opt.force: - if opt.dir: - paths = [opt.dir] +extra = [argv_bytes(x) for x in extra] + +if opt.check: + # check existing midx files + if extra: + midxes = extra else: - paths = [git.repo('objects/pack')] - paths += glob.glob(git.repo('index-cache/*/.')) - for path in paths: - log('midx: scanning %s\n' % path) - do_midx_dir(path) - log('\n') + midxes = [] + paths = opt.dir and [opt.dir] or git.all_packdirs() + for path in paths: + debug1('midx: scanning %s\n' % path) + midxes += glob.glob(os.path.join(path, b'*.midx')) + for name in midxes: + check_midx(name) + if not saved_errors: + log('All tests passed.\n') else: - o.fatal("you must use -f or -a or provide input filenames") + if extra: + sys.stdout.flush() + do_midx(git.repo(b'objects/pack'), opt.output, extra, b'', + byte_stream(sys.stdout)) + elif opt.auto or opt.force: + sys.stdout.flush() + paths = opt.dir and [opt.dir] or git.all_packdirs() + for path in paths: + debug1('midx: scanning %s\n' % path_msg(path)) + do_midx_dir(path, opt.output, byte_stream(sys.stdout)) + else: + o.fatal("you must use -f or -a or provide input filenames") + +if saved_errors: + log('WARNING: %d errors encountered.\n' % len(saved_errors)) + sys.exit(1)