X-Git-Url: https://arthur.barton.de/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=cmd%2Fsave-cmd.py;h=6cd142f394c987549ac4864e10759e28d49c148c;hb=0eff3fc44bb3dda4c590b9700ad4ab1148084402;hp=1abaa3e7f1a716400350663e49c23c881d59f90d;hpb=9872d84c87fa4034fad0e63a90fdc5c782dae88b;p=bup.git diff --git a/cmd/save-cmd.py b/cmd/save-cmd.py index 1abaa3e..6cd142f 100755 --- a/cmd/save-cmd.py +++ b/cmd/save-cmd.py @@ -1,21 +1,32 @@ #!/usr/bin/env python -import sys, re, errno, stat, time, math -from bup import hashsplit, git, options, index, client +import sys, stat, time, math +from cStringIO import StringIO +from errno import EACCES + +from bup import hashsplit, git, options, index, client, metadata, hlinkdb from bup.helpers import * +from bup.hashsplit import GIT_MODE_TREE, GIT_MODE_FILE, GIT_MODE_SYMLINK optspec = """ bup save [-tc] [-n name] -- -r,remote= remote repository path +r,remote= hostname:/path/to/repo of remote repository t,tree output a tree id c,commit output a commit id n,name= name of backup set to update (if any) +d,date= date for the commit (seconds since the epoch) v,verbose increase log output (can be used more than once) q,quiet don't show progress meter smaller= only back up files smaller than n bytes +bwlimit= maximum bytes/sec to transmit to server +f,indexfile= the name of the index file (normally BUP_DIR/bupindex) +strip strips the path to every filename given +strip-path= path-prefix to be stripped when saving +graft= a graft point *old_path*=*new_path* (can be used more than once) +#,compress= set compression level to # (0-9, 9 is highest) [1] """ -o = options.Options('bup save', optspec) +o = options.Options(optspec) (opt, flags, extra) = o.parse(sys.argv[1:]) git.check_repo_or_die() @@ -24,17 +35,58 @@ if not (opt.tree or opt.commit or opt.name): if not extra: o.fatal("no filenames given") -opt.progress = (istty and not opt.quiet) +opt.progress = (istty2 and not opt.quiet) +opt.smaller = parse_num(opt.smaller or 0) +if opt.bwlimit: + client.bwlimit = parse_num(opt.bwlimit) + +if opt.date: + date = parse_date_or_fatal(opt.date, o.fatal) +else: + date = time.time() + +if opt.strip and opt.strip_path: + o.fatal("--strip is incompatible with --strip-path") + +graft_points = [] +if opt.graft: + if opt.strip: + o.fatal("--strip is incompatible with --graft") + if opt.strip_path: + o.fatal("--strip-path is incompatible with --graft") + + for (option, parameter) in flags: + if option == "--graft": + splitted_parameter = parameter.split('=') + if len(splitted_parameter) != 2: + o.fatal("a graft point must be of the form old_path=new_path") + old_path, new_path = splitted_parameter + if not (old_path and new_path): + o.fatal("a graft point cannot be empty") + graft_points.append((realpath(old_path), realpath(new_path))) + +is_reverse = os.environ.get('BUP_SERVER_REVERSE') +if is_reverse and opt.remote: + o.fatal("don't use -r in reverse mode; it's automatic") + +if opt.name and opt.name.startswith('.'): + o.fatal("'%s' is not a valid branch name" % opt.name) refname = opt.name and 'refs/heads/%s' % opt.name or None -if opt.remote: - cli = client.Client(opt.remote) +if opt.remote or is_reverse: + try: + cli = client.Client(opt.remote) + except client.ClientError, e: + log('error: %s' % e) + sys.exit(1) oldref = refname and cli.read_ref(refname) or None - w = cli.new_packwriter() + w = cli.new_packwriter(compression_level=opt.compress) else: cli = None oldref = refname and git.read_ref(refname) or None - w = git.PackWriter() + w = git.PackWriter(compression_level=opt.compress) + +handle_ctrl_c() def eatslash(dir): @@ -44,37 +96,73 @@ def eatslash(dir): return dir -parts = [''] -shalists = [[]] +# Metadata is stored in a file named .bupm in each directory. The +# first metadata entry will be the metadata for the current directory. +# The remaining entries will be for each of the other directory +# elements, in the order they're listed in the index. +# +# Since the git tree elements are sorted according to +# git.shalist_item_sort_key, the metalist items are accumulated as +# (sort_key, metadata) tuples, and then sorted when the .bupm file is +# created. The sort_key must be computed using the element's real +# name and mode rather than the git mode and (possibly mangled) name. + +# Maintain a stack of information representing the current location in +# the archive being constructed. The current path is recorded in +# parts, which will be something like ['', 'home', 'someuser'], and +# the accumulated content and metadata for of the dirs in parts is +# stored in parallel stacks in shalists and metalists. -def _push(part): - assert(part) +parts = [] # Current archive position (stack of dir names). +shalists = [] # Hashes for each dir in paths. +metalists = [] # Metadata for each dir in paths. + + +def _push(part, metadata): + # Enter a new archive directory -- make it the current directory. parts.append(part) shalists.append([]) + metalists.append([('', metadata)]) # This dir's metadata (no name). + -def _pop(force_tree): +def _pop(force_tree, dir_metadata=None): + # Leave the current archive directory and add its tree to its parent. assert(len(parts) >= 1) part = parts.pop() shalist = shalists.pop() + metalist = metalists.pop() + if metalist and not force_tree: + if dir_metadata: # Override the original metadata pushed for this dir. + metalist = [('', dir_metadata)] + metalist[1:] + sorted_metalist = sorted(metalist, key = lambda x : x[0]) + metadata = ''.join([m[1].encode() for m in sorted_metalist]) + metadata_f = StringIO(metadata) + mode, id = hashsplit.split_to_blob_or_tree(w.new_blob, w.new_tree, + [metadata_f], + keep_boundaries=False) + shalist.append((mode, '.bupm', id)) tree = force_tree or w.new_tree(shalist) if shalists: - shalists[-1].append(('40000', part, tree)) - else: # this was the toplevel, so put it back for sanity - shalists.append(shalist) + shalists[-1].append((GIT_MODE_TREE, + git.mangle_name(part, + GIT_MODE_TREE, GIT_MODE_TREE), + tree)) return tree + lastremain = None def progress_report(n): - global count, lastremain - count += n - pct = total and (count*100.0/total) or 0 + global count, subcount, lastremain + subcount += n + cc = count + subcount + pct = total and (cc*100.0/total) or 0 now = time.time() elapsed = now - tstart - kps = elapsed and int(count/1024./elapsed) + kps = elapsed and int(cc/1024./elapsed) kps_frac = 10 ** int(math.log(kps+1, 10) - 1) kps = int(kps/kps_frac)*kps_frac - if count: - remain = elapsed*1.0/count * (total-count) + if cc: + remain = elapsed*1.0/cc * (total-cc) else: remain = 0.0 if (lastremain and (remain > lastremain) @@ -96,38 +184,75 @@ def progress_report(n): remainstr = '%dm%d' % (mins, secs) else: remainstr = '%ds' % secs - progress('Saving: %.2f%% (%d/%dk, %d/%d files) %s %s\r' - % (pct, count/1024, total/1024, fcount, ftotal, - remainstr, kpsstr)) + qprogress('Saving: %.2f%% (%d/%dk, %d/%d files) %s %s\r' + % (pct, cc/1024, total/1024, fcount, ftotal, + remainstr, kpsstr)) -r = index.Reader(git.repo('bupindex')) +indexfile = opt.indexfile or git.repo('bupindex') +r = index.Reader(indexfile) +try: + msr = index.MetaStoreReader(indexfile + '.meta') +except IOError, ex: + if ex.errno != EACCES: + raise + log('error: cannot access %r; have you run bup index?' % indexfile) + sys.exit(1) +hlink_db = hlinkdb.HLinkDB(indexfile + '.hlink') def already_saved(ent): return ent.is_valid() and w.exists(ent.sha) and ent.sha -def wantrecurse(ent): +def wantrecurse_pre(ent): return not already_saved(ent) +def wantrecurse_during(ent): + return not already_saved(ent) or ent.sha_missing() + +def find_hardlink_target(hlink_db, ent): + if hlink_db and not stat.S_ISDIR(ent.mode) and ent.nlink > 1: + link_paths = hlink_db.node_paths(ent.dev, ent.ino) + if link_paths: + return link_paths[0] + total = ftotal = 0 if opt.progress: - for (transname,ent) in r.filter(extra, wantrecurse=wantrecurse): + for (transname,ent) in r.filter(extra, wantrecurse=wantrecurse_pre): if not (ftotal % 10024): - progress('Reading index: %d\r' % ftotal) - exists = (ent.flags & index.IX_EXISTS) + qprogress('Reading index: %d\r' % ftotal) + exists = ent.exists() hashvalid = already_saved(ent) - if exists and not hashvalid: - total += ent.size + ent.set_sha_missing(not hashvalid) + if not opt.smaller or ent.size < opt.smaller: + if exists and not hashvalid: + total += ent.size ftotal += 1 progress('Reading index: %d, done.\n' % ftotal) hashsplit.progress_callback = progress_report +# Root collisions occur when strip or graft options map more than one +# path to the same directory (paths which originally had separate +# parents). When that situation is detected, use empty metadata for +# the parent. Otherwise, use the metadata for the common parent. +# Collision example: "bup save ... --strip /foo /foo/bar /bar". + +# FIXME: Add collision tests, or handle collisions some other way. + +# FIXME: Detect/handle strip/graft name collisions (other than root), +# i.e. if '/foo/bar' and '/bar' both map to '/'. + +first_root = None +root_collision = None tstart = time.time() -count = fcount = 0 -for (transname,ent) in r.filter(extra, wantrecurse=wantrecurse): +count = subcount = fcount = 0 +lastskip_name = None +lastdir = '' +for (transname,ent) in r.filter(extra, wantrecurse=wantrecurse_during): (dir, file) = os.path.split(ent.name) exists = (ent.flags & index.IX_EXISTS) hashvalid = already_saved(ent) + wasmissing = ent.sha_missing() + oldsize = ent.size if opt.verbose: if not exists: status = 'D' @@ -138,8 +263,12 @@ for (transname,ent) in r.filter(extra, wantrecurse=wantrecurse): status = 'M' else: status = ' ' - if opt.verbose >= 2 or stat.S_ISDIR(ent.mode): + if opt.verbose >= 2: log('%s %-70s\n' % (status, ent.name)) + elif not stat.S_ISDIR(ent.mode) and lastdir != dir: + if not lastdir.startswith(dir): + log('%s %-70s\n' % (status, os.path.join(dir, ''))) + lastdir = dir if opt.progress: progress_report(0) @@ -147,81 +276,164 @@ for (transname,ent) in r.filter(extra, wantrecurse=wantrecurse): if not exists: continue + if opt.smaller and ent.size >= opt.smaller: + if exists and not hashvalid: + if opt.verbose: + log('skipping large file "%s"\n' % ent.name) + lastskip_name = ent.name + continue assert(dir.startswith('/')) - dirp = dir.split('/') - while parts > dirp: + if opt.strip: + dirp = stripped_path_components(dir, extra) + elif opt.strip_path: + dirp = stripped_path_components(dir, [opt.strip_path]) + elif graft_points: + dirp = grafted_path_components(graft_points, dir) + else: + dirp = path_components(dir) + + # At this point, dirp contains a representation of the archive + # path that looks like [(archive_dir_name, real_fs_path), ...]. + # So given "bup save ... --strip /foo/bar /foo/bar/baz", dirp + # might look like this at some point: + # [('', '/foo/bar'), ('baz', '/foo/bar/baz'), ...]. + + # This dual representation supports stripping/grafting, where the + # archive path may not have a direct correspondence with the + # filesystem. The root directory is represented by an initial + # component named '', and any component that doesn't have a + # corresponding filesystem directory (due to grafting, for + # example) will have a real_fs_path of None, i.e. [('', None), + # ...]. + + if first_root == None: + first_root = dirp[0] + elif first_root != dirp[0]: + root_collision = True + + # If switching to a new sub-tree, finish the current sub-tree. + while parts > [x[0] for x in dirp]: _pop(force_tree = None) - if dir != '/': - for part in dirp[len(parts):]: - _push(part) + + # If switching to a new sub-tree, start a new sub-tree. + for path_component in dirp[len(parts):]: + dir_name, fs_path = path_component + # Not indexed, so just grab the FS metadata or use empty metadata. + try: + meta = metadata.from_path(fs_path) if fs_path else metadata.Metadata() + except (OSError, IOError), e: + add_error(e) + lastskip_name = dir_name + meta = metadata.Metadata() + _push(dir_name, meta) if not file: - # sub/parentdirectories already handled in the pop/push() part above. + if len(parts) == 1: + continue # We're at the top level -- keep the current root dir + # Since there's no filename, this is a subdir -- finish it. oldtree = already_saved(ent) # may be None newtree = _pop(force_tree = oldtree) if not oldtree: - ent.validate(040000, newtree) + if lastskip_name and lastskip_name.startswith(ent.name): + ent.invalidate() + else: + ent.validate(GIT_MODE_TREE, newtree) ent.repack() - count += ent.size + if exists and wasmissing: + count += oldsize continue # it's not a directory id = None if hashvalid: - mode = '%o' % ent.gitmode id = ent.sha - shalists[-1].append((mode, file, id)) - elif opt.smaller and ent.size >= opt.smaller: - add_error('skipping large file "%s"' % ent.name) + git_name = git.mangle_name(file, ent.mode, ent.gitmode) + git_info = (ent.gitmode, git_name, id) + shalists[-1].append(git_info) + sort_key = git.shalist_item_sort_key((ent.mode, file, id)) + meta = msr.metadata_at(ent.meta_ofs) + meta.hardlink_target = find_hardlink_target(hlink_db, ent) + # Restore the times that were cleared to 0 in the metastore. + (meta.atime, meta.mtime, meta.ctime) = (ent.atime, ent.mtime, ent.ctime) + metalists[-1].append((sort_key, meta)) else: if stat.S_ISREG(ent.mode): try: - f = open(ent.name) - except IOError, e: - add_error(e) - except OSError, e: + f = hashsplit.open_noatime(ent.name) + except (IOError, OSError), e: add_error(e) + lastskip_name = ent.name else: - (mode, id) = hashsplit.split_to_blob_or_tree(w, [f]) + try: + (mode, id) = hashsplit.split_to_blob_or_tree( + w.new_blob, w.new_tree, [f], + keep_boundaries=False) + except (IOError, OSError), e: + add_error('%s: %s' % (ent.name, e)) + lastskip_name = ent.name else: if stat.S_ISDIR(ent.mode): assert(0) # handled above elif stat.S_ISLNK(ent.mode): try: rl = os.readlink(ent.name) - except OSError, e: - add_error(e) - except IOError, e: + except (OSError, IOError), e: add_error(e) + lastskip_name = ent.name else: - (mode, id) = ('120000', w.new_blob(rl)) + (mode, id) = (GIT_MODE_SYMLINK, w.new_blob(rl)) else: - add_error(Exception('skipping special file "%s"' % ent.name)) - count += ent.size + # Everything else should be fully described by its + # metadata, so just record an empty blob, so the paths + # in the tree and .bupm will match up. + (mode, id) = (GIT_MODE_FILE, w.new_blob("")) + if id: - ent.validate(int(mode, 8), id) + ent.validate(mode, id) ent.repack() - shalists[-1].append((mode, file, id)) + git_name = git.mangle_name(file, ent.mode, ent.gitmode) + git_info = (mode, git_name, id) + shalists[-1].append(git_info) + sort_key = git.shalist_item_sort_key((ent.mode, file, id)) + hlink = find_hardlink_target(hlink_db, ent) + try: + meta = metadata.from_path(ent.name, hardlink_target=hlink) + except (OSError, IOError), e: + add_error(e) + lastskip_name = ent.name + else: + metalists[-1].append((sort_key, meta)) + + if exists and wasmissing: + count += oldsize + subcount = 0 + if opt.progress: pct = total and count*100.0/total or 100 progress('Saving: %.2f%% (%d/%dk, %d/%d files), done. \n' % (pct, count/1024, total/1024, fcount, ftotal)) -while len(parts) > 1: +while len(parts) > 1: # _pop() all the parts above the root _pop(force_tree = None) assert(len(shalists) == 1) -tree = w.new_tree(shalists[-1]) +assert(len(metalists) == 1) + +# Finish the root directory. +tree = _pop(force_tree = None, + # When there's a collision, use empty metadata for the root. + dir_metadata = metadata.Metadata() if root_collision else None) + if opt.tree: print tree.encode('hex') if opt.commit or opt.name: - msg = 'bup save\n\nGenerated by command:\n%r' % sys.argv - ref = opt.name and ('refs/heads/%s' % opt.name) or None - commit = w.new_commit(oldref, tree, msg) + msg = 'bup save\n\nGenerated by command:\n%r\n' % sys.argv + commit = w.new_commit(oldref, tree, date, msg) if opt.commit: print commit.encode('hex') +msr.close() w.close() # must close before we can update the ref if opt.name: @@ -235,3 +447,4 @@ if cli: if saved_errors: log('WARNING: %d errors encountered while saving.\n' % len(saved_errors)) + sys.exit(1)