From c713babd495deb051aad90c73caf38353d69b015 Mon Sep 17 00:00:00 2001 From: Avery Pennarun Date: Sat, 9 Jan 2010 22:43:48 -0500 Subject: [PATCH] cmd-save: completely reimplement using the indexfile. 'bup save' no longer walks the filesystem: instead it walks the indexfile (which is much faster) and doesn't bother opening any files that haven't had an attribute change, since it can just reuse their sha1 from before. That makes it *much* faster in the common case. --- cmd-index.py | 29 ++++---- cmd-save.py | 191 ++++++++++++++++++++++++--------------------------- git.py | 9 ++- index.py | 46 ++++++++++--- t/tindex.py | 13 ++++ 5 files changed, 159 insertions(+), 129 deletions(-) create mode 100644 t/tindex.py diff --git a/cmd-index.py b/cmd-index.py index 85b0f6c..ef596b7 100755 --- a/cmd-index.py +++ b/cmd-index.py @@ -32,7 +32,7 @@ def handle_path(ri, wi, dir, name, pst, xdev, can_delete_siblings): hashgen = None if opt.fake_valid: def hashgen(name): - return index.FAKE_SHA + return (0, index.FAKE_SHA) dirty = 0 path = dir + name @@ -62,7 +62,7 @@ def handle_path(ri, wi, dir, name, pst, xdev, can_delete_siblings): continue if xdev != None and st.st_dev != xdev: log('Skipping %r: different filesystem.\n' - % os.path.realpath(p)) + % index.realpath(p)) continue if stat.S_ISDIR(st.st_mode): p = slashappend(p) @@ -86,7 +86,7 @@ def handle_path(ri, wi, dir, name, pst, xdev, can_delete_siblings): if dirty or not (ri.cur.flags & index.IX_HASHVALID): #log(' --- updating %r\n' % path) if hashgen: - ri.cur.sha = hashgen(name) + (ri.cur.gitmode, ri.cur.sha) = hashgen(name) ri.cur.flags |= index.IX_HASHVALID ri.cur.repack() ri.next() @@ -125,7 +125,7 @@ def update_index(path): wi = index.Writer(indexfile) rig = MergeGetter(ri) - rpath = os.path.realpath(path) + rpath = index.realpath(path) st = os.lstat(rpath) if opt.xdev: xdev = st.st_dev @@ -181,6 +181,7 @@ bup index <-p|s|m|u> [options...] p,print print the index entries for the given names (also works with -u) m,modified print only added/deleted/modified files (implies -p) s,status print each filename with a status char (A/M/D) (implies -p) +H,hash print the hash for each object next to its name (implies -p) u,update (recursively) update the index entries for the given filenames x,xdev,one-file-system don't cross filesystem boundaries fake-valid mark all index entries as up-to-date even if they aren't @@ -192,10 +193,10 @@ o = options.Options('bup index', optspec) if not (opt.modified or opt['print'] or opt.status or opt.update): log('bup index: you must supply one or more of -p, -s, -m, or -u\n') - exit(97) + o.usage() if opt.fake_valid and not opt.update: log('bup index: --fake-valid is meaningless without -u\n') - exit(96) + o.usage() git.check_repo_or_die() indexfile = opt.indexfile or git.repo('bupindex') @@ -205,7 +206,7 @@ paths = index.reduce_paths(extra) if opt.update: if not paths: log('bup index: update (-u) requested but no paths given\n') - exit(96) + o.usage() for (rp, path) in paths: update_index(rp) @@ -213,18 +214,20 @@ if opt['print'] or opt.status or opt.modified: for (name, ent) in index.Reader(indexfile).filter(extra or ['']): if opt.modified and ent.flags & index.IX_HASHVALID: continue + line = '' if opt.status: if not ent.flags & index.IX_EXISTS: - print 'D ' + name + line += 'D ' elif not ent.flags & index.IX_HASHVALID: if ent.sha == index.EMPTY_SHA: - print 'A ' + name + line += 'A ' else: - print 'M ' + name + line += 'M ' else: - print ' ' + name - else: - print name + line += ' ' + if opt.hash: + line += ent.sha.encode('hex') + ' ' + print line + (name or './') #print repr(ent) if saved_errors: diff --git a/cmd-save.py b/cmd-save.py index 7da0429..67cb30c 100755 --- a/cmd-save.py +++ b/cmd-save.py @@ -1,6 +1,6 @@ #!/usr/bin/env python2.5 import sys, re, errno, stat, client -import hashsplit, git, options +import hashsplit, git, options, index from helpers import * @@ -10,90 +10,6 @@ def add_error(e): log('\n%s\n' % e) -def _direxpand(name): - st = os.lstat(name) - try: - if stat.S_ISDIR(st.st_mode): - for sub in os.listdir(name): - subfull = os.path.join(name, sub) - for fn_st in _direxpand(subfull): - yield fn_st - else: - yield (name,st) - except OSError, e: - if e.errno in [errno.ENOENT, errno.EPERM, errno.EACCES]: - add_error(e) - else: - raise - - -def direxpand(names): - for n in names: - for fn_st in _direxpand(n): - yield fn_st - - -def _normpath(dir): - p = os.path.normpath(dir) - return (p != '.') and p or '' - - -class Tree: - def __init__(self, parent, name): - assert(name != '.') - assert(not (parent and not name)) - self.parent = parent - self.name = name - self.sha = None - self.children = {} - if self.parent: - self.parent.children[self.name] = self - - def fullpath(self): - if self.parent: - return os.path.join(self.parent.fullpath(), self.name) - else: - return self.name - - def gettop(self): - p = self - while p.parent: - p = p.parent - return p - - def getdir(self, dir): - # FIXME: deal with '..' somehow (look at how tar does it) - dir = _normpath(dir) - if dir.startswith('/'): - dir = dir[1:] - top = self.gettop() - if not dir: - return top - for part in dir.split('/'): - sub = top.children.get(part) - if not sub: - sub = top.children[part] = Tree(top, part) - top = sub - return top - - def addfile(self, mode, fullname, id): - (dir, name) = os.path.split(fullname) - self.getdir(dir).children[name] = (mode, name, id) - - def shalist(self, w): - for c in self.children.values(): - if isinstance(c, tuple): # sha1 entry for a file - yield c - else: # tree - t = ('40000', c.name, c.gen_tree(w)) - yield t - - def gen_tree(self, w): - if not self.sha: - self.sha = w.new_tree(self.shalist(w)) - return self.sha - - optspec = """ bup save [-tc] [-n name] -- @@ -110,6 +26,9 @@ git.check_repo_or_die() if not (opt.tree or opt.commit or opt.name): log("bup save: use one or more of -t, -c, -n\n") o.usage() +if not extra: + log("bup save: no filenames given.\n") + o.usage() if opt.verbose >= 2: git.verbose = opt.verbose - 1 @@ -124,26 +43,94 @@ else: cli = None oldref = refname and git.read_ref(refname) or None w = git.PackWriter() - -root = Tree(None, '') -for (fn,st) in direxpand(extra): + + +def eatslash(dir): + if dir.endswith('/'): + return dir[:-1] + else: + return dir + + +parts = [''] +shalists = [[]] + +def _push(part): + parts.append(part) + shalists.append([]) + +def _pop(): + assert(len(parts) > 1) + part = parts.pop() + shalist = shalists.pop() + tree = w.new_tree(shalist) + shalists[-1].append(('40000', part, tree)) + + +for (transname,ent) in index.Reader(git.repo('bupindex')).filter(extra): + (dir, file) = os.path.split(ent.name) + exists = (ent.flags & index.IX_EXISTS) + hashvalid = (ent.flags & index.IX_HASHVALID) and w.exists(ent.sha) if opt.verbose: - log('\n%s ' % fn) - try: - if stat.S_ISREG(st.st_mode): # regular file - f = open(fn) - (mode, id) = hashsplit.split_to_blob_or_tree(w, [f]) - elif stat.S_ISLNK(st.st_mode): # symlink - (mode, id) = ('120000', w.new_blob(os.readlink(fn))) + if not exists: + status = 'D' + elif not hashvalid: + if ent.sha == index.EMPTY_SHA: + status = 'A' + else: + status = 'M' else: - add_error(Exception('skipping special file "%s"' % fn)) - except IOError, e: - add_error(e) - except OSError, e: - add_error(e) + status = ' ' + log('\n%s %s ' % (status, ent.name)) + + if not exists: + continue + + assert(dir.startswith('/')) + dirp = dir.split('/') + while parts > dirp: + _pop() + for part in dirp[len(parts):]: + _push(part) + + if not file: + # directory already handled. + # FIXME: not using the indexed tree sha1's for anything, which is + # a waste. That's a potential optimization... + continue + + id = None + if hashvalid: + mode = '%o' % ent.mode + id = ent.sha + shalists[-1].append((mode, file, id)) else: - root.addfile(mode, fn, id) -tree = root.gen_tree(w) + try: + if stat.S_ISREG(ent.mode): + f = open(ent.name) + (mode, id) = hashsplit.split_to_blob_or_tree(w, [f]) + elif stat.S_ISDIR(ent.mode): + assert(0) # handled above + elif stat.S_ISLNK(ent.mode): + (mode, id) = ('120000', w.new_blob(os.readlink(ent.name))) + else: + add_error(Exception('skipping special file "%s"' % ent.name)) + except IOError, e: + add_error(e) + except OSError, e: + add_error(e) + if id: + ent.validate(id) + ent.repack() + shalists[-1].append((mode, file, id)) +#log('parts out: %r\n' % parts) +#log('stk out: %r\n' % shalists) +while len(parts) > 1: + _pop() +#log('parts out: %r\n' % parts) +#log('stk out: %r\n' % shalists) +assert(len(shalists) == 1) +tree = w.new_tree(shalists[-1]) if opt.verbose: log('\n') if opt.tree: diff --git a/git.py b/git.py index 6775bb2..5aa7933 100644 --- a/git.py +++ b/git.py @@ -183,11 +183,14 @@ class PackWriter: def write(self, type, content): return self._write(calc_hash(type, content), type, content) - def maybe_write(self, type, content): - bin = calc_hash(type, content) + def exists(self, id): if not self.objcache: self._make_objcache() - if not self.objcache.exists(bin): + return self.objcache.exists(id) + + def maybe_write(self, type, content): + bin = calc_hash(type, content) + if not self.exists(bin): self._write(bin, type, content) self.objcache.add(bin) return bin diff --git a/index.py b/index.py index f541dec..6d302bb 100644 --- a/index.py +++ b/index.py @@ -4,7 +4,7 @@ from helpers import * EMPTY_SHA = '\0'*20 FAKE_SHA = '\x01'*20 INDEX_HDR = 'BUPI\0\0\0\1' -INDEX_SIG = '!IIIIIQ20sH' +INDEX_SIG = '!IIIIIQII20sH' ENTLEN = struct.calcsize(INDEX_SIG) IX_EXISTS = 0x8000 @@ -21,7 +21,7 @@ class Entry: self.name = str(name) self.tstart = tstart (self.dev, self.ctime, self.mtime, self.uid, self.gid, - self.size, self.sha, + self.size, self.mode, self.gitmode, self.sha, self.flags) = struct.unpack(INDEX_SIG, buffer(m, ofs, ENTLEN)) def __repr__(self): @@ -31,9 +31,10 @@ class Entry: self.size, self.flags)) def packed(self): - return struct.pack(INDEX_SIG, self.dev, self.ctime, self.mtime, - self.uid, self.gid, self.size, self.sha, - self.flags) + return struct.pack(INDEX_SIG, + self.dev, self.ctime, self.mtime, + self.uid, self.gid, self.size, self.mode, + self.gitmode, self.sha, self.flags) def repack(self): self._m[self._ofs:self._ofs+ENTLEN] = self.packed() @@ -49,6 +50,7 @@ class Entry: self.uid = st.st_uid self.gid = st.st_gid self.size = st.st_size + self.mode = st.st_mode self.flags |= IX_EXISTS if int(st.st_ctime) >= self.tstart or old != new: self.flags &= ~IX_HASHVALID @@ -56,6 +58,11 @@ class Entry: else: return 0 # not dirty + def validate(self, sha): + assert(sha) + self.sha = sha + self.flags |= IX_HASHVALID + def __cmp__(a, b): return cmp(a.name, b.name) @@ -120,7 +127,7 @@ class Reader: continue # not interested else: name = pin + ent.name[len(rpin):] - yield (name or './', ent) + yield (name, ent) # Read all the iters in order; when more than one iter has the same entry, @@ -161,7 +168,7 @@ class Writer: self.count = 0 self.lastfile = None self.filename = None - self.filename = filename = os.path.realpath(filename) + self.filename = filename = realpath(filename) (dir,name) = os.path.split(filename) (ffd,self.tmpname) = tempfile.mkstemp('.tmp', filename, dir) self.f = os.fdopen(ffd, 'wb', 65536) @@ -196,15 +203,15 @@ class Writer: flags = IX_EXISTS sha = None if hashgen: - sha = hashgen(name) + (gitmode, sha) = hashgen(name) if sha: flags |= IX_HASHVALID else: - sha = EMPTY_SHA + (gitmode, sha) = (0, EMPTY_SHA) data = name + '\0' + \ struct.pack(INDEX_SIG, st.st_dev, int(st.st_ctime), int(st.st_mtime), st.st_uid, st.st_gid, - st.st_size, sha, flags) + st.st_size, st.st_mode, gitmode, sha, flags) self._write(data) def add_ixentry(self, e): @@ -220,10 +227,27 @@ class Writer: return Reader(self.tmpname) +# like os.path.realpath, but doesn't follow a symlink for the last element. +# (ie. if 'p' itself is itself a symlink, this one won't follow it) +def realpath(p): + try: + st = os.lstat(p) + except OSError: + st = None + if st and stat.S_ISLNK(st.st_mode): + (dir, name) = os.path.split(p) + dir = os.path.realpath(dir) + out = os.path.join(dir, name) + else: + out = os.path.realpath(p) + #log('realpathing:%r,%r\n' % (p, out)) + return out + + def reduce_paths(paths): xpaths = [] for p in paths: - rp = os.path.realpath(p) + rp = realpath(p) st = os.lstat(rp) if stat.S_ISDIR(st.st_mode): rp = slashappend(rp) diff --git a/t/tindex.py b/t/tindex.py new file mode 100644 index 0000000..9922b8f --- /dev/null +++ b/t/tindex.py @@ -0,0 +1,13 @@ +import os +import index +from wvtest import * + +@wvtest +def testbasic(): + cd = os.path.realpath('') + WVPASS(cd) + sd = os.path.realpath('t/sampledata') + WVPASSEQ(index.realpath('t/sampledata'), cd + '/t/sampledata') + WVPASSEQ(os.path.realpath('t/sampledata/x'), sd + '/x') + WVPASSEQ(os.path.realpath('t/sampledata/etc'), '/etc') + WVPASSEQ(index.realpath('t/sampledata/etc'), sd + '/etc') -- 2.39.2