]> arthur.barton.de Git - bup.git/commitdiff
Massive speedups to bupindex code.
authorAvery Pennarun <apenwarr@gmail.com>
Sun, 31 Jan 2010 22:59:33 +0000 (17:59 -0500)
committerAvery Pennarun <apenwarr@gmail.com>
Tue, 2 Feb 2010 06:01:57 +0000 (01:01 -0500)
The old file format was modeled after the git one, but it was kind of dumb;
you couldn't search through the file except linearly, which is pretty slow
when you have hundreds of thousands, or millions, of files.  It also stored
the entire pathname of each file, which got very wasteful as filenames got
longer.

The new format is much quicker; each directory has a pointer to its list of
children, so you can jump around rather than reading linearly through the
file.  Thus you can now 'bup index -p' any subdirectory pretty much
instantly.  The code is still not completely optimized, but the remaining
algorithmic silliness doesn't seem to matter.

And it even still passes unit tests!  Which is too bad, actually, because I
still get oddly crashy behaviour when I repeatedly update a large index. So
there are still some screwy bugs hanging around.  I guess that means we need
better unit tests...

cmd-index.py
helpers.py
index.py
t/sampledata/b2/foozy [new file with mode: 0644]
t/sampledata/b2/foozy2 [new file with mode: 0644]
t/test.sh
t/tindex.py

index 1931634a123f07e62b42aa6ef1989d7a1a19c793..fc6805fb0e1fb52d9e1cf34c4b2c671e6c2a801e 100755 (executable)
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-import os, sys, stat
+import os, sys, stat, time
 import options, git, index
 from helpers import *
 
@@ -34,75 +34,68 @@ def add_error(e):
 # the use of fchdir() and lstat() are for two reasons:
 #  - help out the kernel by not making it repeatedly look up the absolute path
 #  - avoid race conditions caused by doing listdir() on a changing symlink
-def handle_path(ri, wi, dir, name, pst, xdev, can_delete_siblings):
-    hashgen = None
-    if opt.fake_valid:
-        def hashgen(name):
-            return (0, index.FAKE_SHA)
-    
-    dirty = 0
-    path = dir + name
-    #log('handle_path(%r,%r)\n' % (dir, name))
-    if stat.S_ISDIR(pst.st_mode):
-        if opt.verbose == 1: # log dirs only
-            sys.stdout.write('%s\n' % path)
-            sys.stdout.flush()
+def dirlist(path):
+    l = []
+    try:
+        OsFile(path).fchdir()
+    except OSError, e:
+        add_error(e)
+        return l
+    for n in os.listdir('.'):
         try:
-            OsFile(name).fchdir()
+            st = os.lstat(n)
         except OSError, e:
-            add_error(Exception('in %s: %s' % (dir, str(e))))
-            return 0
+            add_error(Exception('in %s: %s' % (index.realpath(path), str(e))))
+            continue
+        if stat.S_ISDIR(st.st_mode):
+            n += '/'
+        l.append((os.path.join(path, n), st))
+    l.sort(reverse=True)
+    return l
+
+
+def _recursive_dirlist(path, xdev):
+    olddir = OsFile('.')
+    for (path,pst) in dirlist(path):
+        if xdev != None and pst.st_dev != xdev:
+            log('Skipping %r: different filesystem.\n' % path)
+            continue
+        if stat.S_ISDIR(pst.st_mode):
+            for i in _recursive_dirlist(path, xdev=xdev):
+                yield i
+        yield (path,pst)
+    olddir.fchdir()
+
+
+def _matchlen(a,b):
+    bi = iter(b)
+    count = 0
+    for ai in a:
         try:
-            try:
-                ld = os.listdir('.')
-                #log('* %r: %r\n' % (name, ld))
-            except OSError, e:
-                add_error(Exception('in %s: %s' % (path, str(e))))
-                return 0
-            lds = []
-            for p in ld:
-                try:
-                    st = os.lstat(p)
-                except OSError, e:
-                    add_error(Exception('in %s: %s' % (path, str(e))))
-                    continue
-                if xdev != None and st.st_dev != xdev:
-                    log('Skipping %r: different filesystem.\n' 
-                        % index.realpath(p))
-                    continue
-                if stat.S_ISDIR(st.st_mode):
-                    p = slashappend(p)
-                lds.append((p, st))
-            for p,st in reversed(sorted(lds)):
-                dirty += handle_path(ri, wi, path, p, st, xdev,
-                                     can_delete_siblings = True)
-        finally:
-            os.chdir('..')
-    #log('endloop: ri.cur:%r path:%r\n' % (ri.cur.name, path))
-    while ri.cur and ri.cur.name > path:
-        #log('ricur:%r path:%r\n' % (ri.cur, path))
-        if can_delete_siblings and dir and ri.cur.name.startswith(dir):
-            #log('    --- deleting\n')
-            ri.cur.flags &= ~(index.IX_EXISTS | index.IX_HASHVALID)
-            ri.cur.repack()
-            dirty += 1
-        ri.next()
-    if ri.cur and ri.cur.name == path:
-        dirty += ri.cur.from_stat(pst)
-        if dirty or not (ri.cur.flags & index.IX_HASHVALID):
-            #log('   --- updating %r\n' % path)
-            if hashgen:
-                (ri.cur.gitmode, ri.cur.sha) = hashgen(name)
-                ri.cur.flags |= index.IX_HASHVALID
-            ri.cur.repack()
-        ri.next()
-    else:
-        wi.add(path, pst, hashgen = hashgen)
-        dirty += 1
-    if opt.verbose > 1:  # all files, not just dirs
-        sys.stdout.write('%s\n' % path)
-        sys.stdout.flush()
-    return dirty
+            if bi.next() == ai:
+                count += 1
+        except StopIteration:
+            break
+    return count
+
+
+def recursive_dirlist(paths):
+    last = ()
+    for path in paths:
+        pathsplit = index.pathsplit(path)
+        while _matchlen(pathsplit, last) < len(last):
+            yield (''.join(last), None)
+            last.pop()
+        pst = os.lstat(path)
+        if opt.xdev:
+            xdev = pst.st_dev
+        else:
+            xdev = None
+        if stat.S_ISDIR(pst.st_mode):
+            for i in _recursive_dirlist(path, xdev=xdev):
+                yield i
+        yield (path,pst)
+        last = pathsplit[:-1]
 
 
 def merge_indexes(out, r1, r2):
@@ -112,7 +105,7 @@ def merge_indexes(out, r1, r2):
             out.add_ixentry(e)
 
 
-class MergeGetter:
+class IterHelper:
     def __init__(self, l):
         self.i = iter(l)
         self.cur = None
@@ -126,60 +119,52 @@ class MergeGetter:
         return self.cur
 
 
-def update_index(path):
+def update_index(top):
     ri = index.Reader(indexfile)
     wi = index.Writer(indexfile)
-    rig = MergeGetter(ri)
-    
-    rpath = index.realpath(path)
-    st = os.lstat(rpath)
-    if opt.xdev:
-        xdev = st.st_dev
-    else:
-        xdev = None
-    f = OsFile('.')
-    if rpath[-1] == '/':
-        rpath = rpath[:-1]
-    (dir, name) = os.path.split(rpath)
-    dir = slashappend(dir)
-    if stat.S_ISDIR(st.st_mode) and (not rpath or rpath[-1] != '/'):
-        name += '/'
-        can_delete_siblings = True
-    else:
-        can_delete_siblings = False
-    OsFile(dir or '/').fchdir()
-    dirty = handle_path(rig, wi, dir, name, st, xdev, can_delete_siblings)
-
-    # make sure all the parents of the updated path exist and are invalidated
-    # if appropriate.
-    while 1:
-        (rpath, junk) = os.path.split(rpath)
-        if not rpath:
-            break
-        elif rpath == '/':
-            p = rpath
-        else:
-            p = rpath + '/'
-        while rig.cur and rig.cur.name > p:
-            #log('FINISHING: %r path=%r d=%r\n' % (rig.cur.name, p, dirty))
+    rig = IterHelper(ri.iter(name=top))
+    tstart = int(time.time())
+
+    hashgen = None
+    if opt.fake_valid:
+        def hashgen(name):
+            return (0, index.FAKE_SHA)
+
+    #log('doing: %r\n' % paths)
+
+    for (path,pst) in recursive_dirlist([top]):
+        #log('got: %r\n' % path)
+        if opt.verbose>=2 or (opt.verbose==1 and stat.S_ISDIR(pst.st_mode)):
+            sys.stdout.write('%s\n' % path)
+            sys.stdout.flush()
+        while rig.cur and rig.cur.name > path:  # deleted paths
+            rig.cur.set_deleted()
+            rig.cur.repack()
             rig.next()
-        if rig.cur and rig.cur.name == p:
-            if dirty:
-                rig.cur.flags &= ~index.IX_HASHVALID
+        if rig.cur and rig.cur.name == path:    # paths that already existed
+            if pst:
+                rig.cur.from_stat(pst, tstart)
+            if not (rig.cur.flags & index.IX_HASHVALID):
+                if hashgen:
+                    (rig.cur.gitmode, rig.cur.sha) = hashgen(path)
+                    rig.cur.flags |= index.IX_HASHVALID
                 rig.cur.repack()
-        else:
-            wi.add(p, os.lstat(p))
-        if p == '/':
-            break
+            rig.next()
+        else:  # new paths
+            #log('adding: %r\n' % path)
+            wi.add(path, pst, hashgen = hashgen)
     
-    f.fchdir()
-    ri.save()
-    if wi.count:
-        mi = index.Writer(indexfile)
-        merge_indexes(mi, ri, wi.new_reader())
-        ri.close()
-        mi.close()
-    wi.abort()
+    if ri.exists():
+        ri.save()
+        wi.flush()
+        if wi.count:
+            mi = index.Writer(indexfile)
+            merge_indexes(mi, ri, wi.new_reader())
+            ri.close()
+            mi.close()
+        wi.abort()
+    else:
+        wi.close()
 
 
 optspec = """
@@ -214,7 +199,7 @@ if opt.update:
     if not paths:
         log('bup index: update (-u) requested but no paths given\n')
         o.usage()
-    for (rp, path) in paths:
+    for (rp,path) in paths:
         update_index(rp)
 
 if opt['print'] or opt.status or opt.modified:
index c7dfd5f1892f8326722159156b8fac413d98bb77..3209bfb0fca96530d1edd8afd03a1cb594080577 100644 (file)
@@ -22,6 +22,14 @@ def next(it):
         return None
     
     
+def unlink(f):
+    try:
+        os.unlink(f)
+    except OSError, e:
+        if e.errno == errno.ENOENT:
+            pass  # it doesn't exist, that's what you asked for
+
+
 def readpipe(argv):
     p = subprocess.Popen(argv, stdout=subprocess.PIPE)
     r = p.stdout.read()
index 0c5f3a0ca567010d38997b7324de9d03713b866c..c41ac77776cd0f6a8733b1eb9ba1a7d5206b2ad8 100644 (file)
--- a/index.py
+++ b/index.py
@@ -3,8 +3,8 @@ from helpers import *
 
 EMPTY_SHA = '\0'*20
 FAKE_SHA = '\x01'*20
-INDEX_HDR = 'BUPI\0\0\0\1'
-INDEX_SIG = '!IIIIIQII20sH'
+INDEX_HDR = 'BUPI\0\0\0\2'
+INDEX_SIG = '!IIIIIQII20sHII'
 ENTLEN = struct.calcsize(INDEX_SIG)
 
 IX_EXISTS = 0x8000
@@ -14,15 +14,16 @@ class Error(Exception):
     pass
 
 
+def _encode(dev, ctime, mtime, uid, gi8d, size, mode, gitmode, sha, flags):
+    return struct.pack(INDEX_SIG,
+                       dev, ctime, mtime, uid, gid, size, mode,
+                       gitmode, sha, flags)
+
 class Entry:
-    def __init__(self, name, m, ofs, tstart):
-        self._m = m
-        self._ofs = ofs
+    def __init__(self, name):
         self.name = str(name)
-        self.tstart = tstart
-        (self.dev, self.ctime, self.mtime, self.uid, self.gid,
-         self.size, self.mode, self.gitmode, self.sha,
-         self.flags) = struct.unpack(INDEX_SIG, str(buffer(m, ofs, ENTLEN)))
+        self.children_ofs = 0
+        self.children_n = 0
 
     def __repr__(self):
         return ("(%s,0x%04x,%d,%d,%d,%d,%d,0x%04x)" 
@@ -34,12 +35,10 @@ class Entry:
         return struct.pack(INDEX_SIG,
                            self.dev, self.ctime, self.mtime, 
                            self.uid, self.gid, self.size, self.mode,
-                           self.gitmode, self.sha, self.flags)
-
-    def repack(self):
-        self._m[self._ofs:self._ofs+ENTLEN] = self.packed()
+                           self.gitmode, self.sha, self.flags,
+                           self.children_ofs, self.children_n)
 
-    def from_stat(self, st):
+    def from_stat(self, st, tstart):
         old = (self.dev, self.ctime, self.mtime,
                self.uid, self.gid, self.size, self.flags & IX_EXISTS)
         new = (st.st_dev, int(st.st_ctime), int(st.st_mtime),
@@ -52,19 +51,76 @@ class Entry:
         self.size = st.st_size
         self.mode = st.st_mode
         self.flags |= IX_EXISTS
-        if int(st.st_ctime) >= self.tstart or old != new:
+        if int(st.st_ctime) >= tstart or old != new:
             self.flags &= ~IX_HASHVALID
-            return 1  # dirty
-        else:
-            return 0  # not dirty
+            self.set_dirty()
 
     def validate(self, sha):
         assert(sha)
         self.sha = sha
         self.flags |= IX_HASHVALID
 
+    def set_deleted(self):
+        self.flags &= ~(IX_EXISTS | IX_HASHVALID)
+        self.set_dirty()
+
+    def set_dirty(self):
+        pass # FIXME
+
     def __cmp__(a, b):
         return cmp(a.name, b.name)
+
+
+class NewEntry(Entry):
+    def __init__(self, name, dev, ctime, mtime, uid, gid,
+                 size, mode, gitmode, sha, flags, children_ofs, children_n):
+        Entry.__init__(self, name)
+        (self.dev, self.ctime, self.mtime, self.uid, self.gid,
+         self.size, self.mode, self.gitmode, self.sha,
+         self.flags, self.children_ofs, self.children_n
+         ) = (dev, int(ctime), int(mtime), uid, gid,
+              size, mode, gitmode, sha, flags, children_ofs, children_n)
+
+
+class ExistingEntry(Entry):
+    def __init__(self, name, m, ofs):
+        Entry.__init__(self, name)
+        self._m = m
+        self._ofs = ofs
+        (self.dev, self.ctime, self.mtime, self.uid, self.gid,
+         self.size, self.mode, self.gitmode, self.sha,
+         self.flags, self.children_ofs, self.children_n
+         ) = struct.unpack(INDEX_SIG, str(buffer(m, ofs, ENTLEN)))
+
+    def repack(self):
+        self._m[self._ofs:self._ofs+ENTLEN] = self.packed()
+
+    def iter(self, name=None):
+        dname = name
+        if dname and not dname.endswith('/'):
+            dname += '/'
+        ofs = self.children_ofs
+        #log('myname=%r\n' % self.name)
+        assert(ofs <= len(self._m))
+        for i in range(self.children_n):
+            eon = self._m.find('\0', ofs)
+            #log('eon=0x%x ofs=0x%x i=%d cn=%d\n' % (eon, ofs, i, self.children_n))
+            assert(eon >= 0)
+            assert(eon >= ofs)
+            assert(eon > ofs)
+            child = ExistingEntry(self.name + str(buffer(self._m, ofs, eon-ofs)),
+                                  self._m, eon+1)
+            if (not dname
+                 or child.name.startswith(dname)
+                 or child.name.endswith('/') and dname.startswith(child.name)):
+                for e in child.iter(name=name):
+                    yield e
+            if not name or child.name == name or child.name.startswith(dname):
+                yield child
+            ofs = eon + 1 + ENTLEN
+
+    def __iter__(self):
+        return self.iter()
             
 
 class Reader:
@@ -93,15 +149,22 @@ class Reader:
     def __del__(self):
         self.close()
 
+    def iter(self, name=None):
+        if len(self.m):
+            dname = name
+            if dname and not dname.endswith('/'):
+                dname += '/'
+            root = ExistingEntry('/', self.m, len(self.m)-ENTLEN)
+            for sub in root.iter(name=name):
+                yield sub
+            if not dname or dname == root.name:
+                yield root
+
     def __iter__(self):
-        tstart = int(time.time())
-        ofs = len(INDEX_HDR)
-        while ofs < len(self.m):
-            eon = self.m.find('\0', ofs)
-            assert(eon >= 0)
-            yield Entry(buffer(self.m, ofs, eon-ofs),
-                          self.m, eon+1, tstart = tstart)
-            ofs = eon + 1 + ENTLEN
+        return self.iter()
+
+    def exists(self):
+        return self.m
 
     def save(self):
         if self.writable and self.m:
@@ -114,23 +177,11 @@ class Reader:
             self.writable = False
 
     def filter(self, prefixes):
-        #log("filtering %r\n" % prefixes)
-        paths = reduce_paths(prefixes)
-        #log("filtering %r\n" % paths)
-        pi = iter(paths)
-        (rpin, pin) = pi.next()
-        for ent in self:
-            #log('checking %r vs %r\n' % (ent.name, rpin))
-            while ent.name < rpin:
-                try:
-                    (rpin, pin) = pi.next()
-                except StopIteration:
-                    return  # no more files can possibly match
-            if not ent.name.startswith(rpin):
-                continue   # not interested
-            else:
-                name = pin + ent.name[len(rpin):]
-                yield (name, ent)
+        for (rp, path) in reduce_paths(prefixes):
+            for e in self.iter(rp):
+                assert(e.name.startswith(rp))
+                name = path + e.name[len(rp):]
+                yield (name, e)
 
 
 # Read all the iters in order; when more than one iter has the same entry,
@@ -165,8 +216,17 @@ def _last_writer_wins_iter(iters):
         l = filter(None, l)
 
 
+def pathsplit(p):
+    l = p.split('/')
+    l = list([i+'/' for i in l[:-1]]) + l[-1:]
+    if l[-1] == '':
+        l.pop()  # extra blank caused by terminating '/'
+    return l
+
+
 class Writer:
     def __init__(self, filename):
+        self.stack = []
         self.f = None
         self.count = 0
         self.lastfile = None
@@ -187,46 +247,84 @@ class Writer:
             f.close()
             os.unlink(self.tmpname)
 
+    def flush(self):
+        while self.stack:
+            self.add(''.join(self.stack[-1][0]), None)
+        self._pop_to(None, [])
+        self.f.flush()
+
     def close(self):
+        self.flush()
         f = self.f
         self.f = None
         if f:
             f.close()
             os.rename(self.tmpname, self.filename)
 
-    def _write(self, data):
-        self.f.write(data)
+    # FIXME: this function modifies 'entry' and can only pop a single level.
+    # That means its semantics are basically crazy.
+    def _pop_to(self, entry, edir):
+        assert(len(self.stack) - len(edir) <= 1)
+        while self.stack and self.stack[-1][0] > edir:
+            #log('popping %r with %d entries (%d)\n' 
+            #    % (''.join(self.stack[-1][0]), len(self.stack[-1][1]),
+            #       len(self.stack)))
+            p = self.stack.pop()
+            entry.children_ofs = self.f.tell()
+            entry.children_n = len(p[1])
+            for e in p[1]:
+                self._write(e)
+
+    def _write(self, entry):
+        #log('        writing %r\n' % entry.name)
+        es = pathsplit(entry.name)
+        self.f.write(es[-1] + '\0' + entry.packed())
         self.count += 1
 
-    def add(self, name, st, hashgen=None):
-        #log('ADDING %r\n' % name)
+    def _add(self, entry):
+        es = pathsplit(entry.name)
+        edir = es[:-1]
+        self._pop_to(entry, edir)
+        while len(self.stack) < len(edir):
+            self.stack.append([es[:len(self.stack)+1], [], ()])
+        if entry.name != '/':
+            self.stack[-1][1].append(entry)
+        else:
+            self._write(entry)
+
+    def add(self, name, st, hashgen = None):
         if self.lastfile:
             assert(cmp(self.lastfile, name) > 0) # reverse order only
-        self.lastfile = name
+        endswith = name.endswith('/')
         flags = IX_EXISTS
         sha = None
         if hashgen:
             (gitmode, sha) = hashgen(name)
-            if sha:
-                flags |= IX_HASHVALID
+            flags |= IX_HASHVALID
         else:
             (gitmode, sha) = (0, EMPTY_SHA)
-        data = name + '\0' + \
-            struct.pack(INDEX_SIG, st.st_dev, int(st.st_ctime),
-                        int(st.st_mtime), st.st_uid, st.st_gid,
-                        st.st_size, st.st_mode, gitmode, sha, flags)
-        self._write(data)
+        if st:
+            isdir = stat.S_ISDIR(st.st_mode)
+            assert(isdir == endswith)
+            e = NewEntry(name, st.st_dev, int(st.st_ctime),
+                         int(st.st_mtime), st.st_uid, st.st_gid,
+                         st.st_size, st.st_mode, gitmode, sha, flags,
+                         0, 0)
+        else:
+            assert(endswith)
+            e = NewEntry(name, 0, 0, 0, 0, 0, 0, 0, gitmode, sha, flags, 0, 0)
+        self.lastfile = name
+        self._add(e)
 
     def add_ixentry(self, e):
         if self.lastfile and self.lastfile <= e.name:
             raise Error('%r must come before %r' 
                              % (e.name, self.lastfile))
         self.lastfile = e.name
-        data = e.name + '\0' + e.packed()
-        self._write(data)
+        self._add(e)
 
     def new_reader(self):
-        self.f.flush()
+        self.flush()
         return Reader(self.tmpname)
 
 
@@ -251,10 +349,14 @@ def reduce_paths(paths):
     xpaths = []
     for p in paths:
         rp = realpath(p)
-        st = os.lstat(rp)
-        if stat.S_ISDIR(st.st_mode):
-            rp = slashappend(rp)
-            p = slashappend(p)
+        try:
+            st = os.lstat(rp)
+            if stat.S_ISDIR(st.st_mode):
+                rp = slashappend(rp)
+                p = slashappend(p)
+        except OSError, e:
+            if e.errno != errno.ENOENT:
+                raise
         xpaths.append((rp, p))
     xpaths.sort()
 
diff --git a/t/sampledata/b2/foozy b/t/sampledata/b2/foozy
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/t/sampledata/b2/foozy2 b/t/sampledata/b2/foozy2
new file mode 100644 (file)
index 0000000..e69de29
index fc0612d550a0b6f02445c997733af29c084667ce..a5a54eb4093852ce8e2d35cb007737cec4d2aef6 100755 (executable)
--- a/t/test.sh
+++ b/t/test.sh
@@ -32,7 +32,7 @@ WVPASSEQ "$(bup index -s $D/)" "A $D/"
 WVPASSEQ "$(bup index -s $D/b)" ""
 bup tick
 WVPASSEQ "$(bup index -us $D/b)" "A $D/b"
-WVPASSEQ "$(bup index -us $D)" \
+WVPASSEQ "$(bup index -usx $D)" \
 "A $D/d/e/
 A $D/d/
 A $D/b
index 48341c02703235f29640b9d9a1ff8e67f3a2c460..bcf87f7f78dda3899d8a713e04e9b541e881063a 100644 (file)
@@ -1,6 +1,7 @@
 import os
 import index
 from wvtest import *
+from helpers import *
 
 @wvtest
 def testbasic():
@@ -11,3 +12,16 @@ def testbasic():
     WVPASSEQ(os.path.realpath('t/sampledata/x'), sd + '/x')
     WVPASSEQ(os.path.realpath('t/sampledata/etc'), os.path.realpath('/etc'))
     WVPASSEQ(index.realpath('t/sampledata/etc'), sd + '/etc')
+
+
+@wvtest
+def testwriter():
+    unlink('index.tmp')
+    ds = os.stat('.')
+    fs = os.stat('t/tindex.py')
+    w = index.Writer('index.tmp')
+    w.add('/var/tmp/sporky', fs)
+    w.add('/etc/passwd', fs)
+    w.add('/etc/', ds)
+    w.add('/', ds)
+    w.close()