From: Avery Pennarun <apenwarr@gmail.com>
Date: Sun, 10 Jan 2010 06:13:10 +0000 (-0500)
Subject: This adds the long-awaited indexfile feature, so you no longer have to feed
X-Git-Tag: bup-0.04~1
X-Git-Url: https://arthur.barton.de/gitweb/?a=commitdiff_plain;h=84c3d04310d40b0ae0ccbdf13a57543ae88b342d;hp=3198fba54521cd7f313a02d237f9198dd58f6a90;p=bup.git

This adds the long-awaited indexfile feature, so you no longer have to feed
your backups through tar.

Okay, 'bup save' is still a bit weak... but it could be much worse.

Merge branch 'indexfile'

* indexfile:
  Minor fix for python 2.4.4 compatibility.
  cmd-save: completely reimplement using the indexfile.
  Moved some reusable index-handling code from cmd-index.py to index.py.
  A bunch of wvtests for the 'bup index' command.
  Start using wvtest.sh for shell-based tests in test-sh.
  cmd-index: default indexfile path is ~/.bup/bupindex, not $PWD/index
  cmd-index: skip merging the index if nothing was written to the new one.
  cmd-index: only update if -u is given; print only given file/dirnames.
  cmd-index: correct reporting of deleted vs. added vs. modified status.
  Generalize the multi-index-walking code.
  cmd-index: indexfiles should start with a well-known header.
  cmd-index: eliminate redundant paths from index update command.
  cmd-index: some handy options.
  index: add --xdev (--one-file-system) option.
  Fix some bugs with indexing '/'
  cmd-index: basic index reader/writer/merger.
---

diff --git a/Makefile b/Makefile
index 6ca9447..7c24fa1 100644
--- a/Makefile
+++ b/Makefile
@@ -12,7 +12,8 @@ endif
 
 default: all
 
-all: bup-split bup-join bup-save bup-init bup-server bup randomgen chashsplit.so
+all: bup-split bup-join bup-save bup-init bup-server bup-index bup-tick \
+	bup randomgen chashsplit.so
 
 randomgen: randomgen.o
 	$(CC) $(CFLAGS) -o $@ $<
@@ -20,16 +21,18 @@ randomgen: randomgen.o
 chashsplit.so: chashsplitmodule.o
 	$(CC) $(CFLAGS) $(SHARED) -o $@ $< $(PYLIB)
 	
-runtests: all
+runtests: all runtests-python runtests-cmdline
+
+runtests-python:
 	./wvtest.py $(wildcard t/t*.py)
 	
 runtests-cmdline: all
-	./test-sh
+	t/test.sh
 	
 stupid:
 	PATH=/bin:/usr/bin $(MAKE) test
 	
-test: all runtests-cmdline
+test: all
 	./wvtestrun $(MAKE) runtests
 
 %: %.o
diff --git a/cmd-index.py b/cmd-index.py
new file mode 100755
index 0000000..ef596b7
--- /dev/null
+++ b/cmd-index.py
@@ -0,0 +1,235 @@
+#!/usr/bin/env python2.5
+import os, sys, stat
+import options, git, index
+from helpers import *
+
+class OsFile:
+    def __init__(self, path):
+        self.fd = None
+        self.fd = os.open(path, os.O_RDONLY|os.O_LARGEFILE|os.O_NOFOLLOW)
+        #self.st = os.fstat(self.fd)
+        
+    def __del__(self):
+        if self.fd:
+            fd = self.fd
+            self.fd = None
+            os.close(fd)
+
+    def fchdir(self):
+        os.fchdir(self.fd)
+
+
+saved_errors = []
+def add_error(e):
+    saved_errors.append(e)
+    log('\n%s\n' % e)
+
+
+# the use of fchdir() and lstat() are for two reasons:
+#  - help out the kernel by not making it repeatedly look up the absolute path
+#  - avoid race conditions caused by doing listdir() on a changing symlink
+def handle_path(ri, wi, dir, name, pst, xdev, can_delete_siblings):
+    hashgen = None
+    if opt.fake_valid:
+        def hashgen(name):
+            return (0, index.FAKE_SHA)
+    
+    dirty = 0
+    path = dir + name
+    #log('handle_path(%r,%r)\n' % (dir, name))
+    if stat.S_ISDIR(pst.st_mode):
+        if opt.verbose == 1: # log dirs only
+            sys.stdout.write('%s\n' % path)
+            sys.stdout.flush()
+        try:
+            OsFile(name).fchdir()
+        except OSError, e:
+            add_error(Exception('in %s: %s' % (dir, str(e))))
+            return 0
+        try:
+            try:
+                ld = os.listdir('.')
+                #log('* %r: %r\n' % (name, ld))
+            except OSError, e:
+                add_error(Exception('in %s: %s' % (path, str(e))))
+                return 0
+            lds = []
+            for p in ld:
+                try:
+                    st = os.lstat(p)
+                except OSError, e:
+                    add_error(Exception('in %s: %s' % (path, str(e))))
+                    continue
+                if xdev != None and st.st_dev != xdev:
+                    log('Skipping %r: different filesystem.\n' 
+                        % index.realpath(p))
+                    continue
+                if stat.S_ISDIR(st.st_mode):
+                    p = slashappend(p)
+                lds.append((p, st))
+            for p,st in reversed(sorted(lds)):
+                dirty += handle_path(ri, wi, path, p, st, xdev,
+                                     can_delete_siblings = True)
+        finally:
+            os.chdir('..')
+    #log('endloop: ri.cur:%r path:%r\n' % (ri.cur.name, path))
+    while ri.cur and ri.cur.name > path:
+        #log('ricur:%r path:%r\n' % (ri.cur, path))
+        if can_delete_siblings and dir and ri.cur.name.startswith(dir):
+            #log('    --- deleting\n')
+            ri.cur.flags &= ~(index.IX_EXISTS | index.IX_HASHVALID)
+            ri.cur.repack()
+            dirty += 1
+        ri.next()
+    if ri.cur and ri.cur.name == path:
+        dirty += ri.cur.from_stat(pst)
+        if dirty or not (ri.cur.flags & index.IX_HASHVALID):
+            #log('   --- updating %r\n' % path)
+            if hashgen:
+                (ri.cur.gitmode, ri.cur.sha) = hashgen(name)
+                ri.cur.flags |= index.IX_HASHVALID
+            ri.cur.repack()
+        ri.next()
+    else:
+        wi.add(path, pst, hashgen = hashgen)
+        dirty += 1
+    if opt.verbose > 1:  # all files, not just dirs
+        sys.stdout.write('%s\n' % path)
+        sys.stdout.flush()
+    return dirty
+
+
+def merge_indexes(out, r1, r2):
+    log('bup: merging indexes.\n')
+    for e in index._last_writer_wins_iter([r1, r2]):
+        #if e.flags & index.IX_EXISTS:
+            out.add_ixentry(e)
+
+
+class MergeGetter:
+    def __init__(self, l):
+        self.i = iter(l)
+        self.cur = None
+        self.next()
+
+    def next(self):
+        try:
+            self.cur = self.i.next()
+        except StopIteration:
+            self.cur = None
+        return self.cur
+
+
+def update_index(path):
+    ri = index.Reader(indexfile)
+    wi = index.Writer(indexfile)
+    rig = MergeGetter(ri)
+    
+    rpath = index.realpath(path)
+    st = os.lstat(rpath)
+    if opt.xdev:
+        xdev = st.st_dev
+    else:
+        xdev = None
+    f = OsFile('.')
+    if rpath[-1] == '/':
+        rpath = rpath[:-1]
+    (dir, name) = os.path.split(rpath)
+    dir = slashappend(dir)
+    if stat.S_ISDIR(st.st_mode) and (not rpath or rpath[-1] != '/'):
+        name += '/'
+        can_delete_siblings = True
+    else:
+        can_delete_siblings = False
+    OsFile(dir or '/').fchdir()
+    dirty = handle_path(rig, wi, dir, name, st, xdev, can_delete_siblings)
+
+    # make sure all the parents of the updated path exist and are invalidated
+    # if appropriate.
+    while 1:
+        (rpath, junk) = os.path.split(rpath)
+        if not rpath:
+            break
+        elif rpath == '/':
+            p = rpath
+        else:
+            p = rpath + '/'
+        while rig.cur and rig.cur.name > p:
+            #log('FINISHING: %r path=%r d=%r\n' % (rig.cur.name, p, dirty))
+            rig.next()
+        if rig.cur and rig.cur.name == p:
+            if dirty:
+                rig.cur.flags &= ~index.IX_HASHVALID
+                rig.cur.repack()
+        else:
+            wi.add(p, os.lstat(p))
+        if p == '/':
+            break
+    
+    f.fchdir()
+    ri.save()
+    if wi.count:
+        mi = index.Writer(indexfile)
+        merge_indexes(mi, ri, wi.new_reader())
+        mi.close()
+    wi.abort()
+
+
+optspec = """
+bup index <-p|s|m|u> [options...] <filenames...>
+--
+p,print    print the index entries for the given names (also works with -u)
+m,modified print only added/deleted/modified files (implies -p)
+s,status   print each filename with a status char (A/M/D) (implies -p)
+H,hash     print the hash for each object next to its name (implies -p)
+u,update   (recursively) update the index entries for the given filenames
+x,xdev,one-file-system  don't cross filesystem boundaries
+fake-valid    mark all index entries as up-to-date even if they aren't
+f,indexfile=  the name of the index file (default 'index')
+v,verbose  increase log output (can be used more than once)
+"""
+o = options.Options('bup index', optspec)
+(opt, flags, extra) = o.parse(sys.argv[1:])
+
+if not (opt.modified or opt['print'] or opt.status or opt.update):
+    log('bup index: you must supply one or more of -p, -s, -m, or -u\n')
+    o.usage()
+if opt.fake_valid and not opt.update:
+    log('bup index: --fake-valid is meaningless without -u\n')
+    o.usage()
+
+git.check_repo_or_die()
+indexfile = opt.indexfile or git.repo('bupindex')
+
+paths = index.reduce_paths(extra)
+
+if opt.update:
+    if not paths:
+        log('bup index: update (-u) requested but no paths given\n')
+        o.usage()
+    for (rp, path) in paths:
+        update_index(rp)
+
+if opt['print'] or opt.status or opt.modified:
+    for (name, ent) in index.Reader(indexfile).filter(extra or ['']):
+        if opt.modified and ent.flags & index.IX_HASHVALID:
+            continue
+        line = ''
+        if opt.status:
+            if not ent.flags & index.IX_EXISTS:
+                line += 'D '
+            elif not ent.flags & index.IX_HASHVALID:
+                if ent.sha == index.EMPTY_SHA:
+                    line += 'A '
+                else:
+                    line += 'M '
+            else:
+                line += '  '
+        if opt.hash:
+            line += ent.sha.encode('hex') + ' '
+        print line + (name or './')
+        #print repr(ent)
+
+if saved_errors:
+    log('WARNING: %d errors encountered.\n' % len(saved_errors))
+    exit(1)
diff --git a/cmd-join.py b/cmd-join.py
index b87319a..5378629 100755
--- a/cmd-join.py
+++ b/cmd-join.py
@@ -26,5 +26,6 @@ if opt.remote:
     cli.close()
 else:
     for id in extra:
+        #log('id=%r\n' % id)
         for blob in git.cat(id):
             sys.stdout.write(blob)
diff --git a/cmd-save.py b/cmd-save.py
index 7da0429..67cb30c 100755
--- a/cmd-save.py
+++ b/cmd-save.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python2.5
 import sys, re, errno, stat, client
-import hashsplit, git, options
+import hashsplit, git, options, index
 from helpers import *
 
 
@@ -10,90 +10,6 @@ def add_error(e):
     log('\n%s\n' % e)
 
 
-def _direxpand(name):
-    st = os.lstat(name)
-    try:
-        if stat.S_ISDIR(st.st_mode):
-            for sub in os.listdir(name):
-                subfull = os.path.join(name, sub)
-                for fn_st in _direxpand(subfull):
-                    yield fn_st
-        else:
-            yield (name,st)
-    except OSError, e:
-        if e.errno in [errno.ENOENT, errno.EPERM, errno.EACCES]:
-            add_error(e)
-        else:
-            raise
-
-
-def direxpand(names):
-    for n in names:
-        for fn_st in _direxpand(n):
-            yield fn_st
-            
-
-def _normpath(dir):
-    p = os.path.normpath(dir)
-    return (p != '.') and p or ''
-
-
-class Tree:
-    def __init__(self, parent, name):
-        assert(name != '.')
-        assert(not (parent and not name))
-        self.parent = parent
-        self.name = name
-        self.sha = None
-        self.children = {}
-        if self.parent:
-            self.parent.children[self.name] = self
-    
-    def fullpath(self):
-        if self.parent:
-            return os.path.join(self.parent.fullpath(), self.name)
-        else:
-            return self.name
-        
-    def gettop(self):
-        p = self
-        while p.parent:
-            p = p.parent
-        return p
-        
-    def getdir(self, dir):
-        # FIXME: deal with '..' somehow (look at how tar does it)
-        dir = _normpath(dir)
-        if dir.startswith('/'):
-            dir = dir[1:]
-        top = self.gettop()
-        if not dir:
-            return top
-        for part in dir.split('/'):
-            sub = top.children.get(part)
-            if not sub:
-                sub = top.children[part] = Tree(top, part)
-            top = sub
-        return top
-    
-    def addfile(self, mode, fullname, id):
-        (dir, name) = os.path.split(fullname)
-        self.getdir(dir).children[name] = (mode, name, id)
-        
-    def shalist(self, w):
-        for c in self.children.values():
-            if isinstance(c, tuple):  # sha1 entry for a file
-                yield c
-            else:  # tree
-                t = ('40000', c.name, c.gen_tree(w))
-                yield t
-        
-    def gen_tree(self, w):
-        if not self.sha:
-            self.sha = w.new_tree(self.shalist(w))
-        return self.sha
-
-
 optspec = """
 bup save [-tc] [-n name] <filenames...>
 --
@@ -110,6 +26,9 @@ git.check_repo_or_die()
 if not (opt.tree or opt.commit or opt.name):
     log("bup save: use one or more of -t, -c, -n\n")
     o.usage()
+if not extra:
+    log("bup save: no filenames given.\n")
+    o.usage()
 
 if opt.verbose >= 2:
     git.verbose = opt.verbose - 1
@@ -124,26 +43,94 @@ else:
     cli = None
     oldref = refname and git.read_ref(refname) or None
     w = git.PackWriter()
-    
-root = Tree(None, '')
-for (fn,st) in direxpand(extra):
+
+
+def eatslash(dir):
+    if dir.endswith('/'):
+        return dir[:-1]
+    else:
+        return dir
+
+
+parts = ['']
+shalists = [[]]
+
+def _push(part):
+    parts.append(part)
+    shalists.append([])
+
+def _pop():
+    assert(len(parts) > 1)
+    part = parts.pop()
+    shalist = shalists.pop()
+    tree = w.new_tree(shalist)
+    shalists[-1].append(('40000', part, tree))
+
+
+for (transname,ent) in index.Reader(git.repo('bupindex')).filter(extra):
+    (dir, file) = os.path.split(ent.name)
+    exists = (ent.flags & index.IX_EXISTS)
+    hashvalid = (ent.flags & index.IX_HASHVALID) and w.exists(ent.sha)
     if opt.verbose:
-        log('\n%s ' % fn)
-    try:
-        if stat.S_ISREG(st.st_mode):  # regular file
-            f = open(fn)
-            (mode, id) = hashsplit.split_to_blob_or_tree(w, [f])
-        elif stat.S_ISLNK(st.st_mode):  # symlink
-            (mode, id) = ('120000', w.new_blob(os.readlink(fn)))
+        if not exists:
+            status = 'D'
+        elif not hashvalid:
+            if ent.sha == index.EMPTY_SHA:
+                status = 'A'
+            else:
+                status = 'M'
         else:
-            add_error(Exception('skipping special file "%s"' % fn))
-    except IOError, e:
-        add_error(e)
-    except OSError, e:
-        add_error(e)
+            status = ' '
+        log('\n%s %s ' % (status, ent.name))
+
+    if not exists:
+        continue
+
+    assert(dir.startswith('/'))
+    dirp = dir.split('/')
+    while parts > dirp:
+        _pop()
+    for part in dirp[len(parts):]:
+        _push(part)
+
+    if not file:
+        # directory already handled.
+        # FIXME: not using the indexed tree sha1's for anything, which is
+        # a waste.  That's a potential optimization...
+        continue  
+
+    id = None
+    if hashvalid:
+        mode = '%o' % ent.mode
+        id = ent.sha
+        shalists[-1].append((mode, file, id))
     else:
-        root.addfile(mode, fn, id)
-tree = root.gen_tree(w)
+        try:
+            if stat.S_ISREG(ent.mode):
+                f = open(ent.name)
+                (mode, id) = hashsplit.split_to_blob_or_tree(w, [f])
+            elif stat.S_ISDIR(ent.mode):
+                assert(0)  # handled above
+            elif stat.S_ISLNK(ent.mode):
+                (mode, id) = ('120000', w.new_blob(os.readlink(ent.name)))
+            else:
+                add_error(Exception('skipping special file "%s"' % ent.name))
+        except IOError, e:
+            add_error(e)
+        except OSError, e:
+            add_error(e)
+        if id:
+            ent.validate(id)
+            ent.repack()
+            shalists[-1].append((mode, file, id))
+#log('parts out: %r\n' % parts)
+#log('stk out: %r\n' % shalists)
+while len(parts) > 1:
+    _pop()
+#log('parts out: %r\n' % parts)
+#log('stk out: %r\n' % shalists)
+assert(len(shalists) == 1)
+tree = w.new_tree(shalists[-1])
 if opt.verbose:
     log('\n')
 if opt.tree:
diff --git a/cmd-tick.py b/cmd-tick.py
new file mode 100755
index 0000000..da1d003
--- /dev/null
+++ b/cmd-tick.py
@@ -0,0 +1,17 @@
+#!/usr/bin/env python2.5
+import sys, time
+import options
+
+optspec = """
+bup tick
+"""
+o = options.Options('bup tick', optspec)
+(opt, flags, extra) = o.parse(sys.argv[1:])
+
+if extra:
+    log("bup tick: no arguments expected\n")
+    o.usage()
+
+t = time.time()
+tleft = 1 - (t - int(t))
+time.sleep(tleft)
diff --git a/git.py b/git.py
index ccafa5c..5aa7933 100644
--- a/git.py
+++ b/git.py
@@ -183,11 +183,14 @@ class PackWriter:
     def write(self, type, content):
         return self._write(calc_hash(type, content), type, content)
 
-    def maybe_write(self, type, content):
-        bin = calc_hash(type, content)
+    def exists(self, id):
         if not self.objcache:
             self._make_objcache()
-        if not self.objcache.exists(bin):
+        return self.objcache.exists(id)
+
+    def maybe_write(self, type, content):
+        bin = calc_hash(type, content)
+        if not self.exists(bin):
             self._write(bin, type, content)
             self.objcache.add(bin)
         return bin
@@ -398,9 +401,11 @@ class CatPipe:
         assert(id[0] != '-')
         self.p.stdin.write('%s\n' % id)
         hdr = self.p.stdout.readline()
+        if hdr.endswith(' missing\n'):
+            raise GitError('blob %r is missing' % id)
         spl = hdr.split(' ')
-        assert(len(spl) == 3)
-        assert(len(spl[0]) == 40)
+        if len(spl) != 3 or len(spl[0]) != 40:
+            raise GitError('expected blob, got %r' % spl)
         (hex, type, size) = spl
         yield type
         for blob in chunkyreader(self.p.stdout, int(spl[2])):
@@ -437,7 +442,8 @@ class CatPipe:
             for blob in self.join(treeline[5:]):
                 yield blob
         else:
-            raise GitError('unknown object type %r' % type)
+            raise GitError('invalid object type %r: expected blob/tree/commit'
+                           % type)
 
     def join(self, id):
         for d in self._join(self.get(id)):
diff --git a/helpers.py b/helpers.py
index b478d61..b0b054d 100644
--- a/helpers.py
+++ b/helpers.py
@@ -112,3 +112,11 @@ def chunkyreader(f, count = None):
             b = f.read(65536)
             if not b: break
             yield b
+
+
+def slashappend(s):
+    if s and not s.endswith('/'):
+        return s + '/'
+    else:
+        return s
+
diff --git a/index.py b/index.py
new file mode 100644
index 0000000..9a746eb
--- /dev/null
+++ b/index.py
@@ -0,0 +1,268 @@
+import os, stat, time, struct, tempfile, mmap
+from helpers import *
+
+EMPTY_SHA = '\0'*20
+FAKE_SHA = '\x01'*20
+INDEX_HDR = 'BUPI\0\0\0\1'
+INDEX_SIG = '!IIIIIQII20sH'
+ENTLEN = struct.calcsize(INDEX_SIG)
+
+IX_EXISTS = 0x8000
+IX_HASHVALID = 0x4000
+
+class Error(Exception):
+    pass
+
+
+class Entry:
+    def __init__(self, name, m, ofs, tstart):
+        self._m = m
+        self._ofs = ofs
+        self.name = str(name)
+        self.tstart = tstart
+        (self.dev, self.ctime, self.mtime, self.uid, self.gid,
+         self.size, self.mode, self.gitmode, self.sha,
+         self.flags) = struct.unpack(INDEX_SIG, str(buffer(m, ofs, ENTLEN)))
+
+    def __repr__(self):
+        return ("(%s,0x%04x,%d,%d,%d,%d,%d,0x%04x)" 
+                % (self.name, self.dev,
+                   self.ctime, self.mtime, self.uid, self.gid,
+                   self.size, self.flags))
+
+    def packed(self):
+        return struct.pack(INDEX_SIG,
+                           self.dev, self.ctime, self.mtime, 
+                           self.uid, self.gid, self.size, self.mode,
+                           self.gitmode, self.sha, self.flags)
+
+    def repack(self):
+        self._m[self._ofs:self._ofs+ENTLEN] = self.packed()
+
+    def from_stat(self, st):
+        old = (self.dev, self.ctime, self.mtime,
+               self.uid, self.gid, self.size, self.flags & IX_EXISTS)
+        new = (st.st_dev, int(st.st_ctime), int(st.st_mtime),
+               st.st_uid, st.st_gid, st.st_size, IX_EXISTS)
+        self.dev = st.st_dev
+        self.ctime = int(st.st_ctime)
+        self.mtime = int(st.st_mtime)
+        self.uid = st.st_uid
+        self.gid = st.st_gid
+        self.size = st.st_size
+        self.mode = st.st_mode
+        self.flags |= IX_EXISTS
+        if int(st.st_ctime) >= self.tstart or old != new:
+            self.flags &= ~IX_HASHVALID
+            return 1  # dirty
+        else:
+            return 0  # not dirty
+
+    def validate(self, sha):
+        assert(sha)
+        self.sha = sha
+        self.flags |= IX_HASHVALID
+
+    def __cmp__(a, b):
+        return cmp(a.name, b.name)
+            
+
+class Reader:
+    def __init__(self, filename):
+        self.filename = filename
+        self.m = ''
+        self.writable = False
+        f = None
+        try:
+            f = open(filename, 'r+')
+        except IOError, e:
+            if e.errno == errno.ENOENT:
+                pass
+            else:
+                raise
+        if f:
+            b = f.read(len(INDEX_HDR))
+            if b != INDEX_HDR:
+                raise Error('%s: header: expected %r, got %r'
+                                 % (filename, INDEX_HDR, b))
+            st = os.fstat(f.fileno())
+            if st.st_size:
+                self.m = mmap.mmap(f.fileno(), 0,
+                                   mmap.MAP_SHARED,
+                                   mmap.PROT_READ|mmap.PROT_WRITE)
+                f.close()  # map will persist beyond file close
+                self.writable = True
+
+    def __del__(self):
+        self.save()
+
+    def __iter__(self):
+        tstart = int(time.time())
+        ofs = len(INDEX_HDR)
+        while ofs < len(self.m):
+            eon = self.m.find('\0', ofs)
+            assert(eon >= 0)
+            yield Entry(buffer(self.m, ofs, eon-ofs),
+                          self.m, eon+1, tstart = tstart)
+            ofs = eon + 1 + ENTLEN
+
+    def save(self):
+        if self.writable:
+            self.m.flush()
+
+    def filter(self, prefixes):
+        #log("filtering %r\n" % prefixes)
+        paths = reduce_paths(prefixes)
+        #log("filtering %r\n" % paths)
+        pi = iter(paths)
+        (rpin, pin) = pi.next()
+        for ent in self:
+            #log('checking %r vs %r\n' % (ent.name, rpin))
+            while ent.name < rpin:
+                try:
+                    (rpin, pin) = pi.next()
+                except StopIteration:
+                    return  # no more files can possibly match
+            if not ent.name.startswith(rpin):
+                continue   # not interested
+            else:
+                name = pin + ent.name[len(rpin):]
+                yield (name, ent)
+
+
+# Read all the iters in order; when more than one iter has the same entry,
+# the *later* iter in the list wins.  (ie. more recent iter entries replace
+# older ones)
+def _last_writer_wins_iter(iters):
+    l = []
+    for e in iters:
+        it = iter(e)
+        try:
+            l.append([it.next(), it])
+        except StopIteration:
+            pass
+    del iters  # to avoid accidents
+    while l:
+        l.sort()
+        mv = l[0][0]
+        mi = []
+        for (i,(v,it)) in enumerate(l):
+            #log('(%d) considering %d: %r\n' % (len(l), i, v))
+            if v > mv:
+                mv = v
+                mi = [i]
+            elif v == mv:
+                mi.append(i)
+        yield mv
+        for i in mi:
+            try:
+                l[i][0] = l[i][1].next()
+            except StopIteration:
+                l[i] = None
+        l = filter(None, l)
+
+
+class Writer:
+    def __init__(self, filename):
+        self.f = None
+        self.count = 0
+        self.lastfile = None
+        self.filename = None
+        self.filename = filename = realpath(filename)
+        (dir,name) = os.path.split(filename)
+        (ffd,self.tmpname) = tempfile.mkstemp('.tmp', filename, dir)
+        self.f = os.fdopen(ffd, 'wb', 65536)
+        self.f.write(INDEX_HDR)
+
+    def __del__(self):
+        self.abort()
+
+    def abort(self):
+        f = self.f
+        self.f = None
+        if f:
+            f.close()
+            os.unlink(self.tmpname)
+
+    def close(self):
+        f = self.f
+        self.f = None
+        if f:
+            f.close()
+            os.rename(self.tmpname, self.filename)
+
+    def _write(self, data):
+        self.f.write(data)
+        self.count += 1
+
+    def add(self, name, st, hashgen=None):
+        #log('ADDING %r\n' % name)
+        if self.lastfile:
+            assert(cmp(self.lastfile, name) > 0) # reverse order only
+        self.lastfile = name
+        flags = IX_EXISTS
+        sha = None
+        if hashgen:
+            (gitmode, sha) = hashgen(name)
+            if sha:
+                flags |= IX_HASHVALID
+        else:
+            (gitmode, sha) = (0, EMPTY_SHA)
+        data = name + '\0' + \
+            struct.pack(INDEX_SIG, st.st_dev, int(st.st_ctime),
+                        int(st.st_mtime), st.st_uid, st.st_gid,
+                        st.st_size, st.st_mode, gitmode, sha, flags)
+        self._write(data)
+
+    def add_ixentry(self, e):
+        if self.lastfile and self.lastfile <= e.name:
+            raise Error('%r must come before %r' 
+                             % (e.name, self.lastfile))
+        self.lastfile = e.name
+        data = e.name + '\0' + e.packed()
+        self._write(data)
+
+    def new_reader(self):
+        self.f.flush()
+        return Reader(self.tmpname)
+
+
+# like os.path.realpath, but doesn't follow a symlink for the last element.
+# (ie. if 'p' itself is itself a symlink, this one won't follow it)
+def realpath(p):
+    try:
+        st = os.lstat(p)
+    except OSError:
+        st = None
+    if st and stat.S_ISLNK(st.st_mode):
+        (dir, name) = os.path.split(p)
+        dir = os.path.realpath(dir)
+        out = os.path.join(dir, name)
+    else:
+        out = os.path.realpath(p)
+    #log('realpathing:%r,%r\n' % (p, out))
+    return out
+
+
+def reduce_paths(paths):
+    xpaths = []
+    for p in paths:
+        rp = realpath(p)
+        st = os.lstat(rp)
+        if stat.S_ISDIR(st.st_mode):
+            rp = slashappend(rp)
+            p = slashappend(p)
+        xpaths.append((rp, p))
+    xpaths.sort()
+
+    paths = []
+    prev = None
+    for (rp, p) in xpaths:
+        if prev and (prev == rp 
+                     or (prev.endswith('/') and rp.startswith(prev))):
+            continue # already superceded by previous path
+        paths.append((rp, p))
+        prev = rp
+    paths.sort(reverse=True)
+    return paths
+
diff --git a/t/test.sh b/t/test.sh
new file mode 100755
index 0000000..7e994c1
--- /dev/null
+++ b/t/test.sh
@@ -0,0 +1,103 @@
+#!/bin/bash
+. wvtest.sh
+#set -e
+
+TOP="$(pwd)"
+export BUP_DIR="$TOP/buptest.tmp"
+
+bup()
+{
+    "$TOP/bup" "$@"
+}
+
+WVSTART "init"
+
+#set -x
+rm -rf "$BUP_DIR"
+WVPASS bup init
+
+WVSTART "index"
+D=bupdata.tmp
+rm -rf $D
+mkdir $D
+WVPASSEQ "$(bup index -p)" ""
+WVPASSEQ "$(bup index -p $D)" ""
+WVFAIL [ -e $D.fake ]
+WVFAIL bup index -u $D.fake
+WVPASS bup index -u $D
+WVPASSEQ "$(bup index -p $D)" "$D/"
+touch $D/a $D/b
+mkdir $D/d $D/d/e
+WVPASSEQ "$(bup index -s $D/)" "A $D/"
+WVPASSEQ "$(bup index -s $D/b)" ""
+bup tick
+WVPASSEQ "$(bup index -us $D/b)" "A $D/b"
+WVPASSEQ "$(bup index -us $D)" \
+"A $D/d/e/
+A $D/d/
+A $D/b
+A $D/a
+A $D/"
+WVPASSEQ "$(bup index -us $D/a $D/b --fake-valid)" \
+"  $D/b
+  $D/a"
+WVPASSEQ "$(bup index -us $D/a)" "  $D/a"  # stays unmodified
+touch $D/a
+WVPASS bup index -u $D/a  # becomes modified
+WVPASSEQ "$(bup index -s $D/a $D $D/b)" \
+"A $D/d/e/
+A $D/d/
+  $D/b
+M $D/a
+A $D/"
+WVPASSEQ "$(cd $D && bup index -m .)" \
+"./d/e/
+./d/
+./a
+./"
+WVPASSEQ "$(cd $D && bup index -m)" \
+"d/e/
+d/
+a
+./"
+WVPASSEQ "$(cd $D && bup index -s .)" "$(cd $D && bup index -s .)"
+
+
+WVSTART "split"
+WVPASS bup split --bench -b <testfile1 >tags1.tmp
+WVPASS bup split -vvvv -b testfile2 >tags2.tmp
+WVPASS bup split -t testfile2 >tags2t.tmp
+WVPASS bup split -t testfile2 --fanout 3 >tags2tf.tmp
+WVPASS bup split -r "$BUP_DIR" -c testfile2 >tags2c.tmp
+WVPASS ls -lR \
+   | WVPASS bup split -r "$BUP_DIR" -c --fanout 3 --max-pack-objects 3 -n lslr
+WVFAIL diff -u tags1.tmp tags2.tmp
+
+# fanout must be different from non-fanout
+WVFAIL diff -q tags2t.tmp tags2tf.tmp
+wc -c testfile1 testfile2
+wc -l tags1.tmp tags2.tmp
+
+WVSTART "join"
+WVPASS bup join $(cat tags1.tmp) >out1.tmp
+WVPASS bup join <tags2.tmp >out2.tmp
+WVPASS bup join <tags2t.tmp >out2t.tmp
+WVPASS bup join -r "$BUP_DIR" <tags2c.tmp >out2c.tmp
+WVPASS diff -u testfile1 out1.tmp
+WVPASS diff -u testfile2 out2.tmp
+WVPASS diff -u testfile2 out2t.tmp
+WVPASS diff -u testfile2 out2c.tmp
+
+WVSTART "save/fsck"
+(
+    set -e
+    cd "$BUP_DIR" || exit 1
+    #git repack -Ad
+    #git prune
+    (cd "$TOP/t/sampledata" && WVPASS bup save -vvn master .) || WVFAIL
+    n=$(git fsck --full --strict 2>&1 | 
+	  egrep -v 'dangling (commit|tree)' |
+	  tee -a /dev/stderr | 
+	  wc -l)
+    WVPASS [ "$n" -eq 0 ]
+) || exit 1
diff --git a/t/tindex.py b/t/tindex.py
new file mode 100644
index 0000000..9922b8f
--- /dev/null
+++ b/t/tindex.py
@@ -0,0 +1,13 @@
+import os
+import index
+from wvtest import *
+
+@wvtest
+def testbasic():
+    cd = os.path.realpath('')
+    WVPASS(cd)
+    sd = os.path.realpath('t/sampledata')
+    WVPASSEQ(index.realpath('t/sampledata'), cd + '/t/sampledata')
+    WVPASSEQ(os.path.realpath('t/sampledata/x'), sd + '/x')
+    WVPASSEQ(os.path.realpath('t/sampledata/etc'), '/etc')
+    WVPASSEQ(index.realpath('t/sampledata/etc'), sd + '/etc')
diff --git a/test-sh b/test-sh
deleted file mode 100755
index f48be99..0000000
--- a/test-sh
+++ /dev/null
@@ -1,52 +0,0 @@
-#!/bin/bash
-set -e
-echo "Testing \"integration\" in $0:"
-
-TOP="$(pwd)"
-export BUP_DIR="$TOP/buptest.tmp"
-
-bup()
-{
-    "$TOP/bup" "$@"
-}
-
-set -x
-rm -rf "$BUP_DIR"
-bup init
-bup split --bench -b <testfile1 >tags1.tmp
-bup split -vvvv -b testfile2 >tags2.tmp
-bup split -t testfile2 >tags2t.tmp
-bup split -t testfile2 --fanout 3 >tags2tf.tmp
-bup split -r "$BUP_DIR" -c testfile2 >tags2c.tmp
-ls -lR | bup split -r "$BUP_DIR" -c --fanout 3 --max-pack-objects 3 -n lslr
-diff -u tags1.tmp tags2.tmp || true
-if diff -q tags2t.tmp tags2tf.tmp; then
-    echo "fanout tree same as non-fanout tree!?"
-    false
-fi
-wc -c testfile1 testfile2
-wc -l tags1.tmp tags2.tmp
-bup join $(cat tags1.tmp) >out1.tmp
-bup join <tags2.tmp >out2.tmp
-bup join <tags2t.tmp >out2t.tmp
-bup join -r "$BUP_DIR" <tags2c.tmp >out2c.tmp
-diff -u testfile1 out1.tmp
-diff -u testfile2 out2.tmp
-diff -u testfile2 out2t.tmp
-diff -u testfile2 out2c.tmp
-
-(
-    set -e
-    cd "$BUP_DIR" || exit 1
-    #git repack -Ad
-    #git prune
-    (cd "$TOP/t/sampledata" && bup save -vvn master .) || exit 1
-    n=$(git fsck --full --strict 2>&1 | 
-	  egrep -v 'dangling (commit|tree)' |
-	  tee -a /dev/stderr | 
-	  wc -l)
-    if [ "$n" -ne 0 ]; then
-        echo "git fsck error."
-        exit 5
-    fi
-) || exit 1
diff --git a/wvtest.sh b/wvtest.sh
new file mode 100644
index 0000000..90bdc90
--- /dev/null
+++ b/wvtest.sh
@@ -0,0 +1,89 @@
+# we don't quote $TEXT in case it contains newlines; newlines
+# aren't allowed in test output.  However, we set -f so that
+# at least shell glob characters aren't processed.
+_textclean()
+{
+	( set -f; echo $* )
+}
+
+_wvcheck()
+{
+	CODE="$1"
+	TEXT=$(_textclean "$2")
+	OK=ok
+	if [ "$CODE" -ne 0 ]; then
+		OK=FAILED
+	fi
+	echo "! ${BASH_SOURCE[2]}:${BASH_LINENO[1]}  $TEXT  $OK" >&2
+	if [ "$CODE" -ne 0 ]; then
+		exit $CODE
+	else
+		return 0
+	fi
+}
+
+
+WVPASS()
+{
+	TEXT="$*"
+	
+	if "$@"; then
+		_wvcheck 0 "$TEXT"
+		return 0
+	else
+		_wvcheck 1 "$TEXT"
+		# NOTREACHED
+		return 1
+	fi
+}
+
+
+WVFAIL()
+{
+	TEXT="$*"
+	
+	if "$@"; then
+		_wvcheck 1 "NOT($TEXT)"
+		# NOTREACHED
+		return 1
+	else
+		_wvcheck 0 "NOT($TEXT)"
+		return 0
+	fi
+}
+
+
+_wvgetrv()
+{
+	( "$@" >&2 )
+	echo -n $?
+}
+
+
+WVPASSEQ()
+{
+	WVPASS [ "$#" -eq 2 ]
+	echo "Comparing:" >&2
+	echo "$1" >&2
+	echo "--" >&2
+	echo "$2" >&2
+	_wvcheck $(_wvgetrv [ "$1" = "$2" ]) "'$1' = '$2'"
+}
+
+
+WVPASSNE()
+{
+	WVPASS [ "$#" -eq 2 ]
+	echo "Comparing:" >&2
+	echo "$1" >&2
+	echo "--" >&2
+	echo "$2" >&2
+	_wvcheck $(_wvgetrv [ "$1" != "$2" ]) "'$1' != '$2'"
+}
+
+
+WVSTART()
+{
+	echo >&2
+	echo "Testing \"$*\" in ${BASH_SOURCE[1]}:" >&2
+}