]> arthur.barton.de Git - bup.git/commitdiff
cmd-save: completely reimplement using the indexfile.
authorAvery Pennarun <apenwarr@gmail.com>
Sun, 10 Jan 2010 03:43:48 +0000 (22:43 -0500)
committerAvery Pennarun <apenwarr@gmail.com>
Sun, 10 Jan 2010 06:11:09 +0000 (01:11 -0500)
'bup save' no longer walks the filesystem: instead it walks the indexfile
(which is much faster) and doesn't bother opening any files that haven't had
an attribute change, since it can just reuse their sha1 from before.  That
makes it *much* faster in the common case.

cmd-index.py
cmd-save.py
git.py
index.py
t/tindex.py [new file with mode: 0644]

index 85b0f6c399ec16ca3d7e8f78e1287302665986d2..ef596b7d7909b6b43dc9d4f56a6ff2c9809cec82 100755 (executable)
@@ -32,7 +32,7 @@ def handle_path(ri, wi, dir, name, pst, xdev, can_delete_siblings):
     hashgen = None
     if opt.fake_valid:
         def hashgen(name):
-            return index.FAKE_SHA
+            return (0, index.FAKE_SHA)
     
     dirty = 0
     path = dir + name
@@ -62,7 +62,7 @@ def handle_path(ri, wi, dir, name, pst, xdev, can_delete_siblings):
                     continue
                 if xdev != None and st.st_dev != xdev:
                     log('Skipping %r: different filesystem.\n' 
-                        % os.path.realpath(p))
+                        % index.realpath(p))
                     continue
                 if stat.S_ISDIR(st.st_mode):
                     p = slashappend(p)
@@ -86,7 +86,7 @@ def handle_path(ri, wi, dir, name, pst, xdev, can_delete_siblings):
         if dirty or not (ri.cur.flags & index.IX_HASHVALID):
             #log('   --- updating %r\n' % path)
             if hashgen:
-                ri.cur.sha = hashgen(name)
+                (ri.cur.gitmode, ri.cur.sha) = hashgen(name)
                 ri.cur.flags |= index.IX_HASHVALID
             ri.cur.repack()
         ri.next()
@@ -125,7 +125,7 @@ def update_index(path):
     wi = index.Writer(indexfile)
     rig = MergeGetter(ri)
     
-    rpath = os.path.realpath(path)
+    rpath = index.realpath(path)
     st = os.lstat(rpath)
     if opt.xdev:
         xdev = st.st_dev
@@ -181,6 +181,7 @@ bup index <-p|s|m|u> [options...] <filenames...>
 p,print    print the index entries for the given names (also works with -u)
 m,modified print only added/deleted/modified files (implies -p)
 s,status   print each filename with a status char (A/M/D) (implies -p)
+H,hash     print the hash for each object next to its name (implies -p)
 u,update   (recursively) update the index entries for the given filenames
 x,xdev,one-file-system  don't cross filesystem boundaries
 fake-valid    mark all index entries as up-to-date even if they aren't
@@ -192,10 +193,10 @@ o = options.Options('bup index', optspec)
 
 if not (opt.modified or opt['print'] or opt.status or opt.update):
     log('bup index: you must supply one or more of -p, -s, -m, or -u\n')
-    exit(97)
+    o.usage()
 if opt.fake_valid and not opt.update:
     log('bup index: --fake-valid is meaningless without -u\n')
-    exit(96)
+    o.usage()
 
 git.check_repo_or_die()
 indexfile = opt.indexfile or git.repo('bupindex')
@@ -205,7 +206,7 @@ paths = index.reduce_paths(extra)
 if opt.update:
     if not paths:
         log('bup index: update (-u) requested but no paths given\n')
-        exit(96)
+        o.usage()
     for (rp, path) in paths:
         update_index(rp)
 
@@ -213,18 +214,20 @@ if opt['print'] or opt.status or opt.modified:
     for (name, ent) in index.Reader(indexfile).filter(extra or ['']):
         if opt.modified and ent.flags & index.IX_HASHVALID:
             continue
+        line = ''
         if opt.status:
             if not ent.flags & index.IX_EXISTS:
-                print 'D ' + name
+                line += 'D '
             elif not ent.flags & index.IX_HASHVALID:
                 if ent.sha == index.EMPTY_SHA:
-                    print 'A ' + name
+                    line += 'A '
                 else:
-                    print 'M ' + name
+                    line += 'M '
             else:
-                print '  ' + name
-        else:
-            print name
+                line += '  '
+        if opt.hash:
+            line += ent.sha.encode('hex') + ' '
+        print line + (name or './')
         #print repr(ent)
 
 if saved_errors:
index 7da0429503cff90286f0b680249151db51590aab..67cb30c2b4db06c38ea936e3214eee5844ed6c5f 100755 (executable)
@@ -1,6 +1,6 @@
 #!/usr/bin/env python2.5
 import sys, re, errno, stat, client
-import hashsplit, git, options
+import hashsplit, git, options, index
 from helpers import *
 
 
@@ -10,90 +10,6 @@ def add_error(e):
     log('\n%s\n' % e)
 
 
-def _direxpand(name):
-    st = os.lstat(name)
-    try:
-        if stat.S_ISDIR(st.st_mode):
-            for sub in os.listdir(name):
-                subfull = os.path.join(name, sub)
-                for fn_st in _direxpand(subfull):
-                    yield fn_st
-        else:
-            yield (name,st)
-    except OSError, e:
-        if e.errno in [errno.ENOENT, errno.EPERM, errno.EACCES]:
-            add_error(e)
-        else:
-            raise
-
-
-def direxpand(names):
-    for n in names:
-        for fn_st in _direxpand(n):
-            yield fn_st
-            
-
-def _normpath(dir):
-    p = os.path.normpath(dir)
-    return (p != '.') and p or ''
-
-
-class Tree:
-    def __init__(self, parent, name):
-        assert(name != '.')
-        assert(not (parent and not name))
-        self.parent = parent
-        self.name = name
-        self.sha = None
-        self.children = {}
-        if self.parent:
-            self.parent.children[self.name] = self
-    
-    def fullpath(self):
-        if self.parent:
-            return os.path.join(self.parent.fullpath(), self.name)
-        else:
-            return self.name
-        
-    def gettop(self):
-        p = self
-        while p.parent:
-            p = p.parent
-        return p
-        
-    def getdir(self, dir):
-        # FIXME: deal with '..' somehow (look at how tar does it)
-        dir = _normpath(dir)
-        if dir.startswith('/'):
-            dir = dir[1:]
-        top = self.gettop()
-        if not dir:
-            return top
-        for part in dir.split('/'):
-            sub = top.children.get(part)
-            if not sub:
-                sub = top.children[part] = Tree(top, part)
-            top = sub
-        return top
-    
-    def addfile(self, mode, fullname, id):
-        (dir, name) = os.path.split(fullname)
-        self.getdir(dir).children[name] = (mode, name, id)
-        
-    def shalist(self, w):
-        for c in self.children.values():
-            if isinstance(c, tuple):  # sha1 entry for a file
-                yield c
-            else:  # tree
-                t = ('40000', c.name, c.gen_tree(w))
-                yield t
-        
-    def gen_tree(self, w):
-        if not self.sha:
-            self.sha = w.new_tree(self.shalist(w))
-        return self.sha
-
-
 optspec = """
 bup save [-tc] [-n name] <filenames...>
 --
@@ -110,6 +26,9 @@ git.check_repo_or_die()
 if not (opt.tree or opt.commit or opt.name):
     log("bup save: use one or more of -t, -c, -n\n")
     o.usage()
+if not extra:
+    log("bup save: no filenames given.\n")
+    o.usage()
 
 if opt.verbose >= 2:
     git.verbose = opt.verbose - 1
@@ -124,26 +43,94 @@ else:
     cli = None
     oldref = refname and git.read_ref(refname) or None
     w = git.PackWriter()
-    
-root = Tree(None, '')
-for (fn,st) in direxpand(extra):
+
+
+def eatslash(dir):
+    if dir.endswith('/'):
+        return dir[:-1]
+    else:
+        return dir
+
+
+parts = ['']
+shalists = [[]]
+
+def _push(part):
+    parts.append(part)
+    shalists.append([])
+
+def _pop():
+    assert(len(parts) > 1)
+    part = parts.pop()
+    shalist = shalists.pop()
+    tree = w.new_tree(shalist)
+    shalists[-1].append(('40000', part, tree))
+
+
+for (transname,ent) in index.Reader(git.repo('bupindex')).filter(extra):
+    (dir, file) = os.path.split(ent.name)
+    exists = (ent.flags & index.IX_EXISTS)
+    hashvalid = (ent.flags & index.IX_HASHVALID) and w.exists(ent.sha)
     if opt.verbose:
-        log('\n%s ' % fn)
-    try:
-        if stat.S_ISREG(st.st_mode):  # regular file
-            f = open(fn)
-            (mode, id) = hashsplit.split_to_blob_or_tree(w, [f])
-        elif stat.S_ISLNK(st.st_mode):  # symlink
-            (mode, id) = ('120000', w.new_blob(os.readlink(fn)))
+        if not exists:
+            status = 'D'
+        elif not hashvalid:
+            if ent.sha == index.EMPTY_SHA:
+                status = 'A'
+            else:
+                status = 'M'
         else:
-            add_error(Exception('skipping special file "%s"' % fn))
-    except IOError, e:
-        add_error(e)
-    except OSError, e:
-        add_error(e)
+            status = ' '
+        log('\n%s %s ' % (status, ent.name))
+
+    if not exists:
+        continue
+
+    assert(dir.startswith('/'))
+    dirp = dir.split('/')
+    while parts > dirp:
+        _pop()
+    for part in dirp[len(parts):]:
+        _push(part)
+
+    if not file:
+        # directory already handled.
+        # FIXME: not using the indexed tree sha1's for anything, which is
+        # a waste.  That's a potential optimization...
+        continue  
+
+    id = None
+    if hashvalid:
+        mode = '%o' % ent.mode
+        id = ent.sha
+        shalists[-1].append((mode, file, id))
     else:
-        root.addfile(mode, fn, id)
-tree = root.gen_tree(w)
+        try:
+            if stat.S_ISREG(ent.mode):
+                f = open(ent.name)
+                (mode, id) = hashsplit.split_to_blob_or_tree(w, [f])
+            elif stat.S_ISDIR(ent.mode):
+                assert(0)  # handled above
+            elif stat.S_ISLNK(ent.mode):
+                (mode, id) = ('120000', w.new_blob(os.readlink(ent.name)))
+            else:
+                add_error(Exception('skipping special file "%s"' % ent.name))
+        except IOError, e:
+            add_error(e)
+        except OSError, e:
+            add_error(e)
+        if id:
+            ent.validate(id)
+            ent.repack()
+            shalists[-1].append((mode, file, id))
+#log('parts out: %r\n' % parts)
+#log('stk out: %r\n' % shalists)
+while len(parts) > 1:
+    _pop()
+#log('parts out: %r\n' % parts)
+#log('stk out: %r\n' % shalists)
+assert(len(shalists) == 1)
+tree = w.new_tree(shalists[-1])
 if opt.verbose:
     log('\n')
 if opt.tree:
diff --git a/git.py b/git.py
index 6775bb2876a7d8c10d98b65027db432a4438fe77..5aa7933a520373f717ece351bd1472eaa6984797 100644 (file)
--- a/git.py
+++ b/git.py
@@ -183,11 +183,14 @@ class PackWriter:
     def write(self, type, content):
         return self._write(calc_hash(type, content), type, content)
 
-    def maybe_write(self, type, content):
-        bin = calc_hash(type, content)
+    def exists(self, id):
         if not self.objcache:
             self._make_objcache()
-        if not self.objcache.exists(bin):
+        return self.objcache.exists(id)
+
+    def maybe_write(self, type, content):
+        bin = calc_hash(type, content)
+        if not self.exists(bin):
             self._write(bin, type, content)
             self.objcache.add(bin)
         return bin
index f541dec9805d4f4b96af4bdacb37daf2522ca361..6d302bb7e5d9951db3b339d45dc33c47c793bb4f 100644 (file)
--- a/index.py
+++ b/index.py
@@ -4,7 +4,7 @@ from helpers import *
 EMPTY_SHA = '\0'*20
 FAKE_SHA = '\x01'*20
 INDEX_HDR = 'BUPI\0\0\0\1'
-INDEX_SIG = '!IIIIIQ20sH'
+INDEX_SIG = '!IIIIIQII20sH'
 ENTLEN = struct.calcsize(INDEX_SIG)
 
 IX_EXISTS = 0x8000
@@ -21,7 +21,7 @@ class Entry:
         self.name = str(name)
         self.tstart = tstart
         (self.dev, self.ctime, self.mtime, self.uid, self.gid,
-         self.size, self.sha,
+         self.size, self.mode, self.gitmode, self.sha,
          self.flags) = struct.unpack(INDEX_SIG, buffer(m, ofs, ENTLEN))
 
     def __repr__(self):
@@ -31,9 +31,10 @@ class Entry:
                    self.size, self.flags))
 
     def packed(self):
-        return struct.pack(INDEX_SIG, self.dev, self.ctime, self.mtime,
-                           self.uid, self.gid, self.size, self.sha,
-                           self.flags)
+        return struct.pack(INDEX_SIG,
+                           self.dev, self.ctime, self.mtime, 
+                           self.uid, self.gid, self.size, self.mode,
+                           self.gitmode, self.sha, self.flags)
 
     def repack(self):
         self._m[self._ofs:self._ofs+ENTLEN] = self.packed()
@@ -49,6 +50,7 @@ class Entry:
         self.uid = st.st_uid
         self.gid = st.st_gid
         self.size = st.st_size
+        self.mode = st.st_mode
         self.flags |= IX_EXISTS
         if int(st.st_ctime) >= self.tstart or old != new:
             self.flags &= ~IX_HASHVALID
@@ -56,6 +58,11 @@ class Entry:
         else:
             return 0  # not dirty
 
+    def validate(self, sha):
+        assert(sha)
+        self.sha = sha
+        self.flags |= IX_HASHVALID
+
     def __cmp__(a, b):
         return cmp(a.name, b.name)
             
@@ -120,7 +127,7 @@ class Reader:
                 continue   # not interested
             else:
                 name = pin + ent.name[len(rpin):]
-                yield (name or './', ent)
+                yield (name, ent)
 
 
 # Read all the iters in order; when more than one iter has the same entry,
@@ -161,7 +168,7 @@ class Writer:
         self.count = 0
         self.lastfile = None
         self.filename = None
-        self.filename = filename = os.path.realpath(filename)
+        self.filename = filename = realpath(filename)
         (dir,name) = os.path.split(filename)
         (ffd,self.tmpname) = tempfile.mkstemp('.tmp', filename, dir)
         self.f = os.fdopen(ffd, 'wb', 65536)
@@ -196,15 +203,15 @@ class Writer:
         flags = IX_EXISTS
         sha = None
         if hashgen:
-            sha = hashgen(name)
+            (gitmode, sha) = hashgen(name)
             if sha:
                 flags |= IX_HASHVALID
         else:
-            sha = EMPTY_SHA
+            (gitmode, sha) = (0, EMPTY_SHA)
         data = name + '\0' + \
             struct.pack(INDEX_SIG, st.st_dev, int(st.st_ctime),
                         int(st.st_mtime), st.st_uid, st.st_gid,
-                        st.st_size, sha, flags)
+                        st.st_size, st.st_mode, gitmode, sha, flags)
         self._write(data)
 
     def add_ixentry(self, e):
@@ -220,10 +227,27 @@ class Writer:
         return Reader(self.tmpname)
 
 
+# like os.path.realpath, but doesn't follow a symlink for the last element.
+# (ie. if 'p' itself is itself a symlink, this one won't follow it)
+def realpath(p):
+    try:
+        st = os.lstat(p)
+    except OSError:
+        st = None
+    if st and stat.S_ISLNK(st.st_mode):
+        (dir, name) = os.path.split(p)
+        dir = os.path.realpath(dir)
+        out = os.path.join(dir, name)
+    else:
+        out = os.path.realpath(p)
+    #log('realpathing:%r,%r\n' % (p, out))
+    return out
+
+
 def reduce_paths(paths):
     xpaths = []
     for p in paths:
-        rp = os.path.realpath(p)
+        rp = realpath(p)
         st = os.lstat(rp)
         if stat.S_ISDIR(st.st_mode):
             rp = slashappend(rp)
diff --git a/t/tindex.py b/t/tindex.py
new file mode 100644 (file)
index 0000000..9922b8f
--- /dev/null
@@ -0,0 +1,13 @@
+import os
+import index
+from wvtest import *
+
+@wvtest
+def testbasic():
+    cd = os.path.realpath('')
+    WVPASS(cd)
+    sd = os.path.realpath('t/sampledata')
+    WVPASSEQ(index.realpath('t/sampledata'), cd + '/t/sampledata')
+    WVPASSEQ(os.path.realpath('t/sampledata/x'), sd + '/x')
+    WVPASSEQ(os.path.realpath('t/sampledata/etc'), '/etc')
+    WVPASSEQ(index.realpath('t/sampledata/etc'), sd + '/etc')