]> arthur.barton.de Git - bup.git/commitdiff
index: only collect metadata for stale paths
authorRob Browning <rlb@defaultvalue.org>
Sun, 25 Dec 2016 17:44:56 +0000 (11:44 -0600)
committerRob Browning <rlb@defaultvalue.org>
Mon, 26 Dec 2016 18:18:47 +0000 (12:18 -0600)
Stop collecting all of the metadata for every path, even if the path
hasn't changed since the last save.  Further, rework the code to
short-circuit some other unnecessary work.

To support this, split Entry.from_stat() into two parts, stale(), which
tests to see if an entry has "materially" changed, and
update_from_stat() which updates the entry to match the stat information
provided.

This should substantially decrease the indexing cost for paths that
haven't changed since the last save.

While we're here, rename hashgen to fake_hash so it's clearer that's its
only purpose.

Signed-off-by: Rob Browning <rlb@defaultvalue.org>
Tested-by: Rob Browning <rlb@defaultvalue.org>
cmd/index-cmd.py
lib/bup/index.py
lib/bup/t/tindex.py

index ab97e928407bc21e081e5af5b9b64ddbe1da339d..bb21b4ec23e5fd8e8b3ef67485575f36bd29e92a 100755 (executable)
@@ -82,9 +82,9 @@ def update_index(top, excluded_paths, exclude_rxs, xdev_exceptions):
 
     hlinks = hlinkdb.HLinkDB(indexfile + '.hlink')
 
-    hashgen = None
+    fake_hash = None
     if opt.fake_valid:
-        def hashgen(name):
+        def fake_hash(name):
             return (GIT_MODE_FILE, index.FAKE_SHA)
 
     total = 0
@@ -107,6 +107,7 @@ def update_index(top, excluded_paths, exclude_rxs, xdev_exceptions):
             paths_per_sec = total / elapsed if elapsed else 0
             qprogress('Indexing: %d (%d paths/s)\r' % (total, paths_per_sec))
         total += 1
+
         while rig.cur and rig.cur.name > path:  # deleted paths
             if rig.cur.exists():
                 rig.cur.set_deleted()
@@ -114,40 +115,47 @@ def update_index(top, excluded_paths, exclude_rxs, xdev_exceptions):
                 if rig.cur.nlink > 1 and not stat.S_ISDIR(rig.cur.mode):
                     hlinks.del_path(rig.cur.name)
             rig.next()
+
         if rig.cur and rig.cur.name == path:    # paths that already existed
-            try:
-                meta = metadata.from_path(path, statinfo=pst)
-            except (OSError, IOError) as e:
-                add_error(e)
-                rig.next()
-                continue
-            if not stat.S_ISDIR(rig.cur.mode) and rig.cur.nlink > 1:
-                hlinks.del_path(rig.cur.name)
-            if not stat.S_ISDIR(pst.st_mode) and pst.st_nlink > 1:
-                hlinks.add_path(path, pst.st_dev, pst.st_ino)
-            # Clear these so they don't bloat the store -- they're
-            # already in the index (since they vary a lot and they're
-            # fixed length).  If you've noticed "tmax", you might
-            # wonder why it's OK to do this, since that code may
-            # adjust (mangle) the index mtime and ctime -- producing
-            # fake values which must not end up in a .bupm.  However,
-            # it looks like that shouldn't be possible:  (1) When
-            # "save" validates the index entry, it always reads the
-            # metadata from the filesytem. (2) Metadata is only
-            # read/used from the index if hashvalid is true. (3) index
-            # always invalidates "faked" entries, because "old != new"
-            # in from_stat().
-            meta.ctime = meta.mtime = meta.atime = 0
-            meta_ofs = msw.store(meta)
-            rig.cur.from_stat(pst, meta_ofs, tstart,
-                              check_device=opt.check_device)
+            need_repack = False
+            if(rig.cur.stale(pst, tstart, check_device=opt.check_device)):
+                try:
+                    meta = metadata.from_path(path, statinfo=pst)
+                except (OSError, IOError) as e:
+                    add_error(e)
+                    rig.next()
+                    continue
+                if not stat.S_ISDIR(rig.cur.mode) and rig.cur.nlink > 1:
+                    hlinks.del_path(rig.cur.name)
+                if not stat.S_ISDIR(pst.st_mode) and pst.st_nlink > 1:
+                    hlinks.add_path(path, pst.st_dev, pst.st_ino)
+                # Clear these so they don't bloat the store -- they're
+                # already in the index (since they vary a lot and they're
+                # fixed length).  If you've noticed "tmax", you might
+                # wonder why it's OK to do this, since that code may
+                # adjust (mangle) the index mtime and ctime -- producing
+                # fake values which must not end up in a .bupm.  However,
+                # it looks like that shouldn't be possible:  (1) When
+                # "save" validates the index entry, it always reads the
+                # metadata from the filesytem. (2) Metadata is only
+                # read/used from the index if hashvalid is true. (3)
+                # "faked" entries will be stale(), and so we'll invalidate
+                # them below.
+                meta.ctime = meta.mtime = meta.atime = 0
+                meta_ofs = msw.store(meta)
+                rig.cur.update_from_stat(pst, meta_ofs)
+                rig.cur.invalidate()
+                need_repack = True
             if not (rig.cur.flags & index.IX_HASHVALID):
-                if hashgen:
-                    (rig.cur.gitmode, rig.cur.sha) = hashgen(path)
+                if fake_hash:
+                    rig.cur.gitmode, rig.cur.sha = fake_hash(path)
                     rig.cur.flags |= index.IX_HASHVALID
+                    need_repack = True
             if opt.fake_invalid:
                 rig.cur.invalidate()
-            rig.cur.repack()
+                need_repack = True
+            if need_repack:
+                rig.cur.repack()
             rig.next()
         else:  # new paths
             try:
@@ -158,7 +166,7 @@ def update_index(top, excluded_paths, exclude_rxs, xdev_exceptions):
             # See same assignment to 0, above, for rationale.
             meta.atime = meta.mtime = meta.ctime = 0
             meta_ofs = msw.store(meta)
-            wi.add(path, pst, meta_ofs, hashgen = hashgen)
+            wi.add(path, pst, meta_ofs, hashgen=fake_hash)
             if not stat.S_ISDIR(pst.st_mode) and pst.st_nlink > 1:
                 hlinks.add_path(path, pst.st_dev, pst.st_ino)
 
index a570784da81c5c2a7e6b30cb2922fe30975d40e7..5f6c366957cb93d0b98e90dd411c4b09bd890f08 100644 (file)
@@ -195,13 +195,34 @@ class Entry:
             log('pack error: %s (%r)\n' % (e, self))
             raise
 
-    def from_stat(self, st, meta_ofs, tstart, check_device=True):
-        old = (self.dev if check_device else 0,
-               self.ino, self.nlink, self.ctime, self.mtime,
-               self.size, self.flags & IX_EXISTS)
-        new = (st.st_dev if check_device else 0,
-               st.st_ino, st.st_nlink, st.st_ctime, st.st_mtime,
-               st.st_size, IX_EXISTS)
+    def stale(self, st, tstart, check_device=True):
+        if self.size != st.st_size:
+            return True
+        if self.mtime != st.st_mtime:
+            return True
+        if self.sha == EMPTY_SHA:
+            return True
+        if not self.gitmode:
+            return True
+        if self.ctime != st.st_ctime:
+            return True
+        if self.ino != st.st_ino:
+            return True
+        if self.nlink != st.st_nlink:
+            return True
+        if not (self.flags & IX_EXISTS):
+            return True
+        if check_device and (self.dev != st.st_dev):
+            return True
+        # Check that the ctime's "second" is at or after tstart's.
+        ctime_sec_in_ns = xstat.fstime_floor_secs(st.st_ctime) * 10**9
+        if ctime_sec_in_ns >= tstart:
+            return True
+        return False
+
+    def update_from_stat(self, st, meta_ofs):
+        # Should only be called when the entry is stale(), and
+        # invalidate() should almost certainly be called afterward.
         self.dev = st.st_dev
         self.ino = st.st_ino
         self.nlink = st.st_nlink
@@ -212,13 +233,8 @@ class Entry:
         self.mode = st.st_mode
         self.flags |= IX_EXISTS
         self.meta_ofs = meta_ofs
-        # Check that the ctime's "second" is at or after tstart's.
-        ctime_sec_in_ns = xstat.fstime_floor_secs(st.st_ctime) * 10**9
-        if ctime_sec_in_ns >= tstart or old != new \
-              or self.sha == EMPTY_SHA or not self.gitmode:
-            self.invalidate()
         self._fixup()
-        
+
     def _fixup(self):
         self.mtime = self._fixup_time(self.mtime)
         self.ctime = self._fixup_time(self.ctime)
index c597b96aa2364c6cc0073ee3b421790c37f47a38..6639e0b939fcd5f7584c15efaebf42055d6b4353 100644 (file)
@@ -77,19 +77,16 @@ def index_negative_timestamps():
             # Dec 31, 1969
             os.utime(foopath, (-86400, -86400))
             ns_per_sec = 10**9
-            tstart = time.time() * ns_per_sec
-            tmax = tstart - ns_per_sec
+            tmax = (time.time() - 1) * ns_per_sec
             e = index.BlankNewEntry(foopath, 0, tmax)
-            e.from_stat(xstat.stat(foopath), 0, tstart)
-            assert len(e.packed())
-            WVPASS()
+            e.update_from_stat(xstat.stat(foopath), 0)
+            WVPASS(e.packed())
 
             # Jun 10, 1893
             os.utime(foopath, (-0x80000000, -0x80000000))
             e = index.BlankNewEntry(foopath, 0, tmax)
-            e.from_stat(xstat.stat(foopath), 0, tstart)
-            assert len(e.packed())
-            WVPASS()
+            e.update_from_stat(xstat.stat(foopath), 0)
+            WVPASS(e.packed())
 
 
 @wvtest