From: Rob Browning Date: Sun, 25 Dec 2016 17:44:56 +0000 (-0600) Subject: index: only collect metadata for stale paths X-Git-Tag: 0.29-rc2~3 X-Git-Url: https://arthur.barton.de/cgi-bin/gitweb.cgi?p=bup.git;a=commitdiff_plain;h=7d8f74a4f0ed85a1827ec02a9f39b336093b91dc index: only collect metadata for stale paths Stop collecting all of the metadata for every path, even if the path hasn't changed since the last save. Further, rework the code to short-circuit some other unnecessary work. To support this, split Entry.from_stat() into two parts, stale(), which tests to see if an entry has "materially" changed, and update_from_stat() which updates the entry to match the stat information provided. This should substantially decrease the indexing cost for paths that haven't changed since the last save. While we're here, rename hashgen to fake_hash so it's clearer that's its only purpose. Signed-off-by: Rob Browning Tested-by: Rob Browning --- diff --git a/cmd/index-cmd.py b/cmd/index-cmd.py index ab97e92..bb21b4e 100755 --- a/cmd/index-cmd.py +++ b/cmd/index-cmd.py @@ -82,9 +82,9 @@ def update_index(top, excluded_paths, exclude_rxs, xdev_exceptions): hlinks = hlinkdb.HLinkDB(indexfile + '.hlink') - hashgen = None + fake_hash = None if opt.fake_valid: - def hashgen(name): + def fake_hash(name): return (GIT_MODE_FILE, index.FAKE_SHA) total = 0 @@ -107,6 +107,7 @@ def update_index(top, excluded_paths, exclude_rxs, xdev_exceptions): paths_per_sec = total / elapsed if elapsed else 0 qprogress('Indexing: %d (%d paths/s)\r' % (total, paths_per_sec)) total += 1 + while rig.cur and rig.cur.name > path: # deleted paths if rig.cur.exists(): rig.cur.set_deleted() @@ -114,40 +115,47 @@ def update_index(top, excluded_paths, exclude_rxs, xdev_exceptions): if rig.cur.nlink > 1 and not stat.S_ISDIR(rig.cur.mode): hlinks.del_path(rig.cur.name) rig.next() + if rig.cur and rig.cur.name == path: # paths that already existed - try: - meta = metadata.from_path(path, statinfo=pst) - except (OSError, IOError) as e: - add_error(e) - rig.next() - continue - if not stat.S_ISDIR(rig.cur.mode) and rig.cur.nlink > 1: - hlinks.del_path(rig.cur.name) - if not stat.S_ISDIR(pst.st_mode) and pst.st_nlink > 1: - hlinks.add_path(path, pst.st_dev, pst.st_ino) - # Clear these so they don't bloat the store -- they're - # already in the index (since they vary a lot and they're - # fixed length). If you've noticed "tmax", you might - # wonder why it's OK to do this, since that code may - # adjust (mangle) the index mtime and ctime -- producing - # fake values which must not end up in a .bupm. However, - # it looks like that shouldn't be possible: (1) When - # "save" validates the index entry, it always reads the - # metadata from the filesytem. (2) Metadata is only - # read/used from the index if hashvalid is true. (3) index - # always invalidates "faked" entries, because "old != new" - # in from_stat(). - meta.ctime = meta.mtime = meta.atime = 0 - meta_ofs = msw.store(meta) - rig.cur.from_stat(pst, meta_ofs, tstart, - check_device=opt.check_device) + need_repack = False + if(rig.cur.stale(pst, tstart, check_device=opt.check_device)): + try: + meta = metadata.from_path(path, statinfo=pst) + except (OSError, IOError) as e: + add_error(e) + rig.next() + continue + if not stat.S_ISDIR(rig.cur.mode) and rig.cur.nlink > 1: + hlinks.del_path(rig.cur.name) + if not stat.S_ISDIR(pst.st_mode) and pst.st_nlink > 1: + hlinks.add_path(path, pst.st_dev, pst.st_ino) + # Clear these so they don't bloat the store -- they're + # already in the index (since they vary a lot and they're + # fixed length). If you've noticed "tmax", you might + # wonder why it's OK to do this, since that code may + # adjust (mangle) the index mtime and ctime -- producing + # fake values which must not end up in a .bupm. However, + # it looks like that shouldn't be possible: (1) When + # "save" validates the index entry, it always reads the + # metadata from the filesytem. (2) Metadata is only + # read/used from the index if hashvalid is true. (3) + # "faked" entries will be stale(), and so we'll invalidate + # them below. + meta.ctime = meta.mtime = meta.atime = 0 + meta_ofs = msw.store(meta) + rig.cur.update_from_stat(pst, meta_ofs) + rig.cur.invalidate() + need_repack = True if not (rig.cur.flags & index.IX_HASHVALID): - if hashgen: - (rig.cur.gitmode, rig.cur.sha) = hashgen(path) + if fake_hash: + rig.cur.gitmode, rig.cur.sha = fake_hash(path) rig.cur.flags |= index.IX_HASHVALID + need_repack = True if opt.fake_invalid: rig.cur.invalidate() - rig.cur.repack() + need_repack = True + if need_repack: + rig.cur.repack() rig.next() else: # new paths try: @@ -158,7 +166,7 @@ def update_index(top, excluded_paths, exclude_rxs, xdev_exceptions): # See same assignment to 0, above, for rationale. meta.atime = meta.mtime = meta.ctime = 0 meta_ofs = msw.store(meta) - wi.add(path, pst, meta_ofs, hashgen = hashgen) + wi.add(path, pst, meta_ofs, hashgen=fake_hash) if not stat.S_ISDIR(pst.st_mode) and pst.st_nlink > 1: hlinks.add_path(path, pst.st_dev, pst.st_ino) diff --git a/lib/bup/index.py b/lib/bup/index.py index a570784..5f6c366 100644 --- a/lib/bup/index.py +++ b/lib/bup/index.py @@ -195,13 +195,34 @@ class Entry: log('pack error: %s (%r)\n' % (e, self)) raise - def from_stat(self, st, meta_ofs, tstart, check_device=True): - old = (self.dev if check_device else 0, - self.ino, self.nlink, self.ctime, self.mtime, - self.size, self.flags & IX_EXISTS) - new = (st.st_dev if check_device else 0, - st.st_ino, st.st_nlink, st.st_ctime, st.st_mtime, - st.st_size, IX_EXISTS) + def stale(self, st, tstart, check_device=True): + if self.size != st.st_size: + return True + if self.mtime != st.st_mtime: + return True + if self.sha == EMPTY_SHA: + return True + if not self.gitmode: + return True + if self.ctime != st.st_ctime: + return True + if self.ino != st.st_ino: + return True + if self.nlink != st.st_nlink: + return True + if not (self.flags & IX_EXISTS): + return True + if check_device and (self.dev != st.st_dev): + return True + # Check that the ctime's "second" is at or after tstart's. + ctime_sec_in_ns = xstat.fstime_floor_secs(st.st_ctime) * 10**9 + if ctime_sec_in_ns >= tstart: + return True + return False + + def update_from_stat(self, st, meta_ofs): + # Should only be called when the entry is stale(), and + # invalidate() should almost certainly be called afterward. self.dev = st.st_dev self.ino = st.st_ino self.nlink = st.st_nlink @@ -212,13 +233,8 @@ class Entry: self.mode = st.st_mode self.flags |= IX_EXISTS self.meta_ofs = meta_ofs - # Check that the ctime's "second" is at or after tstart's. - ctime_sec_in_ns = xstat.fstime_floor_secs(st.st_ctime) * 10**9 - if ctime_sec_in_ns >= tstart or old != new \ - or self.sha == EMPTY_SHA or not self.gitmode: - self.invalidate() self._fixup() - + def _fixup(self): self.mtime = self._fixup_time(self.mtime) self.ctime = self._fixup_time(self.ctime) diff --git a/lib/bup/t/tindex.py b/lib/bup/t/tindex.py index c597b96..6639e0b 100644 --- a/lib/bup/t/tindex.py +++ b/lib/bup/t/tindex.py @@ -77,19 +77,16 @@ def index_negative_timestamps(): # Dec 31, 1969 os.utime(foopath, (-86400, -86400)) ns_per_sec = 10**9 - tstart = time.time() * ns_per_sec - tmax = tstart - ns_per_sec + tmax = (time.time() - 1) * ns_per_sec e = index.BlankNewEntry(foopath, 0, tmax) - e.from_stat(xstat.stat(foopath), 0, tstart) - assert len(e.packed()) - WVPASS() + e.update_from_stat(xstat.stat(foopath), 0) + WVPASS(e.packed()) # Jun 10, 1893 os.utime(foopath, (-0x80000000, -0x80000000)) e = index.BlankNewEntry(foopath, 0, tmax) - e.from_stat(xstat.stat(foopath), 0, tstart) - assert len(e.packed()) - WVPASS() + e.update_from_stat(xstat.stat(foopath), 0) + WVPASS(e.packed()) @wvtest