X-Git-Url: https://arthur.barton.de/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=lib%2Fbup%2Findex.py;h=d6d44accb7eb0b320526283cd174513d3f1a84ed;hb=c40b3dd5fd74e72024fbaad3daf5a958aefa1c54;hp=bc057f90ec5940e28bcd1908b0403da7cdf0df6a;hpb=dc35412b085d292f4d0eff95be1563aa64cc872d;p=bup.git diff --git a/lib/bup/index.py b/lib/bup/index.py index bc057f9..d6d44ac 100644 --- a/lib/bup/index.py +++ b/lib/bup/index.py @@ -1,10 +1,40 @@ -import os, stat, time, struct, tempfile -from bup.helpers import * + +from __future__ import absolute_import +import errno, os, stat, struct, tempfile + +from bup import metadata, xstat +from bup._helpers import UINT_MAX, bytescmp +from bup.helpers import (add_error, log, merge_iter, mmap_readwrite, + progress, qprogress, resolve_parent, slashappend) EMPTY_SHA = '\0'*20 FAKE_SHA = '\x01'*20 -INDEX_HDR = 'BUPI\0\0\0\2' -INDEX_SIG = '!IIIIIQII20sHII' + +INDEX_HDR = 'BUPI\0\0\0\7' + +# Time values are handled as integer nanoseconds since the epoch in +# memory, but are written as xstat/metadata timespecs. This behavior +# matches the existing metadata/xstat/.bupm code. + +# Record times (mtime, ctime, atime) as xstat/metadata timespecs, and +# store all of the times in the index so they won't interfere with the +# forthcoming metadata cache. +INDEX_SIG = ('!' + 'Q' # dev + 'Q' # ino + 'Q' # nlink + 'qQ' # ctime_s, ctime_ns + 'qQ' # mtime_s, mtime_ns + 'qQ' # atime_s, atime_ns + 'Q' # size + 'I' # mode + 'I' # gitmode + '20s' # sha + 'H' # flags + 'Q' # children_ofs + 'I' # children_n + 'Q') # meta_ofs + ENTLEN = struct.calcsize(INDEX_SIG) FOOTER_SIG = '!Q' FOOTLEN = struct.calcsize(FOOTER_SIG) @@ -17,6 +47,74 @@ class Error(Exception): pass +class MetaStoreReader: + def __init__(self, filename): + self._file = None + self._file = open(filename, 'rb') + + def close(self): + if self._file: + self._file.close() + self._file = None + + def __del__(self): + self.close() + + def metadata_at(self, ofs): + self._file.seek(ofs) + return metadata.Metadata.read(self._file) + + +class MetaStoreWriter: + # For now, we just append to the file, and try to handle any + # truncation or corruption somewhat sensibly. + + def __init__(self, filename): + # Map metadata hashes to bupindex.meta offsets. + self._offsets = {} + self._filename = filename + self._file = None + # FIXME: see how slow this is; does it matter? + m_file = open(filename, 'ab+') + try: + m_file.seek(0) + try: + m_off = m_file.tell() + m = metadata.Metadata.read(m_file) + while m: + m_encoded = m.encode() + self._offsets[m_encoded] = m_off + m_off = m_file.tell() + m = metadata.Metadata.read(m_file) + except EOFError: + pass + except: + log('index metadata in %r appears to be corrupt' % filename) + raise + finally: + m_file.close() + self._file = open(filename, 'ab') + + def close(self): + if self._file: + self._file.close() + self._file = None + + def __del__(self): + # Be optimistic. + self.close() + + def store(self, metadata): + meta_encoded = metadata.encode(include_path=False) + ofs = self._offsets.get(meta_encoded) + if ofs: + return ofs + ofs = self._file.tell() + self._file.write(meta_encoded) + self._offsets[meta_encoded] = ofs + return ofs + + class Level: def __init__(self, ename, parent): self.parent = parent @@ -37,11 +135,12 @@ class Level: return (ofs,n) -def _golevel(level, f, ename, newentry): +def _golevel(level, f, ename, newentry, metastore, tmax): # close nodes back up the tree assert(level) + default_meta_ofs = metastore.store(metadata.Metadata()) while ename[:len(level.ename)] != level.ename: - n = BlankNewEntry(level.ename[-1]) + n = BlankNewEntry(level.ename[-1], default_meta_ofs, tmax) n.flags |= IX_EXISTS (n.children_ofs,n.children_n) = level.write(f) level.parent.list.append(n) @@ -53,7 +152,8 @@ def _golevel(level, f, ename, newentry): # are we in precisely the right place? assert(ename == level.ename) - n = newentry or BlankNewEntry(ename and level.ename[-1] or None) + n = newentry or \ + BlankNewEntry(ename and level.ename[-1] or None, default_meta_ofs, tmax) (n.children_ofs,n.children_n) = level.write(f) if level.parent: level.parent.list.append(n) @@ -63,42 +163,89 @@ def _golevel(level, f, ename, newentry): class Entry: - def __init__(self, basename, name): + def __init__(self, basename, name, meta_ofs, tmax): self.basename = str(basename) self.name = str(name) + self.meta_ofs = meta_ofs + self.tmax = tmax self.children_ofs = 0 self.children_n = 0 def __repr__(self): - return ("(%s,0x%04x,%d,%d,%d,%d,%d,%s/%s,0x%04x,0x%08x/%d)" - % (self.name, self.dev, - self.ctime, self.mtime, self.uid, self.gid, - self.size, oct(self.mode), oct(self.gitmode), - self.flags, self.children_ofs, self.children_n)) + return ("(%s,0x%04x,%d,%d,%d,%d,%d,%d,%s/%s,0x%04x,%d,0x%08x/%d)" + % (self.name, self.dev, self.ino, self.nlink, + self.ctime, self.mtime, self.atime, + self.size, self.mode, self.gitmode, + self.flags, self.meta_ofs, + self.children_ofs, self.children_n)) def packed(self): - return struct.pack(INDEX_SIG, - self.dev, self.ctime, self.mtime, - self.uid, self.gid, self.size, self.mode, - self.gitmode, self.sha, self.flags, - self.children_ofs, self.children_n) - - def from_stat(self, st, tstart): - old = (self.dev, self.ctime, self.mtime, - self.uid, self.gid, self.size, self.flags & IX_EXISTS) - new = (st.st_dev, int(st.st_ctime), int(st.st_mtime), - st.st_uid, st.st_gid, st.st_size, IX_EXISTS) + try: + ctime = xstat.nsecs_to_timespec(self.ctime) + mtime = xstat.nsecs_to_timespec(self.mtime) + atime = xstat.nsecs_to_timespec(self.atime) + return struct.pack(INDEX_SIG, + self.dev, self.ino, self.nlink, + ctime[0], ctime[1], + mtime[0], mtime[1], + atime[0], atime[1], + self.size, self.mode, + self.gitmode, self.sha, self.flags, + self.children_ofs, self.children_n, + self.meta_ofs) + except (DeprecationWarning, struct.error) as e: + log('pack error: %s (%r)\n' % (e, self)) + raise + + def stale(self, st, tstart, check_device=True): + if self.size != st.st_size: + return True + if self.mtime != st.st_mtime: + return True + if self.sha == EMPTY_SHA: + return True + if not self.gitmode: + return True + if self.ctime != st.st_ctime: + return True + if self.ino != st.st_ino: + return True + if self.nlink != st.st_nlink: + return True + if not (self.flags & IX_EXISTS): + return True + if check_device and (self.dev != st.st_dev): + return True + # Check that the ctime's "second" is at or after tstart's. + ctime_sec_in_ns = xstat.fstime_floor_secs(st.st_ctime) * 10**9 + if ctime_sec_in_ns >= tstart: + return True + return False + + def update_from_stat(self, st, meta_ofs): + # Should only be called when the entry is stale(), and + # invalidate() should almost certainly be called afterward. self.dev = st.st_dev - self.ctime = int(st.st_ctime) - self.mtime = int(st.st_mtime) - self.uid = st.st_uid - self.gid = st.st_gid + self.ino = st.st_ino + self.nlink = st.st_nlink + self.ctime = st.st_ctime + self.mtime = st.st_mtime + self.atime = st.st_atime self.size = st.st_size self.mode = st.st_mode self.flags |= IX_EXISTS - if int(st.st_ctime) >= tstart or old != new \ - or self.sha == EMPTY_SHA or not self.gitmode: - self.invalidate() + self.meta_ofs = meta_ofs + self._fixup() + + def _fixup(self): + self.mtime = self._fixup_time(self.mtime) + self.ctime = self._fixup_time(self.ctime) + + def _fixup_time(self, t): + if self.tmax != None and t > self.tmax: + return self.tmax + else: + return t def is_valid(self): f = IX_HASHVALID|IX_EXISTS @@ -110,6 +257,7 @@ class Entry: def validate(self, gitmode, sha): assert(sha) assert(gitmode) + assert(gitmode+0 == gitmode) self.gitmode = gitmode self.sha = sha self.flags |= IX_HASHVALID|IX_EXISTS @@ -133,43 +281,76 @@ class Entry: def is_fake(self): return not self.ctime - def __cmp__(a, b): - return (cmp(a.name, b.name) - or -cmp(a.is_valid(), b.is_valid()) - or -cmp(a.is_fake(), b.is_fake())) + def _cmp(self, other): + # Note reversed name ordering + bc = bytescmp(other.name, self.name) + if bc != 0: + return bc + vc = self.is_valid() - other.is_valid() + if vc != 0: + return vc + fc = self.is_fake() - other.is_fake() + if fc != 0: + return fc + return 0 + + def __eq__(self, other): + return self._cmp(other) == 0 + + def __ne__(): + return self._cmp(other) != 0 + + def __lt__(self, other): + return self._cmp(other) < 0 + + def __gt__(self, other): + return self._cmp(other) > 0 + + def __le__(): + return self._cmp(other) <= 0 + + def __ge__(): + return self._cmp(other) >= 0 def write(self, f): f.write(self.basename + '\0' + self.packed()) class NewEntry(Entry): - def __init__(self, basename, name, dev, ctime, mtime, uid, gid, - size, mode, gitmode, sha, flags, children_ofs, children_n): - Entry.__init__(self, basename, name) - (self.dev, self.ctime, self.mtime, self.uid, self.gid, + def __init__(self, basename, name, tmax, dev, ino, nlink, + ctime, mtime, atime, + size, mode, gitmode, sha, flags, meta_ofs, + children_ofs, children_n): + Entry.__init__(self, basename, name, meta_ofs, tmax) + (self.dev, self.ino, self.nlink, self.ctime, self.mtime, self.atime, self.size, self.mode, self.gitmode, self.sha, self.flags, self.children_ofs, self.children_n - ) = (dev, int(ctime), int(mtime), uid, gid, + ) = (dev, ino, nlink, ctime, mtime, atime, size, mode, gitmode, sha, flags, children_ofs, children_n) + self._fixup() class BlankNewEntry(NewEntry): - def __init__(self, basename): - NewEntry.__init__(self, basename, basename, - 0, 0, 0, 0, 0, 0, 0, - 0, EMPTY_SHA, 0, 0, 0) + def __init__(self, basename, meta_ofs, tmax): + NewEntry.__init__(self, basename, basename, tmax, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, EMPTY_SHA, 0, meta_ofs, 0, 0) class ExistingEntry(Entry): def __init__(self, parent, basename, name, m, ofs): - Entry.__init__(self, basename, name) + Entry.__init__(self, basename, name, None, None) self.parent = parent self._m = m self._ofs = ofs - (self.dev, self.ctime, self.mtime, self.uid, self.gid, + (self.dev, self.ino, self.nlink, + self.ctime, ctime_ns, self.mtime, mtime_ns, self.atime, atime_ns, self.size, self.mode, self.gitmode, self.sha, - self.flags, self.children_ofs, self.children_n + self.flags, self.children_ofs, self.children_n, self.meta_ofs ) = struct.unpack(INDEX_SIG, str(buffer(m, ofs, ENTLEN))) + self.atime = xstat.timespec_to_nsecs((self.atime, atime_ns)) + self.mtime = xstat.timespec_to_nsecs((self.mtime, mtime_ns)) + self.ctime = xstat.timespec_to_nsecs((self.ctime, ctime_ns)) # effectively, we don't bother messing with IX_SHAMISSING if # not IX_HASHVALID, since it's redundant, and repacking is more @@ -201,7 +382,7 @@ class ExistingEntry(Entry): dname += '/' ofs = self.children_ofs assert(ofs <= len(self._m)) - assert(self.children_n < 1000000) + assert(self.children_n <= UINT_MAX) # i.e. python struct 'I' for i in xrange(self.children_n): eon = self._m.find('\0', ofs) assert(eon >= 0) @@ -233,7 +414,7 @@ class Reader: f = None try: f = open(filename, 'r+') - except IOError, e: + except IOError as e: if e.errno == errno.ENOENT: pass else: @@ -241,7 +422,7 @@ class Reader: if f: b = f.read(len(INDEX_HDR)) if b != INDEX_HDR: - log('warning: %s: header: expected %r, got %r' + log('warning: %s: header: expected %r, got %r\n' % (filename, INDEX_HDR, b)) else: st = os.fstat(f.fileno()) @@ -283,6 +464,11 @@ class Reader: def __iter__(self): return self.iter() + def find(self, name): + return next((e for e in self.iter(name, wantrecurse=lambda x : True) + if e.name == name), + None) + def exists(self): return self.m @@ -299,11 +485,20 @@ class Reader: def filter(self, prefixes, wantrecurse=None): for (rp, path) in reduce_paths(prefixes): + any_entries = False for e in self.iter(rp, wantrecurse=wantrecurse): + any_entries = True assert(e.name.startswith(rp)) name = path + e.name[len(rp):] yield (name, e) - + if not any_entries: + # Always return at least the top for each prefix. + # Otherwise something like "save x/y" will produce + # nothing if x is up to date. + pe = self.find(rp) + assert(pe) + name = path + pe.name[len(rp):] + yield (name, pe) # FIXME: this function isn't very generic, because it splits the filename # in an odd way and depends on a terminating '/' to indicate directories. @@ -317,13 +512,15 @@ def pathsplit(p): class Writer: - def __init__(self, filename): + def __init__(self, filename, metastore, tmax): self.rootlevel = self.level = Level([], None) self.f = None self.count = 0 self.lastfile = None self.filename = None - self.filename = filename = realpath(filename) + self.filename = filename = resolve_parent(filename) + self.metastore = metastore + self.tmax = tmax (dir,name) = os.path.split(filename) (ffd,self.tmpname) = tempfile.mkstemp('.tmp', filename, dir) self.f = os.fdopen(ffd, 'wb', 65536) @@ -341,7 +538,8 @@ class Writer: def flush(self): if self.level: - self.level = _golevel(self.level, self.f, [], None) + self.level = _golevel(self.level, self.f, [], None, + self.metastore, self.tmax) self.count = self.rootlevel.count if self.count: self.count += 1 @@ -360,11 +558,12 @@ class Writer: def _add(self, ename, entry): if self.lastfile and self.lastfile <= ename: raise Error('%r must come before %r' - % (''.join(e.name), ''.join(self.lastfile))) - self.lastfile = e.name - self.level = _golevel(self.level, self.f, ename, entry) + % (''.join(ename), ''.join(self.lastfile))) + self.lastfile = ename + self.level = _golevel(self.level, self.f, ename, entry, + self.metastore, self.tmax) - def add(self, name, st, hashgen = None): + def add(self, name, st, meta_ofs, hashgen = None): endswith = name.endswith('/') ename = pathsplit(name) basename = ename[-1] @@ -379,13 +578,15 @@ class Writer: if st: isdir = stat.S_ISDIR(st.st_mode) assert(isdir == endswith) - e = NewEntry(basename, name, st.st_dev, int(st.st_ctime), - int(st.st_mtime), st.st_uid, st.st_gid, + e = NewEntry(basename, name, self.tmax, + st.st_dev, st.st_ino, st.st_nlink, + st.st_ctime, st.st_mtime, st.st_atime, st.st_size, st.st_mode, gitmode, sha, flags, - 0, 0) + meta_ofs, 0, 0) else: assert(endswith) - e = BlankNewEntry(basename) + meta_ofs = self.metastore.store(metadata.Metadata()) + e = BlankNewEntry(basename, meta_ofs, self.tmax) e.gitmode = gitmode e.sha = sha e.flags = flags @@ -400,15 +601,34 @@ class Writer: return Reader(self.tmpname) +def _slashappend_or_add_error(p, caller): + """Return p, after ensuring it has a single trailing slash if it names + a directory, unless there's an OSError, in which case, call + add_error() and return None.""" + try: + st = os.lstat(p) + except OSError as e: + add_error('%s: %s' % (caller, e)) + return None + else: + if stat.S_ISDIR(st.st_mode): + return slashappend(p) + return p + + +def unique_resolved_paths(paths): + "Return a collection of unique resolved paths." + rps = (_slashappend_or_add_error(resolve_parent(p), 'unique_resolved_paths') + for p in paths) + return frozenset((x for x in rps if x is not None)) + + def reduce_paths(paths): xpaths = [] for p in paths: - rp = realpath(p) - st = os.lstat(rp) - if stat.S_ISDIR(st.st_mode): - rp = slashappend(rp) - p = slashappend(p) - xpaths.append((rp, p)) + rp = _slashappend_or_add_error(resolve_parent(p), 'reduce_paths') + if rp: + xpaths.append((rp, slashappend(p) if rp.endswith('/') else p)) xpaths.sort() paths = [] @@ -423,35 +643,9 @@ def reduce_paths(paths): return paths -class MergeIter: - def __init__(self, iters): - self.iters = iters - - def __len__(self): - # FIXME: doesn't remove duplicated entries between iters. - # That only happens for parent directories, but will mean the - # actual iteration returns fewer entries than this function counts. - return sum(len(it) for it in self.iters) - - def __iter__(self): - total = len(self) - l = [iter(it) for it in self.iters] - l = [(next(it),it) for it in l] - l = filter(lambda x: x[0], l) - count = 0 - lastname = None - while l: - if not (count % 1024): - progress('bup: merging indexes (%d/%d)\r' % (count, total)) - l.sort() - (e,it) = l.pop() - if not e: - continue - if e.name != lastname: - yield e - lastname = e.name - n = next(it) - if n: - l.append((n,it)) - count += 1 - log('bup: merging indexes (%d/%d), done.\n' % (count, total)) +def merge(*iters): + def pfunc(count, total): + qprogress('bup: merging indexes (%d/%d)\r' % (count, total)) + def pfinal(count, total): + progress('bup: merging indexes (%d/%d), done.\n' % (count, total)) + return merge_iter(iters, 1024, pfunc, pfinal, key='name')