From: Avery Pennarun Date: Fri, 12 Mar 2010 22:18:20 +0000 (-0500) Subject: hashsplit: use posix_fadvise(DONTNEED) when available. X-Git-Tag: bup-0.12a~3 X-Git-Url: https://arthur.barton.de/gitweb/?a=commitdiff_plain;h=dbc9e3ec44b765860e7769d2084c609dfa246230;p=bup.git hashsplit: use posix_fadvise(DONTNEED) when available. When reading through large disk images to back them up, we'll only end up reading the data once, but it still takes up space in the kernel's disk cache. If you're backing up a whole disk full of stuff, that's bad news for anything else running on your system, which will rapidly have its stuff dumped out of cache to store a bunch of stuff bup will never look at again. The posix_fadvise() call actually lets us tell the kernel we won't be using this data anymore, thus greatly reducing our hit on the disk cache. Theoretically it improves things, anyway. I haven't been able to come up with a really scientific way to test it, since of course *bup's* performance is expected to be the same either way (we're only throwing away stuff we're done using). It really does throw things out of cache, though, so the rest follows logically at least. --- diff --git a/lib/bup/_hashsplit.c b/lib/bup/_hashsplit.c index c8a3cde..937b3ad 100644 --- a/lib/bup/_hashsplit.c +++ b/lib/bup/_hashsplit.c @@ -163,6 +163,19 @@ static PyObject *open_noatime(PyObject *self, PyObject *args) } +static PyObject *fadvise_done(PyObject *self, PyObject *args) +{ + int fd = -1; + long long ofs = 0; + if (!PyArg_ParseTuple(args, "iL", &fd, &ofs)) + return NULL; +#ifdef POSIX_FADV_DONTNEED + posix_fadvise(fd, 0, ofs, POSIX_FADV_DONTNEED); +#endif + return Py_BuildValue(""); +} + + static PyMethodDef hashsplit_methods[] = { { "blobbits", blobbits, METH_VARARGS, "Return the number of bits in the rolling checksum." }, @@ -174,6 +187,8 @@ static PyMethodDef hashsplit_methods[] = { "Write random bytes to the given file descriptor" }, { "open_noatime", open_noatime, METH_VARARGS, "open() the given filename for read with O_NOATIME if possible" }, + { "fadvise_done", fadvise_done, METH_VARARGS, + "Inform the kernel that we're finished with earlier parts of a file" }, { NULL, NULL, 0, NULL }, // sentinel }; diff --git a/lib/bup/hashsplit.py b/lib/bup/hashsplit.py index 764ff55..03232b4 100644 --- a/lib/bup/hashsplit.py +++ b/lib/bup/hashsplit.py @@ -47,9 +47,13 @@ def splitbuf(buf): def blobiter(files): for f in files: + ofs = 0 while 1: + fadvise_done(f, max(0, ofs - 1024*1024)) b = f.read(BLOB_HWM) + ofs += len(b) if not b: + fadvise_done(f, ofs) break yield b @@ -168,3 +172,9 @@ def open_noatime(name): except: pass raise + + +def fadvise_done(f, ofs): + assert(ofs >= 0) + if ofs > 0: + _hashsplit.fadvise_done(f.fileno(), ofs)