From 31a7d800f45fe7ea29d1d1d311b7468398a51242 Mon Sep 17 00:00:00 2001 From: Avery Pennarun Date: Wed, 30 Dec 2009 02:33:35 -0500 Subject: [PATCH] Add a C module to do the rolling checksum. This is about 80x faster than the old speed (27megs/sec instead of 330k/sec) but still quite a lot slower than the 60+megs/sec I get *without* the checksum stuff. There are a few inefficiencies remaining, but not such easy ones as before... --- .gitignore | 1 + Makefile | 9 +++++--- hashsplit.py | 49 +++++++++------------------------------ hashsplitmodule.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 77 insertions(+), 41 deletions(-) create mode 100644 hashsplitmodule.c diff --git a/.gitignore b/.gitignore index f0b96cf..208c127 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ hashsplit hashjoin datagen *.o +*.so *~ tags[12] out[12] diff --git a/Makefile b/Makefile index 081bd4d..b66798c 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,8 @@ -CFLAGS=-Wall -g -Werror +CFLAGS=-Wall -g -Werror -I/usr/include/python2.5 -g -fwrapv -fPIC default: all -all: hashsplit hashjoin datagen +all: hashsplit hashjoin datagen hashsplit.so hashsplit: hashsplit.o @@ -10,6 +10,9 @@ datagen: datagen.o hashjoin: hashjoin.sh +hashsplit.so: hashsplitmodule.o + $(CC) -shared -Wl,-Bsymbolic-functions -o $@ $< + test: hashsplit hashjoin ./hashsplit.py tags1 ./hashsplit.py tags2 @@ -32,5 +35,5 @@ test: hashsplit hashjoin gcc -c -o $@ $^ $(CPPFLAGS) $(CFLAGS) clean: - rm -f *.o *~ hashsplit hashjoin hsplit hjoin datagen \ + rm -f *.o *.so *~ hashsplit hashjoin hsplit hjoin datagen \ out[12] tags[12] .*~ diff --git a/hashsplit.py b/hashsplit.py index ba790c6..43c8905 100755 --- a/hashsplit.py +++ b/hashsplit.py @@ -1,36 +1,17 @@ #!/usr/bin/env python import sys, os, subprocess, errno, zlib, time +import hashsplit from sha import sha +# FIXME: duplicated in C module. This shouldn't really be here at all... BLOBBITS = 14 BLOBSIZE = 1 << (BLOBBITS-1) -WINDOWBITS = 7 -WINDOWSIZE = 1 << (WINDOWBITS-1) def log(s): sys.stderr.write('%s\n' % s) -# FIXME: replace this with a not-stupid rolling checksum algorithm, -# such as the one used in rsync (Adler32?) -def stupidsum_add(old, drop, add): - return (((old<<1) | ((old>>31)&0xffffffff)) & 0xffffffff) ^ drop ^ add - - -def test_sums(): - sum = 0 - for i in range(WINDOWSIZE): - sum = stupidsum_add(sum, 0, i%256) - sum1 = sum - for i in range(WINDOWSIZE*5): - sum = stupidsum_add(sum, i%256, i%256) - assert(sum == sum1) - for i in range(WINDOWSIZE): - sum = stupidsum_add(sum, i%256, 0) - assert(sum == 0) - - class Buf: def __init__(self): self.list = [] @@ -63,19 +44,13 @@ class Buf: def splitbuf(buf): #return buf.get(BLOBSIZE) - window = [0] * WINDOWSIZE - sum = 0 - i = 0 - count = 0 - for ent in buf.list: - for c in ent: - count += 1 - b = ord(c) - sum = stupidsum_add(sum, window[i], b) - window[i] = b - i = (i + 1) % WINDOWSIZE - if (sum & (BLOBSIZE-1)) == ((~0) & (BLOBSIZE-1)): - return buf.get(count) + b = buf.get(buf.used()) + try: + ofs = hashsplit.splitbuf(b) + if ofs: + return b[:ofs] + finally: + buf.put(b[ofs:]) return None @@ -133,8 +108,8 @@ def do_main(): if blob: ofs += len(blob) - #log('SPLIT @ %-8d size=%-8d (%d/%d)' - # % (ofs, len(blob), BLOBSIZE, WINDOWSIZE)) + #log('SPLIT @ %-8d size=%-8d (blobsize=%d)' + # % (ofs, len(blob), BLOBSIZE)) save_blob(blob) nv = (ofs + buf.used())/1000000 @@ -146,7 +121,5 @@ def do_main(): % (ofs/1024., secs, ofs/1024./secs)) -assert(WINDOWSIZE >= 32) assert(BLOBSIZE >= 32) -test_sums() do_main() diff --git a/hashsplitmodule.c b/hashsplitmodule.c new file mode 100644 index 0000000..cf6305d --- /dev/null +++ b/hashsplitmodule.c @@ -0,0 +1,59 @@ +#include +#include +#include + +#define BLOBBITS (14) +#define BLOBSIZE (1<<(BLOBBITS-1)) +#define WINDOWBITS (7) +#define WINDOWSIZE (1<<(WINDOWBITS-1)) + + +// FIXME: replace this with a not-stupid rolling checksum algorithm, +// such as the one used in rsync (Adler32?) +static uint32_t stupidsum_add(uint32_t old, uint8_t drop, uint8_t add) +{ + return ((old<<1) | (old>>31)) ^ drop ^ add; +} + + +static PyObject *splitbuf(PyObject *self, PyObject *args) +{ + char *buf = NULL; + int len = 0, count; + + if (!PyArg_ParseTuple(args, "et#", "utf-8", &buf, &len)) + return NULL; + + { + unsigned char window[WINDOWSIZE]; + uint32_t sum = 0; + int i = 0; + memset(window, 0, sizeof(window)); + + for (count = 0; count < len; count++) + { + sum = stupidsum_add(sum, window[i], buf[count]); + window[i] = buf[count]; + i = (i + 1) % WINDOWSIZE; + if ((sum & (BLOBSIZE-1)) == ((~0) & (BLOBSIZE-1))) + goto done; + } + } + + count = -1; +done: + PyMem_Free(buf); + return Py_BuildValue("i", count+1); +} + + +static PyMethodDef hashsplit_methods[] = { + { "splitbuf", splitbuf, METH_VARARGS, + "Split a list of strings based on a rolling checksum." }, + { NULL, NULL, 0, NULL }, // sentinel +}; + +PyMODINIT_FUNC inithashsplit() +{ + Py_InitModule("hashsplit", hashsplit_methods); +} -- 2.39.2