From 31a7d800f45fe7ea29d1d1d311b7468398a51242 Mon Sep 17 00:00:00 2001
From: Avery Pennarun <apenwarr@gmail.com>
Date: Wed, 30 Dec 2009 02:33:35 -0500
Subject: [PATCH] Add a C module to do the rolling checksum.

This is about 80x faster than the old speed (27megs/sec instead of 330k/sec)
but still quite a lot slower than the 60+megs/sec I get *without* the
checksum stuff.  There are a few inefficiencies remaining, but not such easy
ones as before...
---
 .gitignore        |  1 +
 Makefile          |  9 +++++---
 hashsplit.py      | 49 +++++++++------------------------------
 hashsplitmodule.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 77 insertions(+), 41 deletions(-)
 create mode 100644 hashsplitmodule.c
diff --git a/.gitignore b/.gitignore
index f0b96cf..208c127 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,7 @@ hashsplit
 hashjoin
 datagen
 *.o
+*.so
 *~
 tags[12]
 out[12]
diff --git a/Makefile b/Makefile
index 081bd4d..b66798c 100644
--- a/Makefile
+++ b/Makefile
@@ -1,8 +1,8 @@
-CFLAGS=-Wall -g -Werror
+CFLAGS=-Wall -g -Werror -I/usr/include/python2.5 -g -fwrapv -fPIC
 
 default: all
 
-all: hashsplit hashjoin datagen
+all: hashsplit hashjoin datagen hashsplit.so
 
 hashsplit: hashsplit.o
 
@@ -10,6 +10,9 @@ datagen: datagen.o
 
 hashjoin: hashjoin.sh
 
+hashsplit.so: hashsplitmodule.o
+	$(CC) -shared -Wl,-Bsymbolic-functions -o $@ $<
+
 test: hashsplit hashjoin
 	./hashsplit.py <testfile1 >tags1
 	./hashsplit.py <testfile2 >tags2
@@ -32,5 +35,5 @@ test: hashsplit hashjoin
 	gcc -c -o $@ $^ $(CPPFLAGS) $(CFLAGS)
 
 clean:
-	rm -f *.o *~ hashsplit hashjoin hsplit hjoin datagen \
+	rm -f *.o *.so *~ hashsplit hashjoin hsplit hjoin datagen \
 		out[12] tags[12] .*~
diff --git a/hashsplit.py b/hashsplit.py
index ba790c6..43c8905 100755
--- a/hashsplit.py
+++ b/hashsplit.py
@@ -1,36 +1,17 @@
 #!/usr/bin/env python
 import sys, os, subprocess, errno, zlib, time
+import hashsplit
 from sha import sha
 
+# FIXME: duplicated in C module.  This shouldn't really be here at all...
 BLOBBITS = 14
 BLOBSIZE = 1 << (BLOBBITS-1)
-WINDOWBITS = 7
-WINDOWSIZE = 1 << (WINDOWBITS-1)
 
 
 def log(s):
     sys.stderr.write('%s\n' % s)
 
 
-# FIXME: replace this with a not-stupid rolling checksum algorithm,
-# such as the one used in rsync (Adler32?)
-def stupidsum_add(old, drop, add):
-    return (((old<<1) | ((old>>31)&0xffffffff)) & 0xffffffff) ^ drop ^ add
-
-
-def test_sums():
-    sum = 0
-    for i in range(WINDOWSIZE):
-        sum = stupidsum_add(sum, 0, i%256)
-    sum1 = sum
-    for i in range(WINDOWSIZE*5):
-        sum = stupidsum_add(sum, i%256, i%256)
-    assert(sum == sum1)
-    for i in range(WINDOWSIZE):
-        sum = stupidsum_add(sum, i%256, 0)
-    assert(sum == 0)
-
-
 class Buf:
     def __init__(self):
         self.list = []
@@ -63,19 +44,13 @@ class Buf:
 
 def splitbuf(buf):
     #return buf.get(BLOBSIZE)
-    window = [0] * WINDOWSIZE
-    sum = 0
-    i = 0
-    count = 0
-    for ent in buf.list:
-        for c in ent:
-            count += 1
-            b = ord(c)
-            sum = stupidsum_add(sum, window[i], b)
-            window[i] = b
-            i = (i + 1) % WINDOWSIZE
-            if (sum & (BLOBSIZE-1)) == ((~0) & (BLOBSIZE-1)):
-                return buf.get(count)
+    b = buf.get(buf.used())
+    try:
+        ofs = hashsplit.splitbuf(b)
+        if ofs:
+            return b[:ofs]
+    finally:
+        buf.put(b[ofs:])
     return None
 
 
@@ -133,8 +108,8 @@ def do_main():
 
         if blob:
             ofs += len(blob)
-            #log('SPLIT @ %-8d size=%-8d (%d/%d)'
-            #    % (ofs, len(blob), BLOBSIZE, WINDOWSIZE))
+            #log('SPLIT @ %-8d size=%-8d (blobsize=%d)'
+            #    % (ofs, len(blob), BLOBSIZE))
             save_blob(blob)
           
         nv = (ofs + buf.used())/1000000
@@ -146,7 +121,5 @@ def do_main():
         % (ofs/1024., secs, ofs/1024./secs))
 
 
-assert(WINDOWSIZE >= 32)
 assert(BLOBSIZE >= 32)
-test_sums()
 do_main()
diff --git a/hashsplitmodule.c b/hashsplitmodule.c
new file mode 100644
index 0000000..cf6305d
--- /dev/null
+++ b/hashsplitmodule.c
@@ -0,0 +1,59 @@
+#include <Python.h>
+#include <assert.h>
+#include <stdint.h>
+
+#define BLOBBITS (14)
+#define BLOBSIZE (1<<(BLOBBITS-1))
+#define WINDOWBITS (7)
+#define WINDOWSIZE (1<<(WINDOWBITS-1))
+
+
+// FIXME: replace this with a not-stupid rolling checksum algorithm,
+// such as the one used in rsync (Adler32?)
+static uint32_t stupidsum_add(uint32_t old, uint8_t drop, uint8_t add)
+{
+    return ((old<<1) | (old>>31)) ^ drop ^ add;
+}
+
+
+static PyObject *splitbuf(PyObject *self, PyObject *args)
+{
+    char *buf = NULL;
+    int len = 0, count;
+
+    if (!PyArg_ParseTuple(args, "et#", "utf-8", &buf, &len))
+	return NULL;
+    
+    {
+	unsigned char window[WINDOWSIZE];
+	uint32_t sum = 0;
+	int i = 0;
+	memset(window, 0, sizeof(window));
+	
+	for (count = 0; count < len; count++)
+	{
+	    sum = stupidsum_add(sum, window[i], buf[count]);
+	    window[i] = buf[count];
+	    i = (i + 1) % WINDOWSIZE;
+	    if ((sum & (BLOBSIZE-1)) == ((~0) & (BLOBSIZE-1)))
+		goto done;
+	}
+    }
+    
+    count = -1;
+done:
+    PyMem_Free(buf);
+    return Py_BuildValue("i", count+1);
+}
+
+
+static PyMethodDef hashsplit_methods[] = {
+    { "splitbuf", splitbuf, METH_VARARGS,
+	"Split a list of strings based on a rolling checksum." },
+    { NULL, NULL, 0, NULL },  // sentinel
+};
+
+PyMODINIT_FUNC inithashsplit()
+{
+    Py_InitModule("hashsplit", hashsplit_methods);
+}
-- 
2.39.2