Add a C module to do the rolling checksum.

author Avery Pennarun <apenwarr@gmail.com>

Wed, 30 Dec 2009 07:33:35 +0000 (02:33 -0500)

committer Avery Pennarun <apenwarr@gmail.com>

Wed, 30 Dec 2009 07:48:09 +0000 (02:48 -0500)
author Avery Pennarun <apenwarr@gmail.com>
Wed, 30 Dec 2009 07:33:35 +0000 (02:33 -0500)
committer Avery Pennarun <apenwarr@gmail.com>
Wed, 30 Dec 2009 07:48:09 +0000 (02:48 -0500)
diff --git a/.gitignore b/.gitignore

index f0b96cfd60cb293727f831814388db0096a54b3b..208c1278429dc8fb898a8ffe803e4f87357bce62 100644 (file)
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,7 @@ hashsplit
  hashjoin
  datagen
  *.o
+*.so
  *~
  tags[12]
  out[12]
diff --git a/Makefile b/Makefile

index 081bd4d77adeff0b8dec9f00eff1dd8fb0564371..b66798cc8e668552201bd16eef720150a90de158 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,8 +1,8 @@
-CFLAGS=-Wall -g -Werror
+CFLAGS=-Wall -g -Werror -I/usr/include/python2.5 -g -fwrapv -fPIC
  
  default: all
  
-all: hashsplit hashjoin datagen
+all: hashsplit hashjoin datagen hashsplit.so
  
  hashsplit: hashsplit.o
  
@@ -10,6 +10,9 @@ datagen: datagen.o
  
  hashjoin: hashjoin.sh
  
+hashsplit.so: hashsplitmodule.o
+       $(CC) -shared -Wl,-Bsymbolic-functions -o $@ $<
+
  test: hashsplit hashjoin
         ./hashsplit.py <testfile1 >tags1
         ./hashsplit.py <testfile2 >tags2
@@ -32,5 +35,5 @@ test: hashsplit hashjoin
         gcc -c -o $@ $^ $(CPPFLAGS) $(CFLAGS)
  
  clean:
-       rm -f *.o *~ hashsplit hashjoin hsplit hjoin datagen \
+       rm -f *.o *.so *~ hashsplit hashjoin hsplit hjoin datagen \
                 out[12] tags[12] .*~
diff --git a/hashsplit.py b/hashsplit.py

index ba790c61d95eb4271068e7769354abac9bf04a12..43c8905746d7c73933a9010d7ad895fd2d67e313 100755 (executable)
--- a/hashsplit.py
+++ b/hashsplit.py
@@ -1,36 +1,17 @@
  #!/usr/bin/env python
  import sys, os, subprocess, errno, zlib, time
+import hashsplit
  from sha import sha
  
+# FIXME: duplicated in C module.  This shouldn't really be here at all...
  BLOBBITS = 14
  BLOBSIZE = 1 << (BLOBBITS-1)
-WINDOWBITS = 7
-WINDOWSIZE = 1 << (WINDOWBITS-1)
  
  
  def log(s):
      sys.stderr.write('%s\n' % s)
  
  
-# FIXME: replace this with a not-stupid rolling checksum algorithm,
-# such as the one used in rsync (Adler32?)
-def stupidsum_add(old, drop, add):
-    return (((old<<1) | ((old>>31)&0xffffffff)) & 0xffffffff) ^ drop ^ add
-
-
-def test_sums():
-    sum = 0
-    for i in range(WINDOWSIZE):
-        sum = stupidsum_add(sum, 0, i%256)
-    sum1 = sum
-    for i in range(WINDOWSIZE*5):
-        sum = stupidsum_add(sum, i%256, i%256)
-    assert(sum == sum1)
-    for i in range(WINDOWSIZE):
-        sum = stupidsum_add(sum, i%256, 0)
-    assert(sum == 0)
-
-
  class Buf:
      def __init__(self):
          self.list = []
@@ -63,19 +44,13 @@ class Buf:
  
  def splitbuf(buf):
      #return buf.get(BLOBSIZE)
-    window = [0] * WINDOWSIZE
-    sum = 0
-    i = 0
-    count = 0
-    for ent in buf.list:
-        for c in ent:
-            count += 1
-            b = ord(c)
-            sum = stupidsum_add(sum, window[i], b)
-            window[i] = b
-            i = (i + 1) % WINDOWSIZE
-            if (sum & (BLOBSIZE-1)) == ((~0) & (BLOBSIZE-1)):
-                return buf.get(count)
+    b = buf.get(buf.used())
+    try:
+        ofs = hashsplit.splitbuf(b)
+        if ofs:
+            return b[:ofs]
+    finally:
+        buf.put(b[ofs:])
      return None
  
  
@@ -133,8 +108,8 @@ def do_main():
  
          if blob:
              ofs += len(blob)
-            #log('SPLIT @ %-8d size=%-8d (%d/%d)'
-            #    % (ofs, len(blob), BLOBSIZE, WINDOWSIZE))
+            #log('SPLIT @ %-8d size=%-8d (blobsize=%d)'
+            #    % (ofs, len(blob), BLOBSIZE))
              save_blob(blob)
            
          nv = (ofs + buf.used())/1000000
@@ -146,7 +121,5 @@ def do_main():
          % (ofs/1024., secs, ofs/1024./secs))
  
  
-assert(WINDOWSIZE >= 32)
  assert(BLOBSIZE >= 32)
-test_sums()
  do_main()
diff --git a/hashsplitmodule.c b/hashsplitmodule.c

new file mode 100644 (file)

index 0000000..cf6305d
--- /dev/null
+++ b/hashsplitmodule.c
@@ -0,0 +1,59 @@
+#include <Python.h>
+#include <assert.h>
+#include <stdint.h>
+
+#define BLOBBITS (14)
+#define BLOBSIZE (1<<(BLOBBITS-1))
+#define WINDOWBITS (7)
+#define WINDOWSIZE (1<<(WINDOWBITS-1))
+
+
+// FIXME: replace this with a not-stupid rolling checksum algorithm,
+// such as the one used in rsync (Adler32?)
+static uint32_t stupidsum_add(uint32_t old, uint8_t drop, uint8_t add)
+{
+    return ((old<<1) | (old>>31)) ^ drop ^ add;
+}
+
+
+static PyObject *splitbuf(PyObject *self, PyObject *args)
+{
+    char *buf = NULL;
+    int len = 0, count;
+
+    if (!PyArg_ParseTuple(args, "et#", "utf-8", &buf, &len))
+       return NULL;
+    
+    {
+       unsigned char window[WINDOWSIZE];
+       uint32_t sum = 0;
+       int i = 0;
+       memset(window, 0, sizeof(window));
+       
+       for (count = 0; count < len; count++)
+       {
+           sum = stupidsum_add(sum, window[i], buf[count]);
+           window[i] = buf[count];
+           i = (i + 1) % WINDOWSIZE;
+           if ((sum & (BLOBSIZE-1)) == ((~0) & (BLOBSIZE-1)))
+               goto done;
+       }
+    }
+    
+    count = -1;
+done:
+    PyMem_Free(buf);
+    return Py_BuildValue("i", count+1);
+}
+
+
+static PyMethodDef hashsplit_methods[] = {
+    { "splitbuf", splitbuf, METH_VARARGS,
+       "Split a list of strings based on a rolling checksum." },
+    { NULL, NULL, 0, NULL },  // sentinel
+};
+
+PyMODINIT_FUNC inithashsplit()
+{
+    Py_InitModule("hashsplit", hashsplit_methods);
+}
author	Avery Pennarun <apenwarr@gmail.com>
	Wed, 30 Dec 2009 07:33:35 +0000 (02:33 -0500)
committer	Avery Pennarun <apenwarr@gmail.com>
	Wed, 30 Dec 2009 07:48:09 +0000 (02:48 -0500)
.gitignore		patch \| blob \| history
Makefile		patch \| blob \| history
hashsplit.py		patch \| blob \| history
hashsplitmodule.c	[new file with mode: 0644]	patch \| blob