hashjoin
datagen
*.o
+*.so
*~
tags[12]
out[12]
-CFLAGS=-Wall -g -Werror
+CFLAGS=-Wall -g -Werror -I/usr/include/python2.5 -g -fwrapv -fPIC
default: all
-all: hashsplit hashjoin datagen
+all: hashsplit hashjoin datagen hashsplit.so
hashsplit: hashsplit.o
hashjoin: hashjoin.sh
+hashsplit.so: hashsplitmodule.o
+ $(CC) -shared -Wl,-Bsymbolic-functions -o $@ $<
+
test: hashsplit hashjoin
./hashsplit.py <testfile1 >tags1
./hashsplit.py <testfile2 >tags2
gcc -c -o $@ $^ $(CPPFLAGS) $(CFLAGS)
clean:
- rm -f *.o *~ hashsplit hashjoin hsplit hjoin datagen \
+ rm -f *.o *.so *~ hashsplit hashjoin hsplit hjoin datagen \
out[12] tags[12] .*~
#!/usr/bin/env python
import sys, os, subprocess, errno, zlib, time
+import hashsplit
from sha import sha
+# FIXME: duplicated in C module. This shouldn't really be here at all...
BLOBBITS = 14
BLOBSIZE = 1 << (BLOBBITS-1)
-WINDOWBITS = 7
-WINDOWSIZE = 1 << (WINDOWBITS-1)
def log(s):
sys.stderr.write('%s\n' % s)
-# FIXME: replace this with a not-stupid rolling checksum algorithm,
-# such as the one used in rsync (Adler32?)
-def stupidsum_add(old, drop, add):
- return (((old<<1) | ((old>>31)&0xffffffff)) & 0xffffffff) ^ drop ^ add
-
-
-def test_sums():
- sum = 0
- for i in range(WINDOWSIZE):
- sum = stupidsum_add(sum, 0, i%256)
- sum1 = sum
- for i in range(WINDOWSIZE*5):
- sum = stupidsum_add(sum, i%256, i%256)
- assert(sum == sum1)
- for i in range(WINDOWSIZE):
- sum = stupidsum_add(sum, i%256, 0)
- assert(sum == 0)
-
-
class Buf:
def __init__(self):
self.list = []
def splitbuf(buf):
#return buf.get(BLOBSIZE)
- window = [0] * WINDOWSIZE
- sum = 0
- i = 0
- count = 0
- for ent in buf.list:
- for c in ent:
- count += 1
- b = ord(c)
- sum = stupidsum_add(sum, window[i], b)
- window[i] = b
- i = (i + 1) % WINDOWSIZE
- if (sum & (BLOBSIZE-1)) == ((~0) & (BLOBSIZE-1)):
- return buf.get(count)
+ b = buf.get(buf.used())
+ try:
+ ofs = hashsplit.splitbuf(b)
+ if ofs:
+ return b[:ofs]
+ finally:
+ buf.put(b[ofs:])
return None
if blob:
ofs += len(blob)
- #log('SPLIT @ %-8d size=%-8d (%d/%d)'
- # % (ofs, len(blob), BLOBSIZE, WINDOWSIZE))
+ #log('SPLIT @ %-8d size=%-8d (blobsize=%d)'
+ # % (ofs, len(blob), BLOBSIZE))
save_blob(blob)
nv = (ofs + buf.used())/1000000
% (ofs/1024., secs, ofs/1024./secs))
-assert(WINDOWSIZE >= 32)
assert(BLOBSIZE >= 32)
-test_sums()
do_main()
--- /dev/null
+#include <Python.h>
+#include <assert.h>
+#include <stdint.h>
+
+#define BLOBBITS (14)
+#define BLOBSIZE (1<<(BLOBBITS-1))
+#define WINDOWBITS (7)
+#define WINDOWSIZE (1<<(WINDOWBITS-1))
+
+
+// FIXME: replace this with a not-stupid rolling checksum algorithm,
+// such as the one used in rsync (Adler32?)
+static uint32_t stupidsum_add(uint32_t old, uint8_t drop, uint8_t add)
+{
+ return ((old<<1) | (old>>31)) ^ drop ^ add;
+}
+
+
+static PyObject *splitbuf(PyObject *self, PyObject *args)
+{
+ char *buf = NULL;
+ int len = 0, count;
+
+ if (!PyArg_ParseTuple(args, "et#", "utf-8", &buf, &len))
+ return NULL;
+
+ {
+ unsigned char window[WINDOWSIZE];
+ uint32_t sum = 0;
+ int i = 0;
+ memset(window, 0, sizeof(window));
+
+ for (count = 0; count < len; count++)
+ {
+ sum = stupidsum_add(sum, window[i], buf[count]);
+ window[i] = buf[count];
+ i = (i + 1) % WINDOWSIZE;
+ if ((sum & (BLOBSIZE-1)) == ((~0) & (BLOBSIZE-1)))
+ goto done;
+ }
+ }
+
+ count = -1;
+done:
+ PyMem_Free(buf);
+ return Py_BuildValue("i", count+1);
+}
+
+
+static PyMethodDef hashsplit_methods[] = {
+ { "splitbuf", splitbuf, METH_VARARGS,
+ "Split a list of strings based on a rolling checksum." },
+ { NULL, NULL, 0, NULL }, // sentinel
+};
+
+PyMODINIT_FUNC inithashsplit()
+{
+ Py_InitModule("hashsplit", hashsplit_methods);
+}