Refactor splitting functions from cmd-split.py into hashsplit.py.

author Avery Pennarun <apenwarr@gmail.com>

Fri, 1 Jan 2010 02:51:12 +0000 (21:51 -0500)

committer Avery Pennarun <apenwarr@gmail.com>

Fri, 1 Jan 2010 02:51:12 +0000 (21:51 -0500)
author Avery Pennarun <apenwarr@gmail.com>
Fri, 1 Jan 2010 02:51:12 +0000 (21:51 -0500)
committer Avery Pennarun <apenwarr@gmail.com>
Fri, 1 Jan 2010 02:51:12 +0000 (21:51 -0500)
diff --git a/Makefile b/Makefile

index d0df36c99d707de0aee2d3d9b6636e945da21308..417ee946c54c3a9bd728c75c6cf512294cc1505c 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -2,11 +2,11 @@ CFLAGS=-Wall -g -O2 -Werror -I/usr/include/python2.5 -g -fPIC
  
  default: all
  
-all: bup-split bup-join bup randomgen hashsplit.so
+all: bup-split bup-join bup randomgen chashsplit.so
  
  randomgen: randomgen.o
  
-hashsplit.so: hashsplitmodule.o
+chashsplit.so: chashsplitmodule.o
         $(CC) -shared -o $@ $<
         
  runtests: all
diff --git a/chashsplitmodule.c b/chashsplitmodule.c

new file mode 100644 (file)

index 0000000..9393f14
--- /dev/null
+++ b/chashsplitmodule.c
@@ -0,0 +1,60 @@
+#include <Python.h>
+#include <assert.h>
+#include <stdint.h>
+
+#define BLOBBITS (14)
+#define BLOBSIZE (1<<(BLOBBITS-1))
+#define WINDOWBITS (7)
+#define WINDOWSIZE (1<<(WINDOWBITS-1))
+
+
+// FIXME: replace this with a not-stupid rolling checksum algorithm,
+// such as the one used in rsync (Adler32?)
+static uint32_t stupidsum_add(uint32_t old, uint8_t drop, uint8_t add)
+{
+    return ((old<<1) | (old>>31)) ^ drop ^ add;
+}
+
+
+static int find_ofs(const unsigned char *buf, int len)
+{
+    unsigned char window[WINDOWSIZE];
+    uint32_t sum = 0;
+    int i = 0, count;
+    memset(window, 0, sizeof(window));
+    
+    for (count = 0; count < len; count++)
+    {
+       sum = stupidsum_add(sum, window[i], buf[count]);
+       window[i] = buf[count];
+       i = (i + 1) % WINDOWSIZE;
+       if ((sum & (BLOBSIZE-1)) == ((~0) & (BLOBSIZE-1)))
+           return count+1;
+    }
+    return 0;
+}
+
+
+static PyObject *splitbuf(PyObject *self, PyObject *args)
+{
+    unsigned char *buf = NULL;
+    int len = 0, out = 0;
+
+    if (!PyArg_ParseTuple(args, "t#", &buf, &len))
+       return NULL;
+    out = find_ofs(buf, len);
+    //return Py_BuildValue("i", len);//len>BLOBSIZE ? BLOBSIZE : len);
+    return Py_BuildValue("i", out);
+}
+
+
+static PyMethodDef hashsplit_methods[] = {
+    { "splitbuf", splitbuf, METH_VARARGS,
+       "Split a list of strings based on a rolling checksum." },
+    { NULL, NULL, 0, NULL },  // sentinel
+};
+
+PyMODINIT_FUNC initchashsplit()
+{
+    Py_InitModule("chashsplit", hashsplit_methods);
+}
diff --git a/cmd-split.py b/cmd-split.py

index 1150f6031ff0df40785b7b8d01f0dcd51a3e17dd..205c3f33a6379a931724a3c5f8050522431db691 100755 (executable)
--- a/cmd-split.py
+++ b/cmd-split.py
@@ -1,99 +1,8 @@
  #!/usr/bin/env python
-import sys, os, subprocess, errno, zlib, time
+import sys, time
  import hashsplit, git, options
  from helpers import *
  
-BLOB_LWM = 8192*2
-BLOB_MAX = BLOB_LWM*2
-BLOB_HWM = 1024*1024
-
-
-class Buf:
-    def __init__(self):
-        self.data = ''
-        self.start = 0
-
-    def put(self, s):
-        #log('oldsize=%d+%d adding=%d\n' % (len(self.data), self.start, len(s)))
-        if s:
-            self.data = buffer(self.data, self.start) + s
-            self.start = 0
-            
-    def peek(self, count):
-        return buffer(self.data, self.start, count)
-    
-    def eat(self, count):
-        self.start += count
-
-    def get(self, count):
-        v = buffer(self.data, self.start, count)
-        self.start += count
-        return v
-
-    def used(self):
-        return len(self.data) - self.start
-
-
-def splitbuf(buf):
-    b = buf.peek(buf.used())
-    ofs = hashsplit.splitbuf(b)
-    if ofs:
-        buf.eat(ofs)
-        return buffer(b, 0, ofs)
-    return None
-
-
-def blobiter(files):
-    for f in files:
-        b = 1
-        while b:
-            b = f.read(BLOB_HWM)
-            if b:
-                yield b
-    yield '' # EOF indicator
-
-
-def autofiles(filenames):
-    if not filenames:
-        yield sys.stdin
-    else:
-        for n in filenames:
-            yield open(n)
-            
-    
-def hashsplit_iter(f):
-    ofs = 0
-    buf = Buf()
-    fi = blobiter(f)
-    blob = 1
-
-    eof = 0
-    lv = 0
-    while blob or not eof:
-        if not eof and (buf.used() < BLOB_LWM or not blob):
-            bnew = fi.next()
-            if not bnew: eof = 1
-            #log('got %d, total %d\n' % (len(bnew), buf.used()))
-            buf.put(bnew)
-
-        blob = splitbuf(buf)
-        if eof and not blob:
-            blob = buf.get(buf.used())
-        if not blob and buf.used() >= BLOB_MAX:
-            blob = buf.get(BLOB_MAX)  # limit max blob size
-        if not blob and not eof:
-            continue
-
-        if blob:
-            yield (ofs, len(blob), git.hash_blob(blob))
-            ofs += len(blob)
-          
-        nv = (ofs + buf.used())/1000000
-        if nv != lv:
-            log('%d\t' % nv)
-            lv = nv
-
-
  optspec = """
  bup split [-t] [filenames...]
  --
@@ -111,26 +20,11 @@ if not (opt.blobs or opt.tree or opt.commit or opt.name):
      o.usage()
  
  start_time = time.time()
-shalist = []
  
-ofs = 0
-last_ofs = 0
-for (ofs, size, sha) in hashsplit_iter(autofiles(extra)):
-    #log('SPLIT @ %-8d size=%-8d\n' % (ofs, size))
-    if opt.blobs:
-        print sha
-            
-    # this silliness keeps chunk filenames "similar" when a file changes
-    # slightly.
-    bm = BLOB_MAX
-    while 1:
-        cn = ofs / bm * bm
-        #log('%x,%x,%x,%x\n' % (last_ofs,ofs,cn,bm))
-        if cn > last_ofs or ofs == last_ofs: break
-        bm /= 2
-    last_ofs = cn
-    shalist.append(('100644', 'bup.chunk.%016x' % cn, sha))
-tree = git.gen_tree(shalist)
+(shalist,tree) = hashsplit.split_to_tree(hashsplit.autofiles(extra))
+if opt.blobs:
+    for (mode,name,sum) in shalist:
+        print sum
  if opt.tree:
      print tree
  if opt.commit or opt.name:
diff --git a/hashsplit.py b/hashsplit.py

new file mode 100644 (file)

index 0000000..76bc444
--- /dev/null
+++ b/hashsplit.py
@@ -0,0 +1,112 @@
+import sys
+import git, chashsplit
+
+BLOB_LWM = 8192*2
+BLOB_MAX = BLOB_LWM*2
+BLOB_HWM = 1024*1024
+
+
+class Buf:
+    def __init__(self):
+        self.data = ''
+        self.start = 0
+
+    def put(self, s):
+        #log('oldsize=%d+%d adding=%d\n' % (len(self.data), self.start, len(s)))
+        if s:
+            self.data = buffer(self.data, self.start) + s
+            self.start = 0
+            
+    def peek(self, count):
+        return buffer(self.data, self.start, count)
+    
+    def eat(self, count):
+        self.start += count
+
+    def get(self, count):
+        v = buffer(self.data, self.start, count)
+        self.start += count
+        return v
+
+    def used(self):
+        return len(self.data) - self.start
+
+
+def splitbuf(buf):
+    b = buf.peek(buf.used())
+    ofs = chashsplit.splitbuf(b)
+    if ofs:
+        buf.eat(ofs)
+        return buffer(b, 0, ofs)
+    return None
+
+
+def blobiter(files):
+    for f in files:
+        b = 1
+        while b:
+            b = f.read(BLOB_HWM)
+            if b:
+                yield b
+    yield '' # EOF indicator
+
+
+def autofiles(filenames):
+    if not filenames:
+        yield sys.stdin
+    else:
+        for n in filenames:
+            yield open(n)
+            
+    
+def hashsplit_iter(files):
+    ofs = 0
+    buf = Buf()
+    fi = blobiter(files)
+    blob = 1
+
+    eof = 0
+    lv = 0
+    while blob or not eof:
+        if not eof and (buf.used() < BLOB_LWM or not blob):
+            bnew = fi.next()
+            if not bnew: eof = 1
+            #log('got %d, total %d\n' % (len(bnew), buf.used()))
+            buf.put(bnew)
+
+        blob = splitbuf(buf)
+        if eof and not blob:
+            blob = buf.get(buf.used())
+        if not blob and buf.used() >= BLOB_MAX:
+            blob = buf.get(BLOB_MAX)  # limit max blob size
+        if not blob and not eof:
+            continue
+
+        if blob:
+            yield (ofs, len(blob), git.hash_blob(blob))
+            ofs += len(blob)
+          
+        nv = (ofs + buf.used())/1000000
+        if nv != lv:
+            log('%d\t' % nv)
+            lv = nv
+
+
+def split_to_tree(files):
+    shalist = []
+    ofs = 0
+    last_ofs = 0
+    for (ofs, size, sha) in hashsplit_iter(files):
+        #log('SPLIT @ %-8d size=%-8d\n' % (ofs, size))
+        # this silliness keeps chunk filenames "similar" when a file changes
+        # slightly.
+        bm = BLOB_MAX
+        while 1:
+            cn = ofs / bm * bm
+            #log('%x,%x,%x,%x\n' % (last_ofs,ofs,cn,bm))
+            if cn > last_ofs or ofs == last_ofs: break
+            bm /= 2
+        last_ofs = cn
+        shalist.append(('100644', 'bup.chunk.%016x' % cn, sha))
+    tree = git.gen_tree(shalist)
+    return (shalist, tree)
diff --git a/hashsplitmodule.c b/hashsplitmodule.c

deleted file mode 100644 (file)

index 98029a2..0000000
--- a/hashsplitmodule.c
+++ /dev/null
@@ -1,60 +0,0 @@
-#include <Python.h>
-#include <assert.h>
-#include <stdint.h>
-
-#define BLOBBITS (14)
-#define BLOBSIZE (1<<(BLOBBITS-1))
-#define WINDOWBITS (7)
-#define WINDOWSIZE (1<<(WINDOWBITS-1))
-
-
-// FIXME: replace this with a not-stupid rolling checksum algorithm,
-// such as the one used in rsync (Adler32?)
-static uint32_t stupidsum_add(uint32_t old, uint8_t drop, uint8_t add)
-{
-    return ((old<<1) | (old>>31)) ^ drop ^ add;
-}
-
-
-static int find_ofs(const unsigned char *buf, int len)
-{
-    unsigned char window[WINDOWSIZE];
-    uint32_t sum = 0;
-    int i = 0, count;
-    memset(window, 0, sizeof(window));
-    
-    for (count = 0; count < len; count++)
-    {
-       sum = stupidsum_add(sum, window[i], buf[count]);
-       window[i] = buf[count];
-       i = (i + 1) % WINDOWSIZE;
-       if ((sum & (BLOBSIZE-1)) == ((~0) & (BLOBSIZE-1)))
-           return count+1;
-    }
-    return 0;
-}
-
-
-static PyObject *splitbuf(PyObject *self, PyObject *args)
-{
-    unsigned char *buf = NULL;
-    int len = 0, out = 0;
-
-    if (!PyArg_ParseTuple(args, "t#", &buf, &len))
-       return NULL;
-    out = find_ofs(buf, len);
-    //return Py_BuildValue("i", len);//len>BLOBSIZE ? BLOBSIZE : len);
-    return Py_BuildValue("i", out);
-}
-
-
-static PyMethodDef hashsplit_methods[] = {
-    { "splitbuf", splitbuf, METH_VARARGS,
-       "Split a list of strings based on a rolling checksum." },
-    { NULL, NULL, 0, NULL },  // sentinel
-};
-
-PyMODINIT_FUNC inithashsplit()
-{
-    Py_InitModule("hashsplit", hashsplit_methods);
-}
author	Avery Pennarun <apenwarr@gmail.com>
	Fri, 1 Jan 2010 02:51:12 +0000 (21:51 -0500)
committer	Avery Pennarun <apenwarr@gmail.com>
	Fri, 1 Jan 2010 02:51:12 +0000 (21:51 -0500)
Makefile		patch \| blob \| history
chashsplitmodule.c	[new file with mode: 0644]	patch \| blob
cmd-split.py		patch \| blob \| history
hashsplit.py	[new file with mode: 0644]	patch \| blob
hashsplitmodule.c	[deleted file]	patch \| blob \| history