]> arthur.barton.de Git - bup.git/commitdiff
hashsplit.py is now much, much faster than before.
authorAvery Pennarun <apenwarr@gmail.com>
Wed, 30 Dec 2009 00:20:35 +0000 (19:20 -0500)
committerAvery Pennarun <apenwarr@gmail.com>
Wed, 30 Dec 2009 00:27:30 +0000 (19:27 -0500)
4.8 secs vs. 0.8 secs for testfile1.

Still vastly slower than the C version (0.17 secs including time to fork
git for each blob) but still a significant improvement.

The remaining slowness seems to be entirely from:

- running git hash-object (which we can avoid by hashing the object
  ourselves)

- running the rolling checksum algorithm (which we can speed up using a C
  module)

So it's looking good.

Makefile
hashsplit.c
hashsplit.py

index 6c62c9b74a45b6db988118047f4ffacb19eef118..21c176dd1f29a6a6d5e4554f2df6e3d62844fed0 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -9,8 +9,8 @@ hashsplit: hashsplit.o
 hashjoin: hashjoin.sh
 
 test: hashsplit hashjoin
-       ./hashsplit <testfile1 >tags1
-       ./hashsplit <testfile2 >tags2
+       ./hashsplit.py <testfile1 >tags1
+       ./hashsplit.py <testfile2 >tags2
        diff -u tags1 tags2 || true
        wc -c testfile1 testfile2
        wc -l tags1 tags2
index b623d824fe4e8c511c50ad27df48be82d12b9663..038f2d0829ba681cff9622f87f5860935b41133a 100644 (file)
@@ -70,6 +70,13 @@ int main()
        i = (i + 1) % WINDOWSIZE;
        count++;
        
+       if (!pipe)
+           pipe = popen("git hash-object --stdin -w", "w");
+           
+        // FIXME: write more than one byte at a time.  This is absurdly
+        // slow.
+       fputc(c, pipe);
+       
        if ((sum & (BLOBSIZE-1)) == ((~0) & (BLOBSIZE-1)))
        {
            fprintf(stderr, "SPLIT @ %-8d size=%-8d (%d/%d)\n",
@@ -85,13 +92,6 @@ int main()
                pipe = NULL;
            }
        }
-       
-       if (!pipe)
-           pipe = popen("git hash-object --stdin -w", "w");
-           
-        // FIXME: write more than one byte at a time.  This is absurdly
-        // slow.
-       fputc(c, pipe);
     }
     
     if (pipe)
index aa87ed912d4e38d47189280acc387ba72ede3663..7932e40cf9531e283cf0ea61d07ea4f9c762ec70 100755 (executable)
@@ -25,41 +25,87 @@ def test_sums():
     assert(sum == 0)
 
 
-def do_main():
-    buf = [0] * WINDOWSIZE
+class Buf:
+    def __init__(self):
+        self.list = []
+        self.total = 0
+
+    def put(self, s):
+        if s:
+            self.list.append(s)
+            self.total += len(s)
+
+    def get(self, count):
+        count = count
+        out = []
+        while count > 0 and self.list:
+            n = len(self.list[0])
+            if count >= n:
+                out.append(self.list[0])
+                self.list = self.list[1:]
+            else:
+                n = count
+                out.append(self.list[0][:n])
+                self.list[0] = self.list[0][n:]
+            count -= n
+            self.total -= n
+        return ''.join(out)
+
+    def used(self):
+        return self.total
+
+
+def splitbuf(buf):
+    #return buf.get(BLOBSIZE)
+    window = [0] * WINDOWSIZE
     sum = 0
     i = 0
     count = 0
-    last_count = 0
+    for ent in buf.list:
+        for c in ent:
+            count += 1
+            b = ord(c)
+            sum = stupidsum_add(sum, window[i], b)
+            window[i] = b
+            i = (i + 1) % WINDOWSIZE
+            if (sum & (BLOBSIZE-1)) == ((~0) & (BLOBSIZE-1)):
+                return buf.get(count)
+    return None
+
+
+def save_blob(blob):
+    pipe = subprocess.Popen(['git', 'hash-object', '--stdin', '-w'],
+                            stdin=subprocess.PIPE)
+    pipe.stdin.write(blob)
+    pipe.stdin.close()
+    pipe.wait()
     pipe = None
 
-    while 1:
-        c = sys.stdin.read(1)
-        if not len(c): break
-        c = ord(c)
-        sum = stupidsum_add(sum, buf[i], c)
-        buf[i] = c
-        i = (i + 1) % WINDOWSIZE
-        count += 1
 
-        if (sum & (BLOBSIZE-1)) == ((~0) & (BLOBSIZE-1)):
-            sys.stderr.write('SPLIT @ %-8d size=%-8d (%d/%d)\n'
-                             % (count, count - last_count,
-                                BLOBSIZE, WINDOWSIZE))
-            last_count = count
-            i = 0
-            buf = [0] * WINDOWSIZE
-            sum = 0
-            if pipe:
-                pipe.stdin.close()
-                pipe.wait()
-                pipe = None
-
-        if not pipe:
-            pipe = subprocess.Popen(['git', 'hash-object', '--stdin', '-w'],
-                                    stdin=subprocess.PIPE)
-        pipe.stdin.write(chr(c))
+def do_main():
+    ofs = 0
+    buf = Buf()
+    blob = 1
 
+    eof = 0
+    while blob or not eof:
+        if not eof and (buf.used() < BLOBSIZE*2 or not blob):
+            bnew = sys.stdin.read(BLOBSIZE*4)
+            if not len(bnew): eof = 1
+            # print 'got %d, total %d' % (len(bnew), buf.used())
+            buf.put(bnew)
+
+        blob = splitbuf(buf)
+        if not blob and not eof:
+            continue
+        if eof and not blob:
+            blob = buf.get(buf.used())
+
+        if blob:
+            ofs += len(blob)
+            sys.stderr.write('SPLIT @ %-8d size=%-8d (%d/%d)\n'
+                             % (ofs, len(blob), BLOBSIZE, WINDOWSIZE))
+            save_blob(blob)
 
 assert(WINDOWSIZE >= 32)
 assert(BLOBSIZE >= 32)