From 55a2df0be0b26a64ac88f043a1a5888617287428 Mon Sep 17 00:00:00 2001
From: Avery Pennarun <apenwarr@gmail.com>
Date: Tue, 27 Jul 2010 01:27:54 -0400
Subject: [PATCH] _hashsplit.c: replace the stupidsum algorithm with rsync's
 adler32-based one.

I've been meaning to do this for a while, but a particular test dataset that
really caused problems with stupidsum() (ie. it split things into way more
chunks than it should have) finally screwed me over.  Let's change over to a
"real" checksum algorithm.

Non-annoying datasets shouldn't be noticeably affected, but bad ones (such
as my test case from EQL Data) can be 10x more sensible.  Typical backup
sets now have about 20% fewer chunks, although this has little affect on the
overall repository size.

WARNING: After this patch, all your chunk boundaries will be different from
before!  That means your incremental backups won't be terribly incremental
and your backup repositories will jump in size.  This should only happen
once.

Signed-off-by: Avery Pennarun <apenwarr@gmail.com>
---
 lib/bup/_hashsplit.c | 33 ++++++++++++++++++++++++---------
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/lib/bup/_hashsplit.c b/lib/bup/_hashsplit.c
index 6994f42..e3bd946 100644
--- a/lib/bup/_hashsplit.c
+++ b/lib/bup/_hashsplit.c
@@ -8,25 +8,33 @@
 #define WINDOWBITS (7)
 #define WINDOWSIZE (1<<(WINDOWBITS-1))
 
+// According to librsync/rollsum.h:
+// "We should make this something other than zero to improve the
+// checksum algorithm: tridge suggests a prime number."
+// apenwarr: I unscientifically tried 0 and 7919, and they both ended up
+// slightly worse than the librsync value of 31 for my arbitrary test data.
+#define ROLLSUM_CHAR_OFFSET 31
 
 typedef struct {
-    uint32_t sum;
+    unsigned s1, s2;
     uint8_t window[WINDOWSIZE];
     int wofs;
 } Rollsum;
 
 
-// FIXME: replace this with a not-stupid rolling checksum algorithm,
-// such as the one used in rsync (Adler32?)
+// These formulas are based on rollsum.h in the librsync project.
 static void rollsum_add(Rollsum *r, uint8_t drop, uint8_t add)
 {
-    r->sum = ((r->sum<<1) | (r->sum>>31)) ^ drop ^ add;
+    r->s1 += add - drop;
+    r->s2 += r->s1 - (WINDOWSIZE * (drop + ROLLSUM_CHAR_OFFSET));
 }
 
 
 static void rollsum_init(Rollsum *r)
 {
-    r->sum = r->wofs = 0;
+    r->s1 = WINDOWSIZE * ROLLSUM_CHAR_OFFSET;
+    r->s2 = WINDOWSIZE * (WINDOWSIZE-1) * ROLLSUM_CHAR_OFFSET;
+    r->wofs = 0;
     memset(r->window, 0, WINDOWSIZE);
 }
 
@@ -41,6 +49,12 @@ static void rollsum_init(Rollsum *r)
 } while (0)
 
 
+static uint32_t rollsum_digest(Rollsum *r)
+{
+    return (r->s1 << 16) | (r->s2 & 0xffff);
+}
+
+
 static uint32_t rollsum_sum(uint8_t *buf, size_t ofs, size_t len)
 {
     int count;
@@ -48,7 +62,7 @@ static uint32_t rollsum_sum(uint8_t *buf, size_t ofs, size_t len)
     rollsum_init(&r);
     for (count = ofs; count < len; count++)
 	rollsum_roll(&r, buf[count]);
-    return r.sum;
+    return rollsum_digest(&r);
 }
 
 
@@ -93,13 +107,14 @@ static int find_ofs(const unsigned char *buf, int len, int *bits)
     for (count = 0; count < len; count++)
     {
 	rollsum_roll(&r, buf[count]);
-	if ((r.sum & (BLOBSIZE-1)) == ((~0) & (BLOBSIZE-1)))
+	if ((r.s2 & (BLOBSIZE-1)) == ((~0) & (BLOBSIZE-1)))
 	{
 	    if (bits)
 	    {
+		unsigned rsum = rollsum_digest(&r);
 		*bits = BLOBBITS;
-		r.sum >>= BLOBBITS;
-		for (*bits = BLOBBITS; (r.sum >>= 1) & 1; (*bits)++)
+		rsum >>= BLOBBITS;
+		for (*bits = BLOBBITS; (rsum >>= 1) & 1; (*bits)++)
 		    ;
 	    }
 	    return count+1;
-- 
2.39.2