From: Avery Pennarun Date: Sat, 2 Jan 2010 06:46:06 +0000 (-0500) Subject: 'bup split': speed optimization for never-ending blocks. X-Git-Tag: bup-0.01~19 X-Git-Url: https://arthur.barton.de/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=295288b18c6cd7bcd92eaa9c73c271ad4178b2b1;p=bup.git 'bup split': speed optimization for never-ending blocks. For blocks which never got split (eg. huge endless streams of zeroes) we would constantly scan and re-scan the same sub-blocks, making things go really slowly. In such a bad situation, there's no point in being so careful; just dump the *entire* input buffer to a chunk and move on. This vastly speeds up splitting of files with lots of blank space in them, eg. VirtualBox images. Also add a cache for git.hash_raw() so it doesn't have to stat() the same blob files over and over if the same blocks (especially zeroes) occur more than once. --- diff --git a/git.py b/git.py index 61e37cd..1e1c79f 100644 --- a/git.py +++ b/git.py @@ -1,12 +1,16 @@ import os, errno, zlib, time, sha, subprocess from helpers import * - +_objcache = {} def hash_raw(type, s): + global _objcache header = '%s %d\0' % (type, len(s)) sum = sha.sha(header) sum.update(s) + bin = sum.digest() hex = sum.hexdigest() + if bin in _objcache: + return hex dir = '.git/objects/%s' % hex[0:2] fn = '%s/%s' % (dir, hex[2:]) if not os.path.exists(fn): @@ -27,6 +31,7 @@ def hash_raw(type, s): else: #log('exists %s' % fn) pass + _objcache[bin] = 1 return hex diff --git a/hashsplit.py b/hashsplit.py index a991ca7..80b54b1 100644 --- a/hashsplit.py +++ b/hashsplit.py @@ -79,7 +79,7 @@ def hashsplit_iter(files): if eof and not blob: blob = buf.get(buf.used()) if not blob and buf.used() >= BLOB_MAX: - blob = buf.get(BLOB_MAX) # limit max blob size + blob = buf.get(buf.used()) # limit max blob size if not blob and not eof: continue