From eacc2d3b1b42fdbe9ff6a49ac5627e4fc51a35ee Mon Sep 17 00:00:00 2001 From: Avery Pennarun Date: Tue, 5 Jan 2010 23:42:15 -0500 Subject: [PATCH] Split packs around 100M objects or 1G bytes. This will make pruning much easier later, plus avoids any problems with packs >= 2GB (not that we've had any of those yet, but...), plus avoids wasting RAM with an overly full MultiPackIndex.also{} dictionary. --- client.py | 59 +++++++++++++++++++++++++++++++++++++++---- cmd-save.py | 1 - cmd-server.py | 7 +++--- cmd-split.py | 7 +++++- git.py | 70 +++++++++++++++++++++++---------------------------- hashsplit.py | 4 +++ helpers.py | 2 ++ options.py | 4 ++- test-sh | 4 +-- 9 files changed, 107 insertions(+), 51 deletions(-) diff --git a/client.py b/client.py index f01d7e7..3902066 100644 --- a/client.py +++ b/client.py @@ -96,14 +96,19 @@ class Client: self._indexes_synced = 1 + def _make_objcache(self): + ob = self._busy + self._busy = None + self.sync_indexes() + self._busy = ob + return git.MultiPackIndex(self.cachedir) + def new_packwriter(self): - assert(self._indexes_synced) self.check_busy() self._busy = 'receive-objects' - self.conn.write('receive-objects\n') - objcache = git.MultiPackIndex(self.cachedir) - return git.PackWriter_Remote(self.conn, objcache = objcache, - onclose = self._not_busy) + return PackWriter_Remote(self.conn, + objcache_maker = self._make_objcache, + onclose = self._not_busy) def read_ref(self, refname): self.check_busy() @@ -133,3 +138,47 @@ class Client: yield self.conn.read(sz) self.conn.check_ok() self._not_busy() + + +class PackWriter_Remote(git.PackWriter): + def __init__(self, conn, objcache_maker=None, onclose=None): + git.PackWriter.__init__(self, objcache_maker) + self.file = conn + self.filename = 'remote socket' + self.onclose = onclose + self._packopen = False + + def _open(self): + if not self._packopen: + self._make_objcache() + self.file.write('receive-objects\n') + self._packopen = True + + def _end(self): + if self._packopen and self.file: + self.file.write('\0\0\0\0') + self._packopen = False + id = self.file.readline().strip() + self.file.check_ok() + self.objcache = None + return id + + def close(self): + id = self._end() + self.file = None + return id + + def abort(self): + raise GitError("don't know how to abort remote pack writing") + + def _raw_write(self, datalist): + assert(self.file) + if not self._packopen: + self._open() + data = ''.join(datalist) + assert(len(data)) + self.file.write(struct.pack('!I', len(data)) + data) + self.outbytes += len(data) + self.count += 1 + + diff --git a/cmd-save.py b/cmd-save.py index 69d5626..7da0429 100755 --- a/cmd-save.py +++ b/cmd-save.py @@ -119,7 +119,6 @@ refname = opt.name and 'refs/heads/%s' % opt.name or None if opt.remote: cli = client.Client(opt.remote) oldref = refname and cli.read_ref(refname) or None - cli.sync_indexes() w = cli.new_packwriter() else: cli = None diff --git a/cmd-server.py b/cmd-server.py index e29b201..1ce16a0 100755 --- a/cmd-server.py +++ b/cmd-server.py @@ -47,7 +47,9 @@ def receive_objects(conn, junk): if not n: log('bup server: received %d object%s.\n' % (w.count, w.count!=1 and "s" or '')) - w.close() + id = w.close() + conn.write('%s\n' % id) + conn.ok() return buf = conn.read(n) # object sizes in bup are reasonably small #log('read %d bytes\n' % n) @@ -56,8 +58,7 @@ def receive_objects(conn, junk): raise Exception('object read: expected %d bytes, got %d\n' % (n, len(buf))) w._raw_write([buf]) - w.close() - conn.ok() + # NOTREACHED def read_ref(conn, refname): diff --git a/cmd-split.py b/cmd-split.py index 4928733..e7ca122 100755 --- a/cmd-split.py +++ b/cmd-split.py @@ -15,6 +15,8 @@ c,commit output a commit id n,name= name of backup set to update (if any) v,verbose increase log output (can be used more than once) bench print benchmark timings to stderr +max-pack-size= maximum bytes in a single pack +max-pack-objects= maximum number of objects in a single pack """ o = options.Options('bup split', optspec) (opt, flags, extra) = o.parse(sys.argv[1:]) @@ -28,6 +30,10 @@ hashsplit.split_verbosely = opt.verbose if opt.verbose >= 2: git.verbose = opt.verbose - 1 opt.bench = 1 +if opt.max_pack_size: + hashsplit.max_pack_size = int(opt.max_pack_size) +if opt.max_pack_objects: + hashsplit.max_pack_objects = int(opt.max_pack_objects) start_time = time.time() @@ -35,7 +41,6 @@ refname = opt.name and 'refs/heads/%s' % opt.name or None if opt.remote: cli = client.Client(opt.remote) oldref = refname and cli.read_ref(refname) or None - cli.sync_indexes() w = cli.new_packwriter() else: cli = None diff --git a/git.py b/git.py index 3138f6e..ccafa5c 100644 --- a/git.py +++ b/git.py @@ -76,11 +76,12 @@ class PackIndex: class MultiPackIndex: def __init__(self, dir): - self.packs = [] + self.dir = dir self.also = {} - for f in os.listdir(dir): + self.packs = [] + for f in os.listdir(self.dir): if f.endswith('.idx'): - self.packs.append(PackIndex(os.path.join(dir, f))) + self.packs.append(PackIndex(os.path.join(self.dir, f))) def exists(self, hash): if hash in self.also: @@ -117,28 +118,37 @@ def _shalist_sort_key(ent): _typemap = dict(blob=3, tree=2, commit=1, tag=8) class PackWriter: - def __init__(self, objcache=None): + def __init__(self, objcache_maker=None): self.count = 0 + self.outbytes = 0 self.filename = None self.file = None - self.objcache = objcache or MultiPackIndex(repo('objects/pack')) + self.objcache_maker = objcache_maker + self.objcache = None def __del__(self): self.close() + def _make_objcache(self): + if not self.objcache: + if self.objcache_maker: + self.objcache = self.objcache_maker() + else: + self.objcache = MultiPackIndex(repo('objects/pack')) + def _open(self): - assert(not self.file) - self.objcache.zap_also() - self.filename = repo('objects/bup%d' % os.getpid()) - self.file = open(self.filename + '.pack', 'w+') - self.file.write('PACK\0\0\0\2\0\0\0\0') + if not self.file: + self._make_objcache() + self.filename = repo('objects/bup%d' % os.getpid()) + self.file = open(self.filename + '.pack', 'w+') + self.file.write('PACK\0\0\0\2\0\0\0\0') def _raw_write(self, datalist): - if not self.file: - self._open() + self._open() f = self.file for d in datalist: f.write(d) + self.outbytes += len(d) self.count += 1 def _write(self, bin, type, content): @@ -165,11 +175,18 @@ class PackWriter: self._raw_write(out) return bin + def breakpoint(self): + id = self._end() + self.outbytes = self.count = 0 + return id + def write(self, type, content): return self._write(calc_hash(type, content), type, content) def maybe_write(self, type, content): bin = calc_hash(type, content) + if not self.objcache: + self._make_objcache() if not self.objcache.exists(bin): self._write(bin, type, content) self.objcache.add(bin) @@ -209,7 +226,7 @@ class PackWriter: f.close() os.unlink(self.filename + '.pack') - def close(self): + def _end(self): f = self.file if not f: return None self.file = None @@ -230,6 +247,7 @@ class PackWriter: f.write(sum.digest()) f.close() + self.objcache = None p = subprocess.Popen(['git', 'index-pack', '-v', '--index-version=2', @@ -245,32 +263,8 @@ class PackWriter: os.rename(self.filename + '.idx', nameprefix + '.idx') return nameprefix - -class PackWriter_Remote(PackWriter): - def __init__(self, conn, objcache=None, onclose=None): - PackWriter.__init__(self, objcache) - self.file = conn - self.filename = 'remote socket' - self.onclose = onclose - - def _open(self): - assert(not "can't reopen a PackWriter_Remote") - def close(self): - if self.file: - self.file.write('\0\0\0\0') - if self.onclose: - self.onclose() - self.file = None - - def abort(self): - raise GitError("don't know how to abort remote pack writing") - - def _raw_write(self, datalist): - assert(self.file) - data = ''.join(datalist) - assert(len(data)) - self.file.write(struct.pack('!I', len(data)) + data) + return self._end() def _git_date(date): diff --git a/hashsplit.py b/hashsplit.py index 8995869..16f723f 100644 --- a/hashsplit.py +++ b/hashsplit.py @@ -6,6 +6,8 @@ BLOB_LWM = 8192*2 BLOB_MAX = BLOB_LWM*2 BLOB_HWM = 1024*1024 split_verbosely = 0 +max_pack_size = 1000*1000*1000 +max_pack_objects = 10*1000*1000 class Buf: def __init__(self): @@ -88,6 +90,8 @@ def hashsplit_iter(w, files): continue if blob: + if w.outbytes >= max_pack_size or w.count >= max_pack_objects: + w.breakpoint() yield (ofs, len(blob), w.new_blob(blob)) ofs += len(blob) diff --git a/helpers.py b/helpers.py index e7e70bd..f4403ef 100644 --- a/helpers.py +++ b/helpers.py @@ -71,6 +71,7 @@ class Conn: return self.inp.readline() def write(self, data): + #log('%d writing: %d bytes\n' % (os.getpid(), len(data))) self.outp.write(data) def ok(self): @@ -80,6 +81,7 @@ class Conn: self.outp.flush() rl = '' for rl in linereader(self.inp): + #log('%d got line: %r\n' % (os.getpid(), rl)) if not rl: continue elif rl == 'ok': diff --git a/options.py b/options.py index 6b67c06..eab10c5 100644 --- a/options.py +++ b/options.py @@ -1,4 +1,4 @@ -import textwrap, getopt +import textwrap, getopt, re from helpers import * class OptDict: @@ -51,7 +51,9 @@ class Options: flagl = flags.split(',') flagl_nice = [] for f in flagl: + f_nice = re.sub(r'\W', '_', f) self._aliases[f] = flagl[0] + self._aliases[f_nice] = flagl[0] self._hasparms[f] = has_parm if len(f) == 1: self._shortopts += f + (has_parm and ':' or '') diff --git a/test-sh b/test-sh index cdf9465..56ac25a 100755 --- a/test-sh +++ b/test-sh @@ -16,14 +16,14 @@ bup init bup split --bench -b tags1.tmp bup split -vvvv -b testfile2 >tags2.tmp bup split -t testfile2 >tags2t.tmp -bup split -c testfile2 >tags2c.tmp +bup split -r "$BUP_DIR" -c testfile2 >tags2c.tmp diff -u tags1.tmp tags2.tmp || true wc -c testfile1 testfile2 wc -l tags1.tmp tags2.tmp bup join $(cat tags1.tmp) >out1.tmp bup join out2.tmp bup join out2t.tmp -bup join out2c.tmp +bup join -r "$BUP_DIR" out2c.tmp diff -u testfile1 out1.tmp diff -u testfile2 out2.tmp diff -u testfile2 out2t.tmp -- 2.39.2