]> arthur.barton.de Git - bup.git/commitdiff
Split packs around 100M objects or 1G bytes.
authorAvery Pennarun <apenwarr@gmail.com>
Wed, 6 Jan 2010 04:42:15 +0000 (23:42 -0500)
committerAvery Pennarun <apenwarr@gmail.com>
Wed, 6 Jan 2010 05:28:17 +0000 (00:28 -0500)
This will make pruning much easier later, plus avoids any problems with
packs >= 2GB (not that we've had any of those yet, but...), plus avoids
wasting RAM with an overly full MultiPackIndex.also{} dictionary.

client.py
cmd-save.py
cmd-server.py
cmd-split.py
git.py
hashsplit.py
helpers.py
options.py
test-sh

index f01d7e71b9f2617e93c59405a54c108dfa0464da..3902066645ea60fc7db098d14b11607c0232caf5 100644 (file)
--- a/client.py
+++ b/client.py
@@ -96,14 +96,19 @@ class Client:
 
         self._indexes_synced = 1
 
+    def _make_objcache(self):
+        ob = self._busy
+        self._busy = None
+        self.sync_indexes()
+        self._busy = ob
+        return git.MultiPackIndex(self.cachedir)
+
     def new_packwriter(self):
-        assert(self._indexes_synced)
         self.check_busy()
         self._busy = 'receive-objects'
-        self.conn.write('receive-objects\n')
-        objcache = git.MultiPackIndex(self.cachedir)
-        return git.PackWriter_Remote(self.conn, objcache = objcache,
-                                     onclose = self._not_busy)
+        return PackWriter_Remote(self.conn,
+                                 objcache_maker = self._make_objcache,
+                                 onclose = self._not_busy)
 
     def read_ref(self, refname):
         self.check_busy()
@@ -133,3 +138,47 @@ class Client:
             yield self.conn.read(sz)
         self.conn.check_ok()
         self._not_busy()
+
+
+class PackWriter_Remote(git.PackWriter):
+    def __init__(self, conn, objcache_maker=None, onclose=None):
+        git.PackWriter.__init__(self, objcache_maker)
+        self.file = conn
+        self.filename = 'remote socket'
+        self.onclose = onclose
+        self._packopen = False
+
+    def _open(self):
+        if not self._packopen:
+            self._make_objcache()
+            self.file.write('receive-objects\n')
+            self._packopen = True
+
+    def _end(self):
+        if self._packopen and self.file:
+            self.file.write('\0\0\0\0')
+            self._packopen = False
+            id = self.file.readline().strip()
+            self.file.check_ok()
+            self.objcache = None
+            return id
+
+    def close(self):
+        id = self._end()
+        self.file = None
+        return id
+
+    def abort(self):
+        raise GitError("don't know how to abort remote pack writing")
+
+    def _raw_write(self, datalist):
+        assert(self.file)
+        if not self._packopen:
+            self._open()
+        data = ''.join(datalist)
+        assert(len(data))
+        self.file.write(struct.pack('!I', len(data)) + data)
+        self.outbytes += len(data)
+        self.count += 1
+
+
index 69d5626954380ba66cb90e4ba651f3e4de639703..7da0429503cff90286f0b680249151db51590aab 100755 (executable)
@@ -119,7 +119,6 @@ refname = opt.name and 'refs/heads/%s' % opt.name or None
 if opt.remote:
     cli = client.Client(opt.remote)
     oldref = refname and cli.read_ref(refname) or None
-    cli.sync_indexes()
     w = cli.new_packwriter()
 else:
     cli = None
index e29b20189bf9ef0919eb29bb2372398f4a3a5756..1ce16a0551b07562f429bac52968abdb1c05adf6 100755 (executable)
@@ -47,7 +47,9 @@ def receive_objects(conn, junk):
         if not n:
             log('bup server: received %d object%s.\n' 
                 % (w.count, w.count!=1 and "s" or ''))
-            w.close()
+            id = w.close()
+            conn.write('%s\n' % id)
+            conn.ok()
             return
         buf = conn.read(n)  # object sizes in bup are reasonably small
         #log('read %d bytes\n' % n)
@@ -56,8 +58,7 @@ def receive_objects(conn, junk):
             raise Exception('object read: expected %d bytes, got %d\n'
                             % (n, len(buf)))
         w._raw_write([buf])
-    w.close()
-    conn.ok()
+    # NOTREACHED
 
 
 def read_ref(conn, refname):
index 49287338f5bee553db0698e81455e1be9dcb95d1..e7ca12299826ed237c15c3332701968a6332ec43 100755 (executable)
@@ -15,6 +15,8 @@ c,commit   output a commit id
 n,name=    name of backup set to update (if any)
 v,verbose  increase log output (can be used more than once)
 bench      print benchmark timings to stderr
+max-pack-size=  maximum bytes in a single pack
+max-pack-objects=  maximum number of objects in a single pack
 """
 o = options.Options('bup split', optspec)
 (opt, flags, extra) = o.parse(sys.argv[1:])
@@ -28,6 +30,10 @@ hashsplit.split_verbosely = opt.verbose
 if opt.verbose >= 2:
     git.verbose = opt.verbose - 1
     opt.bench = 1
+if opt.max_pack_size:
+    hashsplit.max_pack_size = int(opt.max_pack_size)
+if opt.max_pack_objects:
+    hashsplit.max_pack_objects = int(opt.max_pack_objects)
 
 start_time = time.time()
 
@@ -35,7 +41,6 @@ refname = opt.name and 'refs/heads/%s' % opt.name or None
 if opt.remote:
     cli = client.Client(opt.remote)
     oldref = refname and cli.read_ref(refname) or None
-    cli.sync_indexes()
     w = cli.new_packwriter()
 else:
     cli = None
diff --git a/git.py b/git.py
index 3138f6eb80f5dc7056b461f7ccfdce2f9c7b6745..ccafa5c8b7e8eedd6e842d51cd70e78385386dfe 100644 (file)
--- a/git.py
+++ b/git.py
@@ -76,11 +76,12 @@ class PackIndex:
 
 class MultiPackIndex:
     def __init__(self, dir):
-        self.packs = []
+        self.dir = dir
         self.also = {}
-        for f in os.listdir(dir):
+        self.packs = []
+        for f in os.listdir(self.dir):
             if f.endswith('.idx'):
-                self.packs.append(PackIndex(os.path.join(dir, f)))
+                self.packs.append(PackIndex(os.path.join(self.dir, f)))
 
     def exists(self, hash):
         if hash in self.also:
@@ -117,28 +118,37 @@ def _shalist_sort_key(ent):
 
 _typemap = dict(blob=3, tree=2, commit=1, tag=8)
 class PackWriter:
-    def __init__(self, objcache=None):
+    def __init__(self, objcache_maker=None):
         self.count = 0
+        self.outbytes = 0
         self.filename = None
         self.file = None
-        self.objcache = objcache or MultiPackIndex(repo('objects/pack'))
+        self.objcache_maker = objcache_maker
+        self.objcache = None
 
     def __del__(self):
         self.close()
 
+    def _make_objcache(self):
+        if not self.objcache:
+            if self.objcache_maker:
+                self.objcache = self.objcache_maker()
+            else:
+                self.objcache = MultiPackIndex(repo('objects/pack'))
+
     def _open(self):
-        assert(not self.file)
-        self.objcache.zap_also()
-        self.filename = repo('objects/bup%d' % os.getpid())
-        self.file = open(self.filename + '.pack', 'w+')
-        self.file.write('PACK\0\0\0\2\0\0\0\0')
+        if not self.file:
+            self._make_objcache()
+            self.filename = repo('objects/bup%d' % os.getpid())
+            self.file = open(self.filename + '.pack', 'w+')
+            self.file.write('PACK\0\0\0\2\0\0\0\0')
 
     def _raw_write(self, datalist):
-        if not self.file:
-            self._open()
+        self._open()
         f = self.file
         for d in datalist:
             f.write(d)
+            self.outbytes += len(d)
         self.count += 1
 
     def _write(self, bin, type, content):
@@ -165,11 +175,18 @@ class PackWriter:
         self._raw_write(out)
         return bin
 
+    def breakpoint(self):
+        id = self._end()
+        self.outbytes = self.count = 0
+        return id
+
     def write(self, type, content):
         return self._write(calc_hash(type, content), type, content)
 
     def maybe_write(self, type, content):
         bin = calc_hash(type, content)
+        if not self.objcache:
+            self._make_objcache()
         if not self.objcache.exists(bin):
             self._write(bin, type, content)
             self.objcache.add(bin)
@@ -209,7 +226,7 @@ class PackWriter:
             f.close()
             os.unlink(self.filename + '.pack')
 
-    def close(self):
+    def _end(self):
         f = self.file
         if not f: return None
         self.file = None
@@ -230,6 +247,7 @@ class PackWriter:
         f.write(sum.digest())
         
         f.close()
+        self.objcache = None
 
         p = subprocess.Popen(['git', 'index-pack', '-v',
                               '--index-version=2',
@@ -245,32 +263,8 @@ class PackWriter:
         os.rename(self.filename + '.idx', nameprefix + '.idx')
         return nameprefix
 
-
-class PackWriter_Remote(PackWriter):
-    def __init__(self, conn, objcache=None, onclose=None):
-        PackWriter.__init__(self, objcache)
-        self.file = conn
-        self.filename = 'remote socket'
-        self.onclose = onclose
-
-    def _open(self):
-        assert(not "can't reopen a PackWriter_Remote")
-
     def close(self):
-        if self.file:
-            self.file.write('\0\0\0\0')
-            if self.onclose:
-                self.onclose()
-        self.file = None
-
-    def abort(self):
-        raise GitError("don't know how to abort remote pack writing")
-
-    def _raw_write(self, datalist):
-        assert(self.file)
-        data = ''.join(datalist)
-        assert(len(data))
-        self.file.write(struct.pack('!I', len(data)) + data)
+        return self._end()
 
 
 def _git_date(date):
index 89958692539c6b8b0cb3168a76ba18081a0faf0f..16f723f0ddf608cb25a6dc4411200e6268003f37 100644 (file)
@@ -6,6 +6,8 @@ BLOB_LWM = 8192*2
 BLOB_MAX = BLOB_LWM*2
 BLOB_HWM = 1024*1024
 split_verbosely = 0
+max_pack_size = 1000*1000*1000
+max_pack_objects = 10*1000*1000
 
 class Buf:
     def __init__(self):
@@ -88,6 +90,8 @@ def hashsplit_iter(w, files):
             continue
 
         if blob:
+            if w.outbytes >= max_pack_size or w.count >= max_pack_objects:
+                w.breakpoint()
             yield (ofs, len(blob), w.new_blob(blob))
             ofs += len(blob)
           
index e7e70bda9f5dabe525640847e967f842e5396733..f4403ef0ce9c9b92d92eea7c2ec5297818821c37 100644 (file)
@@ -71,6 +71,7 @@ class Conn:
         return self.inp.readline()
 
     def write(self, data):
+        #log('%d writing: %d bytes\n' % (os.getpid(), len(data)))
         self.outp.write(data)
 
     def ok(self):
@@ -80,6 +81,7 @@ class Conn:
         self.outp.flush()
         rl = ''
         for rl in linereader(self.inp):
+            #log('%d got line: %r\n' % (os.getpid(), rl))
             if not rl:
                 continue
             elif rl == 'ok':
index 6b67c0640af910ee35c3feb0e3653935e31bd809..eab10c5198b94a3f7694aff8c0bc5c3c39d9ab5d 100644 (file)
@@ -1,4 +1,4 @@
-import textwrap, getopt
+import textwrap, getopt, re
 from helpers import *
 
 class OptDict:
@@ -51,7 +51,9 @@ class Options:
                 flagl = flags.split(',')
                 flagl_nice = []
                 for f in flagl:
+                    f_nice = re.sub(r'\W', '_', f)
                     self._aliases[f] = flagl[0]
+                    self._aliases[f_nice] = flagl[0]
                     self._hasparms[f] = has_parm
                     if len(f) == 1:
                         self._shortopts += f + (has_parm and ':' or '')
diff --git a/test-sh b/test-sh
index cdf94658bc7f3e8de0936d84dd1ecfc93fcc46c3..56ac25a9d4257668c2aedab19ecd8939afeaf1db 100755 (executable)
--- a/test-sh
+++ b/test-sh
@@ -16,14 +16,14 @@ bup init
 bup split --bench -b <testfile1 >tags1.tmp
 bup split -vvvv -b testfile2 >tags2.tmp
 bup split -t testfile2 >tags2t.tmp
-bup split -c testfile2 >tags2c.tmp
+bup split -r "$BUP_DIR" -c testfile2 >tags2c.tmp
 diff -u tags1.tmp tags2.tmp || true
 wc -c testfile1 testfile2
 wc -l tags1.tmp tags2.tmp
 bup join $(cat tags1.tmp) >out1.tmp
 bup join <tags2.tmp >out2.tmp
 bup join <tags2t.tmp >out2t.tmp
-bup join <tags2c.tmp >out2c.tmp
+bup join -r "$BUP_DIR" <tags2c.tmp >out2c.tmp
 diff -u testfile1 out1.tmp
 diff -u testfile2 out2.tmp
 diff -u testfile2 out2t.tmp