]> arthur.barton.de Git - bup.git/commitdiff
midx4: midx2 with idx backreferences
authorBrandon Low <lostlogic@lostlogicx.com>
Mon, 7 Feb 2011 06:06:08 +0000 (22:06 -0800)
committerAvery Pennarun <apenwarr@gmail.com>
Mon, 7 Feb 2011 09:31:49 +0000 (01:31 -0800)
Like midx3, this adds a lookup table of 4 bytes per entry to
reference an entry in the idxnames list.  2 bytes should be plenty, but
disk is cheap and the table will only be referenced when bup server gets
an object that's already in the midx.

Signed-off-by: Brandon Low <lostlogic@lostlogicx.com>
cmd/midx-cmd.py
cmd/server-cmd.py
lib/bup/git.py
lib/bup/helpers.py

index 8b2ea2b3dff9d5397fc4ac977fbb07a24bfd8218..4aa20099505ab7eabec33bb721b818ed33aa83c5 100755 (executable)
@@ -1,5 +1,6 @@
 #!/usr/bin/env python
 import sys, math, struct, glob, resource
+import tempfile, shutil
 from bup import options, git
 from bup.helpers import *
 
@@ -31,13 +32,20 @@ def max_files():
     return mf
 
 
-def merge(idxlist, bits, table):
-    count = 0
-    for e in git.idxmerge(idxlist, final_progress=False):
-        count += 1
-        prefix = git.extract_bits(e, bits)
-        table[prefix] = count
-        yield e
+def merge_into(tf_sha, tf_nmap, idxlist, bits, entries, total):
+    prefix = 0
+    it = git.idxmerge(idxlist, final_progress=False, total=total)
+    for i, (e, idx) in enumerate(it):
+        new_prefix = git.extract_bits(e, bits)
+        if new_prefix != prefix:
+            for p in xrange(prefix, new_prefix):
+                yield i
+            prefix = new_prefix
+        tf_sha.write(e)
+        tf_nmap.write(struct.pack('!I', idx))
+    i += 1
+    for p in xrange(prefix, entries):
+        yield i
 
 
 def _do_midx(outdir, outfilename, infilenames, prefixstr):
@@ -48,12 +56,12 @@ def _do_midx(outdir, outfilename, infilenames, prefixstr):
     
     inp = []
     total = 0
-    allfilenames = {}
+    allfilenames = []
     for name in infilenames:
         ix = git.open_idx(name)
+        inp.append(ix.iter_with_idx_i(len(allfilenames)))
         for n in ix.idxnames:
-            allfilenames[n] = 1
-        inp.append(ix)
+            allfilenames.append(os.path.basename(n))
         total += len(ix)
 
     log('midx: %screating from %d files (%d objects).\n'
@@ -69,25 +77,32 @@ def _do_midx(outdir, outfilename, infilenames, prefixstr):
     entries = 2**bits
     debug1('midx: table size: %d (%d bits)\n' % (entries*4, bits))
     
-    table = [0]*entries
-
     try:
         os.unlink(outfilename)
     except OSError:
         pass
     f = open(outfilename + '.tmp', 'w+')
-    f.write('MIDX\0\0\0\2')
-    f.write(struct.pack('!I', bits))
+    f.write('MIDX')
+    f.write(struct.pack('!II', git.MIDX_VERSION, bits))
     assert(f.tell() == 12)
-    f.write('\0'*4*entries)
-    
-    for e in merge(inp, bits, table):
-        f.write(e)
-        
-    f.write('\0'.join(os.path.basename(p) for p in allfilenames.keys()))
 
-    f.seek(12)
-    f.write(struct.pack('!%dI' % entries, *table))
+    tf_sha = tempfile.TemporaryFile(dir=outdir)
+    tf_nmap = tempfile.TemporaryFile(dir=outdir)
+    for t in merge_into(tf_sha, tf_nmap, inp, bits, entries, total):
+        f.write(struct.pack('!I', t))
+    assert(f.tell() == 12 + 4*entries)
+
+    tf_sha.seek(0)
+    shutil.copyfileobj(tf_sha, f)
+    tf_sha.close()
+    assert(f.tell() == 12 + 4*entries + 20*t) # t may be < total due to dupes
+
+    tf_nmap.seek(0)
+    shutil.copyfileobj(tf_nmap, f)
+    tf_nmap.close()
+    assert(f.tell() == 12 + 4*entries + 24*t) # t may be < total due to dupes
+
+    f.write('\0'.join(allfilenames))
     f.close()
     os.rename(outfilename + '.tmp', outfilename)
 
@@ -97,12 +112,11 @@ def _do_midx(outdir, outfilename, infilenames, prefixstr):
         assert(len(p.idxnames) == len(infilenames))
         print p.idxnames
         assert(len(p) == total)
-        pi = iter(p)
-        for i in merge(inp, total, bits, table):
+        for pe, e in p, git.idxmerge(inp, final_progress=False):
             assert(i == pi.next())
             assert(p.exists(i))
 
-    return total,outfilename
+    return total, outfilename
 
 
 def do_midx(outdir, outfilename, infilenames, prefixstr):
index 3bc998bcfe7927e70332bc0c2f3a1c8774a1eb2d..a5e9abde18de074676fb4ddbc12a3ace734434f7 100755 (executable)
@@ -89,18 +89,9 @@ def receive_objects_v2(conn, junk):
         #debug2('read %d bytes\n' % n)
         _check(w, n, len(buf), 'object read: expected %d bytes, got %d\n')
         if not dumb_server_mode:
-            oldpack = w.exists(shar)
+            oldpack = w.exists(shar, want_source=True)
             if oldpack:
-                if oldpack == True or oldpack.endswith('.midx'):
-                    # FIXME: we shouldn't really have to know about midx files
-                    # at this layer.  But exists() on a midx doesn't return the
-                    # packname (since it doesn't know)... probably we should
-                    # just fix that deficiency of midx files eventually,
-                    # although it'll make the files bigger.  This method is
-                    # certainly not very efficient.
-                    oldpack = w.objcache.packname_containing(shar)
-                    debug2('new suggestion: %r\n' % oldpack)
-                    w.objcache.refresh()
+                assert(not oldpack == True)
                 assert(oldpack.endswith('.idx'))
                 (dir,name) = os.path.split(oldpack)
                 if not (name in suggested):
index 19fe9bc6b8682f1ff49db03e6374b83cffe36245..a1a75625c7007fbef8a7e8c0ec2539f74c6d9016 100644 (file)
@@ -6,7 +6,7 @@ import os, sys, zlib, time, subprocess, struct, stat, re, tempfile, math, glob
 from bup.helpers import *
 from bup import _helpers, path
 
-MIDX_VERSION = 2
+MIDX_VERSION = 4
 
 """Discussion of bloom constants for bup:
 
@@ -247,9 +247,11 @@ class PackIdx:
             return self._ofs_from_idx(idx)
         return None
 
-    def exists(self, hash):
+    def exists(self, hash, want_source=False):
         """Return nonempty if the object exists in this index."""
-        return hash and (self._idx_from_hash(hash) != None) and True or None
+        if hash and (self._idx_from_hash(hash) != None):
+            return want_source and self.name or True
+        return None
 
     def __len__(self):
         return int(self.fanout[255])
@@ -275,6 +277,10 @@ class PackIdx:
                 return mid
         return None
 
+    def iter_with_idx_i(self, idx_i):
+        for e in self:
+            yield e, idx_i
+
 
 class PackIdxV1(PackIdx):
     """Object representation of a Git pack index (version 1) file."""
@@ -475,9 +481,10 @@ class PackMidx:
         self.entries = 2**self.bits
         self.fanout = buffer(self.map, 12, self.entries*4)
         shaofs = 12 + self.entries*4
-        nsha = self._fanget(self.entries-1)
+        self.nsha = nsha = self._fanget(self.entries-1)
         self.shatable = buffer(self.map, shaofs, nsha*20)
-        self.idxnames = str(self.map[shaofs + 20*nsha:]).split('\0')
+        self.whichlist = buffer(self.map, shaofs + nsha*20, nsha*4)
+        self.idxnames = str(self.map[shaofs + 24*nsha:]).split('\0')
 
     def _init_failed(self):
         self.bits = 0
@@ -494,7 +501,13 @@ class PackMidx:
     def _get(self, i):
         return str(self.shatable[i*20:(i+1)*20])
 
-    def exists(self, hash):
+    def _get_idx_i(self, i):
+        return struct.unpack('!I', self.whichlist[i*4:(i+1)*4])[0]
+
+    def _get_idxname(self, i):
+        return self.idxnames[self._get_idx_i(i)]
+
+    def exists(self, hash, want_source=False):
         """Return nonempty if the object exists in the index files."""
         global _total_searches, _total_steps
         _total_searches += 1
@@ -525,9 +538,13 @@ class PackMidx:
                 end = mid
                 endv = _helpers.firstword(v)
             else: # got it!
-                return True
+                return want_source and self._get_idxname(mid) or True
         return None
 
+    def iter_with_idx_i(self, ofs):
+        for i in xrange(self._fanget(self.entries-1)):
+            yield buffer(self.shatable, i*20, 20), ofs+self._get_idx_i(i)
+
     def __iter__(self):
         for i in xrange(self._fanget(self.entries-1)):
             yield buffer(self.shatable, i*20, 20)
@@ -560,7 +577,7 @@ class PackIdxList:
     def __len__(self):
         return sum(len(pack) for pack in self.packs)
 
-    def exists(self, hash):
+    def exists(self, hash, want_source=False):
         """Return nonempty if the object exists in the index files."""
         global _total_searches
         _total_searches += 1
@@ -575,10 +592,11 @@ class PackIdxList:
         for i in xrange(len(self.packs)):
             p = self.packs[i]
             _total_searches -= 1  # will be incremented by sub-pack
-            if p.exists(hash):
+            ix = p.exists(hash, want_source=want_source)
+            if ix:
                 # reorder so most recently used packs are searched first
                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
-                return p.name
+                return ix
         self.do_bloom = True
         return None
 
@@ -658,21 +676,6 @@ class PackIdxList:
         debug1('PackIdxList: using %d index%s.\n'
             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 
-    def packname_containing(self, hash):
-        # figure out which pack contains a given hash.
-        # FIXME: if the midx file format would just *store* this information,
-        # we could calculate it a lot more efficiently.  But it's not needed
-        # often, so let's do it like this.
-        for f in glob.glob(os.path.join(self.dir,'*.idx')):
-            full = os.path.join(self.dir, f)
-            try:
-                ix = open_idx(full)
-            except GitError, e:
-                add_error(e)
-                continue
-            if ix.exists(hash):
-                return full
-
     def add(self, hash):
         """Insert an additional object in the list."""
         self.also.add(hash)
@@ -715,7 +718,7 @@ def open_idx(filename):
         raise GitError('idx filenames must end with .idx or .midx')
 
 
-def idxmerge(idxlist, final_progress=True):
+def idxmerge(idxlist, final_progress=True, total=None):
     """Generate a list of all the objects reachable in a PackIdxList."""
     def pfunc(count, total):
         progress('Reading indexes: %.2f%% (%d/%d)\r'
@@ -723,7 +726,7 @@ def idxmerge(idxlist, final_progress=True):
     def pfinal(count, total):
         if final_progress:
             log('Reading indexes: %.2f%% (%d/%d), done.\n' % (100, total, total))
-    return merge_iter(idxlist, 10024, pfunc, pfinal)
+    return merge_iter(idxlist, 10024, pfunc, pfinal, total=total)
 
 
 def _make_objcache():
@@ -800,10 +803,10 @@ class PackWriter:
             raise GitError(
                     "PackWriter not opened or can't check exists w/o objcache")
 
-    def exists(self, id):
+    def exists(self, id, want_source=False):
         """Return non-empty if an object is found in the object cache."""
         self._require_objcache()
-        return self.objcache.exists(id)
+        return self.objcache.exists(id, want_source=want_source)
 
     def maybe_write(self, type, content):
         """Write an object to the pack file if not present and return its id."""
index 7b5eeadafaa78e051083d9119ab687e6882d0f65..fdba7dd23f50a7b29a37ff06e623f552fc7ec07e 100644 (file)
@@ -88,13 +88,13 @@ def next(it):
         return None
 
 
-def merge_iter(iters, pfreq, pfunc, pfinal, key=None):
+def merge_iter(iters, pfreq, pfunc, pfinal, key=None, total=None):
     if key:
         samekey = lambda e, pe: getattr(e, key) == getattr(pe, key, None)
     else:
         samekey = operator.eq
     count = 0
-    total = sum(len(it) for it in iters)
+    total = total or sum(len(it) for it in iters)
     iters = (iter(it) for it in iters)
     heap = ((next(it),it) for it in iters)
     heap = [(e,it) for e,it in heap if e]