]> arthur.barton.de Git - bup.git/commitdiff
Split PackMidx from git.py into a new midx.py.
authorAvery Pennarun <apenwarr@gmail.com>
Thu, 17 Feb 2011 02:55:41 +0000 (18:55 -0800)
committerAvery Pennarun <apenwarr@gmail.com>
Thu, 17 Feb 2011 02:59:30 +0000 (18:59 -0800)
git.py is definitely too big.  It still is, but this helps a bit.

Signed-off-by: Avery Pennarun <apenwarr@gmail.com>
DESIGN
cmd/memtest-cmd.py
cmd/midx-cmd.py
lib/bup/git.py
lib/bup/midx.py [new file with mode: 0644]

diff --git a/DESIGN b/DESIGN
index 2ec570ced39e3b4bbb0baef6a8ea187484640a4f..3e4737a89b71b8d49ec8c99b0fb72703f621da82 100644 (file)
--- a/DESIGN
+++ b/DESIGN
@@ -281,7 +281,7 @@ they're written.
 But that leads us to our next problem.
 
 
-Huge numbers of huge packfiles (git.PackMidx, cmd/midx)
+Huge numbers of huge packfiles (midx.py, cmd/midx)
 ------------------------------
 
 Git isn't actually designed to handle super-huge repositories.  Most git
index d627caa193a09ea58787f6b988ba1fc646747d63..0e3cf0c839d6a86402685a8589a0e5c6acf5b1b9 100755 (executable)
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 import sys, re, struct, time, resource
-from bup import git, bloom, options, _helpers
+from bup import git, bloom, midx, options, _helpers
 from bup.helpers import *
 
 handle_ctrl_c()
@@ -107,6 +107,10 @@ if bloom._total_searches:
     print ('bloom: %d objects searched in %d steps: avg %.3f steps/object' 
            % (bloom._total_searches, bloom._total_steps,
               bloom._total_steps*1.0/bloom._total_searches))
+if midx._total_searches:
+    print ('midx: %d objects searched in %d steps: avg %.3f steps/object' 
+           % (midx._total_searches, midx._total_steps,
+              midx._total_steps*1.0/midx._total_searches))
 if git._total_searches:
     print ('idx: %d objects searched in %d steps: avg %.3f steps/object' 
            % (git._total_searches, git._total_steps,
index b5fb77e3977b48b0945888a5367b1655ba2b6dd8..7a4940291545fcb8537e27b79793198aeade9dc7 100755 (executable)
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 import sys, math, struct, glob, resource
 import tempfile
-from bup import options, git, _helpers
+from bup import options, git, midx, _helpers
 from bup.helpers import *
 
 PAGE_SIZE=4096
@@ -50,7 +50,7 @@ def _do_midx(outdir, outfilename, infilenames, prefixstr):
             ix.map,
             len(ix),
             ix.sha_ofs,
-            isinstance(ix, git.PackMidx) and ix.which_ofs or 0,
+            isinstance(ix, midx.PackMidx) and ix.which_ofs or 0,
             len(allfilenames),
         ))
         for n in ix.idxnames:
@@ -79,7 +79,7 @@ def _do_midx(outdir, outfilename, infilenames, prefixstr):
         pass
     f = open(outfilename + '.tmp', 'w+b')
     f.write('MIDX')
-    f.write(struct.pack('!II', git.MIDX_VERSION, bits))
+    f.write(struct.pack('!II', midx.MIDX_VERSION, bits))
     assert(f.tell() == 12)
 
     f.truncate(12 + 4*entries + 20*total + 4*total)
@@ -97,7 +97,7 @@ def _do_midx(outdir, outfilename, infilenames, prefixstr):
 
     # this is just for testing
     if 0:
-        p = git.PackMidx(outfilename)
+        p = midx.PackMidx(outfilename)
         assert(len(p.idxnames) == len(infilenames))
         print p.idxnames
         assert(len(p) == total)
index f3774bcd12829cc7c8e218b94c5b403228c4cb3b..7d2d2ef2a4820cd952f1838d9cf598e099e1e1c2 100644 (file)
@@ -4,9 +4,8 @@ interact with the Git data structures.
 """
 import os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
 from bup.helpers import *
-from bup import _helpers, path, bloom
+from bup import _helpers, path, midx, bloom
 
-MIDX_VERSION = 4
 SEEK_END=2  # os.SEEK_END is not defined in python 2.4
 
 verbose = 0
@@ -270,108 +269,6 @@ class PackIdxV2(PackIdx):
             yield buffer(self.map, 8 + 256*4 + 20*i, 20)
 
 
-extract_bits = _helpers.extract_bits
-
-class PackMidx:
-    """Wrapper which contains data from multiple index files.
-    Multiple index (.midx) files constitute a wrapper around index (.idx) files
-    and make it possible for bup to expand Git's indexing capabilities to vast
-    amounts of files.
-    """
-    def __init__(self, filename):
-        self.name = filename
-        self.force_keep = False
-        assert(filename.endswith('.midx'))
-        self.map = mmap_read(open(filename))
-        if str(self.map[0:4]) != 'MIDX':
-            log('Warning: skipping: invalid MIDX header in %r\n' % filename)
-            self.force_keep = True
-            return self._init_failed()
-        ver = struct.unpack('!I', self.map[4:8])[0]
-        if ver < MIDX_VERSION:
-            log('Warning: ignoring old-style (v%d) midx %r\n' 
-                % (ver, filename))
-            self.force_keep = False  # old stuff is boring  
-            return self._init_failed()
-        if ver > MIDX_VERSION:
-            log('Warning: ignoring too-new (v%d) midx %r\n'
-                % (ver, filename))
-            self.force_keep = True  # new stuff is exciting
-            return self._init_failed()
-
-        self.bits = _helpers.firstword(self.map[8:12])
-        self.entries = 2**self.bits
-        self.fanout = buffer(self.map, 12, self.entries*4)
-        self.sha_ofs = 12 + self.entries*4
-        self.nsha = nsha = self._fanget(self.entries-1)
-        self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
-        self.which_ofs = self.sha_ofs + 20*nsha
-        self.whichlist = buffer(self.map, self.which_ofs, nsha*4)
-        self.idxnames = str(self.map[self.which_ofs + 4*nsha:]).split('\0')
-
-    def _init_failed(self):
-        self.bits = 0
-        self.entries = 1
-        self.fanout = buffer('\0\0\0\0')
-        self.shatable = buffer('\0'*20)
-        self.idxnames = []
-
-    def _fanget(self, i):
-        start = i*4
-        s = self.fanout[start:start+4]
-        return _helpers.firstword(s)
-
-    def _get(self, i):
-        return str(self.shatable[i*20:(i+1)*20])
-
-    def _get_idx_i(self, i):
-        return struct.unpack('!I', self.whichlist[i*4:(i+1)*4])[0]
-
-    def _get_idxname(self, i):
-        return self.idxnames[self._get_idx_i(i)]
-
-    def exists(self, hash, want_source=False):
-        """Return nonempty if the object exists in the index files."""
-        global _total_searches, _total_steps
-        _total_searches += 1
-        want = str(hash)
-        el = extract_bits(want, self.bits)
-        if el:
-            start = self._fanget(el-1)
-            startv = el << (32-self.bits)
-        else:
-            start = 0
-            startv = 0
-        end = self._fanget(el)
-        endv = (el+1) << (32-self.bits)
-        _total_steps += 1   # lookup table is a step
-        hashv = _helpers.firstword(hash)
-        #print '(%08x) %08x %08x %08x' % (extract_bits(want, 32), startv, hashv, endv)
-        while start < end:
-            _total_steps += 1
-            #print '! %08x %08x %08x   %d - %d' % (startv, hashv, endv, start, end)
-            mid = start + (hashv-startv)*(end-start-1)/(endv-startv)
-            #print '  %08x %08x %08x   %d %d %d' % (startv, hashv, endv, start, mid, end)
-            v = self._get(mid)
-            #print '    %08x' % self._num(v)
-            if v < want:
-                start = mid+1
-                startv = _helpers.firstword(v)
-            elif v > want:
-                end = mid
-                endv = _helpers.firstword(v)
-            else: # got it!
-                return want_source and self._get_idxname(mid) or True
-        return None
-
-    def __iter__(self):
-        for i in xrange(self._fanget(self.entries-1)):
-            yield buffer(self.shatable, i*20, 20)
-
-    def __len__(self):
-        return int(self._fanget(self.entries-1))
-
-
 _mpi_count = 0
 class PackIdxList:
     def __init__(self, dir):
@@ -402,11 +299,11 @@ class PackIdxList:
         _total_searches += 1
         if hash in self.also:
             return True
-        if self.do_bloom and self.bloom is not None:
-            _total_searches -= 1  # will be incremented by bloom
+        if self.do_bloom and self.bloom:
             if self.bloom.exists(hash):
                 self.do_bloom = False
             else:
+                _total_searches -= 1  # was counted by bloom
                 return None
         for i in xrange(len(self.packs)):
             p = self.packs[i]
@@ -435,17 +332,17 @@ class PackIdxList:
         self.do_bloom = False
         skip_midx = skip_midx or ignore_midx
         d = dict((p.name, p) for p in self.packs
-                 if not skip_midx or not isinstance(p, PackMidx))
+                 if not skip_midx or not isinstance(p, midx.PackMidx))
         if os.path.exists(self.dir):
             if not skip_midx:
                 midxl = []
                 for ix in self.packs:
-                    if isinstance(ix, PackMidx):
+                    if isinstance(ix, midx.PackMidx):
                         for name in ix.idxnames:
                             d[os.path.join(self.dir, name)] = ix
                 for full in glob.glob(os.path.join(self.dir,'*.midx')):
                     if not d.get(full):
-                        mx = PackMidx(full)
+                        mx = midx.PackMidx(full)
                         (mxd, mxf) = os.path.split(mx.name)
                         broken = False
                         for n in mx.idxnames:
@@ -532,7 +429,7 @@ def open_idx(filename):
         else:
             raise GitError('%s: unrecognized idx file header' % filename)
     elif filename.endswith('.midx'):
-        return PackMidx(filename)
+        return midx.PackMidx(filename)
     else:
         raise GitError('idx filenames must end with .idx or .midx')
 
diff --git a/lib/bup/midx.py b/lib/bup/midx.py
new file mode 100644 (file)
index 0000000..3a06b73
--- /dev/null
@@ -0,0 +1,111 @@
+import mmap
+from bup import _helpers
+from bup.helpers import *
+
+MIDX_VERSION = 4
+
+extract_bits = _helpers.extract_bits
+_total_searches = 0
+_total_steps = 0
+
+
+class PackMidx:
+    """Wrapper which contains data from multiple index files.
+    Multiple index (.midx) files constitute a wrapper around index (.idx) files
+    and make it possible for bup to expand Git's indexing capabilities to vast
+    amounts of files.
+    """
+    def __init__(self, filename):
+        self.name = filename
+        self.force_keep = False
+        assert(filename.endswith('.midx'))
+        self.map = mmap_read(open(filename))
+        if str(self.map[0:4]) != 'MIDX':
+            log('Warning: skipping: invalid MIDX header in %r\n' % filename)
+            self.force_keep = True
+            return self._init_failed()
+        ver = struct.unpack('!I', self.map[4:8])[0]
+        if ver < MIDX_VERSION:
+            log('Warning: ignoring old-style (v%d) midx %r\n' 
+                % (ver, filename))
+            self.force_keep = False  # old stuff is boring  
+            return self._init_failed()
+        if ver > MIDX_VERSION:
+            log('Warning: ignoring too-new (v%d) midx %r\n'
+                % (ver, filename))
+            self.force_keep = True  # new stuff is exciting
+            return self._init_failed()
+
+        self.bits = _helpers.firstword(self.map[8:12])
+        self.entries = 2**self.bits
+        self.fanout = buffer(self.map, 12, self.entries*4)
+        self.sha_ofs = 12 + self.entries*4
+        self.nsha = nsha = self._fanget(self.entries-1)
+        self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
+        self.which_ofs = self.sha_ofs + 20*nsha
+        self.whichlist = buffer(self.map, self.which_ofs, nsha*4)
+        self.idxnames = str(self.map[self.which_ofs + 4*nsha:]).split('\0')
+
+    def _init_failed(self):
+        self.bits = 0
+        self.entries = 1
+        self.fanout = buffer('\0\0\0\0')
+        self.shatable = buffer('\0'*20)
+        self.idxnames = []
+
+    def _fanget(self, i):
+        start = i*4
+        s = self.fanout[start:start+4]
+        return _helpers.firstword(s)
+
+    def _get(self, i):
+        return str(self.shatable[i*20:(i+1)*20])
+
+    def _get_idx_i(self, i):
+        return struct.unpack('!I', self.whichlist[i*4:(i+1)*4])[0]
+
+    def _get_idxname(self, i):
+        return self.idxnames[self._get_idx_i(i)]
+
+    def exists(self, hash, want_source=False):
+        """Return nonempty if the object exists in the index files."""
+        global _total_searches, _total_steps
+        _total_searches += 1
+        want = str(hash)
+        el = extract_bits(want, self.bits)
+        if el:
+            start = self._fanget(el-1)
+            startv = el << (32-self.bits)
+        else:
+            start = 0
+            startv = 0
+        end = self._fanget(el)
+        endv = (el+1) << (32-self.bits)
+        _total_steps += 1   # lookup table is a step
+        hashv = _helpers.firstword(hash)
+        #print '(%08x) %08x %08x %08x' % (extract_bits(want, 32), startv, hashv, endv)
+        while start < end:
+            _total_steps += 1
+            #print '! %08x %08x %08x   %d - %d' % (startv, hashv, endv, start, end)
+            mid = start + (hashv-startv)*(end-start-1)/(endv-startv)
+            #print '  %08x %08x %08x   %d %d %d' % (startv, hashv, endv, start, mid, end)
+            v = self._get(mid)
+            #print '    %08x' % self._num(v)
+            if v < want:
+                start = mid+1
+                startv = _helpers.firstword(v)
+            elif v > want:
+                end = mid
+                endv = _helpers.firstword(v)
+            else: # got it!
+                return want_source and self._get_idxname(mid) or True
+        return None
+
+    def __iter__(self):
+        for i in xrange(self._fanget(self.entries-1)):
+            yield buffer(self.shatable, i*20, 20)
+
+    def __len__(self):
+        return int(self._fanget(self.entries-1))
+
+