From: Avery Pennarun Date: Fri, 27 Aug 2010 02:31:24 +0000 (-0700) Subject: git.py: when seeking inside a midx, use statistical guessing. X-Git-Tag: bup-0.18~11^2~2 X-Git-Url: https://arthur.barton.de/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=2ae3f8d5f360daf55b0932b0cce7d3ccbe24c503;p=bup.git git.py: when seeking inside a midx, use statistical guessing. Instead of using a pure binary search (where we seek to the middle of the area and do a greater/lesser comparison) we now use an "interpolation search" (http://en.wikipedia.org/wiki/Interpolation_search), which means we seek to where we statistically *expect* the desired value to be. In my test data, this reduces the number of typical search steps in my test midx from 8.7 steps/object to 4.8 steps/object. This reduces memory churn when using a midx, since sometimes a given search region spans two pages, and this technique allows us to more quickly eliminate one of the two pages sometimes, allowing us to dirty one fewer page. Unfortunately the implementation requires some futzing, so this actually makes memtest run about 35% *slower*. Will try to fix that next. The original link to this algorithm came from this article: http://sna-projects.com/blog/2010/06/beating-binary-search/ Thanks, article! Signed-off-by: Avery Pennarun --- diff --git a/lib/bup/git.py b/lib/bup/git.py index f27e760..1887cc0 100644 --- a/lib/bup/git.py +++ b/lib/bup/git.py @@ -231,6 +231,12 @@ class PackMidx: s = self.fanout[start:start+4] return struct.unpack('!I', s)[0] + def _get(self, i): + return str(self.shalist[i*20:(i+1)*20]) + + def _num(self, hash): + return struct.unpack('!I', hash[:4])[0] + def exists(self, hash): """Return nonempty if the object exists in the index files.""" global _total_searches, _total_steps @@ -239,18 +245,28 @@ class PackMidx: el = extract_bits(want, self.bits) if el: start = self._fanget(el-1) + startv = el << (32-self.bits) else: start = 0 + startv = 0 end = self._fanget(el) + endv = (el+1) << (32-self.bits) _total_steps += 1 # lookup table is a step + hashv = self._num(hash) + #print '(%08x) %08x %08x %08x' % (extract_bits(want, 32), startv, hashv, endv) while start < end: _total_steps += 1 - mid = start + (end-start)/2 - v = str(self.shalist[mid*20:(mid+1)*20]) + #print '! %08x %08x %08x %d - %d' % (startv, hashv, endv, start, end) + mid = start + (hashv-startv)*(end-start-1)/(endv-startv) + #print ' %08x %08x %08x %d %d %d' % (startv, hashv, endv, start, mid, end) + v = self._get(mid) + #print ' %08x' % self._num(v) if v < want: start = mid+1 + startv = self._num(v) elif v > want: end = mid + endv = self._num(v) else: # got it! return True return None