]> arthur.barton.de Git - bup.git/commitdiff
git.PackIndex: a class for quickly searching a git packfile index.
authorAvery Pennarun <apenwarr@gmail.com>
Sun, 3 Jan 2010 02:10:17 +0000 (21:10 -0500)
committerAvery Pennarun <apenwarr@gmail.com>
Sun, 3 Jan 2010 02:29:33 +0000 (21:29 -0500)
This will allow us to generate incremental backups more efficiently, since
we can avoid rewriting already-known objects into a new pack.

git.py
t/tgit.py [new file with mode: 0644]

diff --git a/git.py b/git.py
index 1ff1b74325e54012fb767305447cb657d3c8391e..98fff3006cf8ce9fbfd2989144b88037895c1c99 100644 (file)
--- a/git.py
+++ b/git.py
@@ -1,7 +1,75 @@
-import os, errno, zlib, time, sha, subprocess, struct
+import os, errno, zlib, time, sha, subprocess, struct, mmap
 from helpers import *
 
 
 from helpers import *
 
 
+class PackIndex:
+    def __init__(self, filename):
+        self.name = filename
+        f = open(filename)
+        self.map = mmap.mmap(f.fileno(), 0,
+                             mmap.MAP_SHARED, mmap.PROT_READ)
+        f.close()  # map will persist beyond file close
+        assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
+        self.fanout = list(struct.unpack('!256I', buffer(self.map, 8, 256*4)))
+        self.fanout.append(0)  # entry "-1"
+        nsha = self.fanout[255]
+        self.ofstable = buffer(self.map,
+                               8 + 256*4 + nsha*20 + nsha*4,
+                               nsha*4)
+        self.ofs64table = buffer(self.map,
+                                 8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
+
+    def _ofs_from_idx(self, idx):
+        ofs = struct.unpack('!I', buffer(self.ofstable, idx*4, 4))[0]
+        if ofs & 0x80000000:
+            idx64 = ofs & 0x7fffffff
+            ofs = struct.unpack('!I', buffer(self.ofs64table, idx64*8, 8))[0]
+        return ofs
+
+    def _idx_from_hash(self, hash):
+        assert(len(hash) == 20)
+        b1 = ord(hash[0])
+        start = self.fanout[b1-1] # range -1..254
+        end = self.fanout[b1] # range 0..255
+        buf = buffer(self.map, 8 + 256*4, end*20)
+        want = buffer(hash)
+        while start < end:
+            mid = start + (end-start)/2
+            v = buffer(buf, mid*20, 20)
+            if v < want:
+                start = mid+1
+            elif v > want:
+                end = mid
+            else: # got it!
+                return mid
+        return None
+    def find_offset(self, hash):
+        idx = self._idx_from_hash(hash)
+        if idx != None:
+            return self._ofs_from_idx(idx)
+        return None
+
+    def exists(self, hash):
+        return (self._idx_from_hash(hash) != None) and True or None
+
+
+class MultiPackIndex:
+    def __init__(self, dir):
+        self.packs = []
+        for f in os.listdir(dir):
+            if f.endswith('.idx'):
+                self.packs.append(PackIndex(os.path.join(dir, f)))
+
+    def exists(self, hash):
+        for i in range(len(self.packs)):
+            p = self.packs[i]
+            if p.exists(hash):
+                # reorder so most recently used packs are searched first
+                self.packs = [p] + self.packs[:i] + self.packs[i+1:]
+                return True
+        return None
+
+
 def _old_write_object(bin, type, content):
     hex = bin.encode('hex')
     header = '%s %d\0' % (type, len(content))
 def _old_write_object(bin, type, content):
     hex = bin.encode('hex')
     header = '%s %d\0' % (type, len(content))
@@ -24,6 +92,13 @@ def _old_write_object(bin, type, content):
         os.rename(tfn, fn)
 
 
         os.rename(tfn, fn)
 
 
+def calc_hash(type, content):
+    header = '%s %d\0' % (type, len(content))
+    sum = sha.sha(header)
+    sum.update(content)
+    return sum.digest()
+
+
 _typemap = dict(blob=3, tree=2, commit=1, tag=8)
 class PackWriter:
     def __init__(self):
 _typemap = dict(blob=3, tree=2, commit=1, tag=8)
 class PackWriter:
     def __init__(self):
@@ -54,6 +129,10 @@ class PackWriter:
 
         self.count += 1
         self.binlist.append(bin)
 
         self.count += 1
         self.binlist.append(bin)
+        return bin
+
+    def easy_write(self, type, content):
+        return self.write(calc_hash(type, content), type, content)
 
     def close(self):
         f = self.file
 
     def close(self):
         f = self.file
@@ -82,8 +161,11 @@ class PackWriter:
         out = p.stdout.read().strip()
         if p.wait() or not out:
             raise Exception('git index-pack returned an error')
         out = p.stdout.read().strip()
         if p.wait() or not out:
             raise Exception('git index-pack returned an error')
-        os.rename(self.filename + '.pack', '.git/objects/pack/%s.pack' % out)
-        os.rename(self.filename + '.idx', '.git/objects/pack/%s.idx' % out)
+        nameprefix = '.git/objects/pack/%s' % out
+        os.rename(self.filename + '.pack', nameprefix + '.pack')
+        os.rename(self.filename + '.idx', nameprefix + '.idx')
+        return nameprefix
+
 
 _packout = None
 def _write_object(bin, type, content):
 
 _packout = None
 def _write_object(bin, type, content):
@@ -102,11 +184,8 @@ def flush_pack():
 _objcache = {}
 def hash_raw(type, s):
     global _objcache
 _objcache = {}
 def hash_raw(type, s):
     global _objcache
-    header = '%s %d\0' % (type, len(s))
-    sum = sha.sha(header)
-    sum.update(s)
-    bin = sum.digest()
-    hex = sum.hexdigest()
+    bin = calc_hash(type, s)
+    hex = bin.encode('hex')
     if bin in _objcache:
         return hex
     else:
     if bin in _objcache:
         return hex
     else:
diff --git a/t/tgit.py b/t/tgit.py
new file mode 100644 (file)
index 0000000..8c927a1
--- /dev/null
+++ b/t/tgit.py
@@ -0,0 +1,29 @@
+import git
+from wvtest import *
+
+
+@wvtest
+def testpacks():
+    w = git.PackWriter()
+    hashes = []
+    for i in range(1000):
+        hashes.append(w.easy_write('blob', str(i)))
+    nameprefix = w.close()
+    print repr(nameprefix)
+    WVPASS(os.path.exists(nameprefix + '.pack'))
+    WVPASS(os.path.exists(nameprefix + '.idx'))
+
+    r = git.PackIndex(nameprefix + '.idx')
+    print repr(r.fanout)
+
+    for i in range(1000):
+        WVPASS(r.find_offset(hashes[i]) > 0)
+    WVPASS(r.exists(hashes[99]))
+    WVFAIL(r.exists('\0'*20))
+
+    WVFAIL(r.find_offset('\0'*20))
+
+    r = git.MultiPackIndex('.git/objects/pack')
+    WVPASS(r.exists(hashes[5]))
+    WVPASS(r.exists(hashes[6]))
+    WVFAIL(r.exists('\0'*20))