]> arthur.barton.de Git - bup.git/commitdiff
Write idxs directly rather than using git-index-pack.
authorBrandon Low <lostlogic@lostlogicx.com>
Mon, 3 Jan 2011 03:40:51 +0000 (19:40 -0800)
committerAvery Pennarun <apenwarr@gmail.com>
Mon, 3 Jan 2011 04:31:37 +0000 (20:31 -0800)
Also add a test round trip on idx r/w.

(Rearranged by apenwarr mostly due to merge conflicts.)

Signed-off-by: Brandon Low <lostlogic@lostlogicx.com>
Signed-off-by: Avery Pennarun <apenwarr@gmail.com>
cmd/server-cmd.py
lib/bup/client.py
lib/bup/git.py
lib/bup/t/tgit.py

index b7272f4db0efc23c6538385fa1607749889c0d47..03667a2e9825a776398478768167982387c4af14 100755 (executable)
@@ -67,15 +67,16 @@ def receive_objects(conn, junk):
             conn.ok()
             return
             
             conn.ok()
             return
             
-        sha = conn.read(20)
-        n -= 20
+        shar = conn.read(20)
+        crcr = struct.unpack('!I', conn.read(4))[0]
+        n -= 20 + 4
         buf = conn.read(n)  # object sizes in bup are reasonably small
         #debug2('read %d bytes\n' % n)
         if len(buf) < n:
             w.abort()
             raise Exception('object read: expected %d bytes, got %d\n'
                             % (n, len(buf)))
         buf = conn.read(n)  # object sizes in bup are reasonably small
         #debug2('read %d bytes\n' % n)
         if len(buf) < n:
             w.abort()
             raise Exception('object read: expected %d bytes, got %d\n'
                             % (n, len(buf)))
-        oldpack = w.exists(sha)
+        oldpack = w.exists(shar)
         # FIXME: we only suggest a single index per cycle, because the client
         # is currently too dumb to download more than one per cycle anyway.
         # Actually we should fix the client, but this is a minor optimization
         # FIXME: we only suggest a single index per cycle, because the client
         # is currently too dumb to download more than one per cycle anyway.
         # Actually we should fix the client, but this is a minor optimization
@@ -88,7 +89,7 @@ def receive_objects(conn, junk):
             # fix that deficiency of midx files eventually, although it'll
             # make the files bigger.  This method is certainly not very
             # efficient.
             # fix that deficiency of midx files eventually, although it'll
             # make the files bigger.  This method is certainly not very
             # efficient.
-            oldpack = w.objcache.packname_containing(sha)
+            oldpack = w.objcache.packname_containing(shar)
             debug2('new suggestion: %r\n' % oldpack)
             assert(oldpack)
             assert(oldpack != True)
             debug2('new suggestion: %r\n' % oldpack)
             assert(oldpack)
             assert(oldpack != True)
@@ -102,8 +103,16 @@ def receive_objects(conn, junk):
                 conn.write('index %s\n' % name)
                 suggested[name] = 1
         else:
                 conn.write('index %s\n' % name)
                 suggested[name] = 1
         else:
-            w._raw_write([buf])
+            nw, crc = w._raw_write([buf], sha=shar)
+            _check(w, crcr, crc, 'object read: expected crc %d, got %d\n')
+            _check(w, n, nw, 'object read: expected %d bytes, got %d\n')
     # NOTREACHED
     # NOTREACHED
+    
+
+def _check(w, expected, actual, msg):
+    if expected != actual:
+        w.abort()
+        raise Exception(msg % (expected, actual))
 
 
 def read_ref(conn, refname):
 
 
 def read_ref(conn, refname):
index f9d940e973eb56bc579530a384516224eff6ce16..aa9978ba5bacf4d7040285fbf9d30c7098a58c86 100644 (file)
@@ -1,4 +1,4 @@
-import re, struct, errno, time
+import re, struct, errno, time, zlib
 from bup import git, ssh
 from bup.helpers import *
 
 from bup import git, ssh
 from bup.helpers import *
 
@@ -271,18 +271,23 @@ class PackWriter_Remote(git.PackWriter):
     def abort(self):
         raise GitError("don't know how to abort remote pack writing")
 
     def abort(self):
         raise GitError("don't know how to abort remote pack writing")
 
-    def _raw_write(self, datalist, sha=''):
+    def _raw_write(self, datalist, sha):
         assert(self.file)
         if not self._packopen:
             self._open()
         if self.ensure_busy:
             self.ensure_busy()
         data = ''.join(datalist)
         assert(self.file)
         if not self._packopen:
             self._open()
         if self.ensure_busy:
             self.ensure_busy()
         data = ''.join(datalist)
-        assert(len(data))
-        outbuf = ''.join((struct.pack('!I', len(data)+len(sha)), sha, data))
+        assert(data)
+        assert(sha)
+        crc = zlib.crc32(data) & 0xffffffff
+        outbuf = ''.join((struct.pack('!I', len(data) + 20 + 4),
+                          sha,
+                          struct.pack('!I', crc),
+                          data))
         (self._bwcount, self._bwtime) = \
             _raw_write_bwlimit(self.file, outbuf, self._bwcount, self._bwtime)
         (self._bwcount, self._bwtime) = \
             _raw_write_bwlimit(self.file, outbuf, self._bwcount, self._bwtime)
-        self.outbytes += len(data)
+        self.outbytes += len(data) - 20 - 4 # Don't count sha1+crc
         self.count += 1
 
         if self.file.has_input():
         self.count += 1
 
         if self.file.has_input():
@@ -293,10 +298,4 @@ class PackWriter_Remote(git.PackWriter):
                 self.suggest_pack(idxname)
                 self.objcache.refresh()
 
                 self.suggest_pack(idxname)
                 self.objcache.refresh()
 
-    def _write(self, bin, type, content):
-        if git.verbose:
-            log('>')
-        sha = git.calc_hash(type, content)
-        enc = git._encode_packobj(type, content)
-        self._raw_write(enc, sha=sha)
-        return bin
+        return sha, crc
index 4ca31d76376f35a2166cf2fabf7185ee56dae17d..5f334133fbf68e72d2ecd073aef522136fbb0fcf 100644 (file)
@@ -2,8 +2,7 @@
 bup repositories are in Git format. This library allows us to
 interact with the Git data structures.
 """
 bup repositories are in Git format. This library allows us to
 interact with the Git data structures.
 """
-import os, zlib, time, subprocess, struct, stat, re, tempfile
-import heapq
+import os, zlib, time, subprocess, struct, stat, re, tempfile, heapq
 from bup.helpers import *
 from bup import _helpers
 
 from bup.helpers import *
 from bup import _helpers
 
@@ -140,7 +139,7 @@ def _decode_packobj(buf):
 class PackIdx:
     def __init__(self):
         assert(0)
 class PackIdx:
     def __init__(self):
         assert(0)
-    
+
     def find_offset(self, hash):
         """Get the offset of an object inside the index file."""
         idx = self._idx_from_hash(hash)
     def find_offset(self, hash):
         """Get the offset of an object inside the index file."""
         idx = self._idx_from_hash(hash)
@@ -222,7 +221,7 @@ class PackIdxV2(PackIdx):
         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
         if ofs & 0x80000000:
             idx64 = ofs & 0x7fffffff
         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
         if ofs & 0x80000000:
             idx64 = ofs & 0x7fffffff
-            ofs = struct.unpack('!I',
+            ofs = struct.unpack('!Q',
                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
         return ofs
 
                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
         return ofs
 
@@ -528,6 +527,7 @@ class PackWriter:
         self.outbytes = 0
         self.filename = None
         self.file = None
         self.outbytes = 0
         self.filename = None
         self.file = None
+        self.idx = None
         self.objcache_maker = objcache_maker
         self.objcache = None
 
         self.objcache_maker = objcache_maker
         self.objcache = None
 
@@ -549,8 +549,11 @@ class PackWriter:
             assert(name.endswith('.pack'))
             self.filename = name[:-5]
             self.file.write('PACK\0\0\0\2\0\0\0\0')
             assert(name.endswith('.pack'))
             self.filename = name[:-5]
             self.file.write('PACK\0\0\0\2\0\0\0\0')
+            self.idx = list(list() for i in xrange(256))
 
 
-    def _raw_write(self, datalist):
+    # the 'sha' parameter is used in client.py's _raw_write(), but not needed
+    # in this basic version.
+    def _raw_write(self, datalist, sha):
         self._open()
         f = self.file
         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
         self._open()
         f = self.file
         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
@@ -560,14 +563,25 @@ class PackWriter:
         # but that's okay because we'll flush it in _end().
         oneblob = ''.join(datalist)
         f.write(oneblob)
         # but that's okay because we'll flush it in _end().
         oneblob = ''.join(datalist)
         f.write(oneblob)
-        self.outbytes += len(oneblob)
+        nw = len(oneblob)
+        crc = zlib.crc32(oneblob) & 0xffffffff
+        self._update_idx(sha, crc, nw)
+        self.outbytes += nw
         self.count += 1
         self.count += 1
+        return nw, crc
+
+    def _update_idx(self, sha, crc, size):
+        assert(sha)
+        if self.idx:
+            self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
 
 
-    def _write(self, bin, type, content):
+    def _write(self, sha, type, content):
         if verbose:
             log('>')
         if verbose:
             log('>')
-        self._raw_write(_encode_packobj(type, content))
-        return bin
+        if not sha:
+            sha = calc_hash(type, content)
+        size, crc = self._raw_write(_encode_packobj(type, content), sha=sha)
+        return sha
 
     def breakpoint(self):
         """Clear byte and object counts and return the last processed id."""
 
     def breakpoint(self):
         """Clear byte and object counts and return the last processed id."""
@@ -587,11 +601,11 @@ class PackWriter:
 
     def maybe_write(self, type, content):
         """Write an object to the pack file if not present and return its id."""
 
     def maybe_write(self, type, content):
         """Write an object to the pack file if not present and return its id."""
-        bin = calc_hash(type, content)
-        if not self.exists(bin):
-            self._write(bin, type, content)
-            self.objcache.add(bin)
-        return bin
+        sha = calc_hash(type, content)
+        if not self.exists(sha):
+            self._write(sha, type, content)
+            self.objcache.add(sha)
+        return sha
 
     def new_blob(self, blob):
         """Create a blob object in the pack with the supplied content."""
 
     def new_blob(self, blob):
         """Create a blob object in the pack with the supplied content."""
@@ -632,6 +646,7 @@ class PackWriter:
         """Remove the pack file from disk."""
         f = self.file
         if f:
         """Remove the pack file from disk."""
         f = self.file
         if f:
+            self.idx = None
             self.file = None
             f.close()
             os.unlink(self.filename + '.pack')
             self.file = None
             f.close()
             os.unlink(self.filename + '.pack')
@@ -641,6 +656,8 @@ class PackWriter:
         if not f: return None
         self.file = None
         self.objcache = None
         if not f: return None
         self.file = None
         self.objcache = None
+        idx = self.idx
+        self.idx = None
 
         # update object count
         f.seek(8)
 
         # update object count
         f.seek(8)
@@ -653,19 +670,15 @@ class PackWriter:
         sum = Sha1()
         for b in chunkyreader(f):
             sum.update(b)
         sum = Sha1()
         for b in chunkyreader(f):
             sum.update(b)
-        f.write(sum.digest())
+        packbin = sum.digest()
+        f.write(packbin)
         f.close()
 
         f.close()
 
-        p = subprocess.Popen(['git', 'index-pack', '-v',
-                              '--index-version=2',
-                              self.filename + '.pack'],
-                             preexec_fn = _gitenv,
-                             stdout = subprocess.PIPE)
-        out = p.stdout.read().strip()
-        _git_wait('git index-pack', p)
-        if not out:
-            raise GitError('git index-pack produced no output')
-        nameprefix = repo('objects/pack/%s' % out)
+        idx_f = open(self.filename + '.idx', 'wb')
+        obj_list_sha = self._write_pack_idx_v2(idx_f, idx, packbin)
+        idx_f.close()
+
+        nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
         if os.path.exists(self.filename + '.map'):
             os.unlink(self.filename + '.map')
         os.rename(self.filename + '.pack', nameprefix + '.pack')
         if os.path.exists(self.filename + '.map'):
             os.unlink(self.filename + '.map')
         os.rename(self.filename + '.pack', nameprefix + '.pack')
@@ -678,6 +691,44 @@ class PackWriter:
         """Close the pack file and move it to its definitive path."""
         return self._end()
 
         """Close the pack file and move it to its definitive path."""
         return self._end()
 
+    def _write_pack_idx_v2(self, file, idx, packbin):
+        sum = Sha1()
+
+        def write(data):
+            file.write(data)
+            sum.update(data)
+
+        write('\377tOc\0\0\0\2')
+
+        n = 0
+        for part in idx:
+            n += len(part)
+            write(struct.pack('!i', n))
+            part.sort(key=lambda x: x[0])
+
+        obj_list_sum = Sha1()
+        for part in idx:
+            for entry in part:
+                write(entry[0])
+                obj_list_sum.update(entry[0])
+        for part in idx:
+            for entry in part:
+                write(struct.pack('!I', entry[1]))
+        ofs64_list = []
+        for part in idx:
+            for entry in part:
+                if entry[2] & 0x80000000:
+                    write(struct.pack('!I', 0x80000000 | len(ofs64_list)))
+                    ofs64_list.append(struct.pack('!Q', entry[2]))
+                else:
+                    write(struct.pack('!i', entry[2]))
+        for ofs64 in ofs64_list:
+            write(ofs64)
+
+        write(packbin)
+        file.write(sum.digest())
+        return obj_list_sum.hexdigest()
+
 
 def _git_date(date):
     return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
 
 def _git_date(date):
     return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
index c61b351881201b4bee4c2e479056c7a50332b92a..fad720b06e413acf961ff9ee98bda1277b845b24 100644 (file)
@@ -1,4 +1,4 @@
-import time
+import struct, os, tempfile, time
 from bup import git
 from bup.helpers import *
 from wvtest import *
 from bup import git
 from bup.helpers import *
 from wvtest import *
@@ -88,3 +88,29 @@ def testpacks():
     WVPASS(r.exists(hashes[5]))
     WVPASS(r.exists(hashes[6]))
     WVFAIL(r.exists('\0'*20))
     WVPASS(r.exists(hashes[5]))
     WVPASS(r.exists(hashes[6]))
     WVFAIL(r.exists('\0'*20))
+
+@wvtest
+def test_long_index():
+    w = git.PackWriter()
+    obj_bin = struct.pack('!IIIII',
+            0x00112233, 0x44556677, 0x88990011, 0x22334455, 0x66778899)
+    obj2_bin = struct.pack('!IIIII',
+            0x11223344, 0x55667788, 0x99001122, 0x33445566, 0x77889900)
+    obj3_bin = struct.pack('!IIIII',
+            0x22334455, 0x66778899, 0x00112233, 0x44556677, 0x88990011)
+    pack_bin = struct.pack('!IIIII',
+            0x99887766, 0x55443322, 0x11009988, 0x77665544, 0x33221100)
+    idx = list(list() for i in xrange(256))
+    idx[0].append((obj_bin, 1, 0xfffffffff))
+    idx[0x11].append((obj2_bin, 2, 0xffffffffff))
+    idx[0x22].append((obj3_bin, 3, 0xff))
+    (fd,name) = tempfile.mkstemp(suffix='.idx', dir=git.repo('objects'))
+    f = os.fdopen(fd, 'w+b')
+    r = w._write_pack_idx_v2(f, idx, pack_bin)
+    f.seek(0)
+    i = git.PackIdxV2(name, f)
+    WVPASS(i.find_offset(obj_bin)==0xfffffffff)
+    WVPASS(i.find_offset(obj2_bin)==0xffffffffff)
+    WVPASS(i.find_offset(obj3_bin)==0xff)
+    f.close()
+    os.remove(name)