]> arthur.barton.de Git - bup.git/commitdiff
Write idxs directly rather than using git-index-pack.
authorBrandon Low <lostlogic@lostlogicx.com>
Mon, 3 Jan 2011 03:40:51 +0000 (19:40 -0800)
committerAvery Pennarun <apenwarr@gmail.com>
Mon, 3 Jan 2011 04:31:37 +0000 (20:31 -0800)
Also add a test round trip on idx r/w.

(Rearranged by apenwarr mostly due to merge conflicts.)

Signed-off-by: Brandon Low <lostlogic@lostlogicx.com>
Signed-off-by: Avery Pennarun <apenwarr@gmail.com>
cmd/server-cmd.py
lib/bup/client.py
lib/bup/git.py
lib/bup/t/tgit.py

index b7272f4db0efc23c6538385fa1607749889c0d47..03667a2e9825a776398478768167982387c4af14 100755 (executable)
@@ -67,15 +67,16 @@ def receive_objects(conn, junk):
             conn.ok()
             return
             
-        sha = conn.read(20)
-        n -= 20
+        shar = conn.read(20)
+        crcr = struct.unpack('!I', conn.read(4))[0]
+        n -= 20 + 4
         buf = conn.read(n)  # object sizes in bup are reasonably small
         #debug2('read %d bytes\n' % n)
         if len(buf) < n:
             w.abort()
             raise Exception('object read: expected %d bytes, got %d\n'
                             % (n, len(buf)))
-        oldpack = w.exists(sha)
+        oldpack = w.exists(shar)
         # FIXME: we only suggest a single index per cycle, because the client
         # is currently too dumb to download more than one per cycle anyway.
         # Actually we should fix the client, but this is a minor optimization
@@ -88,7 +89,7 @@ def receive_objects(conn, junk):
             # fix that deficiency of midx files eventually, although it'll
             # make the files bigger.  This method is certainly not very
             # efficient.
-            oldpack = w.objcache.packname_containing(sha)
+            oldpack = w.objcache.packname_containing(shar)
             debug2('new suggestion: %r\n' % oldpack)
             assert(oldpack)
             assert(oldpack != True)
@@ -102,8 +103,16 @@ def receive_objects(conn, junk):
                 conn.write('index %s\n' % name)
                 suggested[name] = 1
         else:
-            w._raw_write([buf])
+            nw, crc = w._raw_write([buf], sha=shar)
+            _check(w, crcr, crc, 'object read: expected crc %d, got %d\n')
+            _check(w, n, nw, 'object read: expected %d bytes, got %d\n')
     # NOTREACHED
+    
+
+def _check(w, expected, actual, msg):
+    if expected != actual:
+        w.abort()
+        raise Exception(msg % (expected, actual))
 
 
 def read_ref(conn, refname):
index f9d940e973eb56bc579530a384516224eff6ce16..aa9978ba5bacf4d7040285fbf9d30c7098a58c86 100644 (file)
@@ -1,4 +1,4 @@
-import re, struct, errno, time
+import re, struct, errno, time, zlib
 from bup import git, ssh
 from bup.helpers import *
 
@@ -271,18 +271,23 @@ class PackWriter_Remote(git.PackWriter):
     def abort(self):
         raise GitError("don't know how to abort remote pack writing")
 
-    def _raw_write(self, datalist, sha=''):
+    def _raw_write(self, datalist, sha):
         assert(self.file)
         if not self._packopen:
             self._open()
         if self.ensure_busy:
             self.ensure_busy()
         data = ''.join(datalist)
-        assert(len(data))
-        outbuf = ''.join((struct.pack('!I', len(data)+len(sha)), sha, data))
+        assert(data)
+        assert(sha)
+        crc = zlib.crc32(data) & 0xffffffff
+        outbuf = ''.join((struct.pack('!I', len(data) + 20 + 4),
+                          sha,
+                          struct.pack('!I', crc),
+                          data))
         (self._bwcount, self._bwtime) = \
             _raw_write_bwlimit(self.file, outbuf, self._bwcount, self._bwtime)
-        self.outbytes += len(data)
+        self.outbytes += len(data) - 20 - 4 # Don't count sha1+crc
         self.count += 1
 
         if self.file.has_input():
@@ -293,10 +298,4 @@ class PackWriter_Remote(git.PackWriter):
                 self.suggest_pack(idxname)
                 self.objcache.refresh()
 
-    def _write(self, bin, type, content):
-        if git.verbose:
-            log('>')
-        sha = git.calc_hash(type, content)
-        enc = git._encode_packobj(type, content)
-        self._raw_write(enc, sha=sha)
-        return bin
+        return sha, crc
index 4ca31d76376f35a2166cf2fabf7185ee56dae17d..5f334133fbf68e72d2ecd073aef522136fbb0fcf 100644 (file)
@@ -2,8 +2,7 @@
 bup repositories are in Git format. This library allows us to
 interact with the Git data structures.
 """
-import os, zlib, time, subprocess, struct, stat, re, tempfile
-import heapq
+import os, zlib, time, subprocess, struct, stat, re, tempfile, heapq
 from bup.helpers import *
 from bup import _helpers
 
@@ -140,7 +139,7 @@ def _decode_packobj(buf):
 class PackIdx:
     def __init__(self):
         assert(0)
-    
+
     def find_offset(self, hash):
         """Get the offset of an object inside the index file."""
         idx = self._idx_from_hash(hash)
@@ -222,7 +221,7 @@ class PackIdxV2(PackIdx):
         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
         if ofs & 0x80000000:
             idx64 = ofs & 0x7fffffff
-            ofs = struct.unpack('!I',
+            ofs = struct.unpack('!Q',
                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
         return ofs
 
@@ -528,6 +527,7 @@ class PackWriter:
         self.outbytes = 0
         self.filename = None
         self.file = None
+        self.idx = None
         self.objcache_maker = objcache_maker
         self.objcache = None
 
@@ -549,8 +549,11 @@ class PackWriter:
             assert(name.endswith('.pack'))
             self.filename = name[:-5]
             self.file.write('PACK\0\0\0\2\0\0\0\0')
+            self.idx = list(list() for i in xrange(256))
 
-    def _raw_write(self, datalist):
+    # the 'sha' parameter is used in client.py's _raw_write(), but not needed
+    # in this basic version.
+    def _raw_write(self, datalist, sha):
         self._open()
         f = self.file
         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
@@ -560,14 +563,25 @@ class PackWriter:
         # but that's okay because we'll flush it in _end().
         oneblob = ''.join(datalist)
         f.write(oneblob)
-        self.outbytes += len(oneblob)
+        nw = len(oneblob)
+        crc = zlib.crc32(oneblob) & 0xffffffff
+        self._update_idx(sha, crc, nw)
+        self.outbytes += nw
         self.count += 1
+        return nw, crc
+
+    def _update_idx(self, sha, crc, size):
+        assert(sha)
+        if self.idx:
+            self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
 
-    def _write(self, bin, type, content):
+    def _write(self, sha, type, content):
         if verbose:
             log('>')
-        self._raw_write(_encode_packobj(type, content))
-        return bin
+        if not sha:
+            sha = calc_hash(type, content)
+        size, crc = self._raw_write(_encode_packobj(type, content), sha=sha)
+        return sha
 
     def breakpoint(self):
         """Clear byte and object counts and return the last processed id."""
@@ -587,11 +601,11 @@ class PackWriter:
 
     def maybe_write(self, type, content):
         """Write an object to the pack file if not present and return its id."""
-        bin = calc_hash(type, content)
-        if not self.exists(bin):
-            self._write(bin, type, content)
-            self.objcache.add(bin)
-        return bin
+        sha = calc_hash(type, content)
+        if not self.exists(sha):
+            self._write(sha, type, content)
+            self.objcache.add(sha)
+        return sha
 
     def new_blob(self, blob):
         """Create a blob object in the pack with the supplied content."""
@@ -632,6 +646,7 @@ class PackWriter:
         """Remove the pack file from disk."""
         f = self.file
         if f:
+            self.idx = None
             self.file = None
             f.close()
             os.unlink(self.filename + '.pack')
@@ -641,6 +656,8 @@ class PackWriter:
         if not f: return None
         self.file = None
         self.objcache = None
+        idx = self.idx
+        self.idx = None
 
         # update object count
         f.seek(8)
@@ -653,19 +670,15 @@ class PackWriter:
         sum = Sha1()
         for b in chunkyreader(f):
             sum.update(b)
-        f.write(sum.digest())
+        packbin = sum.digest()
+        f.write(packbin)
         f.close()
 
-        p = subprocess.Popen(['git', 'index-pack', '-v',
-                              '--index-version=2',
-                              self.filename + '.pack'],
-                             preexec_fn = _gitenv,
-                             stdout = subprocess.PIPE)
-        out = p.stdout.read().strip()
-        _git_wait('git index-pack', p)
-        if not out:
-            raise GitError('git index-pack produced no output')
-        nameprefix = repo('objects/pack/%s' % out)
+        idx_f = open(self.filename + '.idx', 'wb')
+        obj_list_sha = self._write_pack_idx_v2(idx_f, idx, packbin)
+        idx_f.close()
+
+        nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
         if os.path.exists(self.filename + '.map'):
             os.unlink(self.filename + '.map')
         os.rename(self.filename + '.pack', nameprefix + '.pack')
@@ -678,6 +691,44 @@ class PackWriter:
         """Close the pack file and move it to its definitive path."""
         return self._end()
 
+    def _write_pack_idx_v2(self, file, idx, packbin):
+        sum = Sha1()
+
+        def write(data):
+            file.write(data)
+            sum.update(data)
+
+        write('\377tOc\0\0\0\2')
+
+        n = 0
+        for part in idx:
+            n += len(part)
+            write(struct.pack('!i', n))
+            part.sort(key=lambda x: x[0])
+
+        obj_list_sum = Sha1()
+        for part in idx:
+            for entry in part:
+                write(entry[0])
+                obj_list_sum.update(entry[0])
+        for part in idx:
+            for entry in part:
+                write(struct.pack('!I', entry[1]))
+        ofs64_list = []
+        for part in idx:
+            for entry in part:
+                if entry[2] & 0x80000000:
+                    write(struct.pack('!I', 0x80000000 | len(ofs64_list)))
+                    ofs64_list.append(struct.pack('!Q', entry[2]))
+                else:
+                    write(struct.pack('!i', entry[2]))
+        for ofs64 in ofs64_list:
+            write(ofs64)
+
+        write(packbin)
+        file.write(sum.digest())
+        return obj_list_sum.hexdigest()
+
 
 def _git_date(date):
     return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
index c61b351881201b4bee4c2e479056c7a50332b92a..fad720b06e413acf961ff9ee98bda1277b845b24 100644 (file)
@@ -1,4 +1,4 @@
-import time
+import struct, os, tempfile, time
 from bup import git
 from bup.helpers import *
 from wvtest import *
@@ -88,3 +88,29 @@ def testpacks():
     WVPASS(r.exists(hashes[5]))
     WVPASS(r.exists(hashes[6]))
     WVFAIL(r.exists('\0'*20))
+
+@wvtest
+def test_long_index():
+    w = git.PackWriter()
+    obj_bin = struct.pack('!IIIII',
+            0x00112233, 0x44556677, 0x88990011, 0x22334455, 0x66778899)
+    obj2_bin = struct.pack('!IIIII',
+            0x11223344, 0x55667788, 0x99001122, 0x33445566, 0x77889900)
+    obj3_bin = struct.pack('!IIIII',
+            0x22334455, 0x66778899, 0x00112233, 0x44556677, 0x88990011)
+    pack_bin = struct.pack('!IIIII',
+            0x99887766, 0x55443322, 0x11009988, 0x77665544, 0x33221100)
+    idx = list(list() for i in xrange(256))
+    idx[0].append((obj_bin, 1, 0xfffffffff))
+    idx[0x11].append((obj2_bin, 2, 0xffffffffff))
+    idx[0x22].append((obj3_bin, 3, 0xff))
+    (fd,name) = tempfile.mkstemp(suffix='.idx', dir=git.repo('objects'))
+    f = os.fdopen(fd, 'w+b')
+    r = w._write_pack_idx_v2(f, idx, pack_bin)
+    f.seek(0)
+    i = git.PackIdxV2(name, f)
+    WVPASS(i.find_offset(obj_bin)==0xfffffffff)
+    WVPASS(i.find_offset(obj2_bin)==0xffffffffff)
+    WVPASS(i.find_offset(obj3_bin)==0xff)
+    f.close()
+    os.remove(name)