From: Brandon Low Date: Mon, 3 Jan 2011 03:40:51 +0000 (-0800) Subject: Write idxs directly rather than using git-index-pack. X-Git-Tag: bup-0.21-rc1^5~1 X-Git-Url: https://arthur.barton.de/gitweb/?p=bup.git;a=commitdiff_plain;h=4cab9ab71fff18d1841fb5964ba092979f7a9ddf Write idxs directly rather than using git-index-pack. Also add a test round trip on idx r/w. (Rearranged by apenwarr mostly due to merge conflicts.) Signed-off-by: Brandon Low Signed-off-by: Avery Pennarun --- diff --git a/cmd/server-cmd.py b/cmd/server-cmd.py index b7272f4..03667a2 100755 --- a/cmd/server-cmd.py +++ b/cmd/server-cmd.py @@ -67,15 +67,16 @@ def receive_objects(conn, junk): conn.ok() return - sha = conn.read(20) - n -= 20 + shar = conn.read(20) + crcr = struct.unpack('!I', conn.read(4))[0] + n -= 20 + 4 buf = conn.read(n) # object sizes in bup are reasonably small #debug2('read %d bytes\n' % n) if len(buf) < n: w.abort() raise Exception('object read: expected %d bytes, got %d\n' % (n, len(buf))) - oldpack = w.exists(sha) + oldpack = w.exists(shar) # FIXME: we only suggest a single index per cycle, because the client # is currently too dumb to download more than one per cycle anyway. # Actually we should fix the client, but this is a minor optimization @@ -88,7 +89,7 @@ def receive_objects(conn, junk): # fix that deficiency of midx files eventually, although it'll # make the files bigger. This method is certainly not very # efficient. - oldpack = w.objcache.packname_containing(sha) + oldpack = w.objcache.packname_containing(shar) debug2('new suggestion: %r\n' % oldpack) assert(oldpack) assert(oldpack != True) @@ -102,8 +103,16 @@ def receive_objects(conn, junk): conn.write('index %s\n' % name) suggested[name] = 1 else: - w._raw_write([buf]) + nw, crc = w._raw_write([buf], sha=shar) + _check(w, crcr, crc, 'object read: expected crc %d, got %d\n') + _check(w, n, nw, 'object read: expected %d bytes, got %d\n') # NOTREACHED + + +def _check(w, expected, actual, msg): + if expected != actual: + w.abort() + raise Exception(msg % (expected, actual)) def read_ref(conn, refname): diff --git a/lib/bup/client.py b/lib/bup/client.py index f9d940e..aa9978b 100644 --- a/lib/bup/client.py +++ b/lib/bup/client.py @@ -1,4 +1,4 @@ -import re, struct, errno, time +import re, struct, errno, time, zlib from bup import git, ssh from bup.helpers import * @@ -271,18 +271,23 @@ class PackWriter_Remote(git.PackWriter): def abort(self): raise GitError("don't know how to abort remote pack writing") - def _raw_write(self, datalist, sha=''): + def _raw_write(self, datalist, sha): assert(self.file) if not self._packopen: self._open() if self.ensure_busy: self.ensure_busy() data = ''.join(datalist) - assert(len(data)) - outbuf = ''.join((struct.pack('!I', len(data)+len(sha)), sha, data)) + assert(data) + assert(sha) + crc = zlib.crc32(data) & 0xffffffff + outbuf = ''.join((struct.pack('!I', len(data) + 20 + 4), + sha, + struct.pack('!I', crc), + data)) (self._bwcount, self._bwtime) = \ _raw_write_bwlimit(self.file, outbuf, self._bwcount, self._bwtime) - self.outbytes += len(data) + self.outbytes += len(data) - 20 - 4 # Don't count sha1+crc self.count += 1 if self.file.has_input(): @@ -293,10 +298,4 @@ class PackWriter_Remote(git.PackWriter): self.suggest_pack(idxname) self.objcache.refresh() - def _write(self, bin, type, content): - if git.verbose: - log('>') - sha = git.calc_hash(type, content) - enc = git._encode_packobj(type, content) - self._raw_write(enc, sha=sha) - return bin + return sha, crc diff --git a/lib/bup/git.py b/lib/bup/git.py index 4ca31d7..5f33413 100644 --- a/lib/bup/git.py +++ b/lib/bup/git.py @@ -2,8 +2,7 @@ bup repositories are in Git format. This library allows us to interact with the Git data structures. """ -import os, zlib, time, subprocess, struct, stat, re, tempfile -import heapq +import os, zlib, time, subprocess, struct, stat, re, tempfile, heapq from bup.helpers import * from bup import _helpers @@ -140,7 +139,7 @@ def _decode_packobj(buf): class PackIdx: def __init__(self): assert(0) - + def find_offset(self, hash): """Get the offset of an object inside the index file.""" idx = self._idx_from_hash(hash) @@ -222,7 +221,7 @@ class PackIdxV2(PackIdx): ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0] if ofs & 0x80000000: idx64 = ofs & 0x7fffffff - ofs = struct.unpack('!I', + ofs = struct.unpack('!Q', str(buffer(self.ofs64table, idx64*8, 8)))[0] return ofs @@ -528,6 +527,7 @@ class PackWriter: self.outbytes = 0 self.filename = None self.file = None + self.idx = None self.objcache_maker = objcache_maker self.objcache = None @@ -549,8 +549,11 @@ class PackWriter: assert(name.endswith('.pack')) self.filename = name[:-5] self.file.write('PACK\0\0\0\2\0\0\0\0') + self.idx = list(list() for i in xrange(256)) - def _raw_write(self, datalist): + # the 'sha' parameter is used in client.py's _raw_write(), but not needed + # in this basic version. + def _raw_write(self, datalist, sha): self._open() f = self.file # in case we get interrupted (eg. KeyboardInterrupt), it's best if @@ -560,14 +563,25 @@ class PackWriter: # but that's okay because we'll flush it in _end(). oneblob = ''.join(datalist) f.write(oneblob) - self.outbytes += len(oneblob) + nw = len(oneblob) + crc = zlib.crc32(oneblob) & 0xffffffff + self._update_idx(sha, crc, nw) + self.outbytes += nw self.count += 1 + return nw, crc + + def _update_idx(self, sha, crc, size): + assert(sha) + if self.idx: + self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size)) - def _write(self, bin, type, content): + def _write(self, sha, type, content): if verbose: log('>') - self._raw_write(_encode_packobj(type, content)) - return bin + if not sha: + sha = calc_hash(type, content) + size, crc = self._raw_write(_encode_packobj(type, content), sha=sha) + return sha def breakpoint(self): """Clear byte and object counts and return the last processed id.""" @@ -587,11 +601,11 @@ class PackWriter: def maybe_write(self, type, content): """Write an object to the pack file if not present and return its id.""" - bin = calc_hash(type, content) - if not self.exists(bin): - self._write(bin, type, content) - self.objcache.add(bin) - return bin + sha = calc_hash(type, content) + if not self.exists(sha): + self._write(sha, type, content) + self.objcache.add(sha) + return sha def new_blob(self, blob): """Create a blob object in the pack with the supplied content.""" @@ -632,6 +646,7 @@ class PackWriter: """Remove the pack file from disk.""" f = self.file if f: + self.idx = None self.file = None f.close() os.unlink(self.filename + '.pack') @@ -641,6 +656,8 @@ class PackWriter: if not f: return None self.file = None self.objcache = None + idx = self.idx + self.idx = None # update object count f.seek(8) @@ -653,19 +670,15 @@ class PackWriter: sum = Sha1() for b in chunkyreader(f): sum.update(b) - f.write(sum.digest()) + packbin = sum.digest() + f.write(packbin) f.close() - p = subprocess.Popen(['git', 'index-pack', '-v', - '--index-version=2', - self.filename + '.pack'], - preexec_fn = _gitenv, - stdout = subprocess.PIPE) - out = p.stdout.read().strip() - _git_wait('git index-pack', p) - if not out: - raise GitError('git index-pack produced no output') - nameprefix = repo('objects/pack/%s' % out) + idx_f = open(self.filename + '.idx', 'wb') + obj_list_sha = self._write_pack_idx_v2(idx_f, idx, packbin) + idx_f.close() + + nameprefix = repo('objects/pack/pack-%s' % obj_list_sha) if os.path.exists(self.filename + '.map'): os.unlink(self.filename + '.map') os.rename(self.filename + '.pack', nameprefix + '.pack') @@ -678,6 +691,44 @@ class PackWriter: """Close the pack file and move it to its definitive path.""" return self._end() + def _write_pack_idx_v2(self, file, idx, packbin): + sum = Sha1() + + def write(data): + file.write(data) + sum.update(data) + + write('\377tOc\0\0\0\2') + + n = 0 + for part in idx: + n += len(part) + write(struct.pack('!i', n)) + part.sort(key=lambda x: x[0]) + + obj_list_sum = Sha1() + for part in idx: + for entry in part: + write(entry[0]) + obj_list_sum.update(entry[0]) + for part in idx: + for entry in part: + write(struct.pack('!I', entry[1])) + ofs64_list = [] + for part in idx: + for entry in part: + if entry[2] & 0x80000000: + write(struct.pack('!I', 0x80000000 | len(ofs64_list))) + ofs64_list.append(struct.pack('!Q', entry[2])) + else: + write(struct.pack('!i', entry[2])) + for ofs64 in ofs64_list: + write(ofs64) + + write(packbin) + file.write(sum.digest()) + return obj_list_sum.hexdigest() + def _git_date(date): return '%d %s' % (date, time.strftime('%z', time.localtime(date))) diff --git a/lib/bup/t/tgit.py b/lib/bup/t/tgit.py index c61b351..fad720b 100644 --- a/lib/bup/t/tgit.py +++ b/lib/bup/t/tgit.py @@ -1,4 +1,4 @@ -import time +import struct, os, tempfile, time from bup import git from bup.helpers import * from wvtest import * @@ -88,3 +88,29 @@ def testpacks(): WVPASS(r.exists(hashes[5])) WVPASS(r.exists(hashes[6])) WVFAIL(r.exists('\0'*20)) + +@wvtest +def test_long_index(): + w = git.PackWriter() + obj_bin = struct.pack('!IIIII', + 0x00112233, 0x44556677, 0x88990011, 0x22334455, 0x66778899) + obj2_bin = struct.pack('!IIIII', + 0x11223344, 0x55667788, 0x99001122, 0x33445566, 0x77889900) + obj3_bin = struct.pack('!IIIII', + 0x22334455, 0x66778899, 0x00112233, 0x44556677, 0x88990011) + pack_bin = struct.pack('!IIIII', + 0x99887766, 0x55443322, 0x11009988, 0x77665544, 0x33221100) + idx = list(list() for i in xrange(256)) + idx[0].append((obj_bin, 1, 0xfffffffff)) + idx[0x11].append((obj2_bin, 2, 0xffffffffff)) + idx[0x22].append((obj3_bin, 3, 0xff)) + (fd,name) = tempfile.mkstemp(suffix='.idx', dir=git.repo('objects')) + f = os.fdopen(fd, 'w+b') + r = w._write_pack_idx_v2(f, idx, pack_bin) + f.seek(0) + i = git.PackIdxV2(name, f) + WVPASS(i.find_offset(obj_bin)==0xfffffffff) + WVPASS(i.find_offset(obj2_bin)==0xffffffffff) + WVPASS(i.find_offset(obj3_bin)==0xff) + f.close() + os.remove(name)