X-Git-Url: https://arthur.barton.de/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=lib%2Fbup%2Fgit.py;h=cd1682afc009b2f3b09d46fb6a8cf0b00e0c3cf7;hb=54a850308fe0f07deed13fe095c326ff8ea23151;hp=66370cacc64d6e6a967fa8220752397175ac7e82;hpb=f1f959d1f67c3c6d28cd3028bef32536ce04d658;p=bup.git diff --git a/lib/bup/git.py b/lib/bup/git.py index 66370ca..cd1682a 100644 --- a/lib/bup/git.py +++ b/lib/bup/git.py @@ -2,10 +2,9 @@ bup repositories are in Git format. This library allows us to interact with the Git data structures. """ -import os, zlib, time, subprocess, struct, stat, re, tempfile -import heapq +import os, sys, zlib, time, subprocess, struct, stat, re, tempfile from bup.helpers import * -from bup import _helpers +from bup import _helpers, path MIDX_VERSION = 2 @@ -40,9 +39,13 @@ def repo(sub = ''): def auto_midx(objdir): - main_exe = os.environ.get('BUP_MAIN_EXE') or sys.argv[0] - args = [main_exe, 'midx', '--auto', '--dir', objdir] - rv = subprocess.call(args, stdout=open('/dev/null', 'w')) + args = [path.exe(), 'midx', '--auto', '--dir', objdir] + try: + rv = subprocess.call(args, stdout=open('/dev/null', 'w')) + except OSError, e: + # make sure 'args' gets printed to help with debugging + add_error('%r: exception: %s' % (args, e)) + raise if rv: add_error('%r: returned %d' % (args, rv)) @@ -138,29 +141,22 @@ def _decode_packobj(buf): class PackIdx: - """Object representation of a Git pack index file.""" - def __init__(self, filename): - self.name = filename - self.idxnames = [self.name] - self.map = mmap_read(open(filename)) - assert(str(self.map[0:8]) == '\377tOc\0\0\0\2') - self.fanout = list(struct.unpack('!256I', - str(buffer(self.map, 8, 256*4)))) - self.fanout.append(0) # entry "-1" - nsha = self.fanout[255] - self.ofstable = buffer(self.map, - 8 + 256*4 + nsha*20 + nsha*4, - nsha*4) - self.ofs64table = buffer(self.map, - 8 + 256*4 + nsha*20 + nsha*4 + nsha*4) + def __init__(self): + assert(0) - def _ofs_from_idx(self, idx): - ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0] - if ofs & 0x80000000: - idx64 = ofs & 0x7fffffff - ofs = struct.unpack('!I', - str(buffer(self.ofs64table, idx64*8, 8)))[0] - return ofs + def find_offset(self, hash): + """Get the offset of an object inside the index file.""" + idx = self._idx_from_hash(hash) + if idx != None: + return self._ofs_from_idx(idx) + return None + + def exists(self, hash): + """Return nonempty if the object exists in this index.""" + return hash and (self._idx_from_hash(hash) != None) and True or None + + def __len__(self): + return int(self.fanout[255]) def _idx_from_hash(self, hash): global _total_searches, _total_steps @@ -169,13 +165,12 @@ class PackIdx: b1 = ord(hash[0]) start = self.fanout[b1-1] # range -1..254 end = self.fanout[b1] # range 0..255 - buf = buffer(self.map, 8 + 256*4, end*20) want = str(hash) _total_steps += 1 # lookup table is a step while start < end: _total_steps += 1 mid = start + (end-start)/2 - v = str(buf[mid*20:(mid+1)*20]) + v = self._idx_to_hash(mid) if v < want: start = mid+1 elif v > want: @@ -184,23 +179,62 @@ class PackIdx: return mid return None - def find_offset(self, hash): - """Get the offset of an object inside the index file.""" - idx = self._idx_from_hash(hash) - if idx != None: - return self._ofs_from_idx(idx) - return None - def exists(self, hash): - """Return nonempty if the object exists in this index.""" - return hash and (self._idx_from_hash(hash) != None) and True or None +class PackIdxV1(PackIdx): + """Object representation of a Git pack index (version 1) file.""" + def __init__(self, filename, f): + self.name = filename + self.idxnames = [self.name] + self.map = mmap_read(f) + self.fanout = list(struct.unpack('!256I', + str(buffer(self.map, 0, 256*4)))) + self.fanout.append(0) # entry "-1" + nsha = self.fanout[255] + self.shatable = buffer(self.map, 256*4, nsha*24) + + def _ofs_from_idx(self, idx): + return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0] + + def _idx_to_hash(self, idx): + return str(self.shatable[idx*24+4 : idx*24+24]) def __iter__(self): for i in xrange(self.fanout[255]): - yield buffer(self.map, 8 + 256*4 + 20*i, 20) + yield buffer(self.map, 256*4 + 24*i + 4, 20) - def __len__(self): - return int(self.fanout[255]) + +class PackIdxV2(PackIdx): + """Object representation of a Git pack index (version 2) file.""" + def __init__(self, filename, f): + self.name = filename + self.idxnames = [self.name] + self.map = mmap_read(f) + assert(str(self.map[0:8]) == '\377tOc\0\0\0\2') + self.fanout = list(struct.unpack('!256I', + str(buffer(self.map, 8, 256*4)))) + self.fanout.append(0) # entry "-1" + nsha = self.fanout[255] + self.shatable = buffer(self.map, 8 + 256*4, nsha*20) + self.ofstable = buffer(self.map, + 8 + 256*4 + nsha*20 + nsha*4, + nsha*4) + self.ofs64table = buffer(self.map, + 8 + 256*4 + nsha*20 + nsha*4 + nsha*4) + + def _ofs_from_idx(self, idx): + ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0] + if ofs & 0x80000000: + idx64 = ofs & 0x7fffffff + ofs = struct.unpack('!Q', + str(buffer(self.ofs64table, idx64*8, 8)))[0] + return ofs + + def _idx_to_hash(self, idx): + return str(self.shatable[idx*20:(idx+1)*20]) + + def __iter__(self): + for i in xrange(self.fanout[255]): + yield buffer(self.map, 8 + 256*4 + 20*i, 20) extract_bits = _helpers.extract_bits @@ -368,7 +402,10 @@ class PackIdxList: log(('warning: index %s missing\n' + ' used by %s\n') % (n, mxf)) broken += 1 - if not broken: + if broken: + del mx + unlink(full) + else: midxl.append(mx) midxl.sort(lambda x,y: -cmp(len(x),len(y))) for ix in midxl: @@ -389,12 +426,32 @@ class PackIdxList: for f in os.listdir(self.dir): full = os.path.join(self.dir, f) if f.endswith('.idx') and not d.get(full): - ix = PackIdx(full) + try: + ix = open_idx(full) + except GitError, e: + add_error(e) + continue d[full] = ix self.packs = list(set(d.values())) debug1('PackIdxList: using %d index%s.\n' % (len(self.packs), len(self.packs)!=1 and 'es' or '')) + def packname_containing(self, hash): + # figure out which pack contains a given hash. + # FIXME: if the midx file format would just *store* this information, + # we could calculate it a lot more efficiently. But it's not needed + # often, so let's do it like this. + for f in os.listdir(self.dir): + if f.endswith('.idx'): + full = os.path.join(self.dir, f) + try: + ix = open_idx(full) + except GitError, e: + add_error(e) + continue + if ix.exists(hash): + return full + def add(self, hash): """Insert an additional object in the list.""" self.also[hash] = 1 @@ -422,7 +479,19 @@ def _shalist_sort_key(ent): def open_idx(filename): if filename.endswith('.idx'): - return PackIdx(filename) + f = open(filename, 'rb') + header = f.read(8) + if header[0:4] == '\377tOc': + version = struct.unpack('!I', header[4:8])[0] + if version == 2: + return PackIdxV2(filename, f) + else: + raise GitError('%s: expected idx file version 2, got %d' + % (filename, version)) + elif len(header) == 8 and header[0:4] < '\377tOc': + return PackIdxV1(filename, f) + else: + raise GitError('%s: unrecognized idx file header' % filename) elif filename.endswith('.midx'): return PackMidx(filename) else: @@ -431,60 +500,44 @@ def open_idx(filename): def idxmerge(idxlist, final_progress=True): """Generate a list of all the objects reachable in a PackIdxList.""" - total = sum(len(i) for i in idxlist) - iters = (iter(i) for i in idxlist) - heap = [(next(it), it) for it in iters] - heapq.heapify(heap) - count = 0 - last = None - while heap: - if (count % 10024) == 0: - progress('Reading indexes: %.2f%% (%d/%d)\r' - % (count*100.0/total, count, total)) - (e, it) = heap[0] - if e != last: - yield e - last = e - count += 1 - e = next(it) - if e: - heapq.heapreplace(heap, (e, it)) - else: - heapq.heappop(heap) - if final_progress: - log('Reading indexes: %.2f%% (%d/%d), done.\n' % (100, total, total)) + def pfunc(count, total): + progress('Reading indexes: %.2f%% (%d/%d)\r' + % (count*100.0/total, count, total)) + def pfinal(count, total): + if final_progress: + log('Reading indexes: %.2f%% (%d/%d), done.\n' % (100, total, total)) + return merge_iter(idxlist, 10024, pfunc, pfinal) + +def _make_objcache(): + return PackIdxList(repo('objects/pack')) class PackWriter: """Writes Git objects insid a pack file.""" - def __init__(self, objcache_maker=None): + def __init__(self, objcache_maker=_make_objcache): self.count = 0 self.outbytes = 0 self.filename = None self.file = None + self.idx = None self.objcache_maker = objcache_maker self.objcache = None def __del__(self): self.close() - def _make_objcache(self): - if self.objcache == None: - if self.objcache_maker: - self.objcache = self.objcache_maker() - else: - self.objcache = PackIdxList(repo('objects/pack')) - def _open(self): if not self.file: - self._make_objcache() (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects')) self.file = os.fdopen(fd, 'w+b') assert(name.endswith('.pack')) self.filename = name[:-5] self.file.write('PACK\0\0\0\2\0\0\0\0') + self.idx = list(list() for i in xrange(256)) - def _raw_write(self, datalist): + # the 'sha' parameter is used in client.py's _raw_write(), but not needed + # in this basic version. + def _raw_write(self, datalist, sha): self._open() f = self.file # in case we get interrupted (eg. KeyboardInterrupt), it's best if @@ -493,15 +546,29 @@ class PackWriter: # to our hashsplit algorithm.) f.write() does its own buffering, # but that's okay because we'll flush it in _end(). oneblob = ''.join(datalist) - f.write(oneblob) - self.outbytes += len(oneblob) + try: + f.write(oneblob) + except IOError, e: + raise GitError, e, sys.exc_info()[2] + nw = len(oneblob) + crc = zlib.crc32(oneblob) & 0xffffffff + self._update_idx(sha, crc, nw) + self.outbytes += nw self.count += 1 + return nw, crc + + def _update_idx(self, sha, crc, size): + assert(sha) + if self.idx: + self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size)) - def _write(self, bin, type, content): + def _write(self, sha, type, content): if verbose: log('>') - self._raw_write(_encode_packobj(type, content)) - return bin + if not sha: + sha = calc_hash(type, content) + size, crc = self._raw_write(_encode_packobj(type, content), sha=sha) + return sha def breakpoint(self): """Clear byte and object counts and return the last processed id.""" @@ -513,19 +580,26 @@ class PackWriter: """Write an object in this pack file.""" return self._write(calc_hash(type, content), type, content) + def _require_objcache(self): + if self.objcache is None and self.objcache_maker: + self.objcache = self.objcache_maker() + if self.objcache is None: + raise GitError( + "PackWriter not opened or can't check exists w/o objcache") + def exists(self, id): """Return non-empty if an object is found in the object cache.""" - if not self.objcache: - self._make_objcache() + self._require_objcache() return self.objcache.exists(id) def maybe_write(self, type, content): """Write an object to the pack file if not present and return its id.""" - bin = calc_hash(type, content) - if not self.exists(bin): - self._write(bin, type, content) - self.objcache.add(bin) - return bin + self._require_objcache() + sha = calc_hash(type, content) + if not self.exists(sha): + self._write(sha, type, content) + self.objcache.add(sha) + return sha def new_blob(self, blob): """Create a blob object in the pack with the supplied content.""" @@ -566,15 +640,18 @@ class PackWriter: """Remove the pack file from disk.""" f = self.file if f: + self.idx = None self.file = None f.close() os.unlink(self.filename + '.pack') - def _end(self): + def _end(self, run_midx=True): f = self.file if not f: return None self.file = None self.objcache = None + idx = self.idx + self.idx = None # update object count f.seek(8) @@ -585,35 +662,67 @@ class PackWriter: # calculate the pack sha1sum f.seek(0) sum = Sha1() - while 1: - b = f.read(65536) + for b in chunkyreader(f): sum.update(b) - if not b: break - f.write(sum.digest()) - + packbin = sum.digest() + f.write(packbin) f.close() - p = subprocess.Popen(['git', 'index-pack', '-v', - '--index-version=2', - self.filename + '.pack'], - preexec_fn = _gitenv, - stdout = subprocess.PIPE) - out = p.stdout.read().strip() - _git_wait('git index-pack', p) - if not out: - raise GitError('git index-pack produced no output') - nameprefix = repo('objects/pack/%s' % out) + idx_f = open(self.filename + '.idx', 'wb') + obj_list_sha = self._write_pack_idx_v2(idx_f, idx, packbin) + idx_f.close() + + nameprefix = repo('objects/pack/pack-%s' % obj_list_sha) if os.path.exists(self.filename + '.map'): os.unlink(self.filename + '.map') os.rename(self.filename + '.pack', nameprefix + '.pack') os.rename(self.filename + '.idx', nameprefix + '.idx') - auto_midx(repo('objects/pack')) + if run_midx: + auto_midx(repo('objects/pack')) return nameprefix - def close(self): + def close(self, run_midx=True): """Close the pack file and move it to its definitive path.""" - return self._end() + return self._end(run_midx=run_midx) + + def _write_pack_idx_v2(self, file, idx, packbin): + sum = Sha1() + + def write(data): + file.write(data) + sum.update(data) + + write('\377tOc\0\0\0\2') + + n = 0 + for part in idx: + n += len(part) + write(struct.pack('!i', n)) + part.sort(key=lambda x: x[0]) + + obj_list_sum = Sha1() + for part in idx: + for entry in part: + write(entry[0]) + obj_list_sum.update(entry[0]) + for part in idx: + for entry in part: + write(struct.pack('!I', entry[1])) + ofs64_list = [] + for part in idx: + for entry in part: + if entry[2] & 0x80000000: + write(struct.pack('!I', 0x80000000 | len(ofs64_list))) + ofs64_list.append(struct.pack('!Q', entry[2])) + else: + write(struct.pack('!i', entry[2])) + for ofs64 in ofs64_list: + write(ofs64) + + write(packbin) + file.write(sum.digest()) + return obj_list_sum.hexdigest() def _git_date(date): @@ -688,6 +797,33 @@ def rev_get_date(ref): raise GitError, 'no such commit %r' % ref +def rev_parse(committish): + """Resolve the full hash for 'committish', if it exists. + + Should be roughly equivalent to 'git rev-parse'. + + Returns the hex value of the hash if it is found, None if 'committish' does + not correspond to anything. + """ + head = read_ref(committish) + if head: + debug2("resolved from ref: commit = %s\n" % head.encode('hex')) + return head + + pL = PackIdxList(repo('objects/pack')) + + if len(committish) == 40: + try: + hash = committish.decode('hex') + except TypeError: + return None + + if pL.exists(hash): + return hash + + return None + + def update_ref(refname, newval, oldval): """Change the commit pointed to by a branch.""" if not oldval: @@ -718,7 +854,10 @@ def guess_repo(path=None): def init_repo(path=None): """Create the Git bare repository for bup in a given path.""" guess_repo(path) - d = repo() + d = repo() # appends a / to the path + parent = os.path.dirname(os.path.dirname(d)) + if parent and not os.path.exists(parent): + raise GitError('parent directory "%s" does not exist\n' % parent) if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')): raise GitError('"%d" exists but is not a directory\n' % d) p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr, @@ -857,6 +996,7 @@ class CatPipe: stdin=subprocess.PIPE, stdout=subprocess.PIPE, close_fds = True, + bufsize = 4096, preexec_fn = _gitenv) def _fast_get(self, id): @@ -870,11 +1010,13 @@ class CatPipe: assert(not self.inprogress) assert(id.find('\n') < 0) assert(id.find('\r') < 0) - assert(id[0] != '-') + assert(not id.startswith('-')) self.inprogress = id self.p.stdin.write('%s\n' % id) + self.p.stdin.flush() hdr = self.p.stdout.readline() if hdr.endswith(' missing\n'): + self.inprogress = None raise KeyError('blob %r is missing' % id) spl = hdr.split(' ') if len(spl) != 3 or len(spl[0]) != 40: @@ -937,3 +1079,16 @@ class CatPipe: yield d except StopIteration: log('booger!\n') + +def tags(): + """Return a dictionary of all tags in the form {hash: [tag_names, ...]}.""" + tags = {} + for (n,c) in list_refs(): + if n.startswith('refs/tags/'): + name = n[10:] + if not c in tags: + tags[c] = [] + + tags[c].append(name) # more than one tag can point at 'c' + + return tags