1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
5 import os, sys, zlib, time, subprocess, struct, stat, re, tempfile, math, glob
6 from bup.helpers import *
7 from bup import _helpers, path
11 """Discussion of bloom constants for bup:
13 There are four basic things to consider when building a bloom filter:
14 The size, in bits, of the filter
15 The capacity, in entries, of the filter
16 The probability of a false positive that is tolerable
17 The number of bits readily available to use for addresing filter bits
19 There is one major tunable that is not directly related to the above:
20 k: the number of bits set in the filter per entry
22 Here's a wall of numbers showing the relationship between k; the ratio between
23 the filter size in bits and the entries in the filter; and pfalse_positive:
25 mn|k=3 |k=4 |k=5 |k=6 |k=7 |k=8 |k=9 |k=10 |k=11
26 8|3.05794|2.39687|2.16792|2.15771|2.29297|2.54917|2.92244|3.41909|4.05091
27 9|2.27780|1.65770|1.40703|1.32721|1.34892|1.44631|1.61138|1.84491|2.15259
28 10|1.74106|1.18133|0.94309|0.84362|0.81937|0.84555|0.91270|1.01859|1.16495
29 11|1.36005|0.86373|0.65018|0.55222|0.51259|0.50864|0.53098|0.57616|0.64387
30 12|1.08231|0.64568|0.45945|0.37108|0.32939|0.31424|0.31695|0.33387|0.36380
31 13|0.87517|0.49210|0.33183|0.25527|0.21689|0.19897|0.19384|0.19804|0.21013
32 14|0.71759|0.38147|0.24433|0.17934|0.14601|0.12887|0.12127|0.12012|0.12399
33 15|0.59562|0.30019|0.18303|0.12840|0.10028|0.08523|0.07749|0.07440|0.07468
34 16|0.49977|0.23941|0.13925|0.09351|0.07015|0.05745|0.05049|0.04700|0.04587
35 17|0.42340|0.19323|0.10742|0.06916|0.04990|0.03941|0.03350|0.03024|0.02870
36 18|0.36181|0.15765|0.08392|0.05188|0.03604|0.02748|0.02260|0.01980|0.01827
37 19|0.31160|0.12989|0.06632|0.03942|0.02640|0.01945|0.01549|0.01317|0.01182
38 20|0.27026|0.10797|0.05296|0.03031|0.01959|0.01396|0.01077|0.00889|0.00777
39 21|0.23591|0.09048|0.04269|0.02356|0.01471|0.01014|0.00759|0.00609|0.00518
40 22|0.20714|0.07639|0.03473|0.01850|0.01117|0.00746|0.00542|0.00423|0.00350
41 23|0.18287|0.06493|0.02847|0.01466|0.00856|0.00555|0.00392|0.00297|0.00240
42 24|0.16224|0.05554|0.02352|0.01171|0.00663|0.00417|0.00286|0.00211|0.00166
43 25|0.14459|0.04779|0.01957|0.00944|0.00518|0.00316|0.00211|0.00152|0.00116
44 26|0.12942|0.04135|0.01639|0.00766|0.00408|0.00242|0.00157|0.00110|0.00082
45 27|0.11629|0.03595|0.01381|0.00626|0.00324|0.00187|0.00118|0.00081|0.00059
46 28|0.10489|0.03141|0.01170|0.00515|0.00259|0.00146|0.00090|0.00060|0.00043
47 29|0.09492|0.02756|0.00996|0.00426|0.00209|0.00114|0.00069|0.00045|0.00031
48 30|0.08618|0.02428|0.00853|0.00355|0.00169|0.00090|0.00053|0.00034|0.00023
49 31|0.07848|0.02147|0.00733|0.00297|0.00138|0.00072|0.00041|0.00025|0.00017
50 32|0.07167|0.01906|0.00633|0.00250|0.00113|0.00057|0.00032|0.00019|0.00013
52 Here's a table showing available repository size for a given pfalse_positive
53 and three values of k (assuming we only use the 160 bit SHA1 for addressing the
54 filter and 8192bytes per object):
56 pfalse|obj k=4 |cap k=4 |obj k=5 |cap k=5 |obj k=6 |cap k=6
57 2.500%|139333497228|1038.11 TiB|558711157|4262.63 GiB|13815755|105.41 GiB
58 1.000%|104489450934| 778.50 TiB|436090254|3327.10 GiB|11077519| 84.51 GiB
59 0.125%| 57254889824| 426.58 TiB|261732190|1996.86 GiB| 7063017| 55.89 GiB
61 This eliminates pretty neatly any k>6 as long as we use the raw SHA for
64 filter size scales linearly with repository size for a given k and pfalse.
66 Here's a table of filter sizes for a 1 TiB repository:
68 pfalse| k=3 | k=4 | k=5 | k=6
69 2.500%| 138.78 MiB | 126.26 MiB | 123.00 MiB | 123.37 MiB
70 1.000%| 197.83 MiB | 168.36 MiB | 157.58 MiB | 153.87 MiB
71 0.125%| 421.14 MiB | 307.26 MiB | 262.56 MiB | 241.32 MiB
74 * We want the bloom filter to fit in memory; if it doesn't, the k pagefaults
75 per lookup will be worse than the two required for midx.
76 * We want the pfalse_positive to be low enough that the cost of sometimes
77 faulting on the midx doesn't overcome the benefit of the bloom filter.
78 * We have readily available 160 bits for addressing the filter.
79 * We want to be able to have a single bloom address entire repositories of
82 Based on these parameters, a combination of k=4 and k=5 provides the behavior
83 that bup needs. As such, I've implemented bloom addressing, adding and
84 checking functions in C for these two values. Because k=5 requires less space
85 and gives better overall pfalse_positive perofrmance, it is preferred if a
86 table with k=5 can represent the repository.
88 None of this tells us what max_pfalse_positive to choose.
90 Brandon Low <lostlogic@lostlogicx.com> 04-02-2011
93 MAX_BITS_EACH = 32 # Kinda arbitrary, but 4 bytes per entry is pretty big
94 MAX_BLOOM_BITS = {4: 37, 5: 29} # 160/k-log2(8)
95 MAX_PFALSE_POSITIVE = 1. # Totally arbitrary, needs benchmarking
99 home_repodir = os.path.expanduser('~/.bup')
102 _typemap = { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
103 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
109 class GitError(Exception):
114 """Get the path to the git repository or one of its subdirectories."""
117 raise GitError('You should call check_repo_or_die()')
119 # If there's a .git subdirectory, then the actual repo is in there.
120 gd = os.path.join(repodir, '.git')
121 if os.path.exists(gd):
124 return os.path.join(repodir, sub)
127 def auto_midx(objdir):
128 args = [path.exe(), 'midx', '--auto', '--dir', objdir]
130 rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
132 # make sure 'args' gets printed to help with debugging
133 add_error('%r: exception: %s' % (args, e))
136 add_error('%r: returned %d' % (args, rv))
138 args = [path.exe(), 'bloom', '--dir', objdir]
140 rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
142 # make sure 'args' gets printed to help with debugging
143 add_error('%r: exception: %s' % (args, e))
146 add_error('%r: returned %d' % (args, rv))
149 def mangle_name(name, mode, gitmode):
150 """Mangle a file name to present an abstract name for segmented files.
151 Mangled file names will have the ".bup" extension added to them. If a
152 file's name already ends with ".bup", a ".bupl" extension is added to
153 disambiguate normal files from semgmented ones.
155 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
157 elif name.endswith('.bup') or name[:-1].endswith('.bup'):
158 return name + '.bupl'
163 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
164 def demangle_name(name):
165 """Remove name mangling from a file name, if necessary.
167 The return value is a tuple (demangled_filename,mode), where mode is one of
170 * BUP_NORMAL : files that should be read as-is from the repository
171 * BUP_CHUNKED : files that were chunked and need to be assembled
173 For more information on the name mangling algorythm, see mangle_name()
175 if name.endswith('.bupl'):
176 return (name[:-5], BUP_NORMAL)
177 elif name.endswith('.bup'):
178 return (name[:-4], BUP_CHUNKED)
180 return (name, BUP_NORMAL)
183 def _encode_packobj(type, content):
186 szbits = (sz & 0x0f) | (_typemap[type]<<4)
189 if sz: szbits |= 0x80
195 z = zlib.compressobj(1)
197 yield z.compress(content)
201 def _encode_looseobj(type, content):
202 z = zlib.compressobj(1)
203 yield z.compress('%s %d\0' % (type, len(content)))
204 yield z.compress(content)
208 def _decode_looseobj(buf):
210 s = zlib.decompress(buf)
217 assert(type in _typemap)
218 assert(sz == len(content))
219 return (type, content)
222 def _decode_packobj(buf):
225 type = _typermap[(c & 0x70) >> 4]
232 sz |= (c & 0x7f) << shift
236 return (type, zlib.decompress(buf[i+1:]))
243 def find_offset(self, hash):
244 """Get the offset of an object inside the index file."""
245 idx = self._idx_from_hash(hash)
247 return self._ofs_from_idx(idx)
250 def exists(self, hash, want_source=False):
251 """Return nonempty if the object exists in this index."""
252 if hash and (self._idx_from_hash(hash) != None):
253 return want_source and self.name or True
257 return int(self.fanout[255])
259 def _idx_from_hash(self, hash):
260 global _total_searches, _total_steps
262 assert(len(hash) == 20)
264 start = self.fanout[b1-1] # range -1..254
265 end = self.fanout[b1] # range 0..255
267 _total_steps += 1 # lookup table is a step
270 mid = start + (end-start)/2
271 v = self._idx_to_hash(mid)
281 class PackIdxV1(PackIdx):
282 """Object representation of a Git pack index (version 1) file."""
283 def __init__(self, filename, f):
285 self.idxnames = [self.name]
286 self.map = mmap_read(f)
287 self.fanout = list(struct.unpack('!256I',
288 str(buffer(self.map, 0, 256*4))))
289 self.fanout.append(0) # entry "-1"
290 nsha = self.fanout[255]
292 self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
294 def _ofs_from_idx(self, idx):
295 return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
297 def _idx_to_hash(self, idx):
298 return str(self.shatable[idx*24+4 : idx*24+24])
301 for i in xrange(self.fanout[255]):
302 yield buffer(self.map, 256*4 + 24*i + 4, 20)
305 class PackIdxV2(PackIdx):
306 """Object representation of a Git pack index (version 2) file."""
307 def __init__(self, filename, f):
309 self.idxnames = [self.name]
310 self.map = mmap_read(f)
311 assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
312 self.fanout = list(struct.unpack('!256I',
313 str(buffer(self.map, 8, 256*4))))
314 self.fanout.append(0) # entry "-1"
315 nsha = self.fanout[255]
316 self.sha_ofs = 8 + 256*4
317 self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
318 self.ofstable = buffer(self.map,
319 self.sha_ofs + nsha*20 + nsha*4,
321 self.ofs64table = buffer(self.map,
322 8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
324 def _ofs_from_idx(self, idx):
325 ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
327 idx64 = ofs & 0x7fffffff
328 ofs = struct.unpack('!Q',
329 str(buffer(self.ofs64table, idx64*8, 8)))[0]
332 def _idx_to_hash(self, idx):
333 return str(self.shatable[idx*20:(idx+1)*20])
336 for i in xrange(self.fanout[255]):
337 yield buffer(self.map, 8 + 256*4 + 20*i, 20)
340 extract_bits = _helpers.extract_bits
342 bloom_contains = _helpers.bloom_contains
343 bloom_add = _helpers.bloom_add
347 """Wrapper which contains data from multiple index files.
348 Multiple index (.midx) files constitute a wrapper around index (.idx) files
349 and make it possible for bup to expand Git's indexing capabilities to vast
352 def __init__(self, filename, f=None, readwrite=False):
354 assert(filename.endswith('.bloom'))
356 self.rwfile = f or open(filename, 'r+b')
357 self.map = mmap_readwrite(self.rwfile, close=False)
360 self.map = mmap_read(f or open(filename, 'rb'))
361 if str(self.map[0:4]) != 'BLOM':
362 log('Warning: skipping: invalid BLOM header in %r\n' % filename)
363 return self._init_failed()
364 ver = struct.unpack('!I', self.map[4:8])[0]
365 if ver < BLOOM_VERSION:
366 log('Warning: ignoring old-style (v%d) bloom %r\n'
368 return self._init_failed()
369 if ver > BLOOM_VERSION:
370 log('Warning: ignoring too-new (v%d) bloom %r\n'
372 return self._init_failed()
374 self.bits, self.k, self.entries = struct.unpack('!HHI', self.map[8:16])
375 idxnamestr = str(self.map[16 + 2**self.bits:])
377 self.idxnames = idxnamestr.split('\0')
381 def _init_failed(self):
389 self.bits = self.entries = 0
392 return self.map and self.bits
400 debug2("bloom: closing with %d entries\n" % self.entries)
401 self.map[12:16] = struct.pack('!I', self.entries)
404 self.rwfile.seek(16 + 2**self.bits)
406 self.rwfile.write('\0'.join(self.idxnames))
409 def pfalse_positive(self, additional=0):
410 n = self.entries + additional
413 return 100*(1-math.exp(-k*float(n)/m))**k
415 def add_idx(self, ix):
416 """Add the object to the filter, return current pfalse_positive."""
417 if not self.map: raise Exception, "Cannot add to closed bloom"
418 self.entries += bloom_add(self.map, ix.shatable, self.bits, self.k)
419 self.idxnames.append(os.path.basename(ix.name))
421 def exists(self, sha):
422 """Return nonempty if the object probably exists in the bloom filter."""
423 global _total_searches, _total_steps
425 if not self.map: return None
426 found, steps = bloom_contains(self.map, str(sha), self.bits, self.k)
427 _total_steps += steps
431 def create(cls, name, f=None, readwrite=False, expected=100000, k=None):
432 """Create and return a bloom filter for `expected` entries."""
433 bits = int(math.floor(math.log(expected*MAX_BITS_EACH/8,2)))
434 k = k or ((bits <= MAX_BLOOM_BITS[5]) and 5 or 4)
435 if bits > MAX_BLOOM_BITS[k]:
436 log('bloom: warning, max bits exceeded, non-optimal\n')
437 bits = MAX_BLOOM_BITS[k]
438 debug1('bloom: using 2^%d bytes and %d hash functions\n' % (bits, k))
439 f = f or open(name, 'w+b')
441 f.write(struct.pack('!IHHI', BLOOM_VERSION, bits, k, 0))
442 assert(f.tell() == 16)
443 f.write('\0'*2**bits)
445 return cls(name, f=f, readwrite=readwrite)
452 """Wrapper which contains data from multiple index files.
453 Multiple index (.midx) files constitute a wrapper around index (.idx) files
454 and make it possible for bup to expand Git's indexing capabilities to vast
457 def __init__(self, filename):
459 self.force_keep = False
460 assert(filename.endswith('.midx'))
461 self.map = mmap_read(open(filename))
462 if str(self.map[0:4]) != 'MIDX':
463 log('Warning: skipping: invalid MIDX header in %r\n' % filename)
464 self.force_keep = True
465 return self._init_failed()
466 ver = struct.unpack('!I', self.map[4:8])[0]
467 if ver < MIDX_VERSION:
468 log('Warning: ignoring old-style (v%d) midx %r\n'
470 self.force_keep = False # old stuff is boring
471 return self._init_failed()
472 if ver > MIDX_VERSION:
473 log('Warning: ignoring too-new (v%d) midx %r\n'
475 self.force_keep = True # new stuff is exciting
476 return self._init_failed()
478 self.bits = _helpers.firstword(self.map[8:12])
479 self.entries = 2**self.bits
480 self.fanout = buffer(self.map, 12, self.entries*4)
481 self.sha_ofs = 12 + self.entries*4
482 self.nsha = nsha = self._fanget(self.entries-1)
483 self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
484 self.whichlist = buffer(self.map, self.sha_ofs + nsha*20, nsha*4)
485 self.idxname_ofs = self.sha_ofs + 24*nsha
486 self.idxnames = str(self.map[self.idxname_ofs:]).split('\0')
488 def _init_failed(self):
491 self.fanout = buffer('\0\0\0\0')
492 self.shatable = buffer('\0'*20)
495 def _fanget(self, i):
497 s = self.fanout[start:start+4]
498 return _helpers.firstword(s)
501 return str(self.shatable[i*20:(i+1)*20])
503 def _get_idx_i(self, i):
504 return struct.unpack('!I', self.whichlist[i*4:(i+1)*4])[0]
506 def _get_idxname(self, i):
507 return self.idxnames[self._get_idx_i(i)]
509 def exists(self, hash, want_source=False):
510 """Return nonempty if the object exists in the index files."""
511 global _total_searches, _total_steps
514 el = extract_bits(want, self.bits)
516 start = self._fanget(el-1)
517 startv = el << (32-self.bits)
521 end = self._fanget(el)
522 endv = (el+1) << (32-self.bits)
523 _total_steps += 1 # lookup table is a step
524 hashv = _helpers.firstword(hash)
525 #print '(%08x) %08x %08x %08x' % (extract_bits(want, 32), startv, hashv, endv)
528 #print '! %08x %08x %08x %d - %d' % (startv, hashv, endv, start, end)
529 mid = start + (hashv-startv)*(end-start-1)/(endv-startv)
530 #print ' %08x %08x %08x %d %d %d' % (startv, hashv, endv, start, mid, end)
532 #print ' %08x' % self._num(v)
535 startv = _helpers.firstword(v)
538 endv = _helpers.firstword(v)
540 return want_source and self._get_idxname(mid) or True
544 for i in xrange(self._fanget(self.entries-1)):
545 yield buffer(self.shatable, i*20, 20)
548 return int(self._fanget(self.entries-1))
553 def __init__(self, dir):
555 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
560 self.do_bloom = False
567 assert(_mpi_count == 0)
570 return iter(idxmerge(self.packs))
573 return sum(len(pack) for pack in self.packs)
575 def exists(self, hash, want_source=False):
576 """Return nonempty if the object exists in the index files."""
577 global _total_searches
579 if hash in self.also:
581 if self.do_bloom and self.bloom is not None:
582 _total_searches -= 1 # will be incremented by bloom
583 if self.bloom.exists(hash):
584 self.do_bloom = False
587 for i in xrange(len(self.packs)):
589 _total_searches -= 1 # will be incremented by sub-pack
590 ix = p.exists(hash, want_source=want_source)
592 # reorder so most recently used packs are searched first
593 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
598 def refresh(self, skip_midx = False):
599 """Refresh the index list.
600 This method verifies if .midx files were superseded (e.g. all of its
601 contents are in another, bigger .midx file) and removes the superseded
604 If skip_midx is True, all work on .midx files will be skipped and .midx
605 files will be removed from the list.
607 The module-global variable 'ignore_midx' can force this function to
608 always act as if skip_midx was True.
610 self.bloom = None # Always reopen the bloom as it may have been relaced
611 self.do_bloom = False
612 skip_midx = skip_midx or ignore_midx
613 d = dict((p.name, p) for p in self.packs
614 if not skip_midx or not isinstance(p, PackMidx))
615 if os.path.exists(self.dir):
618 for ix in self.packs:
619 if isinstance(ix, PackMidx):
620 for name in ix.idxnames:
621 d[os.path.join(self.dir, name)] = ix
622 for full in glob.glob(os.path.join(self.dir,'*.midx')):
625 (mxd, mxf) = os.path.split(mx.name)
627 for n in mx.idxnames:
628 if not os.path.exists(os.path.join(mxd, n)):
629 log(('warning: index %s missing\n' +
630 ' used by %s\n') % (n, mxf))
637 midxl.sort(lambda x,y: -cmp(len(x),len(y)))
640 for sub in ix.idxnames:
641 found = d.get(os.path.join(self.dir, sub))
642 if not found or isinstance(found, PackIdx):
643 # doesn't exist, or exists but not in a midx
648 for name in ix.idxnames:
649 d[os.path.join(self.dir, name)] = ix
650 elif not ix.force_keep:
651 debug1('midx: removing redundant: %s\n'
652 % os.path.basename(ix.name))
654 for full in glob.glob(os.path.join(self.dir,'*.idx')):
662 bfull = os.path.join(self.dir, 'bup.bloom')
663 if self.bloom is None and os.path.exists(bfull):
664 self.bloom = ShaBloom(bfull)
665 self.packs = list(set(d.values()))
666 self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
667 if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
671 debug1('PackIdxList: using %d index%s.\n'
672 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
675 """Insert an additional object in the list."""
679 def calc_hash(type, content):
680 """Calculate some content's hash in the Git fashion."""
681 header = '%s %d\0' % (type, len(content))
687 def _shalist_sort_key(ent):
688 (mode, name, id) = ent
689 if stat.S_ISDIR(int(mode, 8)):
695 def open_idx(filename):
696 if filename.endswith('.idx'):
697 f = open(filename, 'rb')
699 if header[0:4] == '\377tOc':
700 version = struct.unpack('!I', header[4:8])[0]
702 return PackIdxV2(filename, f)
704 raise GitError('%s: expected idx file version 2, got %d'
705 % (filename, version))
706 elif len(header) == 8 and header[0:4] < '\377tOc':
707 return PackIdxV1(filename, f)
709 raise GitError('%s: unrecognized idx file header' % filename)
710 elif filename.endswith('.midx'):
711 return PackMidx(filename)
713 raise GitError('idx filenames must end with .idx or .midx')
716 def idxmerge(idxlist, final_progress=True):
717 """Generate a list of all the objects reachable in a PackIdxList."""
718 def pfunc(count, total):
719 progress('Reading indexes: %.2f%% (%d/%d)\r'
720 % (count*100.0/total, count, total))
721 def pfinal(count, total):
723 log('Reading indexes: %.2f%% (%d/%d), done.\n' % (100, total, total))
724 return merge_iter(idxlist, 10024, pfunc, pfinal)
727 def _make_objcache():
728 return PackIdxList(repo('objects/pack'))
731 """Writes Git objects insid a pack file."""
732 def __init__(self, objcache_maker=_make_objcache):
738 self.objcache_maker = objcache_maker
746 (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
747 self.file = os.fdopen(fd, 'w+b')
748 assert(name.endswith('.pack'))
749 self.filename = name[:-5]
750 self.file.write('PACK\0\0\0\2\0\0\0\0')
751 self.idx = list(list() for i in xrange(256))
753 # the 'sha' parameter is used in client.py's _raw_write(), but not needed
754 # in this basic version.
755 def _raw_write(self, datalist, sha):
758 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
759 # the file never has a *partial* blob. So let's make sure it's
760 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
761 # to our hashsplit algorithm.) f.write() does its own buffering,
762 # but that's okay because we'll flush it in _end().
763 oneblob = ''.join(datalist)
767 raise GitError, e, sys.exc_info()[2]
769 crc = zlib.crc32(oneblob) & 0xffffffff
770 self._update_idx(sha, crc, nw)
775 def _update_idx(self, sha, crc, size):
778 self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
780 def _write(self, sha, type, content):
784 sha = calc_hash(type, content)
785 size, crc = self._raw_write(_encode_packobj(type, content), sha=sha)
788 def breakpoint(self):
789 """Clear byte and object counts and return the last processed id."""
791 self.outbytes = self.count = 0
794 def _require_objcache(self):
795 if self.objcache is None and self.objcache_maker:
796 self.objcache = self.objcache_maker()
797 if self.objcache is None:
799 "PackWriter not opened or can't check exists w/o objcache")
801 def exists(self, id, want_source=False):
802 """Return non-empty if an object is found in the object cache."""
803 self._require_objcache()
804 return self.objcache.exists(id, want_source=want_source)
806 def maybe_write(self, type, content):
807 """Write an object to the pack file if not present and return its id."""
808 self._require_objcache()
809 sha = calc_hash(type, content)
810 if not self.exists(sha):
811 self._write(sha, type, content)
812 self.objcache.add(sha)
815 def new_blob(self, blob):
816 """Create a blob object in the pack with the supplied content."""
817 return self.maybe_write('blob', blob)
819 def new_tree(self, shalist):
820 """Create a tree object in the pack."""
821 shalist = sorted(shalist, key = _shalist_sort_key)
823 for (mode,name,bin) in shalist:
826 assert(mode[0] != '0')
828 assert(len(bin) == 20)
829 l.append('%s %s\0%s' % (mode,name,bin))
830 return self.maybe_write('tree', ''.join(l))
832 def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
834 if tree: l.append('tree %s' % tree.encode('hex'))
835 if parent: l.append('parent %s' % parent.encode('hex'))
836 if author: l.append('author %s %s' % (author, _git_date(adate)))
837 if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
840 return self.maybe_write('commit', '\n'.join(l))
842 def new_commit(self, parent, tree, date, msg):
843 """Create a commit object in the pack."""
844 userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
845 commit = self._new_commit(tree, parent,
846 userline, date, userline, date,
851 """Remove the pack file from disk."""
857 os.unlink(self.filename + '.pack')
859 def _end(self, run_midx=True):
861 if not f: return None
867 # update object count
869 cp = struct.pack('!i', self.count)
873 # calculate the pack sha1sum
876 for b in chunkyreader(f):
878 packbin = sum.digest()
882 idx_f = open(self.filename + '.idx', 'wb')
883 obj_list_sha = self._write_pack_idx_v2(idx_f, idx, packbin)
886 nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
887 if os.path.exists(self.filename + '.map'):
888 os.unlink(self.filename + '.map')
889 os.rename(self.filename + '.pack', nameprefix + '.pack')
890 os.rename(self.filename + '.idx', nameprefix + '.idx')
893 auto_midx(repo('objects/pack'))
896 def close(self, run_midx=True):
897 """Close the pack file and move it to its definitive path."""
898 return self._end(run_midx=run_midx)
900 def _write_pack_idx_v2(self, file, idx, packbin):
907 write('\377tOc\0\0\0\2')
912 write(struct.pack('!i', n))
913 part.sort(key=lambda x: x[0])
915 obj_list_sum = Sha1()
919 obj_list_sum.update(entry[0])
922 write(struct.pack('!I', entry[1]))
926 if entry[2] & 0x80000000:
927 write(struct.pack('!I', 0x80000000 | len(ofs64_list)))
928 ofs64_list.append(struct.pack('!Q', entry[2]))
930 write(struct.pack('!i', entry[2]))
931 for ofs64 in ofs64_list:
935 file.write(sum.digest())
936 return obj_list_sum.hexdigest()
940 return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
944 os.environ['GIT_DIR'] = os.path.abspath(repo())
947 def list_refs(refname = None):
948 """Generate a list of tuples in the form (refname,hash).
949 If a ref name is specified, list only this particular ref.
951 argv = ['git', 'show-ref', '--']
954 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
955 out = p.stdout.read().strip()
956 rv = p.wait() # not fatal
960 for d in out.split('\n'):
961 (sha, name) = d.split(' ', 1)
962 yield (name, sha.decode('hex'))
965 def read_ref(refname):
966 """Get the commit id of the most recent commit made on a given ref."""
967 l = list(list_refs(refname))
975 def rev_list(ref, count=None):
976 """Generate a list of reachable commits in reverse chronological order.
978 This generator walks through commits, from child to parent, that are
979 reachable via the specified ref and yields a series of tuples of the form
982 If count is a non-zero integer, limit the number of commits to "count"
985 assert(not ref.startswith('-'))
988 opts += ['-n', str(atoi(count))]
989 argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
990 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
994 if s.startswith('commit '):
995 commit = s[7:].decode('hex')
999 rv = p.wait() # not fatal
1001 raise GitError, 'git rev-list returned error %d' % rv
1004 def rev_get_date(ref):
1005 """Get the date of the latest commit on the specified ref."""
1006 for (date, commit) in rev_list(ref, count=1):
1008 raise GitError, 'no such commit %r' % ref
1011 def rev_parse(committish):
1012 """Resolve the full hash for 'committish', if it exists.
1014 Should be roughly equivalent to 'git rev-parse'.
1016 Returns the hex value of the hash if it is found, None if 'committish' does
1017 not correspond to anything.
1019 head = read_ref(committish)
1021 debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
1024 pL = PackIdxList(repo('objects/pack'))
1026 if len(committish) == 40:
1028 hash = committish.decode('hex')
1038 def update_ref(refname, newval, oldval):
1039 """Change the commit pointed to by a branch."""
1042 assert(refname.startswith('refs/heads/'))
1043 p = subprocess.Popen(['git', 'update-ref', refname,
1044 newval.encode('hex'), oldval.encode('hex')],
1045 preexec_fn = _gitenv)
1046 _git_wait('git update-ref', p)
1049 def guess_repo(path=None):
1050 """Set the path value in the global variable "repodir".
1051 This makes bup look for an existing bup repository, but not fail if a
1052 repository doesn't exist. Usually, if you are interacting with a bup
1053 repository, you would not be calling this function but using
1054 check_repo_or_die().
1060 repodir = os.environ.get('BUP_DIR')
1062 repodir = os.path.expanduser('~/.bup')
1065 def init_repo(path=None):
1066 """Create the Git bare repository for bup in a given path."""
1068 d = repo() # appends a / to the path
1069 parent = os.path.dirname(os.path.dirname(d))
1070 if parent and not os.path.exists(parent):
1071 raise GitError('parent directory "%s" does not exist\n' % parent)
1072 if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
1073 raise GitError('"%d" exists but is not a directory\n' % d)
1074 p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
1075 preexec_fn = _gitenv)
1076 _git_wait('git init', p)
1077 # Force the index version configuration in order to ensure bup works
1078 # regardless of the version of the installed Git binary.
1079 p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
1080 stdout=sys.stderr, preexec_fn = _gitenv)
1081 _git_wait('git config', p)
1084 def check_repo_or_die(path=None):
1085 """Make sure a bup repository exists, and abort if not.
1086 If the path to a particular repository was not specified, this function
1087 initializes the default repository automatically.
1090 if not os.path.isdir(repo('objects/pack/.')):
1091 if repodir == home_repodir:
1094 log('error: %r is not a bup/git repository\n' % repo())
1099 """Generate a list of (mode, name, hash) tuples of objects from 'buf'."""
1101 while ofs < len(buf):
1102 z = buf[ofs:].find('\0')
1104 spl = buf[ofs:ofs+z].split(' ', 1)
1105 assert(len(spl) == 2)
1106 sha = buf[ofs+z+1:ofs+z+1+20]
1108 yield (spl[0], spl[1], sha)
1113 """Get Git's version and ensure a usable version is installed.
1115 The returned version is formatted as an ordered tuple with each position
1116 representing a digit in the version tag. For example, the following tuple
1117 would represent version 1.6.6.9:
1119 ('1', '6', '6', '9')
1123 p = subprocess.Popen(['git', '--version'],
1124 stdout=subprocess.PIPE)
1125 gvs = p.stdout.read()
1126 _git_wait('git --version', p)
1127 m = re.match(r'git version (\S+.\S+)', gvs)
1129 raise GitError('git --version weird output: %r' % gvs)
1130 _ver = tuple(m.group(1).split('.'))
1131 needed = ('1','5', '3', '1')
1133 raise GitError('git version %s or higher is required; you have %s'
1134 % ('.'.join(needed), '.'.join(_ver)))
1138 def _git_wait(cmd, p):
1141 raise GitError('%s returned %d' % (cmd, rv))
1144 def _git_capture(argv):
1145 p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
1147 _git_wait(repr(argv), p)
1151 class _AbortableIter:
1152 def __init__(self, it, onabort = None):
1154 self.onabort = onabort
1162 return self.it.next()
1163 except StopIteration, e:
1171 """Abort iteration and call the abortion callback, if needed."""
1183 """Link to 'git cat-file' that is used to retrieve blob data."""
1186 wanted = ('1','5','6')
1189 log('warning: git version < %s; bup will be slow.\n'
1192 self.get = self._slow_get
1194 self.p = self.inprogress = None
1195 self.get = self._fast_get
1199 self.p.stdout.close()
1200 self.p.stdin.close()
1202 self.inprogress = None
1206 self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1207 stdin=subprocess.PIPE,
1208 stdout=subprocess.PIPE,
1211 preexec_fn = _gitenv)
1213 def _fast_get(self, id):
1214 if not self.p or self.p.poll() != None:
1217 assert(self.p.poll() == None)
1219 log('_fast_get: opening %r while %r is open'
1220 % (id, self.inprogress))
1221 assert(not self.inprogress)
1222 assert(id.find('\n') < 0)
1223 assert(id.find('\r') < 0)
1224 assert(not id.startswith('-'))
1225 self.inprogress = id
1226 self.p.stdin.write('%s\n' % id)
1227 self.p.stdin.flush()
1228 hdr = self.p.stdout.readline()
1229 if hdr.endswith(' missing\n'):
1230 self.inprogress = None
1231 raise KeyError('blob %r is missing' % id)
1232 spl = hdr.split(' ')
1233 if len(spl) != 3 or len(spl[0]) != 40:
1234 raise GitError('expected blob, got %r' % spl)
1235 (hex, type, size) = spl
1237 it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
1238 onabort = self._abort)
1243 assert(self.p.stdout.readline() == '\n')
1244 self.inprogress = None
1245 except Exception, e:
1249 def _slow_get(self, id):
1250 assert(id.find('\n') < 0)
1251 assert(id.find('\r') < 0)
1252 assert(id[0] != '-')
1253 type = _git_capture(['git', 'cat-file', '-t', id]).strip()
1256 p = subprocess.Popen(['git', 'cat-file', type, id],
1257 stdout=subprocess.PIPE,
1258 preexec_fn = _gitenv)
1259 for blob in chunkyreader(p.stdout):
1261 _git_wait('git cat-file', p)
1263 def _join(self, it):
1268 elif type == 'tree':
1269 treefile = ''.join(it)
1270 for (mode, name, sha) in treeparse(treefile):
1271 for blob in self.join(sha.encode('hex')):
1273 elif type == 'commit':
1274 treeline = ''.join(it).split('\n')[0]
1275 assert(treeline.startswith('tree '))
1276 for blob in self.join(treeline[5:]):
1279 raise GitError('invalid object type %r: expected blob/tree/commit'
1283 """Generate a list of the content of all blobs that can be reached
1284 from an object. The hash given in 'id' must point to a blob, a tree
1285 or a commit. The content of all blobs that can be seen from trees or
1286 commits will be added to the list.
1289 for d in self._join(self.get(id)):
1291 except StopIteration:
1295 """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1297 for (n,c) in list_refs():
1298 if n.startswith('refs/tags/'):
1303 tags[c].append(name) # more than one tag can point at 'c'