1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
5 import os, sys, zlib, time, subprocess, struct, stat, re, tempfile, math, glob
6 from bup.helpers import *
7 from bup import _helpers, path
11 """Discussion of bloom constants for bup:
13 There are four basic things to consider when building a bloom filter:
14 The size, in bits, of the filter
15 The capacity, in entries, of the filter
16 The probability of a false positive that is tolerable
17 The number of bits readily available to use for addresing filter bits
19 There is one major tunable that is not directly related to the above:
20 k: the number of bits set in the filter per entry
22 Here's a wall of numbers showing the relationship between k; the ratio between
23 the filter size in bits and the entries in the filter; and pfalse_positive:
25 mn|k=3 |k=4 |k=5 |k=6 |k=7 |k=8 |k=9 |k=10 |k=11
26 8|3.05794|2.39687|2.16792|2.15771|2.29297|2.54917|2.92244|3.41909|4.05091
27 9|2.27780|1.65770|1.40703|1.32721|1.34892|1.44631|1.61138|1.84491|2.15259
28 10|1.74106|1.18133|0.94309|0.84362|0.81937|0.84555|0.91270|1.01859|1.16495
29 11|1.36005|0.86373|0.65018|0.55222|0.51259|0.50864|0.53098|0.57616|0.64387
30 12|1.08231|0.64568|0.45945|0.37108|0.32939|0.31424|0.31695|0.33387|0.36380
31 13|0.87517|0.49210|0.33183|0.25527|0.21689|0.19897|0.19384|0.19804|0.21013
32 14|0.71759|0.38147|0.24433|0.17934|0.14601|0.12887|0.12127|0.12012|0.12399
33 15|0.59562|0.30019|0.18303|0.12840|0.10028|0.08523|0.07749|0.07440|0.07468
34 16|0.49977|0.23941|0.13925|0.09351|0.07015|0.05745|0.05049|0.04700|0.04587
35 17|0.42340|0.19323|0.10742|0.06916|0.04990|0.03941|0.03350|0.03024|0.02870
36 18|0.36181|0.15765|0.08392|0.05188|0.03604|0.02748|0.02260|0.01980|0.01827
37 19|0.31160|0.12989|0.06632|0.03942|0.02640|0.01945|0.01549|0.01317|0.01182
38 20|0.27026|0.10797|0.05296|0.03031|0.01959|0.01396|0.01077|0.00889|0.00777
39 21|0.23591|0.09048|0.04269|0.02356|0.01471|0.01014|0.00759|0.00609|0.00518
40 22|0.20714|0.07639|0.03473|0.01850|0.01117|0.00746|0.00542|0.00423|0.00350
41 23|0.18287|0.06493|0.02847|0.01466|0.00856|0.00555|0.00392|0.00297|0.00240
42 24|0.16224|0.05554|0.02352|0.01171|0.00663|0.00417|0.00286|0.00211|0.00166
43 25|0.14459|0.04779|0.01957|0.00944|0.00518|0.00316|0.00211|0.00152|0.00116
44 26|0.12942|0.04135|0.01639|0.00766|0.00408|0.00242|0.00157|0.00110|0.00082
45 27|0.11629|0.03595|0.01381|0.00626|0.00324|0.00187|0.00118|0.00081|0.00059
46 28|0.10489|0.03141|0.01170|0.00515|0.00259|0.00146|0.00090|0.00060|0.00043
47 29|0.09492|0.02756|0.00996|0.00426|0.00209|0.00114|0.00069|0.00045|0.00031
48 30|0.08618|0.02428|0.00853|0.00355|0.00169|0.00090|0.00053|0.00034|0.00023
49 31|0.07848|0.02147|0.00733|0.00297|0.00138|0.00072|0.00041|0.00025|0.00017
50 32|0.07167|0.01906|0.00633|0.00250|0.00113|0.00057|0.00032|0.00019|0.00013
52 Here's a table showing available repository size for a given pfalse_positive
53 and three values of k (assuming we only use the 160 bit SHA1 for addressing the
54 filter and 8192bytes per object):
56 pfalse|obj k=4 |cap k=4 |obj k=5 |cap k=5 |obj k=6 |cap k=6
57 2.500%|139333497228|1038.11 TiB|558711157|4262.63 GiB|13815755|105.41 GiB
58 1.000%|104489450934| 778.50 TiB|436090254|3327.10 GiB|11077519| 84.51 GiB
59 0.125%| 57254889824| 426.58 TiB|261732190|1996.86 GiB| 7063017| 55.89 GiB
61 This eliminates pretty neatly any k>6 as long as we use the raw SHA for
64 filter size scales linearly with repository size for a given k and pfalse.
66 Here's a table of filter sizes for a 1 TiB repository:
68 pfalse| k=3 | k=4 | k=5 | k=6
69 2.500%| 138.78 MiB | 126.26 MiB | 123.00 MiB | 123.37 MiB
70 1.000%| 197.83 MiB | 168.36 MiB | 157.58 MiB | 153.87 MiB
71 0.125%| 421.14 MiB | 307.26 MiB | 262.56 MiB | 241.32 MiB
74 * We want the bloom filter to fit in memory; if it doesn't, the k pagefaults
75 per lookup will be worse than the two required for midx.
76 * We want the pfalse_positive to be low enough that the cost of sometimes
77 faulting on the midx doesn't overcome the benefit of the bloom filter.
78 * We have readily available 160 bits for addressing the filter.
79 * We want to be able to have a single bloom address entire repositories of
82 Based on these parameters, a combination of k=4 and k=5 provides the behavior
83 that bup needs. As such, I've implemented bloom addressing, adding and
84 checking functions in C for these two values. Because k=5 requires less space
85 and gives better overall pfalse_positive perofrmance, it is preferred if a
86 table with k=5 can represent the repository.
88 None of this tells us what max_pfalse_positive to choose.
90 Brandon Low <lostlogic@lostlogicx.com> 04-02-2011
93 MAX_BITS_EACH = 32 # Kinda arbitrary, but 4 bytes per entry is pretty big
94 MAX_BLOOM_BITS = {4: 37, 5: 29} # 160/k-log2(8)
95 MAX_PFALSE_POSITIVE = 1. # Totally arbitrary, needs benchmarking
99 home_repodir = os.path.expanduser('~/.bup')
102 _typemap = { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
103 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
109 class GitError(Exception):
114 """Get the path to the git repository or one of its subdirectories."""
117 raise GitError('You should call check_repo_or_die()')
119 # If there's a .git subdirectory, then the actual repo is in there.
120 gd = os.path.join(repodir, '.git')
121 if os.path.exists(gd):
124 return os.path.join(repodir, sub)
127 def auto_midx(objdir):
128 args = [path.exe(), 'midx', '--auto', '--dir', objdir]
130 rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
132 # make sure 'args' gets printed to help with debugging
133 add_error('%r: exception: %s' % (args, e))
136 add_error('%r: returned %d' % (args, rv))
138 args = [path.exe(), 'bloom', '--dir', objdir]
140 rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
142 # make sure 'args' gets printed to help with debugging
143 add_error('%r: exception: %s' % (args, e))
146 add_error('%r: returned %d' % (args, rv))
149 def mangle_name(name, mode, gitmode):
150 """Mangle a file name to present an abstract name for segmented files.
151 Mangled file names will have the ".bup" extension added to them. If a
152 file's name already ends with ".bup", a ".bupl" extension is added to
153 disambiguate normal files from semgmented ones.
155 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
157 elif name.endswith('.bup') or name[:-1].endswith('.bup'):
158 return name + '.bupl'
163 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
164 def demangle_name(name):
165 """Remove name mangling from a file name, if necessary.
167 The return value is a tuple (demangled_filename,mode), where mode is one of
170 * BUP_NORMAL : files that should be read as-is from the repository
171 * BUP_CHUNKED : files that were chunked and need to be assembled
173 For more information on the name mangling algorythm, see mangle_name()
175 if name.endswith('.bupl'):
176 return (name[:-5], BUP_NORMAL)
177 elif name.endswith('.bup'):
178 return (name[:-4], BUP_CHUNKED)
180 return (name, BUP_NORMAL)
183 def _encode_packobj(type, content):
186 szbits = (sz & 0x0f) | (_typemap[type]<<4)
189 if sz: szbits |= 0x80
195 z = zlib.compressobj(1)
197 yield z.compress(content)
201 def _encode_looseobj(type, content):
202 z = zlib.compressobj(1)
203 yield z.compress('%s %d\0' % (type, len(content)))
204 yield z.compress(content)
208 def _decode_looseobj(buf):
210 s = zlib.decompress(buf)
217 assert(type in _typemap)
218 assert(sz == len(content))
219 return (type, content)
222 def _decode_packobj(buf):
225 type = _typermap[(c & 0x70) >> 4]
232 sz |= (c & 0x7f) << shift
236 return (type, zlib.decompress(buf[i+1:]))
243 def find_offset(self, hash):
244 """Get the offset of an object inside the index file."""
245 idx = self._idx_from_hash(hash)
247 return self._ofs_from_idx(idx)
250 def exists(self, hash, want_source=False):
251 """Return nonempty if the object exists in this index."""
252 if hash and (self._idx_from_hash(hash) != None):
253 return want_source and self.name or True
257 return int(self.fanout[255])
259 def _idx_from_hash(self, hash):
260 global _total_searches, _total_steps
262 assert(len(hash) == 20)
264 start = self.fanout[b1-1] # range -1..254
265 end = self.fanout[b1] # range 0..255
267 _total_steps += 1 # lookup table is a step
270 mid = start + (end-start)/2
271 v = self._idx_to_hash(mid)
281 class PackIdxV1(PackIdx):
282 """Object representation of a Git pack index (version 1) file."""
283 def __init__(self, filename, f):
285 self.idxnames = [self.name]
286 self.map = mmap_read(f)
287 self.fanout = list(struct.unpack('!256I',
288 str(buffer(self.map, 0, 256*4))))
289 self.fanout.append(0) # entry "-1"
290 nsha = self.fanout[255]
292 self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
294 def _ofs_from_idx(self, idx):
295 return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
297 def _idx_to_hash(self, idx):
298 return str(self.shatable[idx*24+4 : idx*24+24])
301 for i in xrange(self.fanout[255]):
302 yield buffer(self.map, 256*4 + 24*i + 4, 20)
305 class PackIdxV2(PackIdx):
306 """Object representation of a Git pack index (version 2) file."""
307 def __init__(self, filename, f):
309 self.idxnames = [self.name]
310 self.map = mmap_read(f)
311 assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
312 self.fanout = list(struct.unpack('!256I',
313 str(buffer(self.map, 8, 256*4))))
314 self.fanout.append(0) # entry "-1"
315 nsha = self.fanout[255]
316 self.sha_ofs = 8 + 256*4
317 self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
318 self.ofstable = buffer(self.map,
319 self.sha_ofs + nsha*20 + nsha*4,
321 self.ofs64table = buffer(self.map,
322 8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
324 def _ofs_from_idx(self, idx):
325 ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
327 idx64 = ofs & 0x7fffffff
328 ofs = struct.unpack('!Q',
329 str(buffer(self.ofs64table, idx64*8, 8)))[0]
332 def _idx_to_hash(self, idx):
333 return str(self.shatable[idx*20:(idx+1)*20])
336 for i in xrange(self.fanout[255]):
337 yield buffer(self.map, 8 + 256*4 + 20*i, 20)
340 extract_bits = _helpers.extract_bits
342 bloom_contains = _helpers.bloom_contains
343 bloom_add = _helpers.bloom_add
347 """Wrapper which contains data from multiple index files.
349 def __init__(self, filename, f=None, readwrite=False, expected=-1):
353 assert(filename.endswith('.bloom'))
356 self.rwfile = f = f or open(filename, 'r+b')
359 # Decide if we want to mmap() the pages as writable ('immediate'
360 # write) or else map them privately for later writing back to
361 # the file ('delayed' write). A bloom table's write access
362 # pattern is such that we dirty almost all the pages after adding
363 # very few entries. But the table is so big that dirtying
364 # *all* the pages often exceeds Linux's default
365 # /proc/sys/vm/dirty_ratio or /proc/sys/vm/dirty_background_ratio,
366 # thus causing it to start flushing the table before we're
367 # finished... even though there's more than enough space to
368 # store the bloom table in RAM.
370 # To work around that behaviour, if we calculate that we'll
371 # probably end up touching the whole table anyway (at least
372 # one bit flipped per memory page), let's use a "private" mmap,
373 # which defeats Linux's ability to flush it to disk. Then we'll
374 # flush it as one big lump during close().
375 pages = os.fstat(f.fileno()).st_size / 4096 * 5 # assume k=5
376 self.delaywrite = expected > pages
377 debug1('bloom: delaywrite=%r\n' % self.delaywrite)
379 self.map = mmap_readwrite_private(self.rwfile, close=False)
381 self.map = mmap_readwrite(self.rwfile, close=False)
384 f = f or open(filename, 'rb')
385 self.map = mmap_read(f)
386 got = str(self.map[0:4])
388 log('Warning: invalid BLOM header (%r) in %r\n' % (got, filename))
389 return self._init_failed()
390 ver = struct.unpack('!I', self.map[4:8])[0]
391 if ver < BLOOM_VERSION:
392 log('Warning: ignoring old-style (v%d) bloom %r\n'
394 return self._init_failed()
395 if ver > BLOOM_VERSION:
396 log('Warning: ignoring too-new (v%d) bloom %r\n'
398 return self._init_failed()
400 self.bits, self.k, self.entries = struct.unpack('!HHI', self.map[8:16])
401 idxnamestr = str(self.map[16 + 2**self.bits:])
403 self.idxnames = idxnamestr.split('\0')
407 def _init_failed(self):
414 self.bits = self.entries = 0
417 return self.map and self.bits
423 if self.map and self.rwfile:
424 debug2("bloom: closing with %d entries\n" % self.entries)
425 self.map[12:16] = struct.pack('!I', self.entries)
428 self.rwfile.write(self.map)
431 self.rwfile.seek(16 + 2**self.bits)
433 self.rwfile.write('\0'.join(self.idxnames))
436 def pfalse_positive(self, additional=0):
437 n = self.entries + additional
440 return 100*(1-math.exp(-k*float(n)/m))**k
442 def add_idx(self, ix):
443 """Add the object to the filter, return current pfalse_positive."""
444 if not self.map: raise Exception, "Cannot add to closed bloom"
445 self.entries += bloom_add(self.map, ix.shatable, self.bits, self.k)
446 self.idxnames.append(os.path.basename(ix.name))
448 def exists(self, sha):
449 """Return nonempty if the object probably exists in the bloom filter."""
450 global _total_searches, _total_steps
452 if not self.map: return None
453 found, steps = bloom_contains(self.map, str(sha), self.bits, self.k)
454 _total_steps += steps
458 def create(cls, name, expected, delaywrite=None, f=None, k=None):
459 """Create and return a bloom filter for `expected` entries."""
460 bits = int(math.floor(math.log(expected*MAX_BITS_EACH/8,2)))
461 k = k or ((bits <= MAX_BLOOM_BITS[5]) and 5 or 4)
462 if bits > MAX_BLOOM_BITS[k]:
463 log('bloom: warning, max bits exceeded, non-optimal\n')
464 bits = MAX_BLOOM_BITS[k]
465 debug1('bloom: using 2^%d bytes and %d hash functions\n' % (bits, k))
466 f = f or open(name, 'w+b')
468 f.write(struct.pack('!IHHI', BLOOM_VERSION, bits, k, 0))
469 assert(f.tell() == 16)
470 # NOTE: On some systems this will not extend+zerofill, but it does on
471 # darwin, linux, bsd and solaris.
472 f.truncate(16+2**bits)
474 if delaywrite != None and not delaywrite:
475 # tell it to expect very few objects, forcing a direct mmap
477 return cls(name, f=f, readwrite=True, expected=expected)
484 """Wrapper which contains data from multiple index files.
485 Multiple index (.midx) files constitute a wrapper around index (.idx) files
486 and make it possible for bup to expand Git's indexing capabilities to vast
489 def __init__(self, filename):
491 self.force_keep = False
492 assert(filename.endswith('.midx'))
493 self.map = mmap_read(open(filename))
494 if str(self.map[0:4]) != 'MIDX':
495 log('Warning: skipping: invalid MIDX header in %r\n' % filename)
496 self.force_keep = True
497 return self._init_failed()
498 ver = struct.unpack('!I', self.map[4:8])[0]
499 if ver < MIDX_VERSION:
500 log('Warning: ignoring old-style (v%d) midx %r\n'
502 self.force_keep = False # old stuff is boring
503 return self._init_failed()
504 if ver > MIDX_VERSION:
505 log('Warning: ignoring too-new (v%d) midx %r\n'
507 self.force_keep = True # new stuff is exciting
508 return self._init_failed()
510 self.bits = _helpers.firstword(self.map[8:12])
511 self.entries = 2**self.bits
512 self.fanout = buffer(self.map, 12, self.entries*4)
513 self.sha_ofs = 12 + self.entries*4
514 self.nsha = nsha = self._fanget(self.entries-1)
515 self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
516 self.whichlist = buffer(self.map, self.sha_ofs + nsha*20, nsha*4)
517 self.idxname_ofs = self.sha_ofs + 24*nsha
518 self.idxnames = str(self.map[self.idxname_ofs:]).split('\0')
520 def _init_failed(self):
523 self.fanout = buffer('\0\0\0\0')
524 self.shatable = buffer('\0'*20)
527 def _fanget(self, i):
529 s = self.fanout[start:start+4]
530 return _helpers.firstword(s)
533 return str(self.shatable[i*20:(i+1)*20])
535 def _get_idx_i(self, i):
536 return struct.unpack('!I', self.whichlist[i*4:(i+1)*4])[0]
538 def _get_idxname(self, i):
539 return self.idxnames[self._get_idx_i(i)]
541 def exists(self, hash, want_source=False):
542 """Return nonempty if the object exists in the index files."""
543 global _total_searches, _total_steps
546 el = extract_bits(want, self.bits)
548 start = self._fanget(el-1)
549 startv = el << (32-self.bits)
553 end = self._fanget(el)
554 endv = (el+1) << (32-self.bits)
555 _total_steps += 1 # lookup table is a step
556 hashv = _helpers.firstword(hash)
557 #print '(%08x) %08x %08x %08x' % (extract_bits(want, 32), startv, hashv, endv)
560 #print '! %08x %08x %08x %d - %d' % (startv, hashv, endv, start, end)
561 mid = start + (hashv-startv)*(end-start-1)/(endv-startv)
562 #print ' %08x %08x %08x %d %d %d' % (startv, hashv, endv, start, mid, end)
564 #print ' %08x' % self._num(v)
567 startv = _helpers.firstword(v)
570 endv = _helpers.firstword(v)
572 return want_source and self._get_idxname(mid) or True
576 for i in xrange(self._fanget(self.entries-1)):
577 yield buffer(self.shatable, i*20, 20)
580 return int(self._fanget(self.entries-1))
585 def __init__(self, dir):
587 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
592 self.do_bloom = False
599 assert(_mpi_count == 0)
602 return iter(idxmerge(self.packs))
605 return sum(len(pack) for pack in self.packs)
607 def exists(self, hash, want_source=False):
608 """Return nonempty if the object exists in the index files."""
609 global _total_searches
611 if hash in self.also:
613 if self.do_bloom and self.bloom is not None:
614 _total_searches -= 1 # will be incremented by bloom
615 if self.bloom.exists(hash):
616 self.do_bloom = False
619 for i in xrange(len(self.packs)):
621 _total_searches -= 1 # will be incremented by sub-pack
622 ix = p.exists(hash, want_source=want_source)
624 # reorder so most recently used packs are searched first
625 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
630 def refresh(self, skip_midx = False):
631 """Refresh the index list.
632 This method verifies if .midx files were superseded (e.g. all of its
633 contents are in another, bigger .midx file) and removes the superseded
636 If skip_midx is True, all work on .midx files will be skipped and .midx
637 files will be removed from the list.
639 The module-global variable 'ignore_midx' can force this function to
640 always act as if skip_midx was True.
642 self.bloom = None # Always reopen the bloom as it may have been relaced
643 self.do_bloom = False
644 skip_midx = skip_midx or ignore_midx
645 d = dict((p.name, p) for p in self.packs
646 if not skip_midx or not isinstance(p, PackMidx))
647 if os.path.exists(self.dir):
650 for ix in self.packs:
651 if isinstance(ix, PackMidx):
652 for name in ix.idxnames:
653 d[os.path.join(self.dir, name)] = ix
654 for full in glob.glob(os.path.join(self.dir,'*.midx')):
657 (mxd, mxf) = os.path.split(mx.name)
659 for n in mx.idxnames:
660 if not os.path.exists(os.path.join(mxd, n)):
661 log(('warning: index %s missing\n' +
662 ' used by %s\n') % (n, mxf))
669 midxl.sort(lambda x,y: -cmp(len(x),len(y)))
672 for sub in ix.idxnames:
673 found = d.get(os.path.join(self.dir, sub))
674 if not found or isinstance(found, PackIdx):
675 # doesn't exist, or exists but not in a midx
680 for name in ix.idxnames:
681 d[os.path.join(self.dir, name)] = ix
682 elif not ix.force_keep:
683 debug1('midx: removing redundant: %s\n'
684 % os.path.basename(ix.name))
686 for full in glob.glob(os.path.join(self.dir,'*.idx')):
694 bfull = os.path.join(self.dir, 'bup.bloom')
695 if self.bloom is None and os.path.exists(bfull):
696 self.bloom = ShaBloom(bfull)
697 self.packs = list(set(d.values()))
698 self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
699 if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
703 debug1('PackIdxList: using %d index%s.\n'
704 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
707 """Insert an additional object in the list."""
711 def calc_hash(type, content):
712 """Calculate some content's hash in the Git fashion."""
713 header = '%s %d\0' % (type, len(content))
719 def _shalist_sort_key(ent):
720 (mode, name, id) = ent
721 if stat.S_ISDIR(int(mode, 8)):
727 def open_idx(filename):
728 if filename.endswith('.idx'):
729 f = open(filename, 'rb')
731 if header[0:4] == '\377tOc':
732 version = struct.unpack('!I', header[4:8])[0]
734 return PackIdxV2(filename, f)
736 raise GitError('%s: expected idx file version 2, got %d'
737 % (filename, version))
738 elif len(header) == 8 and header[0:4] < '\377tOc':
739 return PackIdxV1(filename, f)
741 raise GitError('%s: unrecognized idx file header' % filename)
742 elif filename.endswith('.midx'):
743 return PackMidx(filename)
745 raise GitError('idx filenames must end with .idx or .midx')
748 def idxmerge(idxlist, final_progress=True):
749 """Generate a list of all the objects reachable in a PackIdxList."""
750 def pfunc(count, total):
751 progress('Reading indexes: %.2f%% (%d/%d)\r'
752 % (count*100.0/total, count, total))
753 def pfinal(count, total):
755 log('Reading indexes: %.2f%% (%d/%d), done.\n' % (100, total, total))
756 return merge_iter(idxlist, 10024, pfunc, pfinal)
759 def _make_objcache():
760 return PackIdxList(repo('objects/pack'))
763 """Writes Git objects insid a pack file."""
764 def __init__(self, objcache_maker=_make_objcache):
770 self.objcache_maker = objcache_maker
778 (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
779 self.file = os.fdopen(fd, 'w+b')
780 assert(name.endswith('.pack'))
781 self.filename = name[:-5]
782 self.file.write('PACK\0\0\0\2\0\0\0\0')
783 self.idx = list(list() for i in xrange(256))
785 def _raw_write(self, datalist, sha):
788 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
789 # the file never has a *partial* blob. So let's make sure it's
790 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
791 # to our hashsplit algorithm.) f.write() does its own buffering,
792 # but that's okay because we'll flush it in _end().
793 oneblob = ''.join(datalist)
797 raise GitError, e, sys.exc_info()[2]
799 crc = zlib.crc32(oneblob) & 0xffffffff
800 self._update_idx(sha, crc, nw)
805 def _update_idx(self, sha, crc, size):
808 self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
810 def _write(self, sha, type, content):
814 sha = calc_hash(type, content)
815 size, crc = self._raw_write(_encode_packobj(type, content), sha=sha)
818 def breakpoint(self):
819 """Clear byte and object counts and return the last processed id."""
821 self.outbytes = self.count = 0
824 def _require_objcache(self):
825 if self.objcache is None and self.objcache_maker:
826 self.objcache = self.objcache_maker()
827 if self.objcache is None:
829 "PackWriter not opened or can't check exists w/o objcache")
831 def exists(self, id, want_source=False):
832 """Return non-empty if an object is found in the object cache."""
833 self._require_objcache()
834 return self.objcache.exists(id, want_source=want_source)
836 def maybe_write(self, type, content):
837 """Write an object to the pack file if not present and return its id."""
838 self._require_objcache()
839 sha = calc_hash(type, content)
840 if not self.exists(sha):
841 self._write(sha, type, content)
842 self.objcache.add(sha)
845 def new_blob(self, blob):
846 """Create a blob object in the pack with the supplied content."""
847 return self.maybe_write('blob', blob)
849 def new_tree(self, shalist):
850 """Create a tree object in the pack."""
851 shalist = sorted(shalist, key = _shalist_sort_key)
853 for (mode,name,bin) in shalist:
856 assert(mode[0] != '0')
858 assert(len(bin) == 20)
859 l.append('%s %s\0%s' % (mode,name,bin))
860 return self.maybe_write('tree', ''.join(l))
862 def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
864 if tree: l.append('tree %s' % tree.encode('hex'))
865 if parent: l.append('parent %s' % parent.encode('hex'))
866 if author: l.append('author %s %s' % (author, _git_date(adate)))
867 if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
870 return self.maybe_write('commit', '\n'.join(l))
872 def new_commit(self, parent, tree, date, msg):
873 """Create a commit object in the pack."""
874 userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
875 commit = self._new_commit(tree, parent,
876 userline, date, userline, date,
881 """Remove the pack file from disk."""
887 os.unlink(self.filename + '.pack')
889 def _end(self, run_midx=True):
891 if not f: return None
897 # update object count
899 cp = struct.pack('!i', self.count)
903 # calculate the pack sha1sum
906 for b in chunkyreader(f):
908 packbin = sum.digest()
912 idx_f = open(self.filename + '.idx', 'wb')
913 obj_list_sha = self._write_pack_idx_v2(idx_f, idx, packbin)
916 nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
917 if os.path.exists(self.filename + '.map'):
918 os.unlink(self.filename + '.map')
919 os.rename(self.filename + '.pack', nameprefix + '.pack')
920 os.rename(self.filename + '.idx', nameprefix + '.idx')
923 auto_midx(repo('objects/pack'))
926 def close(self, run_midx=True):
927 """Close the pack file and move it to its definitive path."""
928 return self._end(run_midx=run_midx)
930 def _write_pack_idx_v2(self, file, idx, packbin):
937 write('\377tOc\0\0\0\2')
942 write(struct.pack('!i', n))
943 part.sort(key=lambda x: x[0])
945 obj_list_sum = Sha1()
949 obj_list_sum.update(entry[0])
952 write(struct.pack('!I', entry[1]))
956 if entry[2] & 0x80000000:
957 write(struct.pack('!I', 0x80000000 | len(ofs64_list)))
958 ofs64_list.append(struct.pack('!Q', entry[2]))
960 write(struct.pack('!i', entry[2]))
961 for ofs64 in ofs64_list:
965 file.write(sum.digest())
966 return obj_list_sum.hexdigest()
970 return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
974 os.environ['GIT_DIR'] = os.path.abspath(repo())
977 def list_refs(refname = None):
978 """Generate a list of tuples in the form (refname,hash).
979 If a ref name is specified, list only this particular ref.
981 argv = ['git', 'show-ref', '--']
984 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
985 out = p.stdout.read().strip()
986 rv = p.wait() # not fatal
990 for d in out.split('\n'):
991 (sha, name) = d.split(' ', 1)
992 yield (name, sha.decode('hex'))
995 def read_ref(refname):
996 """Get the commit id of the most recent commit made on a given ref."""
997 l = list(list_refs(refname))
1005 def rev_list(ref, count=None):
1006 """Generate a list of reachable commits in reverse chronological order.
1008 This generator walks through commits, from child to parent, that are
1009 reachable via the specified ref and yields a series of tuples of the form
1012 If count is a non-zero integer, limit the number of commits to "count"
1015 assert(not ref.startswith('-'))
1018 opts += ['-n', str(atoi(count))]
1019 argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
1020 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
1022 for row in p.stdout:
1024 if s.startswith('commit '):
1025 commit = s[7:].decode('hex')
1028 yield (date, commit)
1029 rv = p.wait() # not fatal
1031 raise GitError, 'git rev-list returned error %d' % rv
1034 def rev_get_date(ref):
1035 """Get the date of the latest commit on the specified ref."""
1036 for (date, commit) in rev_list(ref, count=1):
1038 raise GitError, 'no such commit %r' % ref
1041 def rev_parse(committish):
1042 """Resolve the full hash for 'committish', if it exists.
1044 Should be roughly equivalent to 'git rev-parse'.
1046 Returns the hex value of the hash if it is found, None if 'committish' does
1047 not correspond to anything.
1049 head = read_ref(committish)
1051 debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
1054 pL = PackIdxList(repo('objects/pack'))
1056 if len(committish) == 40:
1058 hash = committish.decode('hex')
1068 def update_ref(refname, newval, oldval):
1069 """Change the commit pointed to by a branch."""
1072 assert(refname.startswith('refs/heads/'))
1073 p = subprocess.Popen(['git', 'update-ref', refname,
1074 newval.encode('hex'), oldval.encode('hex')],
1075 preexec_fn = _gitenv)
1076 _git_wait('git update-ref', p)
1079 def guess_repo(path=None):
1080 """Set the path value in the global variable "repodir".
1081 This makes bup look for an existing bup repository, but not fail if a
1082 repository doesn't exist. Usually, if you are interacting with a bup
1083 repository, you would not be calling this function but using
1084 check_repo_or_die().
1090 repodir = os.environ.get('BUP_DIR')
1092 repodir = os.path.expanduser('~/.bup')
1095 def init_repo(path=None):
1096 """Create the Git bare repository for bup in a given path."""
1098 d = repo() # appends a / to the path
1099 parent = os.path.dirname(os.path.dirname(d))
1100 if parent and not os.path.exists(parent):
1101 raise GitError('parent directory "%s" does not exist\n' % parent)
1102 if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
1103 raise GitError('"%d" exists but is not a directory\n' % d)
1104 p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
1105 preexec_fn = _gitenv)
1106 _git_wait('git init', p)
1107 # Force the index version configuration in order to ensure bup works
1108 # regardless of the version of the installed Git binary.
1109 p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
1110 stdout=sys.stderr, preexec_fn = _gitenv)
1111 _git_wait('git config', p)
1114 def check_repo_or_die(path=None):
1115 """Make sure a bup repository exists, and abort if not.
1116 If the path to a particular repository was not specified, this function
1117 initializes the default repository automatically.
1120 if not os.path.isdir(repo('objects/pack/.')):
1121 if repodir == home_repodir:
1124 log('error: %r is not a bup/git repository\n' % repo())
1129 """Generate a list of (mode, name, hash) tuples of objects from 'buf'."""
1131 while ofs < len(buf):
1132 z = buf[ofs:].find('\0')
1134 spl = buf[ofs:ofs+z].split(' ', 1)
1135 assert(len(spl) == 2)
1136 sha = buf[ofs+z+1:ofs+z+1+20]
1138 yield (spl[0], spl[1], sha)
1143 """Get Git's version and ensure a usable version is installed.
1145 The returned version is formatted as an ordered tuple with each position
1146 representing a digit in the version tag. For example, the following tuple
1147 would represent version 1.6.6.9:
1149 ('1', '6', '6', '9')
1153 p = subprocess.Popen(['git', '--version'],
1154 stdout=subprocess.PIPE)
1155 gvs = p.stdout.read()
1156 _git_wait('git --version', p)
1157 m = re.match(r'git version (\S+.\S+)', gvs)
1159 raise GitError('git --version weird output: %r' % gvs)
1160 _ver = tuple(m.group(1).split('.'))
1161 needed = ('1','5', '3', '1')
1163 raise GitError('git version %s or higher is required; you have %s'
1164 % ('.'.join(needed), '.'.join(_ver)))
1168 def _git_wait(cmd, p):
1171 raise GitError('%s returned %d' % (cmd, rv))
1174 def _git_capture(argv):
1175 p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
1177 _git_wait(repr(argv), p)
1181 class _AbortableIter:
1182 def __init__(self, it, onabort = None):
1184 self.onabort = onabort
1192 return self.it.next()
1193 except StopIteration, e:
1201 """Abort iteration and call the abortion callback, if needed."""
1213 """Link to 'git cat-file' that is used to retrieve blob data."""
1216 wanted = ('1','5','6')
1219 log('warning: git version < %s; bup will be slow.\n'
1222 self.get = self._slow_get
1224 self.p = self.inprogress = None
1225 self.get = self._fast_get
1229 self.p.stdout.close()
1230 self.p.stdin.close()
1232 self.inprogress = None
1236 self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1237 stdin=subprocess.PIPE,
1238 stdout=subprocess.PIPE,
1241 preexec_fn = _gitenv)
1243 def _fast_get(self, id):
1244 if not self.p or self.p.poll() != None:
1247 assert(self.p.poll() == None)
1249 log('_fast_get: opening %r while %r is open'
1250 % (id, self.inprogress))
1251 assert(not self.inprogress)
1252 assert(id.find('\n') < 0)
1253 assert(id.find('\r') < 0)
1254 assert(not id.startswith('-'))
1255 self.inprogress = id
1256 self.p.stdin.write('%s\n' % id)
1257 self.p.stdin.flush()
1258 hdr = self.p.stdout.readline()
1259 if hdr.endswith(' missing\n'):
1260 self.inprogress = None
1261 raise KeyError('blob %r is missing' % id)
1262 spl = hdr.split(' ')
1263 if len(spl) != 3 or len(spl[0]) != 40:
1264 raise GitError('expected blob, got %r' % spl)
1265 (hex, type, size) = spl
1267 it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
1268 onabort = self._abort)
1273 assert(self.p.stdout.readline() == '\n')
1274 self.inprogress = None
1275 except Exception, e:
1279 def _slow_get(self, id):
1280 assert(id.find('\n') < 0)
1281 assert(id.find('\r') < 0)
1282 assert(id[0] != '-')
1283 type = _git_capture(['git', 'cat-file', '-t', id]).strip()
1286 p = subprocess.Popen(['git', 'cat-file', type, id],
1287 stdout=subprocess.PIPE,
1288 preexec_fn = _gitenv)
1289 for blob in chunkyreader(p.stdout):
1291 _git_wait('git cat-file', p)
1293 def _join(self, it):
1298 elif type == 'tree':
1299 treefile = ''.join(it)
1300 for (mode, name, sha) in treeparse(treefile):
1301 for blob in self.join(sha.encode('hex')):
1303 elif type == 'commit':
1304 treeline = ''.join(it).split('\n')[0]
1305 assert(treeline.startswith('tree '))
1306 for blob in self.join(treeline[5:]):
1309 raise GitError('invalid object type %r: expected blob/tree/commit'
1313 """Generate a list of the content of all blobs that can be reached
1314 from an object. The hash given in 'id' must point to a blob, a tree
1315 or a commit. The content of all blobs that can be seen from trees or
1316 commits will be added to the list.
1319 for d in self._join(self.get(id)):
1321 except StopIteration:
1325 """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1327 for (n,c) in list_refs():
1328 if n.startswith('refs/tags/'):
1333 tags[c].append(name) # more than one tag can point at 'c'