1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
5 import os, sys, zlib, time, subprocess, struct, stat, re, tempfile, math, glob
6 from bup.helpers import *
7 from bup import _helpers, path
11 """Discussion of bloom constants for bup:
13 There are four basic things to consider when building a bloom filter:
14 The size, in bits, of the filter
15 The capacity, in entries, of the filter
16 The probability of a false positive that is tolerable
17 The number of bits readily available to use for addresing filter bits
19 There is one major tunable that is not directly related to the above:
20 k: the number of bits set in the filter per entry
22 Here's a wall of numbers showing the relationship between k; the ratio between
23 the filter size in bits and the entries in the filter; and pfalse_positive:
25 mn|k=3 |k=4 |k=5 |k=6 |k=7 |k=8 |k=9 |k=10 |k=11
26 8|3.05794|2.39687|2.16792|2.15771|2.29297|2.54917|2.92244|3.41909|4.05091
27 9|2.27780|1.65770|1.40703|1.32721|1.34892|1.44631|1.61138|1.84491|2.15259
28 10|1.74106|1.18133|0.94309|0.84362|0.81937|0.84555|0.91270|1.01859|1.16495
29 11|1.36005|0.86373|0.65018|0.55222|0.51259|0.50864|0.53098|0.57616|0.64387
30 12|1.08231|0.64568|0.45945|0.37108|0.32939|0.31424|0.31695|0.33387|0.36380
31 13|0.87517|0.49210|0.33183|0.25527|0.21689|0.19897|0.19384|0.19804|0.21013
32 14|0.71759|0.38147|0.24433|0.17934|0.14601|0.12887|0.12127|0.12012|0.12399
33 15|0.59562|0.30019|0.18303|0.12840|0.10028|0.08523|0.07749|0.07440|0.07468
34 16|0.49977|0.23941|0.13925|0.09351|0.07015|0.05745|0.05049|0.04700|0.04587
35 17|0.42340|0.19323|0.10742|0.06916|0.04990|0.03941|0.03350|0.03024|0.02870
36 18|0.36181|0.15765|0.08392|0.05188|0.03604|0.02748|0.02260|0.01980|0.01827
37 19|0.31160|0.12989|0.06632|0.03942|0.02640|0.01945|0.01549|0.01317|0.01182
38 20|0.27026|0.10797|0.05296|0.03031|0.01959|0.01396|0.01077|0.00889|0.00777
39 21|0.23591|0.09048|0.04269|0.02356|0.01471|0.01014|0.00759|0.00609|0.00518
40 22|0.20714|0.07639|0.03473|0.01850|0.01117|0.00746|0.00542|0.00423|0.00350
41 23|0.18287|0.06493|0.02847|0.01466|0.00856|0.00555|0.00392|0.00297|0.00240
42 24|0.16224|0.05554|0.02352|0.01171|0.00663|0.00417|0.00286|0.00211|0.00166
43 25|0.14459|0.04779|0.01957|0.00944|0.00518|0.00316|0.00211|0.00152|0.00116
44 26|0.12942|0.04135|0.01639|0.00766|0.00408|0.00242|0.00157|0.00110|0.00082
45 27|0.11629|0.03595|0.01381|0.00626|0.00324|0.00187|0.00118|0.00081|0.00059
46 28|0.10489|0.03141|0.01170|0.00515|0.00259|0.00146|0.00090|0.00060|0.00043
47 29|0.09492|0.02756|0.00996|0.00426|0.00209|0.00114|0.00069|0.00045|0.00031
48 30|0.08618|0.02428|0.00853|0.00355|0.00169|0.00090|0.00053|0.00034|0.00023
49 31|0.07848|0.02147|0.00733|0.00297|0.00138|0.00072|0.00041|0.00025|0.00017
50 32|0.07167|0.01906|0.00633|0.00250|0.00113|0.00057|0.00032|0.00019|0.00013
52 Here's a table showing available repository size for a given pfalse_positive
53 and three values of k (assuming we only use the 160 bit SHA1 for addressing the
54 filter and 8192bytes per object):
56 pfalse|obj k=4 |cap k=4 |obj k=5 |cap k=5 |obj k=6 |cap k=6
57 2.500%|139333497228|1038.11 TiB|558711157|4262.63 GiB|13815755|105.41 GiB
58 1.000%|104489450934| 778.50 TiB|436090254|3327.10 GiB|11077519| 84.51 GiB
59 0.125%| 57254889824| 426.58 TiB|261732190|1996.86 GiB| 7063017| 55.89 GiB
61 This eliminates pretty neatly any k>6 as long as we use the raw SHA for
64 filter size scales linearly with repository size for a given k and pfalse.
66 Here's a table of filter sizes for a 1 TiB repository:
68 pfalse| k=3 | k=4 | k=5 | k=6
69 2.500%| 138.78 MiB | 126.26 MiB | 123.00 MiB | 123.37 MiB
70 1.000%| 197.83 MiB | 168.36 MiB | 157.58 MiB | 153.87 MiB
71 0.125%| 421.14 MiB | 307.26 MiB | 262.56 MiB | 241.32 MiB
74 * We want the bloom filter to fit in memory; if it doesn't, the k pagefaults
75 per lookup will be worse than the two required for midx.
76 * We want the pfalse_positive to be low enough that the cost of sometimes
77 faulting on the midx doesn't overcome the benefit of the bloom filter.
78 * We have readily available 160 bits for addressing the filter.
79 * We want to be able to have a single bloom address entire repositories of
82 Based on these parameters, a combination of k=4 and k=5 provides the behavior
83 that bup needs. As such, I've implemented bloom addressing, adding and
84 checking functions in C for these two values. Because k=5 requires less space
85 and gives better overall pfalse_positive perofrmance, it is preferred if a
86 table with k=5 can represent the repository.
88 None of this tells us what max_pfalse_positive to choose.
90 Brandon Low <lostlogic@lostlogicx.com> 04-02-2011
93 MAX_BITS_EACH = 32 # Kinda arbitrary, but 4 bytes per entry is pretty big
94 MAX_BLOOM_BITS = {4: 37, 5: 29} # 160/k-log2(8)
95 MAX_PFALSE_POSITIVE = 1. # Totally arbitrary, needs benchmarking
99 home_repodir = os.path.expanduser('~/.bup')
102 _typemap = { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
103 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
109 class GitError(Exception):
114 """Get the path to the git repository or one of its subdirectories."""
117 raise GitError('You should call check_repo_or_die()')
119 # If there's a .git subdirectory, then the actual repo is in there.
120 gd = os.path.join(repodir, '.git')
121 if os.path.exists(gd):
124 return os.path.join(repodir, sub)
127 def auto_midx(objdir):
128 args = [path.exe(), 'midx', '--auto', '--dir', objdir]
130 rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
132 # make sure 'args' gets printed to help with debugging
133 add_error('%r: exception: %s' % (args, e))
136 add_error('%r: returned %d' % (args, rv))
138 args = [path.exe(), 'bloom', '--dir', objdir]
140 rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
142 # make sure 'args' gets printed to help with debugging
143 add_error('%r: exception: %s' % (args, e))
146 add_error('%r: returned %d' % (args, rv))
149 def mangle_name(name, mode, gitmode):
150 """Mangle a file name to present an abstract name for segmented files.
151 Mangled file names will have the ".bup" extension added to them. If a
152 file's name already ends with ".bup", a ".bupl" extension is added to
153 disambiguate normal files from semgmented ones.
155 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
157 elif name.endswith('.bup') or name[:-1].endswith('.bup'):
158 return name + '.bupl'
163 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
164 def demangle_name(name):
165 """Remove name mangling from a file name, if necessary.
167 The return value is a tuple (demangled_filename,mode), where mode is one of
170 * BUP_NORMAL : files that should be read as-is from the repository
171 * BUP_CHUNKED : files that were chunked and need to be assembled
173 For more information on the name mangling algorythm, see mangle_name()
175 if name.endswith('.bupl'):
176 return (name[:-5], BUP_NORMAL)
177 elif name.endswith('.bup'):
178 return (name[:-4], BUP_CHUNKED)
180 return (name, BUP_NORMAL)
183 def _encode_packobj(type, content):
186 szbits = (sz & 0x0f) | (_typemap[type]<<4)
189 if sz: szbits |= 0x80
195 z = zlib.compressobj(1)
197 yield z.compress(content)
201 def _encode_looseobj(type, content):
202 z = zlib.compressobj(1)
203 yield z.compress('%s %d\0' % (type, len(content)))
204 yield z.compress(content)
208 def _decode_looseobj(buf):
210 s = zlib.decompress(buf)
217 assert(type in _typemap)
218 assert(sz == len(content))
219 return (type, content)
222 def _decode_packobj(buf):
225 type = _typermap[(c & 0x70) >> 4]
232 sz |= (c & 0x7f) << shift
236 return (type, zlib.decompress(buf[i+1:]))
243 def find_offset(self, hash):
244 """Get the offset of an object inside the index file."""
245 idx = self._idx_from_hash(hash)
247 return self._ofs_from_idx(idx)
250 def exists(self, hash, want_source=False):
251 """Return nonempty if the object exists in this index."""
252 if hash and (self._idx_from_hash(hash) != None):
253 return want_source and self.name or True
257 return int(self.fanout[255])
259 def _idx_from_hash(self, hash):
260 global _total_searches, _total_steps
262 assert(len(hash) == 20)
264 start = self.fanout[b1-1] # range -1..254
265 end = self.fanout[b1] # range 0..255
267 _total_steps += 1 # lookup table is a step
270 mid = start + (end-start)/2
271 v = self._idx_to_hash(mid)
281 class PackIdxV1(PackIdx):
282 """Object representation of a Git pack index (version 1) file."""
283 def __init__(self, filename, f):
285 self.idxnames = [self.name]
286 self.map = mmap_read(f)
287 self.fanout = list(struct.unpack('!256I',
288 str(buffer(self.map, 0, 256*4))))
289 self.fanout.append(0) # entry "-1"
290 nsha = self.fanout[255]
292 self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
294 def _ofs_from_idx(self, idx):
295 return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
297 def _idx_to_hash(self, idx):
298 return str(self.shatable[idx*24+4 : idx*24+24])
301 for i in xrange(self.fanout[255]):
302 yield buffer(self.map, 256*4 + 24*i + 4, 20)
305 class PackIdxV2(PackIdx):
306 """Object representation of a Git pack index (version 2) file."""
307 def __init__(self, filename, f):
309 self.idxnames = [self.name]
310 self.map = mmap_read(f)
311 assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
312 self.fanout = list(struct.unpack('!256I',
313 str(buffer(self.map, 8, 256*4))))
314 self.fanout.append(0) # entry "-1"
315 nsha = self.fanout[255]
316 self.sha_ofs = 8 + 256*4
317 self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
318 self.ofstable = buffer(self.map,
319 self.sha_ofs + nsha*20 + nsha*4,
321 self.ofs64table = buffer(self.map,
322 8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
324 def _ofs_from_idx(self, idx):
325 ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
327 idx64 = ofs & 0x7fffffff
328 ofs = struct.unpack('!Q',
329 str(buffer(self.ofs64table, idx64*8, 8)))[0]
332 def _idx_to_hash(self, idx):
333 return str(self.shatable[idx*20:(idx+1)*20])
336 for i in xrange(self.fanout[255]):
337 yield buffer(self.map, 8 + 256*4 + 20*i, 20)
340 extract_bits = _helpers.extract_bits
342 bloom_contains = _helpers.bloom_contains
343 bloom_add = _helpers.bloom_add
347 """Wrapper which contains data from multiple index files.
349 def __init__(self, filename, f=None, readwrite=False, expected=-1):
353 assert(filename.endswith('.bloom'))
356 self.rwfile = f = f or open(filename, 'r+b')
359 # Decide if we want to mmap() the pages as writable ('immediate'
360 # write) or else map them privately for later writing back to
361 # the file ('delayed' write). A bloom table's write access
362 # pattern is such that we dirty almost all the pages after adding
363 # very few entries. But the table is so big that dirtying
364 # *all* the pages often exceeds Linux's default
365 # /proc/sys/vm/dirty_ratio or /proc/sys/vm/dirty_background_ratio,
366 # thus causing it to start flushing the table before we're
367 # finished... even though there's more than enough space to
368 # store the bloom table in RAM.
370 # To work around that behaviour, if we calculate that we'll
371 # probably end up touching the whole table anyway (at least
372 # one bit flipped per memory page), let's use a "private" mmap,
373 # which defeats Linux's ability to flush it to disk. Then we'll
374 # flush it as one big lump during close().
375 pages = os.fstat(f.fileno()).st_size / 4096 * 5 # assume k=5
376 self.delaywrite = expected > pages
377 debug1('bloom: delaywrite=%r\n' % self.delaywrite)
379 self.map = mmap_readwrite_private(self.rwfile, close=False)
381 self.map = mmap_readwrite(self.rwfile, close=False)
384 f = f or open(filename, 'rb')
385 self.map = mmap_read(f)
386 got = str(self.map[0:4])
388 log('Warning: invalid BLOM header (%r) in %r\n' % (got, filename))
389 return self._init_failed()
390 ver = struct.unpack('!I', self.map[4:8])[0]
391 if ver < BLOOM_VERSION:
392 log('Warning: ignoring old-style (v%d) bloom %r\n'
394 return self._init_failed()
395 if ver > BLOOM_VERSION:
396 log('Warning: ignoring too-new (v%d) bloom %r\n'
398 return self._init_failed()
400 self.bits, self.k, self.entries = struct.unpack('!HHI', self.map[8:16])
401 idxnamestr = str(self.map[16 + 2**self.bits:])
403 self.idxnames = idxnamestr.split('\0')
407 def _init_failed(self):
414 self.bits = self.entries = 0
417 return self.map and self.bits
423 if self.map and self.rwfile:
424 debug2("bloom: closing with %d entries\n" % self.entries)
425 self.map[12:16] = struct.pack('!I', self.entries)
428 self.rwfile.write(self.map)
431 self.rwfile.seek(16 + 2**self.bits)
433 self.rwfile.write('\0'.join(self.idxnames))
436 def pfalse_positive(self, additional=0):
437 n = self.entries + additional
440 return 100*(1-math.exp(-k*float(n)/m))**k
442 def add_idx(self, ix):
443 """Add the object to the filter, return current pfalse_positive."""
444 if not self.map: raise Exception, "Cannot add to closed bloom"
445 self.entries += bloom_add(self.map, ix.shatable, self.bits, self.k)
446 self.idxnames.append(os.path.basename(ix.name))
448 def exists(self, sha):
449 """Return nonempty if the object probably exists in the bloom filter."""
450 global _total_searches, _total_steps
452 if not self.map: return None
453 found, steps = bloom_contains(self.map, str(sha), self.bits, self.k)
454 _total_steps += steps
458 def create(cls, name, expected, delaywrite=None, f=None, k=None):
459 """Create and return a bloom filter for `expected` entries."""
460 bits = int(math.floor(math.log(expected*MAX_BITS_EACH/8,2)))
461 k = k or ((bits <= MAX_BLOOM_BITS[5]) and 5 or 4)
462 if bits > MAX_BLOOM_BITS[k]:
463 log('bloom: warning, max bits exceeded, non-optimal\n')
464 bits = MAX_BLOOM_BITS[k]
465 debug1('bloom: using 2^%d bytes and %d hash functions\n' % (bits, k))
466 f = f or open(name, 'w+b')
468 f.write(struct.pack('!IHHI', BLOOM_VERSION, bits, k, 0))
469 assert(f.tell() == 16)
470 # NOTE: On some systems this will not extend+zerofill, but it does on
471 # darwin, linux, bsd and solaris.
472 f.truncate(16+2**bits)
474 if delaywrite != None and not delaywrite:
475 # tell it to expect very few objects, forcing a direct mmap
477 return cls(name, f=f, readwrite=True, expected=expected)
484 """Wrapper which contains data from multiple index files.
485 Multiple index (.midx) files constitute a wrapper around index (.idx) files
486 and make it possible for bup to expand Git's indexing capabilities to vast
489 def __init__(self, filename):
491 self.force_keep = False
492 assert(filename.endswith('.midx'))
493 self.map = mmap_read(open(filename))
494 if str(self.map[0:4]) != 'MIDX':
495 log('Warning: skipping: invalid MIDX header in %r\n' % filename)
496 self.force_keep = True
497 return self._init_failed()
498 ver = struct.unpack('!I', self.map[4:8])[0]
499 if ver < MIDX_VERSION:
500 log('Warning: ignoring old-style (v%d) midx %r\n'
502 self.force_keep = False # old stuff is boring
503 return self._init_failed()
504 if ver > MIDX_VERSION:
505 log('Warning: ignoring too-new (v%d) midx %r\n'
507 self.force_keep = True # new stuff is exciting
508 return self._init_failed()
510 self.bits = _helpers.firstword(self.map[8:12])
511 self.entries = 2**self.bits
512 self.fanout = buffer(self.map, 12, self.entries*4)
513 self.sha_ofs = 12 + self.entries*4
514 self.nsha = nsha = self._fanget(self.entries-1)
515 self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
516 self.whichlist = buffer(self.map, self.sha_ofs + nsha*20, nsha*4)
517 self.idxname_ofs = self.sha_ofs + 24*nsha
518 self.idxnames = str(self.map[self.idxname_ofs:]).split('\0')
520 def _init_failed(self):
523 self.fanout = buffer('\0\0\0\0')
524 self.shatable = buffer('\0'*20)
527 def _fanget(self, i):
529 s = self.fanout[start:start+4]
530 return _helpers.firstword(s)
533 return str(self.shatable[i*20:(i+1)*20])
535 def _get_idx_i(self, i):
536 return struct.unpack('!I', self.whichlist[i*4:(i+1)*4])[0]
538 def _get_idxname(self, i):
539 return self.idxnames[self._get_idx_i(i)]
541 def exists(self, hash, want_source=False):
542 """Return nonempty if the object exists in the index files."""
543 global _total_searches, _total_steps
546 el = extract_bits(want, self.bits)
548 start = self._fanget(el-1)
549 startv = el << (32-self.bits)
553 end = self._fanget(el)
554 endv = (el+1) << (32-self.bits)
555 _total_steps += 1 # lookup table is a step
556 hashv = _helpers.firstword(hash)
557 #print '(%08x) %08x %08x %08x' % (extract_bits(want, 32), startv, hashv, endv)
560 #print '! %08x %08x %08x %d - %d' % (startv, hashv, endv, start, end)
561 mid = start + (hashv-startv)*(end-start-1)/(endv-startv)
562 #print ' %08x %08x %08x %d %d %d' % (startv, hashv, endv, start, mid, end)
564 #print ' %08x' % self._num(v)
567 startv = _helpers.firstword(v)
570 endv = _helpers.firstword(v)
572 return want_source and self._get_idxname(mid) or True
576 for i in xrange(self._fanget(self.entries-1)):
577 yield buffer(self.shatable, i*20, 20)
580 return int(self._fanget(self.entries-1))
585 def __init__(self, dir):
587 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
592 self.do_bloom = False
599 assert(_mpi_count == 0)
602 return iter(idxmerge(self.packs))
605 return sum(len(pack) for pack in self.packs)
607 def exists(self, hash, want_source=False):
608 """Return nonempty if the object exists in the index files."""
609 global _total_searches
611 if hash in self.also:
613 if self.do_bloom and self.bloom is not None:
614 _total_searches -= 1 # will be incremented by bloom
615 if self.bloom.exists(hash):
616 self.do_bloom = False
619 for i in xrange(len(self.packs)):
621 _total_searches -= 1 # will be incremented by sub-pack
622 ix = p.exists(hash, want_source=want_source)
624 # reorder so most recently used packs are searched first
625 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
630 def refresh(self, skip_midx = False):
631 """Refresh the index list.
632 This method verifies if .midx files were superseded (e.g. all of its
633 contents are in another, bigger .midx file) and removes the superseded
636 If skip_midx is True, all work on .midx files will be skipped and .midx
637 files will be removed from the list.
639 The module-global variable 'ignore_midx' can force this function to
640 always act as if skip_midx was True.
642 self.bloom = None # Always reopen the bloom as it may have been relaced
643 self.do_bloom = False
644 skip_midx = skip_midx or ignore_midx
645 d = dict((p.name, p) for p in self.packs
646 if not skip_midx or not isinstance(p, PackMidx))
647 if os.path.exists(self.dir):
650 for ix in self.packs:
651 if isinstance(ix, PackMidx):
652 for name in ix.idxnames:
653 d[os.path.join(self.dir, name)] = ix
654 for full in glob.glob(os.path.join(self.dir,'*.midx')):
657 (mxd, mxf) = os.path.split(mx.name)
659 for n in mx.idxnames:
660 if not os.path.exists(os.path.join(mxd, n)):
661 log(('warning: index %s missing\n' +
662 ' used by %s\n') % (n, mxf))
669 midxl.sort(lambda x,y: -cmp(len(x),len(y)))
672 for sub in ix.idxnames:
673 found = d.get(os.path.join(self.dir, sub))
674 if not found or isinstance(found, PackIdx):
675 # doesn't exist, or exists but not in a midx
680 for name in ix.idxnames:
681 d[os.path.join(self.dir, name)] = ix
682 elif not ix.force_keep:
683 debug1('midx: removing redundant: %s\n'
684 % os.path.basename(ix.name))
686 for full in glob.glob(os.path.join(self.dir,'*.idx')):
694 bfull = os.path.join(self.dir, 'bup.bloom')
695 if self.bloom is None and os.path.exists(bfull):
696 self.bloom = ShaBloom(bfull)
697 self.packs = list(set(d.values()))
698 self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
699 if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
703 debug1('PackIdxList: using %d index%s.\n'
704 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
707 """Insert an additional object in the list."""
711 def calc_hash(type, content):
712 """Calculate some content's hash in the Git fashion."""
713 header = '%s %d\0' % (type, len(content))
719 def _shalist_sort_key(ent):
720 (mode, name, id) = ent
721 if stat.S_ISDIR(int(mode, 8)):
727 def open_idx(filename):
728 if filename.endswith('.idx'):
729 f = open(filename, 'rb')
731 if header[0:4] == '\377tOc':
732 version = struct.unpack('!I', header[4:8])[0]
734 return PackIdxV2(filename, f)
736 raise GitError('%s: expected idx file version 2, got %d'
737 % (filename, version))
738 elif len(header) == 8 and header[0:4] < '\377tOc':
739 return PackIdxV1(filename, f)
741 raise GitError('%s: unrecognized idx file header' % filename)
742 elif filename.endswith('.midx'):
743 return PackMidx(filename)
745 raise GitError('idx filenames must end with .idx or .midx')
748 def idxmerge(idxlist, final_progress=True):
749 """Generate a list of all the objects reachable in a PackIdxList."""
750 def pfunc(count, total):
751 progress('Reading indexes: %.2f%% (%d/%d)\r'
752 % (count*100.0/total, count, total))
753 def pfinal(count, total):
755 log('Reading indexes: %.2f%% (%d/%d), done.\n' % (100, total, total))
756 return merge_iter(idxlist, 10024, pfunc, pfinal)
759 def _make_objcache():
760 return PackIdxList(repo('objects/pack'))
763 """Writes Git objects insid a pack file."""
764 def __init__(self, objcache_maker=_make_objcache):
770 self.objcache_maker = objcache_maker
778 (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
779 self.file = os.fdopen(fd, 'w+b')
780 assert(name.endswith('.pack'))
781 self.filename = name[:-5]
782 self.file.write('PACK\0\0\0\2\0\0\0\0')
783 self.idx = list(list() for i in xrange(256))
785 # the 'sha' parameter is used in client.py's _raw_write(), but not needed
786 # in this basic version.
787 def _raw_write(self, datalist, sha):
790 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
791 # the file never has a *partial* blob. So let's make sure it's
792 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
793 # to our hashsplit algorithm.) f.write() does its own buffering,
794 # but that's okay because we'll flush it in _end().
795 oneblob = ''.join(datalist)
799 raise GitError, e, sys.exc_info()[2]
801 crc = zlib.crc32(oneblob) & 0xffffffff
802 self._update_idx(sha, crc, nw)
807 def _update_idx(self, sha, crc, size):
810 self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
812 def _write(self, sha, type, content):
816 sha = calc_hash(type, content)
817 size, crc = self._raw_write(_encode_packobj(type, content), sha=sha)
820 def breakpoint(self):
821 """Clear byte and object counts and return the last processed id."""
823 self.outbytes = self.count = 0
826 def _require_objcache(self):
827 if self.objcache is None and self.objcache_maker:
828 self.objcache = self.objcache_maker()
829 if self.objcache is None:
831 "PackWriter not opened or can't check exists w/o objcache")
833 def exists(self, id, want_source=False):
834 """Return non-empty if an object is found in the object cache."""
835 self._require_objcache()
836 return self.objcache.exists(id, want_source=want_source)
838 def maybe_write(self, type, content):
839 """Write an object to the pack file if not present and return its id."""
840 self._require_objcache()
841 sha = calc_hash(type, content)
842 if not self.exists(sha):
843 self._write(sha, type, content)
844 self.objcache.add(sha)
847 def new_blob(self, blob):
848 """Create a blob object in the pack with the supplied content."""
849 return self.maybe_write('blob', blob)
851 def new_tree(self, shalist):
852 """Create a tree object in the pack."""
853 shalist = sorted(shalist, key = _shalist_sort_key)
855 for (mode,name,bin) in shalist:
858 assert(mode[0] != '0')
860 assert(len(bin) == 20)
861 l.append('%s %s\0%s' % (mode,name,bin))
862 return self.maybe_write('tree', ''.join(l))
864 def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
866 if tree: l.append('tree %s' % tree.encode('hex'))
867 if parent: l.append('parent %s' % parent.encode('hex'))
868 if author: l.append('author %s %s' % (author, _git_date(adate)))
869 if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
872 return self.maybe_write('commit', '\n'.join(l))
874 def new_commit(self, parent, tree, date, msg):
875 """Create a commit object in the pack."""
876 userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
877 commit = self._new_commit(tree, parent,
878 userline, date, userline, date,
883 """Remove the pack file from disk."""
889 os.unlink(self.filename + '.pack')
891 def _end(self, run_midx=True):
893 if not f: return None
899 # update object count
901 cp = struct.pack('!i', self.count)
905 # calculate the pack sha1sum
908 for b in chunkyreader(f):
910 packbin = sum.digest()
914 idx_f = open(self.filename + '.idx', 'wb')
915 obj_list_sha = self._write_pack_idx_v2(idx_f, idx, packbin)
918 nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
919 if os.path.exists(self.filename + '.map'):
920 os.unlink(self.filename + '.map')
921 os.rename(self.filename + '.pack', nameprefix + '.pack')
922 os.rename(self.filename + '.idx', nameprefix + '.idx')
925 auto_midx(repo('objects/pack'))
928 def close(self, run_midx=True):
929 """Close the pack file and move it to its definitive path."""
930 return self._end(run_midx=run_midx)
932 def _write_pack_idx_v2(self, file, idx, packbin):
939 write('\377tOc\0\0\0\2')
944 write(struct.pack('!i', n))
945 part.sort(key=lambda x: x[0])
947 obj_list_sum = Sha1()
951 obj_list_sum.update(entry[0])
954 write(struct.pack('!I', entry[1]))
958 if entry[2] & 0x80000000:
959 write(struct.pack('!I', 0x80000000 | len(ofs64_list)))
960 ofs64_list.append(struct.pack('!Q', entry[2]))
962 write(struct.pack('!i', entry[2]))
963 for ofs64 in ofs64_list:
967 file.write(sum.digest())
968 return obj_list_sum.hexdigest()
972 return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
976 os.environ['GIT_DIR'] = os.path.abspath(repo())
979 def list_refs(refname = None):
980 """Generate a list of tuples in the form (refname,hash).
981 If a ref name is specified, list only this particular ref.
983 argv = ['git', 'show-ref', '--']
986 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
987 out = p.stdout.read().strip()
988 rv = p.wait() # not fatal
992 for d in out.split('\n'):
993 (sha, name) = d.split(' ', 1)
994 yield (name, sha.decode('hex'))
997 def read_ref(refname):
998 """Get the commit id of the most recent commit made on a given ref."""
999 l = list(list_refs(refname))
1007 def rev_list(ref, count=None):
1008 """Generate a list of reachable commits in reverse chronological order.
1010 This generator walks through commits, from child to parent, that are
1011 reachable via the specified ref and yields a series of tuples of the form
1014 If count is a non-zero integer, limit the number of commits to "count"
1017 assert(not ref.startswith('-'))
1020 opts += ['-n', str(atoi(count))]
1021 argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
1022 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
1024 for row in p.stdout:
1026 if s.startswith('commit '):
1027 commit = s[7:].decode('hex')
1030 yield (date, commit)
1031 rv = p.wait() # not fatal
1033 raise GitError, 'git rev-list returned error %d' % rv
1036 def rev_get_date(ref):
1037 """Get the date of the latest commit on the specified ref."""
1038 for (date, commit) in rev_list(ref, count=1):
1040 raise GitError, 'no such commit %r' % ref
1043 def rev_parse(committish):
1044 """Resolve the full hash for 'committish', if it exists.
1046 Should be roughly equivalent to 'git rev-parse'.
1048 Returns the hex value of the hash if it is found, None if 'committish' does
1049 not correspond to anything.
1051 head = read_ref(committish)
1053 debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
1056 pL = PackIdxList(repo('objects/pack'))
1058 if len(committish) == 40:
1060 hash = committish.decode('hex')
1070 def update_ref(refname, newval, oldval):
1071 """Change the commit pointed to by a branch."""
1074 assert(refname.startswith('refs/heads/'))
1075 p = subprocess.Popen(['git', 'update-ref', refname,
1076 newval.encode('hex'), oldval.encode('hex')],
1077 preexec_fn = _gitenv)
1078 _git_wait('git update-ref', p)
1081 def guess_repo(path=None):
1082 """Set the path value in the global variable "repodir".
1083 This makes bup look for an existing bup repository, but not fail if a
1084 repository doesn't exist. Usually, if you are interacting with a bup
1085 repository, you would not be calling this function but using
1086 check_repo_or_die().
1092 repodir = os.environ.get('BUP_DIR')
1094 repodir = os.path.expanduser('~/.bup')
1097 def init_repo(path=None):
1098 """Create the Git bare repository for bup in a given path."""
1100 d = repo() # appends a / to the path
1101 parent = os.path.dirname(os.path.dirname(d))
1102 if parent and not os.path.exists(parent):
1103 raise GitError('parent directory "%s" does not exist\n' % parent)
1104 if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
1105 raise GitError('"%d" exists but is not a directory\n' % d)
1106 p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
1107 preexec_fn = _gitenv)
1108 _git_wait('git init', p)
1109 # Force the index version configuration in order to ensure bup works
1110 # regardless of the version of the installed Git binary.
1111 p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
1112 stdout=sys.stderr, preexec_fn = _gitenv)
1113 _git_wait('git config', p)
1116 def check_repo_or_die(path=None):
1117 """Make sure a bup repository exists, and abort if not.
1118 If the path to a particular repository was not specified, this function
1119 initializes the default repository automatically.
1122 if not os.path.isdir(repo('objects/pack/.')):
1123 if repodir == home_repodir:
1126 log('error: %r is not a bup/git repository\n' % repo())
1131 """Generate a list of (mode, name, hash) tuples of objects from 'buf'."""
1133 while ofs < len(buf):
1134 z = buf[ofs:].find('\0')
1136 spl = buf[ofs:ofs+z].split(' ', 1)
1137 assert(len(spl) == 2)
1138 sha = buf[ofs+z+1:ofs+z+1+20]
1140 yield (spl[0], spl[1], sha)
1145 """Get Git's version and ensure a usable version is installed.
1147 The returned version is formatted as an ordered tuple with each position
1148 representing a digit in the version tag. For example, the following tuple
1149 would represent version 1.6.6.9:
1151 ('1', '6', '6', '9')
1155 p = subprocess.Popen(['git', '--version'],
1156 stdout=subprocess.PIPE)
1157 gvs = p.stdout.read()
1158 _git_wait('git --version', p)
1159 m = re.match(r'git version (\S+.\S+)', gvs)
1161 raise GitError('git --version weird output: %r' % gvs)
1162 _ver = tuple(m.group(1).split('.'))
1163 needed = ('1','5', '3', '1')
1165 raise GitError('git version %s or higher is required; you have %s'
1166 % ('.'.join(needed), '.'.join(_ver)))
1170 def _git_wait(cmd, p):
1173 raise GitError('%s returned %d' % (cmd, rv))
1176 def _git_capture(argv):
1177 p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
1179 _git_wait(repr(argv), p)
1183 class _AbortableIter:
1184 def __init__(self, it, onabort = None):
1186 self.onabort = onabort
1194 return self.it.next()
1195 except StopIteration, e:
1203 """Abort iteration and call the abortion callback, if needed."""
1215 """Link to 'git cat-file' that is used to retrieve blob data."""
1218 wanted = ('1','5','6')
1221 log('warning: git version < %s; bup will be slow.\n'
1224 self.get = self._slow_get
1226 self.p = self.inprogress = None
1227 self.get = self._fast_get
1231 self.p.stdout.close()
1232 self.p.stdin.close()
1234 self.inprogress = None
1238 self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1239 stdin=subprocess.PIPE,
1240 stdout=subprocess.PIPE,
1243 preexec_fn = _gitenv)
1245 def _fast_get(self, id):
1246 if not self.p or self.p.poll() != None:
1249 assert(self.p.poll() == None)
1251 log('_fast_get: opening %r while %r is open'
1252 % (id, self.inprogress))
1253 assert(not self.inprogress)
1254 assert(id.find('\n') < 0)
1255 assert(id.find('\r') < 0)
1256 assert(not id.startswith('-'))
1257 self.inprogress = id
1258 self.p.stdin.write('%s\n' % id)
1259 self.p.stdin.flush()
1260 hdr = self.p.stdout.readline()
1261 if hdr.endswith(' missing\n'):
1262 self.inprogress = None
1263 raise KeyError('blob %r is missing' % id)
1264 spl = hdr.split(' ')
1265 if len(spl) != 3 or len(spl[0]) != 40:
1266 raise GitError('expected blob, got %r' % spl)
1267 (hex, type, size) = spl
1269 it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
1270 onabort = self._abort)
1275 assert(self.p.stdout.readline() == '\n')
1276 self.inprogress = None
1277 except Exception, e:
1281 def _slow_get(self, id):
1282 assert(id.find('\n') < 0)
1283 assert(id.find('\r') < 0)
1284 assert(id[0] != '-')
1285 type = _git_capture(['git', 'cat-file', '-t', id]).strip()
1288 p = subprocess.Popen(['git', 'cat-file', type, id],
1289 stdout=subprocess.PIPE,
1290 preexec_fn = _gitenv)
1291 for blob in chunkyreader(p.stdout):
1293 _git_wait('git cat-file', p)
1295 def _join(self, it):
1300 elif type == 'tree':
1301 treefile = ''.join(it)
1302 for (mode, name, sha) in treeparse(treefile):
1303 for blob in self.join(sha.encode('hex')):
1305 elif type == 'commit':
1306 treeline = ''.join(it).split('\n')[0]
1307 assert(treeline.startswith('tree '))
1308 for blob in self.join(treeline[5:]):
1311 raise GitError('invalid object type %r: expected blob/tree/commit'
1315 """Generate a list of the content of all blobs that can be reached
1316 from an object. The hash given in 'id' must point to a blob, a tree
1317 or a commit. The content of all blobs that can be seen from trees or
1318 commits will be added to the list.
1321 for d in self._join(self.get(id)):
1323 except StopIteration:
1327 """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1329 for (n,c) in list_refs():
1330 if n.startswith('refs/tags/'):
1335 tags[c].append(name) # more than one tag can point at 'c'