1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
5 import os, sys, zlib, time, subprocess, struct, stat, re, tempfile, math, glob
6 from bup.helpers import *
7 from bup import _helpers, path
11 """Discussion of bloom constants for bup:
13 There are four basic things to consider when building a bloom filter:
14 The size, in bits, of the filter
15 The capacity, in entries, of the filter
16 The probability of a false positive that is tolerable
17 The number of bits readily available to use for addresing filter bits
19 There is one major tunable that is not directly related to the above:
20 k: the number of bits set in the filter per entry
22 Here's a wall of numbers showing the relationship between k; the ratio between
23 the filter size in bits and the entries in the filter; and pfalse_positive:
25 mn|k=3 |k=4 |k=5 |k=6 |k=7 |k=8 |k=9 |k=10 |k=11
26 8|3.05794|2.39687|2.16792|2.15771|2.29297|2.54917|2.92244|3.41909|4.05091
27 9|2.27780|1.65770|1.40703|1.32721|1.34892|1.44631|1.61138|1.84491|2.15259
28 10|1.74106|1.18133|0.94309|0.84362|0.81937|0.84555|0.91270|1.01859|1.16495
29 11|1.36005|0.86373|0.65018|0.55222|0.51259|0.50864|0.53098|0.57616|0.64387
30 12|1.08231|0.64568|0.45945|0.37108|0.32939|0.31424|0.31695|0.33387|0.36380
31 13|0.87517|0.49210|0.33183|0.25527|0.21689|0.19897|0.19384|0.19804|0.21013
32 14|0.71759|0.38147|0.24433|0.17934|0.14601|0.12887|0.12127|0.12012|0.12399
33 15|0.59562|0.30019|0.18303|0.12840|0.10028|0.08523|0.07749|0.07440|0.07468
34 16|0.49977|0.23941|0.13925|0.09351|0.07015|0.05745|0.05049|0.04700|0.04587
35 17|0.42340|0.19323|0.10742|0.06916|0.04990|0.03941|0.03350|0.03024|0.02870
36 18|0.36181|0.15765|0.08392|0.05188|0.03604|0.02748|0.02260|0.01980|0.01827
37 19|0.31160|0.12989|0.06632|0.03942|0.02640|0.01945|0.01549|0.01317|0.01182
38 20|0.27026|0.10797|0.05296|0.03031|0.01959|0.01396|0.01077|0.00889|0.00777
39 21|0.23591|0.09048|0.04269|0.02356|0.01471|0.01014|0.00759|0.00609|0.00518
40 22|0.20714|0.07639|0.03473|0.01850|0.01117|0.00746|0.00542|0.00423|0.00350
41 23|0.18287|0.06493|0.02847|0.01466|0.00856|0.00555|0.00392|0.00297|0.00240
42 24|0.16224|0.05554|0.02352|0.01171|0.00663|0.00417|0.00286|0.00211|0.00166
43 25|0.14459|0.04779|0.01957|0.00944|0.00518|0.00316|0.00211|0.00152|0.00116
44 26|0.12942|0.04135|0.01639|0.00766|0.00408|0.00242|0.00157|0.00110|0.00082
45 27|0.11629|0.03595|0.01381|0.00626|0.00324|0.00187|0.00118|0.00081|0.00059
46 28|0.10489|0.03141|0.01170|0.00515|0.00259|0.00146|0.00090|0.00060|0.00043
47 29|0.09492|0.02756|0.00996|0.00426|0.00209|0.00114|0.00069|0.00045|0.00031
48 30|0.08618|0.02428|0.00853|0.00355|0.00169|0.00090|0.00053|0.00034|0.00023
49 31|0.07848|0.02147|0.00733|0.00297|0.00138|0.00072|0.00041|0.00025|0.00017
50 32|0.07167|0.01906|0.00633|0.00250|0.00113|0.00057|0.00032|0.00019|0.00013
52 Here's a table showing available repository size for a given pfalse_positive
53 and three values of k (assuming we only use the 160 bit SHA1 for addressing the
54 filter and 8192bytes per object):
56 pfalse|obj k=4 |cap k=4 |obj k=5 |cap k=5 |obj k=6 |cap k=6
57 2.500%|139333497228|1038.11 TiB|558711157|4262.63 GiB|13815755|105.41 GiB
58 1.000%|104489450934| 778.50 TiB|436090254|3327.10 GiB|11077519| 84.51 GiB
59 0.125%| 57254889824| 426.58 TiB|261732190|1996.86 GiB| 7063017| 55.89 GiB
61 This eliminates pretty neatly any k>6 as long as we use the raw SHA for
64 filter size scales linearly with repository size for a given k and pfalse.
66 Here's a table of filter sizes for a 1 TiB repository:
68 pfalse| k=3 | k=4 | k=5 | k=6
69 2.500%| 138.78 MiB | 126.26 MiB | 123.00 MiB | 123.37 MiB
70 1.000%| 197.83 MiB | 168.36 MiB | 157.58 MiB | 153.87 MiB
71 0.125%| 421.14 MiB | 307.26 MiB | 262.56 MiB | 241.32 MiB
74 * We want the bloom filter to fit in memory; if it doesn't, the k pagefaults
75 per lookup will be worse than the two required for midx.
76 * We want the pfalse_positive to be low enough that the cost of sometimes
77 faulting on the midx doesn't overcome the benefit of the bloom filter.
78 * We have readily available 160 bits for addressing the filter.
79 * We want to be able to have a single bloom address entire repositories of
82 Based on these parameters, a combination of k=4 and k=5 provides the behavior
83 that bup needs. As such, I've implemented bloom addressing, adding and
84 checking functions in C for these two values. Because k=5 requires less space
85 and gives better overall pfalse_positive perofrmance, it is preferred if a
86 table with k=5 can represent the repository.
88 None of this tells us what max_pfalse_positive to choose.
90 Brandon Low <lostlogic@lostlogicx.com> 04-02-2011
93 MAX_BITS_EACH = 32 # Kinda arbitrary, but 4 bytes per entry is pretty big
94 MAX_BLOOM_BITS = {4: 37, 5: 29} # 160/k-log2(8)
95 MAX_PFALSE_POSITIVE = 1. # Totally arbitrary, needs benchmarking
99 home_repodir = os.path.expanduser('~/.bup')
102 _typemap = { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
103 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
109 class GitError(Exception):
114 """Get the path to the git repository or one of its subdirectories."""
117 raise GitError('You should call check_repo_or_die()')
119 # If there's a .git subdirectory, then the actual repo is in there.
120 gd = os.path.join(repodir, '.git')
121 if os.path.exists(gd):
124 return os.path.join(repodir, sub)
127 def auto_midx(objdir):
128 args = [path.exe(), 'midx', '--auto', '--dir', objdir]
130 rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
132 # make sure 'args' gets printed to help with debugging
133 add_error('%r: exception: %s' % (args, e))
136 add_error('%r: returned %d' % (args, rv))
138 args = [path.exe(), 'bloom', '--dir', objdir]
140 rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
142 # make sure 'args' gets printed to help with debugging
143 add_error('%r: exception: %s' % (args, e))
146 add_error('%r: returned %d' % (args, rv))
149 def mangle_name(name, mode, gitmode):
150 """Mangle a file name to present an abstract name for segmented files.
151 Mangled file names will have the ".bup" extension added to them. If a
152 file's name already ends with ".bup", a ".bupl" extension is added to
153 disambiguate normal files from semgmented ones.
155 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
157 elif name.endswith('.bup') or name[:-1].endswith('.bup'):
158 return name + '.bupl'
163 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
164 def demangle_name(name):
165 """Remove name mangling from a file name, if necessary.
167 The return value is a tuple (demangled_filename,mode), where mode is one of
170 * BUP_NORMAL : files that should be read as-is from the repository
171 * BUP_CHUNKED : files that were chunked and need to be assembled
173 For more information on the name mangling algorythm, see mangle_name()
175 if name.endswith('.bupl'):
176 return (name[:-5], BUP_NORMAL)
177 elif name.endswith('.bup'):
178 return (name[:-4], BUP_CHUNKED)
180 return (name, BUP_NORMAL)
183 def _encode_packobj(type, content):
186 szbits = (sz & 0x0f) | (_typemap[type]<<4)
189 if sz: szbits |= 0x80
195 z = zlib.compressobj(1)
197 yield z.compress(content)
201 def _encode_looseobj(type, content):
202 z = zlib.compressobj(1)
203 yield z.compress('%s %d\0' % (type, len(content)))
204 yield z.compress(content)
208 def _decode_looseobj(buf):
210 s = zlib.decompress(buf)
217 assert(type in _typemap)
218 assert(sz == len(content))
219 return (type, content)
222 def _decode_packobj(buf):
225 type = _typermap[(c & 0x70) >> 4]
232 sz |= (c & 0x7f) << shift
236 return (type, zlib.decompress(buf[i+1:]))
243 def find_offset(self, hash):
244 """Get the offset of an object inside the index file."""
245 idx = self._idx_from_hash(hash)
247 return self._ofs_from_idx(idx)
250 def exists(self, hash, want_source=False):
251 """Return nonempty if the object exists in this index."""
252 if hash and (self._idx_from_hash(hash) != None):
253 return want_source and self.name or True
257 return int(self.fanout[255])
259 def _idx_from_hash(self, hash):
260 global _total_searches, _total_steps
262 assert(len(hash) == 20)
264 start = self.fanout[b1-1] # range -1..254
265 end = self.fanout[b1] # range 0..255
267 _total_steps += 1 # lookup table is a step
270 mid = start + (end-start)/2
271 v = self._idx_to_hash(mid)
280 def iter_with_idx_i(self, idx_i):
285 class PackIdxV1(PackIdx):
286 """Object representation of a Git pack index (version 1) file."""
287 def __init__(self, filename, f):
289 self.idxnames = [self.name]
290 self.map = mmap_read(f)
291 self.fanout = list(struct.unpack('!256I',
292 str(buffer(self.map, 0, 256*4))))
293 self.fanout.append(0) # entry "-1"
294 nsha = self.fanout[255]
295 self.shatable = buffer(self.map, 256*4, nsha*24)
297 def _ofs_from_idx(self, idx):
298 return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
300 def _idx_to_hash(self, idx):
301 return str(self.shatable[idx*24+4 : idx*24+24])
304 for i in xrange(self.fanout[255]):
305 yield buffer(self.map, 256*4 + 24*i + 4, 20)
308 class PackIdxV2(PackIdx):
309 """Object representation of a Git pack index (version 2) file."""
310 def __init__(self, filename, f):
312 self.idxnames = [self.name]
313 self.map = mmap_read(f)
314 assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
315 self.fanout = list(struct.unpack('!256I',
316 str(buffer(self.map, 8, 256*4))))
317 self.fanout.append(0) # entry "-1"
318 nsha = self.fanout[255]
319 self.shatable = buffer(self.map, 8 + 256*4, nsha*20)
320 self.ofstable = buffer(self.map,
321 8 + 256*4 + nsha*20 + nsha*4,
323 self.ofs64table = buffer(self.map,
324 8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
326 def _ofs_from_idx(self, idx):
327 ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
329 idx64 = ofs & 0x7fffffff
330 ofs = struct.unpack('!Q',
331 str(buffer(self.ofs64table, idx64*8, 8)))[0]
334 def _idx_to_hash(self, idx):
335 return str(self.shatable[idx*20:(idx+1)*20])
338 for i in xrange(self.fanout[255]):
339 yield buffer(self.map, 8 + 256*4 + 20*i, 20)
342 extract_bits = _helpers.extract_bits
344 bloom_contains = _helpers.bloom_contains
345 bloom_add = _helpers.bloom_add
349 """Wrapper which contains data from multiple index files.
350 Multiple index (.midx) files constitute a wrapper around index (.idx) files
351 and make it possible for bup to expand Git's indexing capabilities to vast
354 def __init__(self, filename, f=None, readwrite=False):
356 assert(filename.endswith('.bloom'))
358 self.rwfile = f or open(filename, 'r+b')
359 self.map = mmap_readwrite(self.rwfile, close=False)
362 self.map = mmap_read(f or open(filename, 'rb'))
363 if str(self.map[0:4]) != 'BLOM':
364 log('Warning: skipping: invalid BLOM header in %r\n' % filename)
365 return self._init_failed()
366 ver = struct.unpack('!I', self.map[4:8])[0]
367 if ver < BLOOM_VERSION:
368 log('Warning: ignoring old-style (v%d) bloom %r\n'
370 return self._init_failed()
371 if ver > BLOOM_VERSION:
372 log('Warning: ignoring too-new (v%d) bloom %r\n'
374 return self._init_failed()
376 self.bits, self.k, self.entries = struct.unpack('!HHI', self.map[8:16])
377 idxnamestr = str(self.map[16 + 2**self.bits:])
379 self.idxnames = idxnamestr.split('\0')
383 def _init_failed(self):
391 self.bits = self.entries = 0
394 return self.map and self.bits
402 debug2("bloom: closing with %d entries\n" % self.entries)
403 self.map[12:16] = struct.pack('!I', self.entries)
406 self.rwfile.seek(16 + 2**self.bits)
408 self.rwfile.write('\0'.join(self.idxnames))
411 def pfalse_positive(self, additional=0):
412 n = self.entries + additional
415 return 100*(1-math.exp(-k*float(n)/m))**k
417 def add_idx(self, ix):
418 """Add the object to the filter, return current pfalse_positive."""
419 if not self.map: raise Exception, "Cannot add to closed bloom"
420 self.entries += bloom_add(self.map, ix.shatable, self.bits, self.k)
421 self.idxnames.append(os.path.basename(ix.name))
423 def exists(self, sha):
424 """Return nonempty if the object probably exists in the bloom filter."""
425 global _total_searches, _total_steps
427 if not self.map: return None
428 found, steps = bloom_contains(self.map, str(sha), self.bits, self.k)
429 _total_steps += steps
433 def create(cls, name, f=None, readwrite=False, expected=100000, k=None):
434 """Create and return a bloom filter for `expected` entries."""
435 bits = int(math.floor(math.log(expected*MAX_BITS_EACH/8,2)))
436 k = k or ((bits <= MAX_BLOOM_BITS[5]) and 5 or 4)
437 if bits > MAX_BLOOM_BITS[k]:
438 log('bloom: warning, max bits exceeded, non-optimal\n')
439 bits = MAX_BLOOM_BITS[k]
440 debug1('bloom: using 2^%d bytes and %d hash functions\n' % (bits, k))
441 f = f or open(name, 'w+b')
443 f.write(struct.pack('!IHHI', BLOOM_VERSION, bits, k, 0))
444 assert(f.tell() == 16)
445 f.write('\0'*2**bits)
447 return cls(name, f=f, readwrite=readwrite)
454 """Wrapper which contains data from multiple index files.
455 Multiple index (.midx) files constitute a wrapper around index (.idx) files
456 and make it possible for bup to expand Git's indexing capabilities to vast
459 def __init__(self, filename):
461 self.force_keep = False
462 assert(filename.endswith('.midx'))
463 self.map = mmap_read(open(filename))
464 if str(self.map[0:4]) != 'MIDX':
465 log('Warning: skipping: invalid MIDX header in %r\n' % filename)
466 self.force_keep = True
467 return self._init_failed()
468 ver = struct.unpack('!I', self.map[4:8])[0]
469 if ver < MIDX_VERSION:
470 log('Warning: ignoring old-style (v%d) midx %r\n'
472 self.force_keep = False # old stuff is boring
473 return self._init_failed()
474 if ver > MIDX_VERSION:
475 log('Warning: ignoring too-new (v%d) midx %r\n'
477 self.force_keep = True # new stuff is exciting
478 return self._init_failed()
480 self.bits = _helpers.firstword(self.map[8:12])
481 self.entries = 2**self.bits
482 self.fanout = buffer(self.map, 12, self.entries*4)
483 shaofs = 12 + self.entries*4
484 self.nsha = nsha = self._fanget(self.entries-1)
485 self.shatable = buffer(self.map, shaofs, nsha*20)
486 self.whichlist = buffer(self.map, shaofs + nsha*20, nsha*4)
487 self.idxnames = str(self.map[shaofs + 24*nsha:]).split('\0')
489 def _init_failed(self):
492 self.fanout = buffer('\0\0\0\0')
493 self.shatable = buffer('\0'*20)
496 def _fanget(self, i):
498 s = self.fanout[start:start+4]
499 return _helpers.firstword(s)
502 return str(self.shatable[i*20:(i+1)*20])
504 def _get_idx_i(self, i):
505 return struct.unpack('!I', self.whichlist[i*4:(i+1)*4])[0]
507 def _get_idxname(self, i):
508 return self.idxnames[self._get_idx_i(i)]
510 def exists(self, hash, want_source=False):
511 """Return nonempty if the object exists in the index files."""
512 global _total_searches, _total_steps
515 el = extract_bits(want, self.bits)
517 start = self._fanget(el-1)
518 startv = el << (32-self.bits)
522 end = self._fanget(el)
523 endv = (el+1) << (32-self.bits)
524 _total_steps += 1 # lookup table is a step
525 hashv = _helpers.firstword(hash)
526 #print '(%08x) %08x %08x %08x' % (extract_bits(want, 32), startv, hashv, endv)
529 #print '! %08x %08x %08x %d - %d' % (startv, hashv, endv, start, end)
530 mid = start + (hashv-startv)*(end-start-1)/(endv-startv)
531 #print ' %08x %08x %08x %d %d %d' % (startv, hashv, endv, start, mid, end)
533 #print ' %08x' % self._num(v)
536 startv = _helpers.firstword(v)
539 endv = _helpers.firstword(v)
541 return want_source and self._get_idxname(mid) or True
544 def iter_with_idx_i(self, ofs):
545 for i in xrange(self._fanget(self.entries-1)):
546 yield buffer(self.shatable, i*20, 20), ofs+self._get_idx_i(i)
549 for i in xrange(self._fanget(self.entries-1)):
550 yield buffer(self.shatable, i*20, 20)
553 return int(self._fanget(self.entries-1))
558 def __init__(self, dir):
560 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
565 self.do_bloom = False
572 assert(_mpi_count == 0)
575 return iter(idxmerge(self.packs))
578 return sum(len(pack) for pack in self.packs)
580 def exists(self, hash, want_source=False):
581 """Return nonempty if the object exists in the index files."""
582 global _total_searches
584 if hash in self.also:
586 if self.do_bloom and self.bloom is not None:
587 _total_searches -= 1 # will be incremented by bloom
588 if self.bloom.exists(hash):
589 self.do_bloom = False
592 for i in xrange(len(self.packs)):
594 _total_searches -= 1 # will be incremented by sub-pack
595 ix = p.exists(hash, want_source=want_source)
597 # reorder so most recently used packs are searched first
598 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
603 def refresh(self, skip_midx = False):
604 """Refresh the index list.
605 This method verifies if .midx files were superseded (e.g. all of its
606 contents are in another, bigger .midx file) and removes the superseded
609 If skip_midx is True, all work on .midx files will be skipped and .midx
610 files will be removed from the list.
612 The module-global variable 'ignore_midx' can force this function to
613 always act as if skip_midx was True.
615 self.bloom = None # Always reopen the bloom as it may have been relaced
616 self.do_bloom = False
617 skip_midx = skip_midx or ignore_midx
618 d = dict((p.name, p) for p in self.packs
619 if not skip_midx or not isinstance(p, PackMidx))
620 if os.path.exists(self.dir):
623 for ix in self.packs:
624 if isinstance(ix, PackMidx):
625 for name in ix.idxnames:
626 d[os.path.join(self.dir, name)] = ix
627 for full in glob.glob(os.path.join(self.dir,'*.midx')):
630 (mxd, mxf) = os.path.split(mx.name)
632 for n in mx.idxnames:
633 if not os.path.exists(os.path.join(mxd, n)):
634 log(('warning: index %s missing\n' +
635 ' used by %s\n') % (n, mxf))
642 midxl.sort(lambda x,y: -cmp(len(x),len(y)))
645 for sub in ix.idxnames:
646 found = d.get(os.path.join(self.dir, sub))
647 if not found or isinstance(found, PackIdx):
648 # doesn't exist, or exists but not in a midx
653 for name in ix.idxnames:
654 d[os.path.join(self.dir, name)] = ix
655 elif not ix.force_keep:
656 debug1('midx: removing redundant: %s\n'
657 % os.path.basename(ix.name))
659 for full in glob.glob(os.path.join(self.dir,'*.idx')):
667 bfull = os.path.join(self.dir, 'bup.bloom')
668 if self.bloom is None and os.path.exists(bfull):
669 self.bloom = ShaBloom(bfull)
670 self.packs = list(set(d.values()))
671 self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
672 if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
676 debug1('PackIdxList: using %d index%s.\n'
677 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
680 """Insert an additional object in the list."""
684 def calc_hash(type, content):
685 """Calculate some content's hash in the Git fashion."""
686 header = '%s %d\0' % (type, len(content))
692 def _shalist_sort_key(ent):
693 (mode, name, id) = ent
694 if stat.S_ISDIR(int(mode, 8)):
700 def open_idx(filename):
701 if filename.endswith('.idx'):
702 f = open(filename, 'rb')
704 if header[0:4] == '\377tOc':
705 version = struct.unpack('!I', header[4:8])[0]
707 return PackIdxV2(filename, f)
709 raise GitError('%s: expected idx file version 2, got %d'
710 % (filename, version))
711 elif len(header) == 8 and header[0:4] < '\377tOc':
712 return PackIdxV1(filename, f)
714 raise GitError('%s: unrecognized idx file header' % filename)
715 elif filename.endswith('.midx'):
716 return PackMidx(filename)
718 raise GitError('idx filenames must end with .idx or .midx')
721 def idxmerge(idxlist, final_progress=True, total=None):
722 """Generate a list of all the objects reachable in a PackIdxList."""
723 def pfunc(count, total):
724 progress('Reading indexes: %.2f%% (%d/%d)\r'
725 % (count*100.0/total, count, total))
726 def pfinal(count, total):
728 log('Reading indexes: %.2f%% (%d/%d), done.\n' % (100, total, total))
729 return merge_iter(idxlist, 10024, pfunc, pfinal, total=total)
732 def _make_objcache():
733 return PackIdxList(repo('objects/pack'))
736 """Writes Git objects insid a pack file."""
737 def __init__(self, objcache_maker=_make_objcache):
743 self.objcache_maker = objcache_maker
751 (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
752 self.file = os.fdopen(fd, 'w+b')
753 assert(name.endswith('.pack'))
754 self.filename = name[:-5]
755 self.file.write('PACK\0\0\0\2\0\0\0\0')
756 self.idx = list(list() for i in xrange(256))
758 # the 'sha' parameter is used in client.py's _raw_write(), but not needed
759 # in this basic version.
760 def _raw_write(self, datalist, sha):
763 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
764 # the file never has a *partial* blob. So let's make sure it's
765 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
766 # to our hashsplit algorithm.) f.write() does its own buffering,
767 # but that's okay because we'll flush it in _end().
768 oneblob = ''.join(datalist)
772 raise GitError, e, sys.exc_info()[2]
774 crc = zlib.crc32(oneblob) & 0xffffffff
775 self._update_idx(sha, crc, nw)
780 def _update_idx(self, sha, crc, size):
783 self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
785 def _write(self, sha, type, content):
789 sha = calc_hash(type, content)
790 size, crc = self._raw_write(_encode_packobj(type, content), sha=sha)
793 def breakpoint(self):
794 """Clear byte and object counts and return the last processed id."""
796 self.outbytes = self.count = 0
799 def _require_objcache(self):
800 if self.objcache is None and self.objcache_maker:
801 self.objcache = self.objcache_maker()
802 if self.objcache is None:
804 "PackWriter not opened or can't check exists w/o objcache")
806 def exists(self, id, want_source=False):
807 """Return non-empty if an object is found in the object cache."""
808 self._require_objcache()
809 return self.objcache.exists(id, want_source=want_source)
811 def maybe_write(self, type, content):
812 """Write an object to the pack file if not present and return its id."""
813 self._require_objcache()
814 sha = calc_hash(type, content)
815 if not self.exists(sha):
816 self._write(sha, type, content)
817 self.objcache.add(sha)
820 def new_blob(self, blob):
821 """Create a blob object in the pack with the supplied content."""
822 return self.maybe_write('blob', blob)
824 def new_tree(self, shalist):
825 """Create a tree object in the pack."""
826 shalist = sorted(shalist, key = _shalist_sort_key)
828 for (mode,name,bin) in shalist:
831 assert(mode[0] != '0')
833 assert(len(bin) == 20)
834 l.append('%s %s\0%s' % (mode,name,bin))
835 return self.maybe_write('tree', ''.join(l))
837 def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
839 if tree: l.append('tree %s' % tree.encode('hex'))
840 if parent: l.append('parent %s' % parent.encode('hex'))
841 if author: l.append('author %s %s' % (author, _git_date(adate)))
842 if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
845 return self.maybe_write('commit', '\n'.join(l))
847 def new_commit(self, parent, tree, date, msg):
848 """Create a commit object in the pack."""
849 userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
850 commit = self._new_commit(tree, parent,
851 userline, date, userline, date,
856 """Remove the pack file from disk."""
862 os.unlink(self.filename + '.pack')
864 def _end(self, run_midx=True):
866 if not f: return None
872 # update object count
874 cp = struct.pack('!i', self.count)
878 # calculate the pack sha1sum
881 for b in chunkyreader(f):
883 packbin = sum.digest()
887 idx_f = open(self.filename + '.idx', 'wb')
888 obj_list_sha = self._write_pack_idx_v2(idx_f, idx, packbin)
891 nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
892 if os.path.exists(self.filename + '.map'):
893 os.unlink(self.filename + '.map')
894 os.rename(self.filename + '.pack', nameprefix + '.pack')
895 os.rename(self.filename + '.idx', nameprefix + '.idx')
898 auto_midx(repo('objects/pack'))
901 def close(self, run_midx=True):
902 """Close the pack file and move it to its definitive path."""
903 return self._end(run_midx=run_midx)
905 def _write_pack_idx_v2(self, file, idx, packbin):
912 write('\377tOc\0\0\0\2')
917 write(struct.pack('!i', n))
918 part.sort(key=lambda x: x[0])
920 obj_list_sum = Sha1()
924 obj_list_sum.update(entry[0])
927 write(struct.pack('!I', entry[1]))
931 if entry[2] & 0x80000000:
932 write(struct.pack('!I', 0x80000000 | len(ofs64_list)))
933 ofs64_list.append(struct.pack('!Q', entry[2]))
935 write(struct.pack('!i', entry[2]))
936 for ofs64 in ofs64_list:
940 file.write(sum.digest())
941 return obj_list_sum.hexdigest()
945 return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
949 os.environ['GIT_DIR'] = os.path.abspath(repo())
952 def list_refs(refname = None):
953 """Generate a list of tuples in the form (refname,hash).
954 If a ref name is specified, list only this particular ref.
956 argv = ['git', 'show-ref', '--']
959 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
960 out = p.stdout.read().strip()
961 rv = p.wait() # not fatal
965 for d in out.split('\n'):
966 (sha, name) = d.split(' ', 1)
967 yield (name, sha.decode('hex'))
970 def read_ref(refname):
971 """Get the commit id of the most recent commit made on a given ref."""
972 l = list(list_refs(refname))
980 def rev_list(ref, count=None):
981 """Generate a list of reachable commits in reverse chronological order.
983 This generator walks through commits, from child to parent, that are
984 reachable via the specified ref and yields a series of tuples of the form
987 If count is a non-zero integer, limit the number of commits to "count"
990 assert(not ref.startswith('-'))
993 opts += ['-n', str(atoi(count))]
994 argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
995 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
999 if s.startswith('commit '):
1000 commit = s[7:].decode('hex')
1003 yield (date, commit)
1004 rv = p.wait() # not fatal
1006 raise GitError, 'git rev-list returned error %d' % rv
1009 def rev_get_date(ref):
1010 """Get the date of the latest commit on the specified ref."""
1011 for (date, commit) in rev_list(ref, count=1):
1013 raise GitError, 'no such commit %r' % ref
1016 def rev_parse(committish):
1017 """Resolve the full hash for 'committish', if it exists.
1019 Should be roughly equivalent to 'git rev-parse'.
1021 Returns the hex value of the hash if it is found, None if 'committish' does
1022 not correspond to anything.
1024 head = read_ref(committish)
1026 debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
1029 pL = PackIdxList(repo('objects/pack'))
1031 if len(committish) == 40:
1033 hash = committish.decode('hex')
1043 def update_ref(refname, newval, oldval):
1044 """Change the commit pointed to by a branch."""
1047 assert(refname.startswith('refs/heads/'))
1048 p = subprocess.Popen(['git', 'update-ref', refname,
1049 newval.encode('hex'), oldval.encode('hex')],
1050 preexec_fn = _gitenv)
1051 _git_wait('git update-ref', p)
1054 def guess_repo(path=None):
1055 """Set the path value in the global variable "repodir".
1056 This makes bup look for an existing bup repository, but not fail if a
1057 repository doesn't exist. Usually, if you are interacting with a bup
1058 repository, you would not be calling this function but using
1059 check_repo_or_die().
1065 repodir = os.environ.get('BUP_DIR')
1067 repodir = os.path.expanduser('~/.bup')
1070 def init_repo(path=None):
1071 """Create the Git bare repository for bup in a given path."""
1073 d = repo() # appends a / to the path
1074 parent = os.path.dirname(os.path.dirname(d))
1075 if parent and not os.path.exists(parent):
1076 raise GitError('parent directory "%s" does not exist\n' % parent)
1077 if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
1078 raise GitError('"%d" exists but is not a directory\n' % d)
1079 p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
1080 preexec_fn = _gitenv)
1081 _git_wait('git init', p)
1082 # Force the index version configuration in order to ensure bup works
1083 # regardless of the version of the installed Git binary.
1084 p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
1085 stdout=sys.stderr, preexec_fn = _gitenv)
1086 _git_wait('git config', p)
1089 def check_repo_or_die(path=None):
1090 """Make sure a bup repository exists, and abort if not.
1091 If the path to a particular repository was not specified, this function
1092 initializes the default repository automatically.
1095 if not os.path.isdir(repo('objects/pack/.')):
1096 if repodir == home_repodir:
1099 log('error: %r is not a bup/git repository\n' % repo())
1104 """Generate a list of (mode, name, hash) tuples of objects from 'buf'."""
1106 while ofs < len(buf):
1107 z = buf[ofs:].find('\0')
1109 spl = buf[ofs:ofs+z].split(' ', 1)
1110 assert(len(spl) == 2)
1111 sha = buf[ofs+z+1:ofs+z+1+20]
1113 yield (spl[0], spl[1], sha)
1118 """Get Git's version and ensure a usable version is installed.
1120 The returned version is formatted as an ordered tuple with each position
1121 representing a digit in the version tag. For example, the following tuple
1122 would represent version 1.6.6.9:
1124 ('1', '6', '6', '9')
1128 p = subprocess.Popen(['git', '--version'],
1129 stdout=subprocess.PIPE)
1130 gvs = p.stdout.read()
1131 _git_wait('git --version', p)
1132 m = re.match(r'git version (\S+.\S+)', gvs)
1134 raise GitError('git --version weird output: %r' % gvs)
1135 _ver = tuple(m.group(1).split('.'))
1136 needed = ('1','5', '3', '1')
1138 raise GitError('git version %s or higher is required; you have %s'
1139 % ('.'.join(needed), '.'.join(_ver)))
1143 def _git_wait(cmd, p):
1146 raise GitError('%s returned %d' % (cmd, rv))
1149 def _git_capture(argv):
1150 p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
1152 _git_wait(repr(argv), p)
1156 class _AbortableIter:
1157 def __init__(self, it, onabort = None):
1159 self.onabort = onabort
1167 return self.it.next()
1168 except StopIteration, e:
1176 """Abort iteration and call the abortion callback, if needed."""
1188 """Link to 'git cat-file' that is used to retrieve blob data."""
1191 wanted = ('1','5','6')
1194 log('warning: git version < %s; bup will be slow.\n'
1197 self.get = self._slow_get
1199 self.p = self.inprogress = None
1200 self.get = self._fast_get
1204 self.p.stdout.close()
1205 self.p.stdin.close()
1207 self.inprogress = None
1211 self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1212 stdin=subprocess.PIPE,
1213 stdout=subprocess.PIPE,
1216 preexec_fn = _gitenv)
1218 def _fast_get(self, id):
1219 if not self.p or self.p.poll() != None:
1222 assert(self.p.poll() == None)
1224 log('_fast_get: opening %r while %r is open'
1225 % (id, self.inprogress))
1226 assert(not self.inprogress)
1227 assert(id.find('\n') < 0)
1228 assert(id.find('\r') < 0)
1229 assert(not id.startswith('-'))
1230 self.inprogress = id
1231 self.p.stdin.write('%s\n' % id)
1232 self.p.stdin.flush()
1233 hdr = self.p.stdout.readline()
1234 if hdr.endswith(' missing\n'):
1235 self.inprogress = None
1236 raise KeyError('blob %r is missing' % id)
1237 spl = hdr.split(' ')
1238 if len(spl) != 3 or len(spl[0]) != 40:
1239 raise GitError('expected blob, got %r' % spl)
1240 (hex, type, size) = spl
1242 it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
1243 onabort = self._abort)
1248 assert(self.p.stdout.readline() == '\n')
1249 self.inprogress = None
1250 except Exception, e:
1254 def _slow_get(self, id):
1255 assert(id.find('\n') < 0)
1256 assert(id.find('\r') < 0)
1257 assert(id[0] != '-')
1258 type = _git_capture(['git', 'cat-file', '-t', id]).strip()
1261 p = subprocess.Popen(['git', 'cat-file', type, id],
1262 stdout=subprocess.PIPE,
1263 preexec_fn = _gitenv)
1264 for blob in chunkyreader(p.stdout):
1266 _git_wait('git cat-file', p)
1268 def _join(self, it):
1273 elif type == 'tree':
1274 treefile = ''.join(it)
1275 for (mode, name, sha) in treeparse(treefile):
1276 for blob in self.join(sha.encode('hex')):
1278 elif type == 'commit':
1279 treeline = ''.join(it).split('\n')[0]
1280 assert(treeline.startswith('tree '))
1281 for blob in self.join(treeline[5:]):
1284 raise GitError('invalid object type %r: expected blob/tree/commit'
1288 """Generate a list of the content of all blobs that can be reached
1289 from an object. The hash given in 'id' must point to a blob, a tree
1290 or a commit. The content of all blobs that can be seen from trees or
1291 commits will be added to the list.
1294 for d in self._join(self.get(id)):
1296 except StopIteration:
1300 """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1302 for (n,c) in list_refs():
1303 if n.startswith('refs/tags/'):
1308 tags[c].append(name) # more than one tag can point at 'c'