1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
5 import os, sys, zlib, time, subprocess, struct, stat, re, tempfile, math, glob
6 from bup.helpers import *
7 from bup import _helpers, path
11 """Discussion of bloom constants for bup:
13 There are four basic things to consider when building a bloom filter:
14 The size, in bits, of the filter
15 The capacity, in entries, of the filter
16 The probability of a false positive that is tolerable
17 The number of bits readily available to use for addresing filter bits
19 There is one major tunable that is not directly related to the above:
20 k: the number of bits set in the filter per entry
22 Here's a wall of numbers showing the relationship between k; the ratio between
23 the filter size in bits and the entries in the filter; and pfalse_positive:
25 mn|k=3 |k=4 |k=5 |k=6 |k=7 |k=8 |k=9 |k=10 |k=11
26 8|3.05794|2.39687|2.16792|2.15771|2.29297|2.54917|2.92244|3.41909|4.05091
27 9|2.27780|1.65770|1.40703|1.32721|1.34892|1.44631|1.61138|1.84491|2.15259
28 10|1.74106|1.18133|0.94309|0.84362|0.81937|0.84555|0.91270|1.01859|1.16495
29 11|1.36005|0.86373|0.65018|0.55222|0.51259|0.50864|0.53098|0.57616|0.64387
30 12|1.08231|0.64568|0.45945|0.37108|0.32939|0.31424|0.31695|0.33387|0.36380
31 13|0.87517|0.49210|0.33183|0.25527|0.21689|0.19897|0.19384|0.19804|0.21013
32 14|0.71759|0.38147|0.24433|0.17934|0.14601|0.12887|0.12127|0.12012|0.12399
33 15|0.59562|0.30019|0.18303|0.12840|0.10028|0.08523|0.07749|0.07440|0.07468
34 16|0.49977|0.23941|0.13925|0.09351|0.07015|0.05745|0.05049|0.04700|0.04587
35 17|0.42340|0.19323|0.10742|0.06916|0.04990|0.03941|0.03350|0.03024|0.02870
36 18|0.36181|0.15765|0.08392|0.05188|0.03604|0.02748|0.02260|0.01980|0.01827
37 19|0.31160|0.12989|0.06632|0.03942|0.02640|0.01945|0.01549|0.01317|0.01182
38 20|0.27026|0.10797|0.05296|0.03031|0.01959|0.01396|0.01077|0.00889|0.00777
39 21|0.23591|0.09048|0.04269|0.02356|0.01471|0.01014|0.00759|0.00609|0.00518
40 22|0.20714|0.07639|0.03473|0.01850|0.01117|0.00746|0.00542|0.00423|0.00350
41 23|0.18287|0.06493|0.02847|0.01466|0.00856|0.00555|0.00392|0.00297|0.00240
42 24|0.16224|0.05554|0.02352|0.01171|0.00663|0.00417|0.00286|0.00211|0.00166
43 25|0.14459|0.04779|0.01957|0.00944|0.00518|0.00316|0.00211|0.00152|0.00116
44 26|0.12942|0.04135|0.01639|0.00766|0.00408|0.00242|0.00157|0.00110|0.00082
45 27|0.11629|0.03595|0.01381|0.00626|0.00324|0.00187|0.00118|0.00081|0.00059
46 28|0.10489|0.03141|0.01170|0.00515|0.00259|0.00146|0.00090|0.00060|0.00043
47 29|0.09492|0.02756|0.00996|0.00426|0.00209|0.00114|0.00069|0.00045|0.00031
48 30|0.08618|0.02428|0.00853|0.00355|0.00169|0.00090|0.00053|0.00034|0.00023
49 31|0.07848|0.02147|0.00733|0.00297|0.00138|0.00072|0.00041|0.00025|0.00017
50 32|0.07167|0.01906|0.00633|0.00250|0.00113|0.00057|0.00032|0.00019|0.00013
52 Here's a table showing available repository size for a given pfalse_positive
53 and three values of k (assuming we only use the 160 bit SHA1 for addressing the
54 filter and 8192bytes per object):
56 pfalse|obj k=4 |cap k=4 |obj k=5 |cap k=5 |obj k=6 |cap k=6
57 2.500%|139333497228|1038.11 TiB|558711157|4262.63 GiB|13815755|105.41 GiB
58 1.000%|104489450934| 778.50 TiB|436090254|3327.10 GiB|11077519| 84.51 GiB
59 0.125%| 57254889824| 426.58 TiB|261732190|1996.86 GiB| 7063017| 55.89 GiB
61 This eliminates pretty neatly any k>6 as long as we use the raw SHA for
64 filter size scales linearly with repository size for a given k and pfalse.
66 Here's a table of filter sizes for a 1 TiB repository:
68 pfalse| k=3 | k=4 | k=5 | k=6
69 2.500%| 138.78 MiB | 126.26 MiB | 123.00 MiB | 123.37 MiB
70 1.000%| 197.83 MiB | 168.36 MiB | 157.58 MiB | 153.87 MiB
71 0.125%| 421.14 MiB | 307.26 MiB | 262.56 MiB | 241.32 MiB
74 * We want the bloom filter to fit in memory; if it doesn't, the k pagefaults
75 per lookup will be worse than the two required for midx.
76 * We want the pfalse_positive to be low enough that the cost of sometimes
77 faulting on the midx doesn't overcome the benefit of the bloom filter.
78 * We have readily available 160 bits for addressing the filter.
79 * We want to be able to have a single bloom address entire repositories of
82 Based on these parameters, a combination of k=4 and k=5 provides the behavior
83 that bup needs. As such, I've implemented bloom addressing, adding and
84 checking functions in C for these two values. Because k=5 requires less space
85 and gives better overall pfalse_positive perofrmance, it is preferred if a
86 table with k=5 can represent the repository.
88 None of this tells us what max_pfalse_positive to choose.
90 Brandon Low <lostlogic@lostlogicx.com> 04-02-2011
93 MAX_BITS_EACH = 32 # Kinda arbitrary, but 4 bytes per entry is pretty big
94 MAX_BLOOM_BITS = {4: 37, 5: 29} # 160/k-log2(8)
95 MAX_PFALSE_POSITIVE = 1. # Totally arbitrary, needs benchmarking
99 home_repodir = os.path.expanduser('~/.bup')
102 _typemap = { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
103 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
109 class GitError(Exception):
114 """Get the path to the git repository or one of its subdirectories."""
117 raise GitError('You should call check_repo_or_die()')
119 # If there's a .git subdirectory, then the actual repo is in there.
120 gd = os.path.join(repodir, '.git')
121 if os.path.exists(gd):
124 return os.path.join(repodir, sub)
127 def auto_midx(objdir):
128 args = [path.exe(), 'midx', '--auto', '--dir', objdir]
130 rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
132 # make sure 'args' gets printed to help with debugging
133 add_error('%r: exception: %s' % (args, e))
136 add_error('%r: returned %d' % (args, rv))
138 args = [path.exe(), 'bloom', '--dir', objdir]
140 rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
142 # make sure 'args' gets printed to help with debugging
143 add_error('%r: exception: %s' % (args, e))
146 add_error('%r: returned %d' % (args, rv))
149 def mangle_name(name, mode, gitmode):
150 """Mangle a file name to present an abstract name for segmented files.
151 Mangled file names will have the ".bup" extension added to them. If a
152 file's name already ends with ".bup", a ".bupl" extension is added to
153 disambiguate normal files from semgmented ones.
155 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
157 elif name.endswith('.bup') or name[:-1].endswith('.bup'):
158 return name + '.bupl'
163 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
164 def demangle_name(name):
165 """Remove name mangling from a file name, if necessary.
167 The return value is a tuple (demangled_filename,mode), where mode is one of
170 * BUP_NORMAL : files that should be read as-is from the repository
171 * BUP_CHUNKED : files that were chunked and need to be assembled
173 For more information on the name mangling algorythm, see mangle_name()
175 if name.endswith('.bupl'):
176 return (name[:-5], BUP_NORMAL)
177 elif name.endswith('.bup'):
178 return (name[:-4], BUP_CHUNKED)
180 return (name, BUP_NORMAL)
183 def _encode_packobj(type, content):
186 szbits = (sz & 0x0f) | (_typemap[type]<<4)
189 if sz: szbits |= 0x80
195 z = zlib.compressobj(1)
197 yield z.compress(content)
201 def _encode_looseobj(type, content):
202 z = zlib.compressobj(1)
203 yield z.compress('%s %d\0' % (type, len(content)))
204 yield z.compress(content)
208 def _decode_looseobj(buf):
210 s = zlib.decompress(buf)
217 assert(type in _typemap)
218 assert(sz == len(content))
219 return (type, content)
222 def _decode_packobj(buf):
225 type = _typermap[(c & 0x70) >> 4]
232 sz |= (c & 0x7f) << shift
236 return (type, zlib.decompress(buf[i+1:]))
243 def find_offset(self, hash):
244 """Get the offset of an object inside the index file."""
245 idx = self._idx_from_hash(hash)
247 return self._ofs_from_idx(idx)
250 def exists(self, hash):
251 """Return nonempty if the object exists in this index."""
252 return hash and (self._idx_from_hash(hash) != None) and True or None
255 return int(self.fanout[255])
257 def _idx_from_hash(self, hash):
258 global _total_searches, _total_steps
260 assert(len(hash) == 20)
262 start = self.fanout[b1-1] # range -1..254
263 end = self.fanout[b1] # range 0..255
265 _total_steps += 1 # lookup table is a step
268 mid = start + (end-start)/2
269 v = self._idx_to_hash(mid)
279 class PackIdxV1(PackIdx):
280 """Object representation of a Git pack index (version 1) file."""
281 def __init__(self, filename, f):
283 self.idxnames = [self.name]
284 self.map = mmap_read(f)
285 self.fanout = list(struct.unpack('!256I',
286 str(buffer(self.map, 0, 256*4))))
287 self.fanout.append(0) # entry "-1"
288 nsha = self.fanout[255]
289 self.shatable = buffer(self.map, 256*4, nsha*24)
291 def _ofs_from_idx(self, idx):
292 return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
294 def _idx_to_hash(self, idx):
295 return str(self.shatable[idx*24+4 : idx*24+24])
298 for i in xrange(self.fanout[255]):
299 yield buffer(self.map, 256*4 + 24*i + 4, 20)
302 class PackIdxV2(PackIdx):
303 """Object representation of a Git pack index (version 2) file."""
304 def __init__(self, filename, f):
306 self.idxnames = [self.name]
307 self.map = mmap_read(f)
308 assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
309 self.fanout = list(struct.unpack('!256I',
310 str(buffer(self.map, 8, 256*4))))
311 self.fanout.append(0) # entry "-1"
312 nsha = self.fanout[255]
313 self.shatable = buffer(self.map, 8 + 256*4, nsha*20)
314 self.ofstable = buffer(self.map,
315 8 + 256*4 + nsha*20 + nsha*4,
317 self.ofs64table = buffer(self.map,
318 8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
320 def _ofs_from_idx(self, idx):
321 ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
323 idx64 = ofs & 0x7fffffff
324 ofs = struct.unpack('!Q',
325 str(buffer(self.ofs64table, idx64*8, 8)))[0]
328 def _idx_to_hash(self, idx):
329 return str(self.shatable[idx*20:(idx+1)*20])
332 for i in xrange(self.fanout[255]):
333 yield buffer(self.map, 8 + 256*4 + 20*i, 20)
336 extract_bits = _helpers.extract_bits
338 bloom_contains = _helpers.bloom_contains
339 bloom_add = _helpers.bloom_add
343 """Wrapper which contains data from multiple index files.
344 Multiple index (.midx) files constitute a wrapper around index (.idx) files
345 and make it possible for bup to expand Git's indexing capabilities to vast
348 def __init__(self, filename, f=None, readwrite=False):
350 assert(filename.endswith('.bloom'))
352 self.rwfile = f or open(filename, 'r+b')
353 self.map = mmap_readwrite(self.rwfile, close=False)
356 self.map = mmap_read(f or open(filename, 'rb'))
357 if str(self.map[0:4]) != 'BLOM':
358 log('Warning: skipping: invalid BLOM header in %r\n' % filename)
359 return self._init_failed()
360 ver = struct.unpack('!I', self.map[4:8])[0]
361 if ver < BLOOM_VERSION:
362 log('Warning: ignoring old-style (v%d) bloom %r\n'
364 return self._init_failed()
365 if ver > BLOOM_VERSION:
366 log('Warning: ignoring too-new (v%d) bloom %r\n'
368 return self._init_failed()
370 self.bits, self.k, self.entries = struct.unpack('!HHI', self.map[8:16])
371 idxnamestr = str(self.map[16 + 2**self.bits:])
373 self.idxnames = idxnamestr.split('\0')
377 def _init_failed(self):
385 self.bits = self.entries = 0
388 return self.map and self.bits
396 debug2("bloom: closing with %d entries\n" % self.entries)
397 self.map[12:16] = struct.pack('!I', self.entries)
400 self.rwfile.seek(16 + 2**self.bits)
402 self.rwfile.write('\0'.join(self.idxnames))
405 def pfalse_positive(self, additional=0):
406 n = self.entries + additional
409 return 100*(1-math.exp(-k*float(n)/m))**k
411 def add_idx(self, ix):
412 """Add the object to the filter, return current pfalse_positive."""
413 if not self.map: raise Exception, "Cannot add to closed bloom"
414 self.entries += bloom_add(self.map, ix.shatable, self.bits, self.k)
415 self.idxnames.append(os.path.basename(ix.name))
417 def exists(self, sha):
418 """Return nonempty if the object probably exists in the bloom filter."""
419 global _total_searches, _total_steps
421 if not self.map: return None
422 found, steps = bloom_contains(self.map, str(sha), self.bits, self.k)
423 _total_steps += steps
427 def create(cls, name, f=None, readwrite=False, expected=100000, k=None):
428 """Create and return a bloom filter for `expected` entries."""
429 bits = int(math.floor(math.log(expected*MAX_BITS_EACH/8,2)))
430 k = k or ((bits <= MAX_BLOOM_BITS[5]) and 5 or 4)
431 if bits > MAX_BLOOM_BITS[k]:
432 log('bloom: warning, max bits exceeded, non-optimal\n')
433 bits = MAX_BLOOM_BITS[k]
434 debug1('bloom: using 2^%d bytes and %d hash functions\n' % (bits, k))
435 f = f or open(name, 'w+b')
437 f.write(struct.pack('!IHHI', BLOOM_VERSION, bits, k, 0))
438 assert(f.tell() == 16)
439 f.write('\0'*2**bits)
441 return cls(name, f=f, readwrite=readwrite)
448 """Wrapper which contains data from multiple index files.
449 Multiple index (.midx) files constitute a wrapper around index (.idx) files
450 and make it possible for bup to expand Git's indexing capabilities to vast
453 def __init__(self, filename):
455 self.force_keep = False
456 assert(filename.endswith('.midx'))
457 self.map = mmap_read(open(filename))
458 if str(self.map[0:4]) != 'MIDX':
459 log('Warning: skipping: invalid MIDX header in %r\n' % filename)
460 self.force_keep = True
461 return self._init_failed()
462 ver = struct.unpack('!I', self.map[4:8])[0]
463 if ver < MIDX_VERSION:
464 log('Warning: ignoring old-style (v%d) midx %r\n'
466 self.force_keep = False # old stuff is boring
467 return self._init_failed()
468 if ver > MIDX_VERSION:
469 log('Warning: ignoring too-new (v%d) midx %r\n'
471 self.force_keep = True # new stuff is exciting
472 return self._init_failed()
474 self.bits = _helpers.firstword(self.map[8:12])
475 self.entries = 2**self.bits
476 self.fanout = buffer(self.map, 12, self.entries*4)
477 shaofs = 12 + self.entries*4
478 nsha = self._fanget(self.entries-1)
479 self.shatable = buffer(self.map, shaofs, nsha*20)
480 self.idxnames = str(self.map[shaofs + 20*nsha:]).split('\0')
482 def _init_failed(self):
485 self.fanout = buffer('\0\0\0\0')
486 self.shatable = buffer('\0'*20)
489 def _fanget(self, i):
491 s = self.fanout[start:start+4]
492 return _helpers.firstword(s)
495 return str(self.shatable[i*20:(i+1)*20])
497 def exists(self, hash):
498 """Return nonempty if the object exists in the index files."""
499 global _total_searches, _total_steps
502 el = extract_bits(want, self.bits)
504 start = self._fanget(el-1)
505 startv = el << (32-self.bits)
509 end = self._fanget(el)
510 endv = (el+1) << (32-self.bits)
511 _total_steps += 1 # lookup table is a step
512 hashv = _helpers.firstword(hash)
513 #print '(%08x) %08x %08x %08x' % (extract_bits(want, 32), startv, hashv, endv)
516 #print '! %08x %08x %08x %d - %d' % (startv, hashv, endv, start, end)
517 mid = start + (hashv-startv)*(end-start-1)/(endv-startv)
518 #print ' %08x %08x %08x %d %d %d' % (startv, hashv, endv, start, mid, end)
520 #print ' %08x' % self._num(v)
523 startv = _helpers.firstword(v)
526 endv = _helpers.firstword(v)
532 for i in xrange(self._fanget(self.entries-1)):
533 yield buffer(self.shatable, i*20, 20)
536 return int(self._fanget(self.entries-1))
541 def __init__(self, dir):
543 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
548 self.do_bloom = False
555 assert(_mpi_count == 0)
558 return iter(idxmerge(self.packs))
561 return sum(len(pack) for pack in self.packs)
563 def exists(self, hash):
564 """Return nonempty if the object exists in the index files."""
565 global _total_searches
567 if hash in self.also:
569 if self.do_bloom and self.bloom is not None:
570 _total_searches -= 1 # will be incremented by bloom
571 if self.bloom.exists(hash):
572 self.do_bloom = False
575 for i in xrange(len(self.packs)):
577 _total_searches -= 1 # will be incremented by sub-pack
579 # reorder so most recently used packs are searched first
580 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
585 def refresh(self, skip_midx = False):
586 """Refresh the index list.
587 This method verifies if .midx files were superseded (e.g. all of its
588 contents are in another, bigger .midx file) and removes the superseded
591 If skip_midx is True, all work on .midx files will be skipped and .midx
592 files will be removed from the list.
594 The module-global variable 'ignore_midx' can force this function to
595 always act as if skip_midx was True.
597 self.bloom = None # Always reopen the bloom as it may have been relaced
598 self.do_bloom = False
599 skip_midx = skip_midx or ignore_midx
600 d = dict((p.name, p) for p in self.packs
601 if not skip_midx or not isinstance(p, PackMidx))
602 if os.path.exists(self.dir):
605 for ix in self.packs:
606 if isinstance(ix, PackMidx):
607 for name in ix.idxnames:
608 d[os.path.join(self.dir, name)] = ix
609 for full in glob.glob(os.path.join(self.dir,'*.midx')):
612 (mxd, mxf) = os.path.split(mx.name)
614 for n in mx.idxnames:
615 if not os.path.exists(os.path.join(mxd, n)):
616 log(('warning: index %s missing\n' +
617 ' used by %s\n') % (n, mxf))
624 midxl.sort(lambda x,y: -cmp(len(x),len(y)))
627 for sub in ix.idxnames:
628 found = d.get(os.path.join(self.dir, sub))
629 if not found or isinstance(found, PackIdx):
630 # doesn't exist, or exists but not in a midx
635 for name in ix.idxnames:
636 d[os.path.join(self.dir, name)] = ix
637 elif not ix.force_keep:
638 debug1('midx: removing redundant: %s\n'
639 % os.path.basename(ix.name))
641 for full in glob.glob(os.path.join(self.dir,'*.idx')):
649 bfull = os.path.join(self.dir, 'bup.bloom')
650 if self.bloom is None and os.path.exists(bfull):
651 self.bloom = ShaBloom(bfull)
652 self.packs = list(set(d.values()))
653 self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
654 if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
658 debug1('PackIdxList: using %d index%s.\n'
659 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
661 def packname_containing(self, hash):
662 # figure out which pack contains a given hash.
663 # FIXME: if the midx file format would just *store* this information,
664 # we could calculate it a lot more efficiently. But it's not needed
665 # often, so let's do it like this.
666 for f in glob.glob(os.path.join(self.dir,'*.idx')):
667 full = os.path.join(self.dir, f)
677 """Insert an additional object in the list."""
681 def calc_hash(type, content):
682 """Calculate some content's hash in the Git fashion."""
683 header = '%s %d\0' % (type, len(content))
689 def _shalist_sort_key(ent):
690 (mode, name, id) = ent
691 if stat.S_ISDIR(int(mode, 8)):
697 def open_idx(filename):
698 if filename.endswith('.idx'):
699 f = open(filename, 'rb')
701 if header[0:4] == '\377tOc':
702 version = struct.unpack('!I', header[4:8])[0]
704 return PackIdxV2(filename, f)
706 raise GitError('%s: expected idx file version 2, got %d'
707 % (filename, version))
708 elif len(header) == 8 and header[0:4] < '\377tOc':
709 return PackIdxV1(filename, f)
711 raise GitError('%s: unrecognized idx file header' % filename)
712 elif filename.endswith('.midx'):
713 return PackMidx(filename)
715 raise GitError('idx filenames must end with .idx or .midx')
718 def idxmerge(idxlist, final_progress=True):
719 """Generate a list of all the objects reachable in a PackIdxList."""
720 def pfunc(count, total):
721 progress('Reading indexes: %.2f%% (%d/%d)\r'
722 % (count*100.0/total, count, total))
723 def pfinal(count, total):
725 log('Reading indexes: %.2f%% (%d/%d), done.\n' % (100, total, total))
726 return merge_iter(idxlist, 10024, pfunc, pfinal)
729 def _make_objcache():
730 return PackIdxList(repo('objects/pack'))
733 """Writes Git objects insid a pack file."""
734 def __init__(self, objcache_maker=_make_objcache):
740 self.objcache_maker = objcache_maker
748 (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
749 self.file = os.fdopen(fd, 'w+b')
750 assert(name.endswith('.pack'))
751 self.filename = name[:-5]
752 self.file.write('PACK\0\0\0\2\0\0\0\0')
753 self.idx = list(list() for i in xrange(256))
755 # the 'sha' parameter is used in client.py's _raw_write(), but not needed
756 # in this basic version.
757 def _raw_write(self, datalist, sha):
760 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
761 # the file never has a *partial* blob. So let's make sure it's
762 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
763 # to our hashsplit algorithm.) f.write() does its own buffering,
764 # but that's okay because we'll flush it in _end().
765 oneblob = ''.join(datalist)
769 raise GitError, e, sys.exc_info()[2]
771 crc = zlib.crc32(oneblob) & 0xffffffff
772 self._update_idx(sha, crc, nw)
777 def _update_idx(self, sha, crc, size):
780 self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
782 def _write(self, sha, type, content):
786 sha = calc_hash(type, content)
787 size, crc = self._raw_write(_encode_packobj(type, content), sha=sha)
790 def breakpoint(self):
791 """Clear byte and object counts and return the last processed id."""
793 self.outbytes = self.count = 0
796 def _require_objcache(self):
797 if self.objcache is None and self.objcache_maker:
798 self.objcache = self.objcache_maker()
799 if self.objcache is None:
801 "PackWriter not opened or can't check exists w/o objcache")
803 def exists(self, id):
804 """Return non-empty if an object is found in the object cache."""
805 self._require_objcache()
806 return self.objcache.exists(id)
808 def maybe_write(self, type, content):
809 """Write an object to the pack file if not present and return its id."""
810 self._require_objcache()
811 sha = calc_hash(type, content)
812 if not self.exists(sha):
813 self._write(sha, type, content)
814 self.objcache.add(sha)
817 def new_blob(self, blob):
818 """Create a blob object in the pack with the supplied content."""
819 return self.maybe_write('blob', blob)
821 def new_tree(self, shalist):
822 """Create a tree object in the pack."""
823 shalist = sorted(shalist, key = _shalist_sort_key)
825 for (mode,name,bin) in shalist:
828 assert(mode[0] != '0')
830 assert(len(bin) == 20)
831 l.append('%s %s\0%s' % (mode,name,bin))
832 return self.maybe_write('tree', ''.join(l))
834 def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
836 if tree: l.append('tree %s' % tree.encode('hex'))
837 if parent: l.append('parent %s' % parent.encode('hex'))
838 if author: l.append('author %s %s' % (author, _git_date(adate)))
839 if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
842 return self.maybe_write('commit', '\n'.join(l))
844 def new_commit(self, parent, tree, date, msg):
845 """Create a commit object in the pack."""
846 userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
847 commit = self._new_commit(tree, parent,
848 userline, date, userline, date,
853 """Remove the pack file from disk."""
859 os.unlink(self.filename + '.pack')
861 def _end(self, run_midx=True):
863 if not f: return None
869 # update object count
871 cp = struct.pack('!i', self.count)
875 # calculate the pack sha1sum
878 for b in chunkyreader(f):
880 packbin = sum.digest()
884 idx_f = open(self.filename + '.idx', 'wb')
885 obj_list_sha = self._write_pack_idx_v2(idx_f, idx, packbin)
888 nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
889 if os.path.exists(self.filename + '.map'):
890 os.unlink(self.filename + '.map')
891 os.rename(self.filename + '.pack', nameprefix + '.pack')
892 os.rename(self.filename + '.idx', nameprefix + '.idx')
895 auto_midx(repo('objects/pack'))
898 def close(self, run_midx=True):
899 """Close the pack file and move it to its definitive path."""
900 return self._end(run_midx=run_midx)
902 def _write_pack_idx_v2(self, file, idx, packbin):
909 write('\377tOc\0\0\0\2')
914 write(struct.pack('!i', n))
915 part.sort(key=lambda x: x[0])
917 obj_list_sum = Sha1()
921 obj_list_sum.update(entry[0])
924 write(struct.pack('!I', entry[1]))
928 if entry[2] & 0x80000000:
929 write(struct.pack('!I', 0x80000000 | len(ofs64_list)))
930 ofs64_list.append(struct.pack('!Q', entry[2]))
932 write(struct.pack('!i', entry[2]))
933 for ofs64 in ofs64_list:
937 file.write(sum.digest())
938 return obj_list_sum.hexdigest()
942 return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
946 os.environ['GIT_DIR'] = os.path.abspath(repo())
949 def list_refs(refname = None):
950 """Generate a list of tuples in the form (refname,hash).
951 If a ref name is specified, list only this particular ref.
953 argv = ['git', 'show-ref', '--']
956 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
957 out = p.stdout.read().strip()
958 rv = p.wait() # not fatal
962 for d in out.split('\n'):
963 (sha, name) = d.split(' ', 1)
964 yield (name, sha.decode('hex'))
967 def read_ref(refname):
968 """Get the commit id of the most recent commit made on a given ref."""
969 l = list(list_refs(refname))
977 def rev_list(ref, count=None):
978 """Generate a list of reachable commits in reverse chronological order.
980 This generator walks through commits, from child to parent, that are
981 reachable via the specified ref and yields a series of tuples of the form
984 If count is a non-zero integer, limit the number of commits to "count"
987 assert(not ref.startswith('-'))
990 opts += ['-n', str(atoi(count))]
991 argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
992 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
996 if s.startswith('commit '):
997 commit = s[7:].decode('hex')
1000 yield (date, commit)
1001 rv = p.wait() # not fatal
1003 raise GitError, 'git rev-list returned error %d' % rv
1006 def rev_get_date(ref):
1007 """Get the date of the latest commit on the specified ref."""
1008 for (date, commit) in rev_list(ref, count=1):
1010 raise GitError, 'no such commit %r' % ref
1013 def rev_parse(committish):
1014 """Resolve the full hash for 'committish', if it exists.
1016 Should be roughly equivalent to 'git rev-parse'.
1018 Returns the hex value of the hash if it is found, None if 'committish' does
1019 not correspond to anything.
1021 head = read_ref(committish)
1023 debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
1026 pL = PackIdxList(repo('objects/pack'))
1028 if len(committish) == 40:
1030 hash = committish.decode('hex')
1040 def update_ref(refname, newval, oldval):
1041 """Change the commit pointed to by a branch."""
1044 assert(refname.startswith('refs/heads/'))
1045 p = subprocess.Popen(['git', 'update-ref', refname,
1046 newval.encode('hex'), oldval.encode('hex')],
1047 preexec_fn = _gitenv)
1048 _git_wait('git update-ref', p)
1051 def guess_repo(path=None):
1052 """Set the path value in the global variable "repodir".
1053 This makes bup look for an existing bup repository, but not fail if a
1054 repository doesn't exist. Usually, if you are interacting with a bup
1055 repository, you would not be calling this function but using
1056 check_repo_or_die().
1062 repodir = os.environ.get('BUP_DIR')
1064 repodir = os.path.expanduser('~/.bup')
1067 def init_repo(path=None):
1068 """Create the Git bare repository for bup in a given path."""
1070 d = repo() # appends a / to the path
1071 parent = os.path.dirname(os.path.dirname(d))
1072 if parent and not os.path.exists(parent):
1073 raise GitError('parent directory "%s" does not exist\n' % parent)
1074 if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
1075 raise GitError('"%d" exists but is not a directory\n' % d)
1076 p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
1077 preexec_fn = _gitenv)
1078 _git_wait('git init', p)
1079 # Force the index version configuration in order to ensure bup works
1080 # regardless of the version of the installed Git binary.
1081 p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
1082 stdout=sys.stderr, preexec_fn = _gitenv)
1083 _git_wait('git config', p)
1086 def check_repo_or_die(path=None):
1087 """Make sure a bup repository exists, and abort if not.
1088 If the path to a particular repository was not specified, this function
1089 initializes the default repository automatically.
1092 if not os.path.isdir(repo('objects/pack/.')):
1093 if repodir == home_repodir:
1096 log('error: %r is not a bup/git repository\n' % repo())
1101 """Generate a list of (mode, name, hash) tuples of objects from 'buf'."""
1103 while ofs < len(buf):
1104 z = buf[ofs:].find('\0')
1106 spl = buf[ofs:ofs+z].split(' ', 1)
1107 assert(len(spl) == 2)
1108 sha = buf[ofs+z+1:ofs+z+1+20]
1110 yield (spl[0], spl[1], sha)
1115 """Get Git's version and ensure a usable version is installed.
1117 The returned version is formatted as an ordered tuple with each position
1118 representing a digit in the version tag. For example, the following tuple
1119 would represent version 1.6.6.9:
1121 ('1', '6', '6', '9')
1125 p = subprocess.Popen(['git', '--version'],
1126 stdout=subprocess.PIPE)
1127 gvs = p.stdout.read()
1128 _git_wait('git --version', p)
1129 m = re.match(r'git version (\S+.\S+)', gvs)
1131 raise GitError('git --version weird output: %r' % gvs)
1132 _ver = tuple(m.group(1).split('.'))
1133 needed = ('1','5', '3', '1')
1135 raise GitError('git version %s or higher is required; you have %s'
1136 % ('.'.join(needed), '.'.join(_ver)))
1140 def _git_wait(cmd, p):
1143 raise GitError('%s returned %d' % (cmd, rv))
1146 def _git_capture(argv):
1147 p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
1149 _git_wait(repr(argv), p)
1153 class _AbortableIter:
1154 def __init__(self, it, onabort = None):
1156 self.onabort = onabort
1164 return self.it.next()
1165 except StopIteration, e:
1173 """Abort iteration and call the abortion callback, if needed."""
1185 """Link to 'git cat-file' that is used to retrieve blob data."""
1188 wanted = ('1','5','6')
1191 log('warning: git version < %s; bup will be slow.\n'
1194 self.get = self._slow_get
1196 self.p = self.inprogress = None
1197 self.get = self._fast_get
1201 self.p.stdout.close()
1202 self.p.stdin.close()
1204 self.inprogress = None
1208 self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1209 stdin=subprocess.PIPE,
1210 stdout=subprocess.PIPE,
1213 preexec_fn = _gitenv)
1215 def _fast_get(self, id):
1216 if not self.p or self.p.poll() != None:
1219 assert(self.p.poll() == None)
1221 log('_fast_get: opening %r while %r is open'
1222 % (id, self.inprogress))
1223 assert(not self.inprogress)
1224 assert(id.find('\n') < 0)
1225 assert(id.find('\r') < 0)
1226 assert(not id.startswith('-'))
1227 self.inprogress = id
1228 self.p.stdin.write('%s\n' % id)
1229 self.p.stdin.flush()
1230 hdr = self.p.stdout.readline()
1231 if hdr.endswith(' missing\n'):
1232 self.inprogress = None
1233 raise KeyError('blob %r is missing' % id)
1234 spl = hdr.split(' ')
1235 if len(spl) != 3 or len(spl[0]) != 40:
1236 raise GitError('expected blob, got %r' % spl)
1237 (hex, type, size) = spl
1239 it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
1240 onabort = self._abort)
1245 assert(self.p.stdout.readline() == '\n')
1246 self.inprogress = None
1247 except Exception, e:
1251 def _slow_get(self, id):
1252 assert(id.find('\n') < 0)
1253 assert(id.find('\r') < 0)
1254 assert(id[0] != '-')
1255 type = _git_capture(['git', 'cat-file', '-t', id]).strip()
1258 p = subprocess.Popen(['git', 'cat-file', type, id],
1259 stdout=subprocess.PIPE,
1260 preexec_fn = _gitenv)
1261 for blob in chunkyreader(p.stdout):
1263 _git_wait('git cat-file', p)
1265 def _join(self, it):
1270 elif type == 'tree':
1271 treefile = ''.join(it)
1272 for (mode, name, sha) in treeparse(treefile):
1273 for blob in self.join(sha.encode('hex')):
1275 elif type == 'commit':
1276 treeline = ''.join(it).split('\n')[0]
1277 assert(treeline.startswith('tree '))
1278 for blob in self.join(treeline[5:]):
1281 raise GitError('invalid object type %r: expected blob/tree/commit'
1285 """Generate a list of the content of all blobs that can be reached
1286 from an object. The hash given in 'id' must point to a blob, a tree
1287 or a commit. The content of all blobs that can be seen from trees or
1288 commits will be added to the list.
1291 for d in self._join(self.get(id)):
1293 except StopIteration:
1297 """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1299 for (n,c) in list_refs():
1300 if n.startswith('refs/tags/'):
1305 tags[c].append(name) # more than one tag can point at 'c'