1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
5 import os, sys, zlib, time, subprocess, struct, stat, re, tempfile, math, glob
6 from bup.helpers import *
7 from bup import _helpers, path
11 """Discussion of bloom constants for bup:
13 There are four basic things to consider when building a bloom filter:
14 The size, in bits, of the filter
15 The capacity, in entries, of the filter
16 The probability of a false positive that is tolerable
17 The number of bits readily available to use for addresing filter bits
19 There is one major tunable that is not directly related to the above:
20 k: the number of bits set in the filter per entry
22 Here's a wall of numbers showing the relationship between k; the ratio between
23 the filter size in bits and the entries in the filter; and pfalse_positive:
25 mn|k=3 |k=4 |k=5 |k=6 |k=7 |k=8 |k=9 |k=10 |k=11
26 8|3.05794|2.39687|2.16792|2.15771|2.29297|2.54917|2.92244|3.41909|4.05091
27 9|2.27780|1.65770|1.40703|1.32721|1.34892|1.44631|1.61138|1.84491|2.15259
28 10|1.74106|1.18133|0.94309|0.84362|0.81937|0.84555|0.91270|1.01859|1.16495
29 11|1.36005|0.86373|0.65018|0.55222|0.51259|0.50864|0.53098|0.57616|0.64387
30 12|1.08231|0.64568|0.45945|0.37108|0.32939|0.31424|0.31695|0.33387|0.36380
31 13|0.87517|0.49210|0.33183|0.25527|0.21689|0.19897|0.19384|0.19804|0.21013
32 14|0.71759|0.38147|0.24433|0.17934|0.14601|0.12887|0.12127|0.12012|0.12399
33 15|0.59562|0.30019|0.18303|0.12840|0.10028|0.08523|0.07749|0.07440|0.07468
34 16|0.49977|0.23941|0.13925|0.09351|0.07015|0.05745|0.05049|0.04700|0.04587
35 17|0.42340|0.19323|0.10742|0.06916|0.04990|0.03941|0.03350|0.03024|0.02870
36 18|0.36181|0.15765|0.08392|0.05188|0.03604|0.02748|0.02260|0.01980|0.01827
37 19|0.31160|0.12989|0.06632|0.03942|0.02640|0.01945|0.01549|0.01317|0.01182
38 20|0.27026|0.10797|0.05296|0.03031|0.01959|0.01396|0.01077|0.00889|0.00777
39 21|0.23591|0.09048|0.04269|0.02356|0.01471|0.01014|0.00759|0.00609|0.00518
40 22|0.20714|0.07639|0.03473|0.01850|0.01117|0.00746|0.00542|0.00423|0.00350
41 23|0.18287|0.06493|0.02847|0.01466|0.00856|0.00555|0.00392|0.00297|0.00240
42 24|0.16224|0.05554|0.02352|0.01171|0.00663|0.00417|0.00286|0.00211|0.00166
43 25|0.14459|0.04779|0.01957|0.00944|0.00518|0.00316|0.00211|0.00152|0.00116
44 26|0.12942|0.04135|0.01639|0.00766|0.00408|0.00242|0.00157|0.00110|0.00082
45 27|0.11629|0.03595|0.01381|0.00626|0.00324|0.00187|0.00118|0.00081|0.00059
46 28|0.10489|0.03141|0.01170|0.00515|0.00259|0.00146|0.00090|0.00060|0.00043
47 29|0.09492|0.02756|0.00996|0.00426|0.00209|0.00114|0.00069|0.00045|0.00031
48 30|0.08618|0.02428|0.00853|0.00355|0.00169|0.00090|0.00053|0.00034|0.00023
49 31|0.07848|0.02147|0.00733|0.00297|0.00138|0.00072|0.00041|0.00025|0.00017
50 32|0.07167|0.01906|0.00633|0.00250|0.00113|0.00057|0.00032|0.00019|0.00013
52 Here's a table showing available repository size for a given pfalse_positive
53 and three values of k (assuming we only use the 160 bit SHA1 for addressing the
54 filter and 8192bytes per object):
56 pfalse|obj k=4 |cap k=4 |obj k=5 |cap k=5 |obj k=6 |cap k=6
57 2.500%|139333497228|1038.11 TiB|558711157|4262.63 GiB|13815755|105.41 GiB
58 1.000%|104489450934| 778.50 TiB|436090254|3327.10 GiB|11077519| 84.51 GiB
59 0.125%| 57254889824| 426.58 TiB|261732190|1996.86 GiB| 7063017| 55.89 GiB
61 This eliminates pretty neatly any k>6 as long as we use the raw SHA for
64 filter size scales linearly with repository size for a given k and pfalse.
66 Here's a table of filter sizes for a 1 TiB repository:
68 pfalse| k=3 | k=4 | k=5 | k=6
69 2.500%| 138.78 MiB | 126.26 MiB | 123.00 MiB | 123.37 MiB
70 1.000%| 197.83 MiB | 168.36 MiB | 157.58 MiB | 153.87 MiB
71 0.125%| 421.14 MiB | 307.26 MiB | 262.56 MiB | 241.32 MiB
74 * We want the bloom filter to fit in memory; if it doesn't, the k pagefaults
75 per lookup will be worse than the two required for midx.
76 * We want the pfalse_positive to be low enough that the cost of sometimes
77 faulting on the midx doesn't overcome the benefit of the bloom filter.
78 * We have readily available 160 bits for addressing the filter.
79 * We want to be able to have a single bloom address entire repositories of
82 Based on these parameters, a combination of k=4 and k=5 provides the behavior
83 that bup needs. As such, I've implemented bloom addressing, adding and
84 checking functions in C for these two values. Because k=5 requires less space
85 and gives better overall pfalse_positive perofrmance, it is preferred if a
86 table with k=5 can represent the repository.
88 None of this tells us what max_pfalse_positive to choose.
90 Brandon Low <lostlogic@lostlogicx.com> 04-02-2011
93 MAX_BITS_EACH = 32 # Kinda arbitrary, but 4 bytes per entry is pretty big
94 MAX_BLOOM_BITS = {4: 37, 5: 29} # 160/k-log2(8)
95 MAX_PFALSE_POSITIVE = 1. # Totally arbitrary, needs benchmarking
99 home_repodir = os.path.expanduser('~/.bup')
102 _typemap = { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
103 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
109 class GitError(Exception):
114 """Get the path to the git repository or one of its subdirectories."""
117 raise GitError('You should call check_repo_or_die()')
119 # If there's a .git subdirectory, then the actual repo is in there.
120 gd = os.path.join(repodir, '.git')
121 if os.path.exists(gd):
124 return os.path.join(repodir, sub)
127 def auto_midx(objdir):
128 args = [path.exe(), 'midx', '--auto', '--dir', objdir]
130 rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
132 # make sure 'args' gets printed to help with debugging
133 add_error('%r: exception: %s' % (args, e))
136 add_error('%r: returned %d' % (args, rv))
138 args = [path.exe(), 'bloom', '--dir', objdir]
140 rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
142 # make sure 'args' gets printed to help with debugging
143 add_error('%r: exception: %s' % (args, e))
146 add_error('%r: returned %d' % (args, rv))
149 def mangle_name(name, mode, gitmode):
150 """Mangle a file name to present an abstract name for segmented files.
151 Mangled file names will have the ".bup" extension added to them. If a
152 file's name already ends with ".bup", a ".bupl" extension is added to
153 disambiguate normal files from semgmented ones.
155 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
157 elif name.endswith('.bup') or name[:-1].endswith('.bup'):
158 return name + '.bupl'
163 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
164 def demangle_name(name):
165 """Remove name mangling from a file name, if necessary.
167 The return value is a tuple (demangled_filename,mode), where mode is one of
170 * BUP_NORMAL : files that should be read as-is from the repository
171 * BUP_CHUNKED : files that were chunked and need to be assembled
173 For more information on the name mangling algorythm, see mangle_name()
175 if name.endswith('.bupl'):
176 return (name[:-5], BUP_NORMAL)
177 elif name.endswith('.bup'):
178 return (name[:-4], BUP_CHUNKED)
180 return (name, BUP_NORMAL)
183 def _encode_packobj(type, content):
186 szbits = (sz & 0x0f) | (_typemap[type]<<4)
189 if sz: szbits |= 0x80
195 z = zlib.compressobj(1)
197 yield z.compress(content)
201 def _encode_looseobj(type, content):
202 z = zlib.compressobj(1)
203 yield z.compress('%s %d\0' % (type, len(content)))
204 yield z.compress(content)
208 def _decode_looseobj(buf):
210 s = zlib.decompress(buf)
217 assert(type in _typemap)
218 assert(sz == len(content))
219 return (type, content)
222 def _decode_packobj(buf):
225 type = _typermap[(c & 0x70) >> 4]
232 sz |= (c & 0x7f) << shift
236 return (type, zlib.decompress(buf[i+1:]))
243 def find_offset(self, hash):
244 """Get the offset of an object inside the index file."""
245 idx = self._idx_from_hash(hash)
247 return self._ofs_from_idx(idx)
250 def exists(self, hash, want_source=False):
251 """Return nonempty if the object exists in this index."""
252 if hash and (self._idx_from_hash(hash) != None):
253 return want_source and self.name or True
257 return int(self.fanout[255])
259 def _idx_from_hash(self, hash):
260 global _total_searches, _total_steps
262 assert(len(hash) == 20)
264 start = self.fanout[b1-1] # range -1..254
265 end = self.fanout[b1] # range 0..255
267 _total_steps += 1 # lookup table is a step
270 mid = start + (end-start)/2
271 v = self._idx_to_hash(mid)
281 class PackIdxV1(PackIdx):
282 """Object representation of a Git pack index (version 1) file."""
283 def __init__(self, filename, f):
285 self.idxnames = [self.name]
286 self.map = mmap_read(f)
287 self.fanout = list(struct.unpack('!256I',
288 str(buffer(self.map, 0, 256*4))))
289 self.fanout.append(0) # entry "-1"
290 nsha = self.fanout[255]
292 self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
294 def _ofs_from_idx(self, idx):
295 return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
297 def _idx_to_hash(self, idx):
298 return str(self.shatable[idx*24+4 : idx*24+24])
301 for i in xrange(self.fanout[255]):
302 yield buffer(self.map, 256*4 + 24*i + 4, 20)
305 class PackIdxV2(PackIdx):
306 """Object representation of a Git pack index (version 2) file."""
307 def __init__(self, filename, f):
309 self.idxnames = [self.name]
310 self.map = mmap_read(f)
311 assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
312 self.fanout = list(struct.unpack('!256I',
313 str(buffer(self.map, 8, 256*4))))
314 self.fanout.append(0) # entry "-1"
315 nsha = self.fanout[255]
316 self.sha_ofs = 8 + 256*4
317 self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
318 self.ofstable = buffer(self.map,
319 self.sha_ofs + nsha*20 + nsha*4,
321 self.ofs64table = buffer(self.map,
322 8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
324 def _ofs_from_idx(self, idx):
325 ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
327 idx64 = ofs & 0x7fffffff
328 ofs = struct.unpack('!Q',
329 str(buffer(self.ofs64table, idx64*8, 8)))[0]
332 def _idx_to_hash(self, idx):
333 return str(self.shatable[idx*20:(idx+1)*20])
336 for i in xrange(self.fanout[255]):
337 yield buffer(self.map, 8 + 256*4 + 20*i, 20)
340 extract_bits = _helpers.extract_bits
342 bloom_contains = _helpers.bloom_contains
343 bloom_add = _helpers.bloom_add
347 """Wrapper which contains data from multiple index files.
349 def __init__(self, filename, f=None, readwrite=False):
353 assert(filename.endswith('.bloom'))
355 self.rwfile = f or open(filename, 'r+b')
356 self.map = mmap_readwrite(self.rwfile, close=False)
359 self.map = mmap_read(f or open(filename, 'rb'))
360 if str(self.map[0:4]) != 'BLOM':
361 log('Warning: skipping: invalid BLOM header in %r\n' % filename)
362 return self._init_failed()
363 ver = struct.unpack('!I', self.map[4:8])[0]
364 if ver < BLOOM_VERSION:
365 log('Warning: ignoring old-style (v%d) bloom %r\n'
367 return self._init_failed()
368 if ver > BLOOM_VERSION:
369 log('Warning: ignoring too-new (v%d) bloom %r\n'
371 return self._init_failed()
373 self.bits, self.k, self.entries = struct.unpack('!HHI', self.map[8:16])
374 idxnamestr = str(self.map[16 + 2**self.bits:])
376 self.idxnames = idxnamestr.split('\0')
380 def _init_failed(self):
388 self.bits = self.entries = 0
391 return self.map and self.bits
399 debug2("bloom: closing with %d entries\n" % self.entries)
400 self.map[12:16] = struct.pack('!I', self.entries)
403 self.rwfile.seek(16 + 2**self.bits)
405 self.rwfile.write('\0'.join(self.idxnames))
408 def pfalse_positive(self, additional=0):
409 n = self.entries + additional
412 return 100*(1-math.exp(-k*float(n)/m))**k
414 def add_idx(self, ix):
415 """Add the object to the filter, return current pfalse_positive."""
416 if not self.map: raise Exception, "Cannot add to closed bloom"
417 self.entries += bloom_add(self.map, ix.shatable, self.bits, self.k)
418 self.idxnames.append(os.path.basename(ix.name))
420 def exists(self, sha):
421 """Return nonempty if the object probably exists in the bloom filter."""
422 global _total_searches, _total_steps
424 if not self.map: return None
425 found, steps = bloom_contains(self.map, str(sha), self.bits, self.k)
426 _total_steps += steps
430 def create(cls, name, f=None, readwrite=False, expected=100000, k=None):
431 """Create and return a bloom filter for `expected` entries."""
432 bits = int(math.floor(math.log(expected*MAX_BITS_EACH/8,2)))
433 k = k or ((bits <= MAX_BLOOM_BITS[5]) and 5 or 4)
434 if bits > MAX_BLOOM_BITS[k]:
435 log('bloom: warning, max bits exceeded, non-optimal\n')
436 bits = MAX_BLOOM_BITS[k]
437 debug1('bloom: using 2^%d bytes and %d hash functions\n' % (bits, k))
438 f = f or open(name, 'w+b')
440 f.write(struct.pack('!IHHI', BLOOM_VERSION, bits, k, 0))
441 assert(f.tell() == 16)
442 # NOTE: On some systems this will not extend+zerofill, but it does on
443 # darwin, linux, bsd and solaris.
444 f.truncate(16+2**bits)
446 return cls(name, f=f, readwrite=readwrite)
453 """Wrapper which contains data from multiple index files.
454 Multiple index (.midx) files constitute a wrapper around index (.idx) files
455 and make it possible for bup to expand Git's indexing capabilities to vast
458 def __init__(self, filename):
460 self.force_keep = False
461 assert(filename.endswith('.midx'))
462 self.map = mmap_read(open(filename))
463 if str(self.map[0:4]) != 'MIDX':
464 log('Warning: skipping: invalid MIDX header in %r\n' % filename)
465 self.force_keep = True
466 return self._init_failed()
467 ver = struct.unpack('!I', self.map[4:8])[0]
468 if ver < MIDX_VERSION:
469 log('Warning: ignoring old-style (v%d) midx %r\n'
471 self.force_keep = False # old stuff is boring
472 return self._init_failed()
473 if ver > MIDX_VERSION:
474 log('Warning: ignoring too-new (v%d) midx %r\n'
476 self.force_keep = True # new stuff is exciting
477 return self._init_failed()
479 self.bits = _helpers.firstword(self.map[8:12])
480 self.entries = 2**self.bits
481 self.fanout = buffer(self.map, 12, self.entries*4)
482 self.sha_ofs = 12 + self.entries*4
483 self.nsha = nsha = self._fanget(self.entries-1)
484 self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
485 self.whichlist = buffer(self.map, self.sha_ofs + nsha*20, nsha*4)
486 self.idxname_ofs = self.sha_ofs + 24*nsha
487 self.idxnames = str(self.map[self.idxname_ofs:]).split('\0')
489 def _init_failed(self):
492 self.fanout = buffer('\0\0\0\0')
493 self.shatable = buffer('\0'*20)
496 def _fanget(self, i):
498 s = self.fanout[start:start+4]
499 return _helpers.firstword(s)
502 return str(self.shatable[i*20:(i+1)*20])
504 def _get_idx_i(self, i):
505 return struct.unpack('!I', self.whichlist[i*4:(i+1)*4])[0]
507 def _get_idxname(self, i):
508 return self.idxnames[self._get_idx_i(i)]
510 def exists(self, hash, want_source=False):
511 """Return nonempty if the object exists in the index files."""
512 global _total_searches, _total_steps
515 el = extract_bits(want, self.bits)
517 start = self._fanget(el-1)
518 startv = el << (32-self.bits)
522 end = self._fanget(el)
523 endv = (el+1) << (32-self.bits)
524 _total_steps += 1 # lookup table is a step
525 hashv = _helpers.firstword(hash)
526 #print '(%08x) %08x %08x %08x' % (extract_bits(want, 32), startv, hashv, endv)
529 #print '! %08x %08x %08x %d - %d' % (startv, hashv, endv, start, end)
530 mid = start + (hashv-startv)*(end-start-1)/(endv-startv)
531 #print ' %08x %08x %08x %d %d %d' % (startv, hashv, endv, start, mid, end)
533 #print ' %08x' % self._num(v)
536 startv = _helpers.firstword(v)
539 endv = _helpers.firstword(v)
541 return want_source and self._get_idxname(mid) or True
545 for i in xrange(self._fanget(self.entries-1)):
546 yield buffer(self.shatable, i*20, 20)
549 return int(self._fanget(self.entries-1))
554 def __init__(self, dir):
556 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
561 self.do_bloom = False
568 assert(_mpi_count == 0)
571 return iter(idxmerge(self.packs))
574 return sum(len(pack) for pack in self.packs)
576 def exists(self, hash, want_source=False):
577 """Return nonempty if the object exists in the index files."""
578 global _total_searches
580 if hash in self.also:
582 if self.do_bloom and self.bloom is not None:
583 _total_searches -= 1 # will be incremented by bloom
584 if self.bloom.exists(hash):
585 self.do_bloom = False
588 for i in xrange(len(self.packs)):
590 _total_searches -= 1 # will be incremented by sub-pack
591 ix = p.exists(hash, want_source=want_source)
593 # reorder so most recently used packs are searched first
594 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
599 def refresh(self, skip_midx = False):
600 """Refresh the index list.
601 This method verifies if .midx files were superseded (e.g. all of its
602 contents are in another, bigger .midx file) and removes the superseded
605 If skip_midx is True, all work on .midx files will be skipped and .midx
606 files will be removed from the list.
608 The module-global variable 'ignore_midx' can force this function to
609 always act as if skip_midx was True.
611 self.bloom = None # Always reopen the bloom as it may have been relaced
612 self.do_bloom = False
613 skip_midx = skip_midx or ignore_midx
614 d = dict((p.name, p) for p in self.packs
615 if not skip_midx or not isinstance(p, PackMidx))
616 if os.path.exists(self.dir):
619 for ix in self.packs:
620 if isinstance(ix, PackMidx):
621 for name in ix.idxnames:
622 d[os.path.join(self.dir, name)] = ix
623 for full in glob.glob(os.path.join(self.dir,'*.midx')):
626 (mxd, mxf) = os.path.split(mx.name)
628 for n in mx.idxnames:
629 if not os.path.exists(os.path.join(mxd, n)):
630 log(('warning: index %s missing\n' +
631 ' used by %s\n') % (n, mxf))
638 midxl.sort(lambda x,y: -cmp(len(x),len(y)))
641 for sub in ix.idxnames:
642 found = d.get(os.path.join(self.dir, sub))
643 if not found or isinstance(found, PackIdx):
644 # doesn't exist, or exists but not in a midx
649 for name in ix.idxnames:
650 d[os.path.join(self.dir, name)] = ix
651 elif not ix.force_keep:
652 debug1('midx: removing redundant: %s\n'
653 % os.path.basename(ix.name))
655 for full in glob.glob(os.path.join(self.dir,'*.idx')):
663 bfull = os.path.join(self.dir, 'bup.bloom')
664 if self.bloom is None and os.path.exists(bfull):
665 self.bloom = ShaBloom(bfull)
666 self.packs = list(set(d.values()))
667 self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
668 if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
672 debug1('PackIdxList: using %d index%s.\n'
673 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
676 """Insert an additional object in the list."""
680 def calc_hash(type, content):
681 """Calculate some content's hash in the Git fashion."""
682 header = '%s %d\0' % (type, len(content))
688 def _shalist_sort_key(ent):
689 (mode, name, id) = ent
690 if stat.S_ISDIR(int(mode, 8)):
696 def open_idx(filename):
697 if filename.endswith('.idx'):
698 f = open(filename, 'rb')
700 if header[0:4] == '\377tOc':
701 version = struct.unpack('!I', header[4:8])[0]
703 return PackIdxV2(filename, f)
705 raise GitError('%s: expected idx file version 2, got %d'
706 % (filename, version))
707 elif len(header) == 8 and header[0:4] < '\377tOc':
708 return PackIdxV1(filename, f)
710 raise GitError('%s: unrecognized idx file header' % filename)
711 elif filename.endswith('.midx'):
712 return PackMidx(filename)
714 raise GitError('idx filenames must end with .idx or .midx')
717 def idxmerge(idxlist, final_progress=True):
718 """Generate a list of all the objects reachable in a PackIdxList."""
719 def pfunc(count, total):
720 progress('Reading indexes: %.2f%% (%d/%d)\r'
721 % (count*100.0/total, count, total))
722 def pfinal(count, total):
724 log('Reading indexes: %.2f%% (%d/%d), done.\n' % (100, total, total))
725 return merge_iter(idxlist, 10024, pfunc, pfinal)
728 def _make_objcache():
729 return PackIdxList(repo('objects/pack'))
732 """Writes Git objects insid a pack file."""
733 def __init__(self, objcache_maker=_make_objcache):
739 self.objcache_maker = objcache_maker
747 (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
748 self.file = os.fdopen(fd, 'w+b')
749 assert(name.endswith('.pack'))
750 self.filename = name[:-5]
751 self.file.write('PACK\0\0\0\2\0\0\0\0')
752 self.idx = list(list() for i in xrange(256))
754 # the 'sha' parameter is used in client.py's _raw_write(), but not needed
755 # in this basic version.
756 def _raw_write(self, datalist, sha):
759 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
760 # the file never has a *partial* blob. So let's make sure it's
761 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
762 # to our hashsplit algorithm.) f.write() does its own buffering,
763 # but that's okay because we'll flush it in _end().
764 oneblob = ''.join(datalist)
768 raise GitError, e, sys.exc_info()[2]
770 crc = zlib.crc32(oneblob) & 0xffffffff
771 self._update_idx(sha, crc, nw)
776 def _update_idx(self, sha, crc, size):
779 self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
781 def _write(self, sha, type, content):
785 sha = calc_hash(type, content)
786 size, crc = self._raw_write(_encode_packobj(type, content), sha=sha)
789 def breakpoint(self):
790 """Clear byte and object counts and return the last processed id."""
792 self.outbytes = self.count = 0
795 def _require_objcache(self):
796 if self.objcache is None and self.objcache_maker:
797 self.objcache = self.objcache_maker()
798 if self.objcache is None:
800 "PackWriter not opened or can't check exists w/o objcache")
802 def exists(self, id, want_source=False):
803 """Return non-empty if an object is found in the object cache."""
804 self._require_objcache()
805 return self.objcache.exists(id, want_source=want_source)
807 def maybe_write(self, type, content):
808 """Write an object to the pack file if not present and return its id."""
809 self._require_objcache()
810 sha = calc_hash(type, content)
811 if not self.exists(sha):
812 self._write(sha, type, content)
813 self.objcache.add(sha)
816 def new_blob(self, blob):
817 """Create a blob object in the pack with the supplied content."""
818 return self.maybe_write('blob', blob)
820 def new_tree(self, shalist):
821 """Create a tree object in the pack."""
822 shalist = sorted(shalist, key = _shalist_sort_key)
824 for (mode,name,bin) in shalist:
827 assert(mode[0] != '0')
829 assert(len(bin) == 20)
830 l.append('%s %s\0%s' % (mode,name,bin))
831 return self.maybe_write('tree', ''.join(l))
833 def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
835 if tree: l.append('tree %s' % tree.encode('hex'))
836 if parent: l.append('parent %s' % parent.encode('hex'))
837 if author: l.append('author %s %s' % (author, _git_date(adate)))
838 if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
841 return self.maybe_write('commit', '\n'.join(l))
843 def new_commit(self, parent, tree, date, msg):
844 """Create a commit object in the pack."""
845 userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
846 commit = self._new_commit(tree, parent,
847 userline, date, userline, date,
852 """Remove the pack file from disk."""
858 os.unlink(self.filename + '.pack')
860 def _end(self, run_midx=True):
862 if not f: return None
868 # update object count
870 cp = struct.pack('!i', self.count)
874 # calculate the pack sha1sum
877 for b in chunkyreader(f):
879 packbin = sum.digest()
883 idx_f = open(self.filename + '.idx', 'wb')
884 obj_list_sha = self._write_pack_idx_v2(idx_f, idx, packbin)
887 nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
888 if os.path.exists(self.filename + '.map'):
889 os.unlink(self.filename + '.map')
890 os.rename(self.filename + '.pack', nameprefix + '.pack')
891 os.rename(self.filename + '.idx', nameprefix + '.idx')
894 auto_midx(repo('objects/pack'))
897 def close(self, run_midx=True):
898 """Close the pack file and move it to its definitive path."""
899 return self._end(run_midx=run_midx)
901 def _write_pack_idx_v2(self, file, idx, packbin):
908 write('\377tOc\0\0\0\2')
913 write(struct.pack('!i', n))
914 part.sort(key=lambda x: x[0])
916 obj_list_sum = Sha1()
920 obj_list_sum.update(entry[0])
923 write(struct.pack('!I', entry[1]))
927 if entry[2] & 0x80000000:
928 write(struct.pack('!I', 0x80000000 | len(ofs64_list)))
929 ofs64_list.append(struct.pack('!Q', entry[2]))
931 write(struct.pack('!i', entry[2]))
932 for ofs64 in ofs64_list:
936 file.write(sum.digest())
937 return obj_list_sum.hexdigest()
941 return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
945 os.environ['GIT_DIR'] = os.path.abspath(repo())
948 def list_refs(refname = None):
949 """Generate a list of tuples in the form (refname,hash).
950 If a ref name is specified, list only this particular ref.
952 argv = ['git', 'show-ref', '--']
955 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
956 out = p.stdout.read().strip()
957 rv = p.wait() # not fatal
961 for d in out.split('\n'):
962 (sha, name) = d.split(' ', 1)
963 yield (name, sha.decode('hex'))
966 def read_ref(refname):
967 """Get the commit id of the most recent commit made on a given ref."""
968 l = list(list_refs(refname))
976 def rev_list(ref, count=None):
977 """Generate a list of reachable commits in reverse chronological order.
979 This generator walks through commits, from child to parent, that are
980 reachable via the specified ref and yields a series of tuples of the form
983 If count is a non-zero integer, limit the number of commits to "count"
986 assert(not ref.startswith('-'))
989 opts += ['-n', str(atoi(count))]
990 argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
991 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
995 if s.startswith('commit '):
996 commit = s[7:].decode('hex')
1000 rv = p.wait() # not fatal
1002 raise GitError, 'git rev-list returned error %d' % rv
1005 def rev_get_date(ref):
1006 """Get the date of the latest commit on the specified ref."""
1007 for (date, commit) in rev_list(ref, count=1):
1009 raise GitError, 'no such commit %r' % ref
1012 def rev_parse(committish):
1013 """Resolve the full hash for 'committish', if it exists.
1015 Should be roughly equivalent to 'git rev-parse'.
1017 Returns the hex value of the hash if it is found, None if 'committish' does
1018 not correspond to anything.
1020 head = read_ref(committish)
1022 debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
1025 pL = PackIdxList(repo('objects/pack'))
1027 if len(committish) == 40:
1029 hash = committish.decode('hex')
1039 def update_ref(refname, newval, oldval):
1040 """Change the commit pointed to by a branch."""
1043 assert(refname.startswith('refs/heads/'))
1044 p = subprocess.Popen(['git', 'update-ref', refname,
1045 newval.encode('hex'), oldval.encode('hex')],
1046 preexec_fn = _gitenv)
1047 _git_wait('git update-ref', p)
1050 def guess_repo(path=None):
1051 """Set the path value in the global variable "repodir".
1052 This makes bup look for an existing bup repository, but not fail if a
1053 repository doesn't exist. Usually, if you are interacting with a bup
1054 repository, you would not be calling this function but using
1055 check_repo_or_die().
1061 repodir = os.environ.get('BUP_DIR')
1063 repodir = os.path.expanduser('~/.bup')
1066 def init_repo(path=None):
1067 """Create the Git bare repository for bup in a given path."""
1069 d = repo() # appends a / to the path
1070 parent = os.path.dirname(os.path.dirname(d))
1071 if parent and not os.path.exists(parent):
1072 raise GitError('parent directory "%s" does not exist\n' % parent)
1073 if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
1074 raise GitError('"%d" exists but is not a directory\n' % d)
1075 p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
1076 preexec_fn = _gitenv)
1077 _git_wait('git init', p)
1078 # Force the index version configuration in order to ensure bup works
1079 # regardless of the version of the installed Git binary.
1080 p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
1081 stdout=sys.stderr, preexec_fn = _gitenv)
1082 _git_wait('git config', p)
1085 def check_repo_or_die(path=None):
1086 """Make sure a bup repository exists, and abort if not.
1087 If the path to a particular repository was not specified, this function
1088 initializes the default repository automatically.
1091 if not os.path.isdir(repo('objects/pack/.')):
1092 if repodir == home_repodir:
1095 log('error: %r is not a bup/git repository\n' % repo())
1100 """Generate a list of (mode, name, hash) tuples of objects from 'buf'."""
1102 while ofs < len(buf):
1103 z = buf[ofs:].find('\0')
1105 spl = buf[ofs:ofs+z].split(' ', 1)
1106 assert(len(spl) == 2)
1107 sha = buf[ofs+z+1:ofs+z+1+20]
1109 yield (spl[0], spl[1], sha)
1114 """Get Git's version and ensure a usable version is installed.
1116 The returned version is formatted as an ordered tuple with each position
1117 representing a digit in the version tag. For example, the following tuple
1118 would represent version 1.6.6.9:
1120 ('1', '6', '6', '9')
1124 p = subprocess.Popen(['git', '--version'],
1125 stdout=subprocess.PIPE)
1126 gvs = p.stdout.read()
1127 _git_wait('git --version', p)
1128 m = re.match(r'git version (\S+.\S+)', gvs)
1130 raise GitError('git --version weird output: %r' % gvs)
1131 _ver = tuple(m.group(1).split('.'))
1132 needed = ('1','5', '3', '1')
1134 raise GitError('git version %s or higher is required; you have %s'
1135 % ('.'.join(needed), '.'.join(_ver)))
1139 def _git_wait(cmd, p):
1142 raise GitError('%s returned %d' % (cmd, rv))
1145 def _git_capture(argv):
1146 p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
1148 _git_wait(repr(argv), p)
1152 class _AbortableIter:
1153 def __init__(self, it, onabort = None):
1155 self.onabort = onabort
1163 return self.it.next()
1164 except StopIteration, e:
1172 """Abort iteration and call the abortion callback, if needed."""
1184 """Link to 'git cat-file' that is used to retrieve blob data."""
1187 wanted = ('1','5','6')
1190 log('warning: git version < %s; bup will be slow.\n'
1193 self.get = self._slow_get
1195 self.p = self.inprogress = None
1196 self.get = self._fast_get
1200 self.p.stdout.close()
1201 self.p.stdin.close()
1203 self.inprogress = None
1207 self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1208 stdin=subprocess.PIPE,
1209 stdout=subprocess.PIPE,
1212 preexec_fn = _gitenv)
1214 def _fast_get(self, id):
1215 if not self.p or self.p.poll() != None:
1218 assert(self.p.poll() == None)
1220 log('_fast_get: opening %r while %r is open'
1221 % (id, self.inprogress))
1222 assert(not self.inprogress)
1223 assert(id.find('\n') < 0)
1224 assert(id.find('\r') < 0)
1225 assert(not id.startswith('-'))
1226 self.inprogress = id
1227 self.p.stdin.write('%s\n' % id)
1228 self.p.stdin.flush()
1229 hdr = self.p.stdout.readline()
1230 if hdr.endswith(' missing\n'):
1231 self.inprogress = None
1232 raise KeyError('blob %r is missing' % id)
1233 spl = hdr.split(' ')
1234 if len(spl) != 3 or len(spl[0]) != 40:
1235 raise GitError('expected blob, got %r' % spl)
1236 (hex, type, size) = spl
1238 it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
1239 onabort = self._abort)
1244 assert(self.p.stdout.readline() == '\n')
1245 self.inprogress = None
1246 except Exception, e:
1250 def _slow_get(self, id):
1251 assert(id.find('\n') < 0)
1252 assert(id.find('\r') < 0)
1253 assert(id[0] != '-')
1254 type = _git_capture(['git', 'cat-file', '-t', id]).strip()
1257 p = subprocess.Popen(['git', 'cat-file', type, id],
1258 stdout=subprocess.PIPE,
1259 preexec_fn = _gitenv)
1260 for blob in chunkyreader(p.stdout):
1262 _git_wait('git cat-file', p)
1264 def _join(self, it):
1269 elif type == 'tree':
1270 treefile = ''.join(it)
1271 for (mode, name, sha) in treeparse(treefile):
1272 for blob in self.join(sha.encode('hex')):
1274 elif type == 'commit':
1275 treeline = ''.join(it).split('\n')[0]
1276 assert(treeline.startswith('tree '))
1277 for blob in self.join(treeline[5:]):
1280 raise GitError('invalid object type %r: expected blob/tree/commit'
1284 """Generate a list of the content of all blobs that can be reached
1285 from an object. The hash given in 'id' must point to a blob, a tree
1286 or a commit. The content of all blobs that can be seen from trees or
1287 commits will be added to the list.
1290 for d in self._join(self.get(id)):
1292 except StopIteration:
1296 """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1298 for (n,c) in list_refs():
1299 if n.startswith('refs/tags/'):
1304 tags[c].append(name) # more than one tag can point at 'c'